diff --git a/.gitignore b/.gitignore
index 305e025f..b41e10c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ build
 __pycache__
 .hypothesis
 .vscode
+*.log
\ No newline at end of file
diff --git a/cgra/CgraRTL.py b/cgra/CgraRTL.py
index 87777384..c9dffcea 100644
--- a/cgra/CgraRTL.py
+++ b/cgra/CgraRTL.py
@@ -138,8 +138,8 @@ def construct(s, CgraPayloadType,
     s.data_mem.address_upper //= s.address_upper
 
     # Connects data memory with controller.
-    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request
-    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request
+    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc
+    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc
     s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response
     s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt
     s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt
diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py
index 00788487..986efe06 100644
--- a/cgra/CgraTemplateRTL.py
+++ b/cgra/CgraTemplateRTL.py
@@ -83,7 +83,10 @@ def construct(s, CgraPayloadType,
                 provided_max_per_cgra_rows = None,
                 provided_max_per_cgra_cols = None,
                 provided_max_num_rd_tiles = None,
-                provided_max_num_wr_tiles = None):
+                provided_max_num_wr_tiles = None,
+                has_dma_ports = False,
+                DmaDataType = mk_dma_data(),
+                DmaCmdType = mk_dma_cmd()):
     """
     provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch.
     provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch.
@@ -126,6 +129,14 @@ def construct(s, CgraPayloadType,
     CtrlRingPos = mk_ring_pos(max_num_tiles + 1)
     CtrlAddrType = mk_bits(clog2(ctrl_mem_size))
     DataAddrType = mk_bits(clog2(data_mem_size_global))
+    DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag)
+    DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData)
+    DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr)
+    DmaDoneType = mk_dma_done(DmaTagType.nbits)
+    DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits,
+                                              DmaSpmDataType.nbits)
+    DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits)
+    DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits)
     assert(data_mem_size_per_bank * num_banks_per_cgra <= \
            data_mem_size_global)
 
@@ -135,6 +146,21 @@ def construct(s, CgraPayloadType,
     s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType)
     s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType)
 
+    # Optional DMA engine-facing ports. The controller owns command decode and
+    # forwards DMA SPM access to the data memory.
+    if has_dma_ports:
+      s.dma_cmd = SendIfcRTL(DmaCmdType)
+
+      s.dma_done = RecvIfcRTL(DmaDoneType)
+
+      # Receive the request of writing into SPM from the DMA.
+      s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType)
+      # Receive the request of reading from SPM from the DMA.
+      s.recv_from_dma_spm_rd_req  = RecvIfcRTL(DmaSpmReadReqType)
+      # Send the response of reading from SPM to the DMA.
+      s.send_to_dma_spm_rd_resp   = SendIfcRTL(DmaSpmReadRespType)
+
+
     if is_multi_cgra:
       # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra.
       # Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA.
@@ -168,11 +194,17 @@ def construct(s, CgraPayloadType,
                                       multi_cgra_columns,
                                       max_num_tiles,
                                       mem_access_is_combinational,
-                                      idTo2d_map)
+                                      idTo2d_map,
+                                      has_dma_ports,
+                                      DmaCmdType,
+                                      DmaDataType)
     s.cgra_id = InPort(CgraIdType)
     s.controller = ControllerRTL(NocPktType,
                                   multi_cgra_rows, multi_cgra_columns,
-                                  max_num_tiles, controller2addr_map, idTo2d_map)
+                                  max_num_tiles, controller2addr_map, idTo2d_map,
+                                  has_dma_ports,
+                                  DmaDataType,
+                                  DmaCmdType)
     # Connects controller id.
     s.controller.cgra_id //= s.cgra_id
     # An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU.
@@ -190,9 +222,35 @@ def construct(s, CgraPayloadType,
     s.data_mem.address_lower //= s.address_lower
     s.data_mem.address_upper //= s.address_upper
 
+    if has_dma_ports:
+      # CPU packets are decoded by the controller before becoming DMA commands.
+      s.dma_cmd  //= s.controller.dma_cmd
+      s.dma_done //= s.controller.dma_done
+
+      s.recv_from_dma_spm_wr_req //= s.controller.recv_from_dma_spm_wr_req
+      s.recv_from_dma_spm_rd_req  //= s.controller.recv_from_dma_spm_rd_req
+      s.send_to_dma_spm_rd_resp   //= s.controller.send_to_dma_spm_rd_resp
+
+    else:
+      # Grounds the DMA ports when no DMA engine is attached.
+      s.controller.dma_cmd.rdy //= 0
+      s.controller.dma_done.val //= 0
+      s.controller.dma_done.msg //= DmaDoneType()
+
+      s.controller.recv_from_dma_spm_wr_req.val //= 0
+      s.controller.recv_from_dma_spm_wr_req.msg //= DmaSpmWriteReqType()
+      s.controller.recv_from_dma_spm_rd_req.val //= 0
+      s.controller.recv_from_dma_spm_rd_req.msg //= DmaSpmReadReqType()
+      s.controller.send_to_dma_spm_rd_resp.rdy //= 0
+
+    # Controller <-> SPM/data_mem
+    s.controller.send_to_sram_store_request_from_dma   //= s.data_mem.recv_from_controller_spm_wr_req
+    s.controller.send_to_sram_load_request_from_dma    //= s.data_mem.recv_from_controller_spm_rd_req
+    s.controller.recv_from_sram_load_response //= s.data_mem.send_to_controller_spm_rd_resp
+    
     # Connects data memory with controller.
-    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request
-    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request
+    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc
+    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc
     s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response
     s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt
     s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt
diff --git a/cgra/CgraWithContextSwitchRTL.py b/cgra/CgraWithContextSwitchRTL.py
index 361c0a9b..47bf8478 100644
--- a/cgra/CgraWithContextSwitchRTL.py
+++ b/cgra/CgraWithContextSwitchRTL.py
@@ -131,8 +131,8 @@ def construct(s, CgraPayloadType,
     s.data_mem.address_upper //= s.address_upper
 
     # Connects data memory with controller.
-    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request
-    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request
+    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc
+    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc
     s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response
     s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt
     s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt
diff --git a/cgra/CgraWithStreamingLoadRTL.py b/cgra/CgraWithStreamingLoadRTL.py
index 6e7dcbf5..e7b6b64e 100644
--- a/cgra/CgraWithStreamingLoadRTL.py
+++ b/cgra/CgraWithStreamingLoadRTL.py
@@ -138,8 +138,8 @@ def construct(s, CgraPayloadType,
     s.data_mem.address_upper //= s.address_upper
 
     # Connects data memory with controller.
-    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request
-    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request
+    s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc
+    s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc
     s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response
     s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt
     s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt
diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py
new file mode 100644
index 00000000..0aafa364
--- /dev/null
+++ b/cgra/IntegratedCgraWithDmaRTL.py
@@ -0,0 +1,194 @@
+"""
+=========================================================================
+IntegratedCgraWithDmaRTL.py
+=========================================================================
+
+Wrapper that composes a CGRA template with a DMA engine attached to the
+CGRA data SPM.
+"""
+
+from pymtl3 import *
+
+from .CgraTemplateRTL import CgraTemplateRTL
+from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL
+from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL
+from ..lib.messages import *
+from ..lib.util.data_struct_attr import *
+from ..mem.dma.DmaEngineRTL import DmaEngineRTL
+
+
+class IntegratedCgraWithDmaRTL( Component ):
+  """
+  IntegratedCgraWithDmaRTL is a top-level wrapper that integrates a CGRA instance with a
+  DMA engine.
+
+  Architectural Design:
+  - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a
+    DMA engine (`DmaEngineRTL`).
+  - CPU control packets are passed through to the CGRA's controller.
+    DMA commands are decoded there.
+  - The DMA engine accesses the CGRA's internal data SPM through controller-
+    forwarded ports; it is not connected directly to `DataMemControllerRTL`.
+  - External memory requests from the DMA engine are exposed at the top level
+    to be connected to a DRAM model or an AXI adapter.
+  - Boundary data ports for multi-CGRA configurations are also passed through
+    if enabled.
+  """
+
+  def construct(s, CgraPayloadType,
+                multi_cgra_rows,
+                multi_cgra_columns,
+                per_cgra_rows, per_cgra_columns,
+                ctrl_mem_size, data_mem_size_global,
+                data_mem_size_per_bank, num_banks_per_cgra,
+                num_registers_per_reg_bank, num_ctrl,
+                total_steps, mem_access_is_combinational,
+                FunctionUnit, FuList, TileList, LinkList,
+                dataSPM, controller2addr_map, idTo2d_map,
+                is_multi_cgra = True, cgra_id = 0,
+                # For heterogeneous multi-cgra support.(maybe remove it in IntegratedCgraWithDmaRTL for simplicity?)
+                provided_max_per_cgra_rows = None,
+                provided_max_per_cgra_cols = None,
+                provided_max_num_rd_tiles = None,
+                provided_max_num_wr_tiles = None):
+
+    DataType = CgraPayloadType.get_field_type(kAttrData)
+    data_bitwidth = DataType.get_field_type(kAttrPayload).nbits
+    assert data_bitwidth == 32
+
+    max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows
+    max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns
+    max_num_tiles = max_per_cgra_rows * max_per_cgra_cols
+    max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts()
+
+    CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows,
+                                    max_num_tiles, CgraPayloadType)
+    NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows,
+                                   max_num_tiles, max_num_rd_tiles,
+                                   CgraPayloadType)
+
+    CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows)
+    DataAddrType = mk_bits(clog2(data_mem_size_global))
+    DmaCmdType = mk_dma_cmd(dram_addr_nbits = 64,
+                            spm_addr_nbits = 32,
+                            bytes_nbits = 32,
+                            tag_nbits = 8)
+
+    DmaDataType = mk_dma_data(dram_data_nbits = 128,
+                              dram_mask_nbits = 16,
+                              spm_data_nbits = 32)
+
+    DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr)
+    DmaMemDataType  = DmaDataType.get_field_type(kAttrDramData)
+    DmaMemMaskType  = DmaDataType.get_field_type(kAttrDramMask)
+    DmaDramWrReqType = mk_dma_dram_wr_req(DmaDramAddrType.nbits, DmaMemDataType.nbits, DmaMemMaskType.nbits)
+
+    # Existing CGRA-facing interfaces.
+    # CGRA <-> CPU
+    s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType)
+    s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType)
+
+    if is_multi_cgra:
+      s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType)
+      s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType)
+
+      s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
+      s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
+      s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
+      s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)]
+      s.recv_data_on_boundary_west  = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
+      s.send_data_on_boundary_west  = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
+      s.recv_data_on_boundary_east  = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
+      s.send_data_on_boundary_east  = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)]
+
+    s.cgra_id = InPort(CgraIdType)
+    # The local address range of current CGRA.
+    # Any address out of this range will be assumed as remote address.
+    s.address_lower = InPort(DataAddrType)
+    s.address_upper = InPort(DataAddrType)
+
+    # Abstract external dram memory interfaces for the internal DMA engine.
+
+    s.send_to_dram_rd_req = SendIfcRTL(DmaDramAddrType)
+    s.recv_from_dram_rd_resp = RecvIfcRTL(DmaMemDataType)
+
+    s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType)
+    s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1))
+
+    # Components.
+
+    s.cgra = CgraTemplateRTL(CgraPayloadType,
+                             multi_cgra_rows,
+                             multi_cgra_columns,
+                             per_cgra_rows, per_cgra_columns,
+                             ctrl_mem_size, data_mem_size_global,
+                             data_mem_size_per_bank, num_banks_per_cgra,
+                             num_registers_per_reg_bank, num_ctrl,
+                             total_steps, mem_access_is_combinational,
+                             FunctionUnit, FuList, TileList, LinkList,
+                             dataSPM, controller2addr_map, idTo2d_map,
+                             is_multi_cgra, cgra_id,
+                             provided_max_per_cgra_rows,
+                             provided_max_per_cgra_cols,
+                             provided_max_num_rd_tiles,
+                             provided_max_num_wr_tiles,
+                             has_dma_ports = True,
+                             DmaDataType = DmaDataType,
+                             DmaCmdType = DmaCmdType)
+
+    DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData)
+    DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr)
+    DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes)
+    DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag)
+    s.dma = DmaEngineRTL(spm_data_nbits = DmaSpmDataType.nbits,
+                         dram_data_nbits = DmaMemDataType.nbits,
+                         dram_addr_nbits = DmaDramAddrType.nbits,
+                         spm_addr_nbits = DmaSpmAddrType.nbits,
+                         bytes_nbits = DmaBytesType.nbits,
+                         tag_nbits = DmaTagType.nbits)
+
+    # CGRA passthrough connections.
+
+    s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt
+    s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt
+
+    if is_multi_cgra:
+      s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc
+      s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc
+
+      for i in range(max_per_cgra_cols):
+        s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i]
+        s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i]
+        s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i]
+        s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i]
+
+      for i in range(max_per_cgra_rows):
+        s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i]
+        s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i]
+        s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i]
+        s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i]
+
+    s.cgra_id //= s.cgra.cgra_id
+    s.address_lower //= s.cgra.address_lower
+    s.address_upper //= s.cgra.address_upper
+
+
+    # Connections between CGRA and DMA engine.
+    # CGRA communicates with DMA engine through the controller.
+    s.cgra.dma_cmd  //= s.dma.dma_cmd
+    s.dma.dma_done  //= s.cgra.dma_done
+
+    s.send_to_dram_rd_req  //= s.dma.send_to_dram_rd_req
+    s.recv_from_dram_rd_resp //= s.dma.recv_from_dram_rd_resp
+
+    s.send_to_dram_wr_req       //= s.dma.send_to_dram_wr_req
+    s.recv_from_dram_wr_resp      //= s.dma.recv_from_dram_wr_resp
+
+    # DMA to controller-forwarded SPM connections.
+
+    s.dma.send_to_spm_wr_req //= s.cgra.recv_from_dma_spm_wr_req
+    s.dma.send_to_spm_rd_req  //= s.cgra.recv_from_dma_spm_rd_req
+    s.dma.recv_from_spm_rd_resp //= s.cgra.send_to_dma_spm_rd_resp
+
+  def line_trace(s):
+    return f"{s.dma.line_trace()} || {s.cgra.line_trace()}"
diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py
new file mode 100644
index 00000000..96aa8846
--- /dev/null
+++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py
@@ -0,0 +1,306 @@
+"""
+==========================================================================
+IntegratedCgraWithDmaRTL_test.py
+==========================================================================
+"""
+
+from pymtl3 import *
+from pymtl3.passes.backends.verilog import VerilogTranslationPass
+from pymtl3.stdlib.test_utils import config_model_with_cmdline_opts
+
+from ..IntegratedCgraWithDmaRTL import IntegratedCgraWithDmaRTL
+from ...fu.single.AdderRTL import AdderRTL
+from ...fu.single.MemUnitRTL import MemUnitRTL
+from ...fu.single.RetRTL import RetRTL
+from ...lib.cmd_type import *
+from ...lib.messages import *
+from ...lib.opt_type import *
+from ...lib.util.cgra.DataSPM import DataSPM
+from ...lib.util.cgra.Tile import Tile
+from ...lib.util.cgra.cgra_helper import get_links
+
+
+ctrl_mem_size = 8
+data_mem_size_global = 64
+data_mem_size_per_bank = 16
+num_banks_per_cgra = 4
+num_registers_per_reg_bank = 16
+num_ctrl = 1
+total_steps = 1
+
+DataType = mk_data(32, 1)
+DataAddrType = mk_bits(clog2(data_mem_size_global))
+CtrlAddrType = mk_bits(clog2(ctrl_mem_size))
+CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank)
+CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType,
+                                  CtrlAddrType)
+CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType)
+WordType = mk_bits(32)
+
+
+def make_dut():
+  # 2x2 tiles with add/mem/return functional units
+  tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"])
+               for x in range(2)] for y in range(2)]
+  TileList = [t for row in tiles_2d for t in row]
+  LinkList = get_links(tiles_2d)
+  dataSPM = DataSPM(3, 3)
+
+  dut = IntegratedCgraWithDmaRTL(
+    CgraPayloadType,
+    1, 1,  # multi_cgra_rows, multi_cgra_columns
+    2, 2,  # per_cgra_rows, per_cgra_columns
+    ctrl_mem_size, data_mem_size_global,
+    data_mem_size_per_bank, num_banks_per_cgra,
+    num_registers_per_reg_bank, num_ctrl,
+    total_steps, True,
+    None, [AdderRTL, MemUnitRTL, RetRTL],
+    TileList, LinkList, dataSPM,
+    {0: [0, 15]},  # controller to address map
+    {0: [0, 0]},   # cgra id to 2D coordinate
+    is_multi_cgra=False)
+
+  return dut
+
+def issue_cpu_pkt(dut, pkt, max_cycles = 20):
+  """
+     CPU issues a packet to the CGRA.
+  """
+  dut.recv_from_cpu_pkt.val @= 1
+  dut.recv_from_cpu_pkt.msg @= pkt
+
+  for _ in range(max_cycles):
+    dut.sim_eval_combinational()
+    if dut.recv_from_cpu_pkt.rdy:
+      dut.sim_tick()
+      dut.recv_from_cpu_pkt.val @= 0
+      dut.sim_eval_combinational()
+      return
+    dut.sim_tick()
+
+  assert False, "CPU packet was not accepted by the CGRA"
+
+
+def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType,
+                  dma_cmd, dram_addr, spm_addr, nbytes, tag):
+
+  """
+  Issues a DMA command to the CGRA.
+  Args:
+    dut: The CGRA instance.
+    CtrlPktType: The type of the control packet.
+    CgraPayloadType: The type of the CGRA payload.
+    DataType: The type of the data.
+    DataAddrType: The type of the data address.
+
+    dma_cmd: The DMA command to issue.(CMD_DMA_MVIN or CMD_DMA_MVOUT)
+    dram_addr: The DRAM address to transfer data from or to.(64 bits)
+    spm_addr: The SPM address to transfer data from or to.(32 bits)
+    nbytes: The number of bytes to transfer.
+    tag: The tag of the DMA command. This tag isn't used now. We may use it to distinguish different DMA commands.
+  """
+  # NOTE nbytes is the number of bytes to transfer.
+  # Currently, only nbytes that are multiples of 4 are supported.
+  assert nbytes % 4 == 0, \
+    f"DMA nbytes must be a multiple of 4, got {nbytes}"
+  config_pkts = [
+    # The bindwidth of dram address is 64 bits, so we need to split it into two 32 bits parts.
+    # Lower 32 bits are sent first.
+    CtrlPktType(0, 0, payload = CgraPayloadType(
+      CMD_DMA_CONFIG_DRAM_ADDR_LO,
+      data = DataType(dram_addr & 0xffffffff, 1))),
+    
+    # Higher 32 bits are sent second.
+    CtrlPktType(0, 0, payload = CgraPayloadType(
+      CMD_DMA_CONFIG_DRAM_ADDR_HI,
+      data = DataType((dram_addr >> 32) & 0xffffffff, 1))),
+    
+    # The SPM address to read from or write to.
+    CtrlPktType(0, 0, payload = CgraPayloadType(
+      CMD_DMA_CONFIG_SPM_ADDR,
+      data_addr = DataAddrType(spm_addr))),
+
+    # The number of bytes to transfer.
+    CtrlPktType(0, 0, payload = CgraPayloadType(
+      CMD_DMA_CONFIG_BYTES,
+      data = DataType(nbytes, 1))),
+    
+    # The tag of the DMA command.
+    CtrlPktType(0, 0, payload = CgraPayloadType(
+      CMD_DMA_CONFIG_TAG,
+      data = DataType(tag, 1))),
+    CtrlPktType(0, 0, payload = CgraPayloadType(dma_cmd)),
+  ]
+
+  for pkt in config_pkts:
+    issue_cpu_pkt(dut, pkt)
+
+
+def observed_dma_done(dut, expected_tag):
+  dut.sim_eval_combinational()
+  if dut.send_to_cpu_pkt.val and dut.send_to_cpu_pkt.msg.payload.cmd == CMD_DMA_DONE:
+    assert int(dut.send_to_cpu_pkt.msg.opaque) == expected_tag
+    assert int(dut.send_to_cpu_pkt.msg.payload.data.payload) == expected_tag
+    return True
+  return False
+
+
+def test_cgra_dma_mvin_to_local_spm():
+  """
+  Integration test for the IntegratedCgraWithDmaRTL wrapper.
+  It simulates a DMA MVIN command that moves data from external DRAM into
+  the CGRA's dataSPM. It then checks the SPM contents to ensure the
+  transfer was successful.
+  """
+  dut = make_dut()
+
+  dut.apply(DefaultPassGroup())
+  dut.sim_reset()
+
+  dut.cgra_id @= 0
+  # Address range: [0:15]
+  dut.address_lower @= DataAddrType(0)
+  dut.address_upper @= DataAddrType(15)
+
+  dut.recv_from_cpu_pkt.val @= 0
+  dut.recv_from_cpu_pkt.msg @= CtrlPktType()
+  dut.send_to_cpu_pkt.rdy @= 1
+  dut.send_to_dram_rd_req.rdy @= 1
+  dut.recv_from_dram_rd_resp.val @= 0
+  dut.recv_from_dram_rd_resp.msg @= 0
+  dut.send_to_dram_wr_req.rdy @= 1
+  dut.recv_from_dram_wr_resp.val @= 0
+  dut.recv_from_dram_wr_resp.msg @= 0
+
+  # Read 16 bytes from DRAM address 0x1000 and write them to SPM words 0..3.
+  issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType,
+                CMD_DMA_MVIN, 0x1000, 0, 16, 0x33)
+
+  beat = concat(WordType(0x44444444), WordType(0x33333333),
+                WordType(0x22222222), WordType(0x11111111))
+  pending_resp = False
+
+  for _ in range(40):
+    dut.recv_from_dram_rd_resp.val @= 0
+    if pending_resp:
+      dut.recv_from_dram_rd_resp.val @= 1
+      # Simulate the read response from DRAM.
+      dut.recv_from_dram_rd_resp.msg @= beat
+
+    dut.sim_eval_combinational()
+
+    pending_resp = bool(dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy)
+
+    if observed_dma_done(dut, 0x33):
+      break
+
+    dut.sim_tick()
+
+  assert observed_dma_done(dut, 0x33)
+  # Check the data in the dataSPM.
+  assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0)
+  assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0)
+  assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] == DataType(0x33333333, 1, 0, 0)
+  assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] == DataType(0x44444444, 1, 0, 0)
+
+
+def test_cgra_dma_mvout_from_local_spm():
+  """
+  Integration test for the IntegratedCgraWithDmaRTL wrapper.
+  It simulates a DMA MVOUT command that moves data from the local SPM
+  into external DRAM.
+  """
+  dut = make_dut()
+
+  dut.apply(DefaultPassGroup())
+  dut.sim_reset()
+
+  # Pre-load SPM with data
+  dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] <<= DataType(0x11111111, 1, 0, 0)
+  dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] <<= DataType(0x22222222, 1, 0, 0)
+  dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] <<= DataType(0x33333333, 1, 0, 0)
+  dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] <<= DataType(0x44444444, 1, 0, 0)
+  dut.sim_tick()
+
+  dut.cgra_id @= 0
+  # Address range: [0:15]
+  dut.address_lower @= DataAddrType(0)
+  dut.address_upper @= DataAddrType(15)
+
+  dut.recv_from_cpu_pkt.val @= 0
+  dut.recv_from_cpu_pkt.msg @= CtrlPktType()
+  dut.send_to_cpu_pkt.rdy @= 1
+  dut.send_to_dram_rd_req.rdy @= 1
+  dut.recv_from_dram_rd_resp.val @= 0
+  dut.recv_from_dram_rd_resp.msg @= 0
+  dut.send_to_dram_wr_req.rdy @= 1
+  dut.recv_from_dram_wr_resp.val @= 0
+  dut.recv_from_dram_wr_resp.msg @= 0
+
+  # Read SPM words 0..3 and write 16 bytes to DRAM address 0x2000.
+  issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType,
+                CMD_DMA_MVOUT, 0x2000, 0, 16, 0x44)
+
+  # Expected 128-bit beat
+  expected_beat = concat(WordType(0x44444444), WordType(0x33333333),
+                         WordType(0x22222222), WordType(0x11111111))
+
+  done = False
+  pending_wr_resp = False
+  for _ in range(40):
+    dut.recv_from_dram_wr_resp.val @= 0
+    if pending_wr_resp:
+      dut.recv_from_dram_wr_resp.val @= 1
+      pending_wr_resp = False
+
+    dut.sim_eval_combinational()
+
+    if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy:
+      assert dut.send_to_dram_wr_req.msg.addr == 0x2000
+      assert dut.send_to_dram_wr_req.msg.data == expected_beat
+      pending_wr_resp = True
+
+    if observed_dma_done(dut, 0x44):
+      done = True
+      break
+
+    dut.sim_tick()
+
+  assert done
+
+def test_gen_verilog_integrated_cgra_with_dma(cmdline_opts):
+  """
+  Translate IntegratedCgraWithDmaRTL to Verilog.
+  """
+  dut = make_dut()
+
+  if cmdline_opts['test_verilog']:
+    # Standard flow: config_model_with_cmdline_opts handles elaboration,
+    # translation, and Verilator import.
+    try:
+      config_model_with_cmdline_opts(dut, cmdline_opts, duts=[])
+    except Exception as e:
+      print(f"Note (Verilator import may have failed): {e}")
+
+    try:
+      fname = dut.get_metadata(VerilogTranslationPass.translated_filename)
+      print(f"Verilog generated: {fname}")
+    except Exception as e:
+      print(f"Could not retrieve translation metadata: {e}")
+  else:
+    # Standalone flow: apply VerilogTranslationPass directly (no Verilator).
+    print("Generating Verilog without --test-verilog flag...")
+    print("Use 'pytest --test-verilog' to also run Verilator co-simulation.")
+
+    dut.elaborate()
+
+    dut.set_metadata(VerilogTranslationPass.enable, True)
+    dut.set_metadata(VerilogTranslationPass.explicit_module_name,
+                     'IntegratedCgraWithDmaRTL')
+    dut.set_metadata(VerilogTranslationPass.explicit_file_name,
+                     'IntegratedCgraWithDmaRTL.v')
+
+    dut.apply(VerilogTranslationPass())
+
+    fname = dut.get_metadata(VerilogTranslationPass.translated_filename)
+    print(f"Verilog generated: {fname}")
diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py
index 83b41068..5a312c7f 100644
--- a/controller/ControllerRTL.py
+++ b/controller/ControllerRTL.py
@@ -29,11 +29,15 @@ def construct(s,
                 multi_cgra_columns,
                 num_tiles,
                 controller2addr_map,
-                idTo2d_map):
+                idTo2d_map,
+                has_dma_ports = False,
+                DmaDataType = mk_dma_data(),
+                DmaCmdType = mk_dma_cmd()):
 
     # Derives types from InterCgraPktType.
     CgraPayloadType = InterCgraPktType.get_field_type(kAttrPayload)
     DataType = CgraPayloadType.get_field_type(kAttrData)
+    DataPayloadType = DataType.get_field_type(kAttrPayload)
     DataAddrType = CgraPayloadType.get_field_type(kAttrDataAddr)
     
     # Derives CgraIdType from grid dimensions.
@@ -52,6 +56,22 @@ def construct(s,
     YType = mk_bits(max(clog2(multi_cgra_rows), 1))
     TileIdType = mk_bits(clog2(num_tiles + 1))
     ControllerXbarPktType = mk_controller_noc_xbar_pkt(InterCgraPktType)
+    DmaOpcodeType = DmaCmdType.get_field_type(kAttrOpcode)
+    DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr)
+    DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr)
+    DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes)
+    DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag)
+    DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData)
+    # Lower and higher 32 bits of the DRAM address.
+    DmaDramAddrPartType = mk_bits(DmaDramAddrType.nbits // 2)
+    DmaDoneType = mk_dma_done(DmaTagType.nbits)
+    DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits,
+                                              DmaSpmDataType.nbits)
+    DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits)
+    DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits)
+
+    if has_dma_ports:
+      assert DmaSpmDataType.nbits == 32
 
     # Interface
     s.cgra_id = InPort(CgraIdType)
@@ -71,9 +91,38 @@ def construct(s,
     s.recv_from_tile_load_response_pkt = RecvIfcRTL(InterCgraPktType)
     s.recv_from_tile_store_request_pkt = RecvIfcRTL(InterCgraPktType)
 
-    s.send_to_mem_load_request = SendIfcRTL(InterCgraPktType)
+    s.send_to_sram_load_request_from_noc = SendIfcRTL(InterCgraPktType)
     s.send_to_tile_load_response = SendIfcRTL(InterCgraPktType)
-    s.send_to_mem_store_request = SendIfcRTL(InterCgraPktType)
+    s.send_to_sram_store_request_from_noc = SendIfcRTL(InterCgraPktType)
+
+    # Controller-owned command path from CPU packets to the DMA engine.
+    # Send the decoded DMA command to the DMA engine.
+    s.dma_cmd = SendIfcRTL(DmaCmdType)
+    # Receive the DMA done signal from the DMA engine.
+    s.dma_done = RecvIfcRTL(DmaDoneType)
+
+    # -------------------------------------------------------
+    # SPM (SRAM) access path from the DMA engine.
+    # The DMA and the inter-tile NoC (above) each have their own
+    # dedicated SPM access interfaces to the data memory controller.
+    # They are kept separate because the DMA can perform burst data
+    # movement.
+    # -------------------------------------------------------
+    # Receive the request of writing into SPM from the DMA.
+    s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType)
+    # Receive the request of reading from SPM from the DMA.
+    s.recv_from_dma_spm_rd_req  = RecvIfcRTL(DmaSpmReadReqType)
+    # Send the response of reading from SPM to the DMA.
+    s.send_to_dma_spm_rd_resp   = SendIfcRTL(DmaSpmReadRespType)
+
+    # SRAM data memory side of the SPM access path (DMA).
+    # Send the request of writing into SPM to the data_mem controller.
+    s.send_to_sram_store_request_from_dma   = SendIfcRTL(DmaSpmWriteReqType)
+    # Send the request of reading from SPM to the data_mem controller.
+    s.send_to_sram_load_request_from_dma    = SendIfcRTL(DmaSpmReadReqType)
+    # Receive the response of reading from SPM from the data_mem controller.
+    s.recv_from_sram_load_response = RecvIfcRTL(DmaSpmReadRespType)
+
 
     # Component
     s.recv_from_tile_load_request_pkt_queue = ChannelRTL(InterCgraPktType, latency = 1)
@@ -123,6 +172,12 @@ def construct(s,
 
     s.addr_dst_id = Wire(CgraIdType)
 
+    s.dma_dram_addr_lo = Wire(DmaDramAddrPartType)
+    s.dma_dram_addr_hi = Wire(DmaDramAddrPartType)
+    s.dma_spm_addr     = Wire(DmaSpmAddrType)
+    s.dma_bytes        = Wire(DmaBytesType)
+    s.dma_tag          = Wire(DmaTagType)
+
     # Connections.
     # Requests towards others, 1 cycle delay to improve timing.
     s.recv_from_tile_load_request_pkt_queue.recv //= s.recv_from_tile_load_request_pkt
@@ -130,14 +185,62 @@ def construct(s,
     s.recv_from_tile_store_request_pkt_queue.recv //= s.recv_from_tile_store_request_pkt
 
     # Requests towards local from others, 1 cycle delay to improve timing.
-    s.send_to_mem_load_request_queue.send //= s.send_to_mem_load_request
+    s.send_to_mem_load_request_queue.send //= s.send_to_sram_load_request_from_noc
     s.send_to_tile_load_response_queue.send //= s.send_to_tile_load_response
-    s.send_to_mem_store_request_queue.send //= s.send_to_mem_store_request
+    s.send_to_mem_store_request_queue.send //= s.send_to_sram_store_request_from_noc
 
     # For control signals delivery from CPU to tiles.
     s.recv_from_cpu_pkt //= s.recv_from_cpu_pkt_queue.recv
     s.send_to_cpu_pkt //= s.send_to_cpu_pkt_queue.send
 
+    @update_ff
+    def update_dma_cmd_regs():
+      if s.reset:
+        s.dma_dram_addr_lo <<= DmaDramAddrPartType(0)
+        s.dma_dram_addr_hi <<= DmaDramAddrPartType(0)
+        s.dma_spm_addr     <<= DmaSpmAddrType(0)
+        s.dma_bytes        <<= DmaBytesType(0)
+        s.dma_tag          <<= DmaTagType(0)
+      elif has_dma_ports:
+        cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload
+        cpu_cmd = cpu_payload.cmd
+        cpu_data = cpu_payload.data.payload
+        if s.recv_from_cpu_pkt_queue.send.val & s.recv_from_cpu_pkt_queue.send.rdy:
+          if cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO:
+            s.dma_dram_addr_lo <<= DmaDramAddrPartType(cpu_data)
+          elif cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI:
+            s.dma_dram_addr_hi <<= DmaDramAddrPartType(cpu_data)
+          elif cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR:
+            s.dma_spm_addr <<= zext(cpu_payload.data_addr, DmaSpmAddrType)
+          elif cpu_cmd == CMD_DMA_CONFIG_BYTES:
+            s.dma_bytes <<= DmaBytesType(cpu_data)
+          elif cpu_cmd == CMD_DMA_CONFIG_TAG:
+            s.dma_tag <<= trunc(cpu_data, DmaTagType)
+
+    @update
+    def update_dma_spm_forwarding():
+      if has_dma_ports:
+        s.send_to_sram_store_request_from_dma.val @= s.recv_from_dma_spm_wr_req.val
+        s.recv_from_dma_spm_wr_req.rdy @= s.send_to_sram_store_request_from_dma.rdy
+        s.send_to_sram_store_request_from_dma.msg @= s.recv_from_dma_spm_wr_req.msg
+
+        s.send_to_sram_load_request_from_dma.val     @= s.recv_from_dma_spm_rd_req.val
+        s.recv_from_dma_spm_rd_req.rdy   @= s.send_to_sram_load_request_from_dma.rdy
+        s.send_to_sram_load_request_from_dma.msg     @= s.recv_from_dma_spm_rd_req.msg
+        s.send_to_dma_spm_rd_resp.val @= s.recv_from_sram_load_response.val
+        s.recv_from_sram_load_response.rdy   @= s.send_to_dma_spm_rd_resp.rdy
+        s.send_to_dma_spm_rd_resp.msg @= s.recv_from_sram_load_response.msg
+      else:
+        s.send_to_sram_store_request_from_dma.val @= 0
+        s.send_to_sram_store_request_from_dma.msg @= DmaSpmWriteReqType()
+        s.send_to_sram_load_request_from_dma.val @= 0
+        s.send_to_sram_load_request_from_dma.msg @= DmaSpmReadReqType()
+        s.recv_from_sram_load_response.rdy @= 0
+        s.recv_from_dma_spm_wr_req.rdy @= 0
+        s.recv_from_dma_spm_rd_req.rdy @= 0
+        s.send_to_dma_spm_rd_resp.val @= 0
+        s.send_to_dma_spm_rd_resp.msg @= DmaSpmReadRespType()
+
     @update
     def update_received_msg():
       kLoadRequestInportIdx = 0
@@ -151,6 +254,15 @@ def update_received_msg():
       s.send_to_cpu_pkt_queue.recv.msg @= IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
       s.recv_from_ctrl_ring_pkt.rdy @= 0
 
+      s.dma_cmd.val       @= 0
+      s.dma_cmd.msg       @= DmaCmdType(
+        DmaOpcodeType(DMA_MVIN),
+        concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo),
+        s.dma_spm_addr,
+        s.dma_bytes,
+        s.dma_tag)
+      s.dma_done.rdy      @= 0
+
       for i in range(CONTROLLER_CROSSBAR_INPORTS):
         s.crossbar.recv[i].val @= 0
         s.crossbar.recv[i].msg @= ControllerXbarPktType(0, 0)
@@ -201,24 +313,56 @@ def update_received_msg():
       s.global_reduce_unit.send.rdy @= s.crossbar.recv[kFromReduceUnitIdx].rdy
       s.crossbar.recv[kFromReduceUnitIdx].msg @= s.global_reduce_unit.send.msg
 
-      # For the ctrl and data preloading.
-      s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \
-          s.recv_from_cpu_pkt_queue.send.val
-      s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy
-      s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \
-          ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC)
-                                InterCgraPktType(s.cgra_id, # src
-                                                 s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst
-                                                 0, # src_x
-                                                 0, # src_y
-                                                 s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x
-                                                 s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y
-                                                 num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back.
-                                                 s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id
-                                                 0, # remote_src_port, only used for inter-cgra remote load request/response.
-                                                 0, # opaque
-                                                 0, # vc_id
-                                                 s.recv_from_cpu_pkt_queue.send.msg.payload))
+      cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload
+      cpu_cmd = cpu_payload.cmd
+
+      if has_dma_ports & (
+          (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO) |
+          (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI) |
+          (cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR) |
+          (cpu_cmd == CMD_DMA_CONFIG_BYTES) |
+          (cpu_cmd == CMD_DMA_CONFIG_TAG)):
+        s.recv_from_cpu_pkt_queue.send.rdy @= 1
+
+      elif has_dma_ports & (
+          (cpu_cmd == CMD_DMA_MVIN) |
+          (cpu_cmd == CMD_DMA_MVOUT)):
+        s.dma_cmd.val @= s.recv_from_cpu_pkt_queue.send.val
+        if cpu_cmd == CMD_DMA_MVIN:
+          s.dma_cmd.msg @= DmaCmdType(
+            DmaOpcodeType(DMA_MVIN),
+            concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo),
+            s.dma_spm_addr,
+            s.dma_bytes,
+            s.dma_tag)
+        else:
+          s.dma_cmd.msg @= DmaCmdType(
+            DmaOpcodeType(DMA_MVOUT),
+            concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo),
+            s.dma_spm_addr,
+            s.dma_bytes,
+            s.dma_tag)
+        s.recv_from_cpu_pkt_queue.send.rdy @= s.dma_cmd.rdy
+
+      else:
+        # For the ctrl and data preloading.
+        s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \
+            s.recv_from_cpu_pkt_queue.send.val
+        s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy
+        s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \
+            ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC)
+                                  InterCgraPktType(s.cgra_id, # src
+                                                   s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst
+                                                   0, # src_x
+                                                   0, # src_y
+                                                   s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x
+                                                   s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y
+                                                   num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back.
+                                                   s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id
+                                                   0, # remote_src_port, only used for inter-cgra remote load request/response.
+                                                   0, # opaque
+                                                   0, # vc_id
+                                                   s.recv_from_cpu_pkt_queue.send.msg.payload))
 
       # TODO: For the other cmd types.
 
@@ -358,6 +502,30 @@ def update_received_msg():
         #   # TODO: Handle other cmd types.
         #   assert(False)
 
+      # WARNING
+      # A possible conflict occurs when dma_done.valis True and the received message is CMD_COMPLETEat the same time,
+      # that is, when a DMA command and CMD_COMPLETE appear in the same clock cycle.
+      # In this case, both require the CGRA to send a return signal to the CPU, which may causes a conflict.
+      # Related discussion: https://github.com/tancheng/VectorCGRA/pull/293#discussion_r3418482217
+      if has_dma_ports & s.dma_done.val:
+        s.dma_done.rdy @= s.send_to_cpu_pkt_queue.recv.rdy
+        s.send_to_cpu_pkt_queue.recv.val @= 1
+        s.send_to_cpu_pkt_queue.recv.msg @= \
+            IntraCgraPktType(num_tiles, # src_tile_id: controller/DMA sideband source
+                             num_tiles, # dst_tile_id: CPU-facing controller endpoint
+                             s.cgra_id,
+                             s.cgra_id,
+                             s.idTo2d_x_lut[s.cgra_id],
+                             s.idTo2d_y_lut[s.cgra_id],
+                             s.idTo2d_x_lut[s.cgra_id],
+                             s.idTo2d_y_lut[s.cgra_id],
+                             s.dma_done.msg.dma_tag,
+                             0,
+                             CgraPayloadType(
+                               CMD_DMA_DONE,
+                              DataType(zext(s.dma_done.msg.dma_tag, DataPayloadType), 1, 0, 0),
+                               0, 0, 0))
+
     @update
     def update_sending_to_noc_msg():
       s.send_to_inter_cgra_noc.val @= s.crossbar.send[0].val
@@ -383,8 +551,8 @@ def line_trace(s):
     recv_from_tile_load_response_pkt_str = "recv_from_tile_load_response_pkt: " + str(s.recv_from_tile_load_response_pkt.msg)
     recv_from_tile_store_request_pkt_str = "recv_from_tile_store_request_pkt: " + str(s.recv_from_tile_store_request_pkt.msg)
     crossbar_str = "crossbar: {" + s.crossbar.line_trace() + "}"
-    send_to_mem_load_request_str = "send_to_mem_load_request: " + str(s.send_to_mem_load_request.msg)
-    send_to_mem_store_request_str = "send_to_mem_store_request: " + str(s.send_to_mem_store_request.msg)
+    send_to_mem_load_request_str = "send_to_sram_load_request_from_noc: " + str(s.send_to_sram_load_request_from_noc.msg)
+    send_to_mem_store_request_str = "send_to_sram_store_request_from_noc: " + str(s.send_to_sram_store_request_from_noc.msg)
     recv_from_noc_str ="recv_from_noc_pkt.val: " + str(s.recv_from_inter_cgra_noc.val) + " recv_from_noc_pkt.msg: " + str(s.recv_from_inter_cgra_noc.msg) + " recv_from_noc_pkt.rdy: " + str(s.recv_from_inter_cgra_noc.rdy)
     send_to_noc_str = "send_to_noc_pkt: " + str(s.send_to_inter_cgra_noc.msg) + "; rdy: " + str(s.send_to_inter_cgra_noc.rdy) + "; val: " + str(s.send_to_inter_cgra_noc.val)
     return f'{recv_from_cpu_pkt_str} || {recv_from_cpu_pkt_queue_str} || {crossbar_recv_str} ||  {send_to_ctrl_ring_pkt_str} || {recv_from_tile_load_request_pkt_str} || {recv_from_tile_load_response_pkt_str} || {recv_from_tile_store_request_pkt_str} || {crossbar_str} || {send_to_mem_load_request_str} || {send_to_mem_store_request_str} || {recv_from_noc_str} || {send_to_noc_str}\n'
diff --git a/controller/test/ControllerRTL_test.py b/controller/test/ControllerRTL_test.py
index 42d4eda9..1ad2500f 100644
--- a/controller/test/ControllerRTL_test.py
+++ b/controller/test/ControllerRTL_test.py
@@ -78,9 +78,9 @@ def construct(s,
     s.src_from_tile_load_response_pkt.send //= s.dut.recv_from_tile_load_response_pkt
     s.src_from_tile_store_request_pkt.send //= s.dut.recv_from_tile_store_request_pkt
 
-    s.dut.send_to_mem_store_request //= s.sink_to_mem_store_request.recv
+    s.dut.send_to_sram_store_request_from_noc //= s.sink_to_mem_store_request.recv
     s.dut.send_to_tile_load_response //= s.sink_to_mem_load_response.recv
-    s.dut.send_to_mem_load_request //= s.sink_to_mem_load_request.recv
+    s.dut.send_to_sram_load_request_from_noc //= s.sink_to_mem_load_request.recv
 
     s.src_from_noc.send //= s.dut.recv_from_inter_cgra_noc
     s.dut.send_to_inter_cgra_noc //= s.sink_to_noc.recv
diff --git a/fu/single/ExtractPredicateRTL.py b/fu/single/ExtractPredicateRTL.py
index 460e598e..15e5c562 100644
--- a/fu/single/ExtractPredicateRTL.py
+++ b/fu/single/ExtractPredicateRTL.py
@@ -15,7 +15,7 @@
 from pymtl3 import *
 from ..basic.Fu import Fu
 from ...lib.opt_type import *
-
+from ...lib.util.data_struct_attr import *
 class ExtractPredicateRTL(Fu):
 
   def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0):
@@ -60,7 +60,7 @@ def comb_logic():
           # When loop is running (predicate=1) -> payload=1
           # When loop terminates (predicate=0) -> payload=0
           # Downstream NOT will invert: running->0 (no RET), done->1 (trigger RET)
-          s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type('payload'))
+          s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type(kAttrPayload))
           s.send_out[0].msg.predicate @= 1
           
           s.send_out[0].val @= s.recv_in[s.in0_idx].val
diff --git a/fu/single/LoopControlRTL.py b/fu/single/LoopControlRTL.py
index 5ac13f70..fbcf362e 100644
--- a/fu/single/LoopControlRTL.py
+++ b/fu/single/LoopControlRTL.py
@@ -15,7 +15,7 @@
 from pymtl3 import *
 from ..basic.Fu import Fu
 from ...lib.opt_type import OPT_LOOP_CONTROL, OPT_SYMBOL_DICT
-
+from ...lib.util.data_struct_attr import *
 class LoopControlRTL(Fu):
 
   def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0):
@@ -34,8 +34,8 @@ def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0
 
     super(LoopControlRTL, s).construct(CtrlPktType, num_inports, num_outports, 1, vector_factor_power)
 
-    PayloadType = s.DataType.get_field_type('payload')
-    PredicateType = s.DataType.get_field_type('predicate')
+    PayloadType = s.DataType.get_field_type(kAttrPayload)
+    PredicateType = s.DataType.get_field_type(kAttrPredicate)
     FuInType = mk_bits(clog2(num_inports + 1))
     
     # Internal state for loop control
diff --git a/lib/cmd_type.py b/lib/cmd_type.py
index a24078d7..13590ca3 100644
--- a/lib/cmd_type.py
+++ b/lib/cmd_type.py
@@ -14,7 +14,7 @@
 
 # Total number of commands that are supported/recognized by controller.
 # Needs to be updated once more commands are added/supported.
-NUM_CMDS = 44
+NUM_CMDS = 52
 
 CMD_LAUNCH                           = 0
 CMD_PAUSE                            = 1
@@ -69,6 +69,17 @@
 # GEP FU Configuration Commands.
 CMD_CONFIG_GEP_STRIDE                = 43  # Controller -> GEP FU: Configures stride for 2D GEP
 
+# DMA commands. The CPU configures the controller-side command registers
+# before issuing CMD_DMA_MVIN/CMD_DMA_MVOUT.
+CMD_DMA_CONFIG_DRAM_ADDR_LO          = 44  # Configures lower 32 bits of DRAM address
+CMD_DMA_CONFIG_DRAM_ADDR_HI          = 45  # Configures higher 32 bits of DRAM address
+CMD_DMA_CONFIG_SPM_ADDR              = 46  # Configures SPM address
+CMD_DMA_CONFIG_BYTES                 = 47  # Configures number of bytes to transfer
+CMD_DMA_CONFIG_TAG                   = 48  # Configures tag of the DMA command
+CMD_DMA_MVIN                         = 49  # Issues a DMA_MVIN command
+CMD_DMA_MVOUT                        = 50  # Issues a DMA_MVOUT command
+CMD_DMA_DONE                         = 51  # Signals that the DMA command is complete
+
 CMD_SYMBOL_DICT = {
   CMD_LAUNCH:                           "(LAUNCH_KERNEL)",
   CMD_PAUSE:                            "(PAUSE_EXECUTION)",
@@ -114,5 +125,13 @@
   CMD_LC_CHILD_RESET:                   "(LC_CHILD_RESET)",
   CMD_LC_ALL_COMPLETE:                  "(LC_ALL_COMPLETE)",
   CMD_CONFIG_GEP_STRIDE:                "(CONFIG_GEP_STRIDE)",
+  CMD_DMA_CONFIG_DRAM_ADDR_LO:          "(DMA_CONFIG_DRAM_ADDR_LO)",
+  CMD_DMA_CONFIG_DRAM_ADDR_HI:          "(DMA_CONFIG_DRAM_ADDR_HI)",
+  CMD_DMA_CONFIG_SPM_ADDR:              "(DMA_CONFIG_SPM_ADDR)",
+  CMD_DMA_CONFIG_BYTES:                 "(DMA_CONFIG_BYTES)",
+  CMD_DMA_CONFIG_TAG:                   "(DMA_CONFIG_TAG)",
+  CMD_DMA_MVIN:                         "(DMA_MVIN)",
+  CMD_DMA_MVOUT:                        "(DMA_MVOUT)",
+  CMD_DMA_DONE:                         "(DMA_DONE)",
 }
 
diff --git a/lib/messages.py b/lib/messages.py
index 49182f98..92748885 100644
--- a/lib/messages.py
+++ b/lib/messages.py
@@ -160,7 +160,7 @@ def str_func(s):
 
   field_dict[kAttrVectorFactorPower] = VectorFactorPowerType
 
-  field_dict[kAttrIsLastCtrl] = b1
+  field_dict[kAttrIsLastCtrl] = mk_bits(1)
 
   # Register file related signals.
   # Indicates whether to write data into the register bank, and the
@@ -199,6 +199,185 @@ def str_func(s):
     namespace = {'__str__': str_func}
   )
 
+#=========================================================================
+# DMA messages
+#=========================================================================
+
+def mk_dma_cmd(dram_addr_nbits = 64,
+               spm_addr_nbits = 32,
+               bytes_nbits = 32,
+               tag_nbits = 8,
+               prefix = "DmaCmd"):
+
+  OpcodeType   = mk_bits(3)
+  DramAddrType = mk_bits(dram_addr_nbits)
+  SpmAddrType  = mk_bits(spm_addr_nbits)
+  BytesType    = mk_bits(bytes_nbits)
+  TagType      = mk_bits(tag_nbits)
+
+  new_name = f"{prefix}_{dram_addr_nbits}_{spm_addr_nbits}_{bytes_nbits}_{tag_nbits}"
+
+  def str_func(s):
+    return f"dma_cmd(op={s.opcode},dram={s.dram_addr},spm={s.spm_addr},bytes={s.nbytes},tag={s.dma_tag})"
+
+  return mk_bitstruct(new_name, {
+      'opcode'   : OpcodeType,
+      'dram_addr': DramAddrType,
+      'spm_addr' : SpmAddrType,
+      # NOTE nbytes is the number of bytes to transfer.
+      # Currently, only nbytes that are multiples of 4 are supported.
+      'nbytes'   : BytesType,
+      # This dma_tag isn't used now. We may use it to distinguish different DMA commands.
+      'dma_tag'  : TagType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
+# A data structure to represent the data to be transferred by DMA.
+#
+# === Mask Design ===
+# Data transfer granularity between DRAM and SPM is 1 word (4 bytes)
+# The `dram_mask` and `spm_mask` fields define the bitwidth of byte
+# masks for DRAM and SPM data respectively.
+#
+# Actual mask *values* are generated independently by the DMA engine
+# FSM (see DmaEngineRTL), NOT carried in this struct:
+#
+# - dram_mask (16-bit, one bit per byte of 128-bit(16 bytes) DRAM beat):
+#   Dynamically computed during MVOUT (SPM -> DRAM) based on the
+#   number of valid words in the last beat. Values range from 0x000f
+#   (1 word) to 0xffff (full beat). For example, if DMA move 1 word from SPM to DRAM, the mask is 0x000f.
+#   If DMA move 2 words from SPM to DRAM, the mask is 0x00ff.
+#   If DMA move 3 words from SPM to DRAM, the mask is 0x0fff.
+#   If DMA move 4 words from SPM to DRAM, the mask is 0xffff.
+#
+# - spm_mask (4-bit, one bit per byte of 32-bit SPM word):
+#   SPM writes always write full words, so the mask is
+#   hardcoded to 0xf. This field is reserved for
+#   future byte-granular SPM write support.
+def mk_dma_data(dram_data_nbits = 128,
+                dram_mask_nbits = 16,
+                spm_data_nbits = 32,
+                spm_mask_nbits = 4,
+                prefix = "DmaData"):
+  DramDataType = mk_bits(dram_data_nbits)
+  DramMaskType = mk_bits(dram_mask_nbits)
+  SpmDataType = mk_bits(spm_data_nbits)
+  SpmMaskType = mk_bits(spm_mask_nbits)
+  new_name = f"{prefix}_{dram_data_nbits}_{dram_mask_nbits}_{spm_data_nbits}"
+
+  def str_func(s):
+    return f"dma_data(dram_data={s.dram_data},dram_mask={s.dram_mask},spm_data={s.spm_data})"
+  
+  return mk_bitstruct(new_name, {
+    'dram_data': DramDataType,
+    # 16-bit byte mask for 16-bytes DRAM beat.
+    'dram_mask': DramMaskType,
+    'spm_data': SpmDataType,
+    # 4-bit byte mask for 4-bytes SPM word.
+    # Always 0xf in current implementation (full-word writes only).
+    'spm_mask': SpmMaskType,
+  },
+  namespace = {'__str__': str_func}
+  )
+
+def mk_dma_done(tag_nbits = 8,
+                prefix = "DmaDone"):
+
+  TagType = mk_bits(tag_nbits)
+
+  new_name = f"{prefix}_{tag_nbits}"
+
+  def str_func(s):
+    return f"dma_done(dma_tag={s.dma_tag})"
+
+  return mk_bitstruct(new_name, {
+      'dma_tag': TagType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
+#=========================================================================
+# The type of write request signal from DMA to DRAM
+#=========================================================================
+def mk_dma_dram_wr_req(addr_nbits = 64,
+                       data_nbits = 128,
+                       mask_nbits = 16,
+                       prefix = "DmaDramWrReq"):
+
+  AddrType = mk_bits(addr_nbits)
+  DataType = mk_bits(data_nbits)
+  MaskType = mk_bits(mask_nbits)
+
+  new_name = f"{prefix}_{addr_nbits}_{data_nbits}_{mask_nbits}"
+
+  def str_func(s):
+    return f"dma_dram_wr(addr={s.addr},data={s.data},mask={s.mask})"
+
+  return mk_bitstruct(new_name, {
+      'addr': AddrType,
+      'data': DataType,
+      'mask': MaskType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
+# The type of write request signal from DMA to SPM
+def mk_dma_spm_write_req(addr_nbits = 32,
+                         data_nbits = 32,
+                         prefix = "DmaSpmWriteReq"):
+
+  AddrType = mk_bits(addr_nbits)
+  DataType = mk_bits(data_nbits)
+  MaskType = mk_bits(max(1, data_nbits // 8))
+
+  new_name = f"{prefix}_{addr_nbits}_{data_nbits}"
+
+  def str_func(s):
+    return f"dma_spm_wr(addr={s.addr},data={s.data},mask={s.mask})"
+
+  return mk_bitstruct(new_name, {
+      'addr': AddrType,
+      'data': DataType,
+      'mask': MaskType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
+# The type of read request signal from DMA to SPM
+def mk_dma_spm_read_req(addr_nbits = 32,
+                        prefix = "DmaSpmReadReq"):
+
+  AddrType = mk_bits(addr_nbits)
+
+  new_name = f"{prefix}_{addr_nbits}"
+
+  def str_func(s):
+    return f"dma_spm_rd(addr={s.addr})"
+
+  return mk_bitstruct(new_name, {
+      'addr': AddrType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
+# The type of read response signal from SPM to DMA
+def mk_dma_spm_read_resp(data_nbits = 32,
+                         prefix = "DmaSpmReadResp"):
+
+  DataType = mk_bits(data_nbits)
+
+  new_name = f"{prefix}_{data_nbits}"
+
+  def str_func(s):
+    return f"dma_spm_rd_resp(data={s.data})"
+
+  return mk_bitstruct(new_name, {
+      'data': DataType,
+    },
+    namespace = {'__str__': str_func}
+  )
+
 #=========================================================================
 # Multi-cgra oriented inter-/intra-cgra data/config/cmd packet payload
 #=========================================================================
diff --git a/lib/util/common.py b/lib/util/common.py
index 51650d67..5b65174e 100644
--- a/lib/util/common.py
+++ b/lib/util/common.py
@@ -65,3 +65,28 @@
 READ_TOWARDS_FU           = 1
 READ_TOWARDS_ROUTING_XBAR = 2
 READ_TOWARDS_BOTH         = 3
+
+############################
+# Constants for DMA engine.
+############################
+# DMA Move In and Out
+# DMA_MVIN  : DRAM -> DMA Engine -> SPM
+# DMA_MVOUT : SPM -> DMA Engine -> DRAM
+DMA_MVIN  = 0
+DMA_MVOUT = 1
+
+# 1 byte = 8 bits
+CHAR_BIT = 8
+
+# State machine definitions of DMA engine.
+from pymtl3 import mk_bits
+StateType = mk_bits( 4 )
+STATE_DMA_IDLE          = StateType( 0 ) # Waiting for a new DMA command
+STATE_DMA_MVIN_REQ      = StateType( 1 ) # MVIN: Issuing DRAM read request
+STATE_DMA_MVIN_RESP     = StateType( 2 ) # MVIN: Waiting for DRAM read response
+STATE_DMA_MVIN_WRITE    = StateType( 3 ) # MVIN: Writing unpacked words to SPM
+STATE_DMA_MVOUT_READ    = StateType( 4 ) # MVOUT: Issuing SPM read request
+STATE_DMA_MVOUT_RESP    = StateType( 5 ) # MVOUT: Receiving SPM read response and packing
+STATE_DMA_MVOUT_WRITE   = StateType( 6 ) # MVOUT: Issuing DRAM write request
+STATE_DMA_MVOUT_WAIT    = StateType( 7 ) # MVOUT: Waiting for DRAM write response
+STATE_DMA_DONE          = StateType( 8 ) # Signaling command completion
diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py
index 989378d1..615ef246 100644
--- a/lib/util/data_struct_attr.py
+++ b/lib/util/data_struct_attr.py
@@ -39,3 +39,16 @@
 kAttrDstCgraX = 'dst_cgra_x'
 kAttrDstCgraY = 'dst_cgra_y'
 kAttrAddr = 'addr'
+
+# DMA attributes
+kAttrOpcode = 'opcode'
+kAttrDramAddr = 'dram_addr'
+kAttrNBytes = 'nbytes'
+# This dma_tag isn't used now. We may use it to distinguish different DMA commands.
+kAttrDmaTag = 'dma_tag'
+# TODO: https://github.com/tancheng/VectorCGRA/issues/316 -- Consolidates attributes.
+kAttrSpmAddr = 'spm_addr'
+kAttrSpmData = 'spm_data'
+kAttrSpmMask = 'spm_mask'
+kAttrDramData = 'dram_data'
+kAttrDramMask = 'dram_mask'
\ No newline at end of file
diff --git a/local_CI.py b/local_CI.py
new file mode 100644
index 00000000..f35198f8
--- /dev/null
+++ b/local_CI.py
@@ -0,0 +1,77 @@
+"""
+local_CI.py is a script that runs the CI tests locally.
+Usage:
+```shell
+cd /path/to/VectorCGRA/
+mkdir -p build && cd build
+python3 local_CI.py
+```
+The log will be saved to the `local_CI.log` file.
+"""
+import subprocess
+import os
+import sys
+
+def run_tests():
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    log_file = os.path.join(current_dir, "local_CI.log")
+    
+    commands = [
+        ["pytest", "..", "-v", "--tb=short"],
+        ["pytest", "../mem/ctrl/test/CtrlMemDynamicRTL_test.py", "-xvs"],
+        ["pytest", "../tile/test/TileRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../controller/test/ControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../cgra/test/CgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../cgra/test/CgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../noc/PyOCN/pymtl3_net/ringnet/test/RingNetworkRTL_test.py"],
+        ["pytest", "../multi_cgra/test/RingMultiCgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_verilog_homo_2x2_4x4", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../mem/const/test/ConstQueueDynamicRTL_test.py", "-xvs"],
+        ["pytest", "../mem/data/test/DataMemControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../multi_cgra/test/MeshMultiCgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_scalar_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_vector_global_reduce_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"],
+        ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_systolic_2x2_2x2_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"]
+    ]
+
+    with open(log_file, "w", encoding="utf-8") as f:
+        for cmd in commands:
+            cmd_str = " ".join(cmd)
+            header = f"\n{'='*80}\nExecuting: {cmd_str}\n{'='*80}\n"
+            
+            print(header)
+            f.write(header)
+            f.flush()
+
+            try:
+                process = subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    bufsize=1
+                )
+
+                for line in process.stdout:
+                    print(line, end="")
+                    f.write(line)
+                
+                process.wait()
+                
+                if process.returncode == 0:
+                    status = f"\nSUCCESS: {cmd_str}\n"
+                else:
+                    status = f"\nFAILED (Exit Code {process.returncode}): {cmd_str}\n"
+                
+                print(status)
+                f.write(status)
+
+            except Exception as e:
+                error_msg = f"\nERROR executing {cmd_str}: {str(e)}\n"
+                print(error_msg)
+                f.write(error_msg)
+
+    print(f"\n\nAll tests completed. Log saved to: {os.path.abspath(log_file)}")
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py
index 356a0ea2..58508945 100644
--- a/mem/data/DataMemControllerRTL.py
+++ b/mem/data/DataMemControllerRTL.py
@@ -32,8 +32,27 @@
 from ...lib.messages import *
 from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL
 from ...lib.util.data_struct_attr import *
+from ...lib.util.common import CHAR_BIT
 
 class DataMemControllerRTL(Component):
+  """
+  DataMemControllerRTL manages access to the multi-banked data SPM.
+  It arbitrates between multiple request sources:
+  1. Local tiles (via `recv_raddr`, `recv_waddr`, `recv_wdata`)
+  2. Inter-CGRA NoC (via `recv_from_noc_load_request`, etc.)
+  3. Optional controller-forwarded DMA access
+     (via `spm_dma_wval`, `spm_dma_rval`, etc.)
+
+  Architectural Design:
+  - Uses crossbars to route requests to the correct memory bank based on the
+    address.
+  - Supports an optional controller-forwarded DMA SPM interface. When
+    `has_dma_ports` is True, extra ports are added to the read and write
+    crossbars.
+  - DMA-originated requests are treated as another master on the memory bus,
+    competing with tiles and NoC traffic after they pass through the
+    controller.
+  """
   def construct(s,
                 NocPktType,
                 data_mem_size_global,
@@ -45,10 +64,14 @@ def construct(s,
                 multi_cgra_columns = 2,
                 num_tiles = 16,
                 mem_access_is_combinational = True,
-                idTo2d_map = {0: [0, 0]}):
+                idTo2d_map = {0: [0, 0]},
+                has_dma_ports = False,
+                DmaCmdType = mk_dma_cmd(),
+                DmaDataType = mk_dma_data()):
 
     CgraPayloadType = NocPktType.get_field_type(kAttrPayload)
     DataType = CgraPayloadType.get_field_type(kAttrData)
+    PayloadType = DataType.get_field_type(kAttrPayload)
     # Constants.
     global_addr_nbits = clog2(data_mem_size_global)
     per_bank_addr_nbits = clog2(data_mem_size_per_bank)
@@ -58,19 +81,32 @@ def construct(s,
     YType = mk_bits(max(clog2(multi_cgra_rows), 1))
     AddrType = mk_bits(global_addr_nbits)
     PerBankAddrType = mk_bits(per_bank_addr_nbits)
+
+    DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr)
+    DmaMaskType = DmaDataType.get_field_type(kAttrSpmMask)
+    DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData)
+    DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, DmaSpmDataType.nbits)
+    DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits)
+    DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits)
+    NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort)
     s.num_banks_per_cgra = num_banks_per_cgra
-    LocalBankIndexType = mk_bits(clog2(num_banks_per_cgra))
+    s.has_dma_ports = has_dma_ports
+    LocalBankIndexType = mk_bits(max(1, clog2(num_banks_per_cgra)))
     s.num_rd_tiles = num_rd_tiles
     s.num_wr_tiles = num_wr_tiles
-    RdTileIdType = mk_bits(clog2(num_rd_tiles))
+    RdTileIdType = mk_bits(max(1, clog2(num_rd_tiles)))
     # The additional port is for the request from inter-cgra NoC via controller.
-    num_xbar_in_rd_ports = num_rd_tiles + 1
-    num_xbar_in_wr_ports = num_wr_tiles + 1
+    # If DMA is enabled, we add one more port for the DMA engine.
+    dma_port_offset = 1 if has_dma_ports else 0
+    num_xbar_in_rd_ports = num_rd_tiles + 1 + dma_port_offset
+    num_xbar_in_wr_ports = num_wr_tiles + 1 + dma_port_offset
     num_xbar_out_rd_ports = num_banks_per_cgra + 1
     num_xbar_out_wr_ports = num_banks_per_cgra + 1
     num_cgras = multi_cgra_rows * multi_cgra_columns
     XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports))
     XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports))
+    XbarInRdType = mk_bits(clog2(num_xbar_in_rd_ports))
+    XbarInWrType = mk_bits(clog2(num_xbar_in_wr_ports))
     MemReadPktType = \
         mk_mem_access_pkt(DataType,
                           num_xbar_in_rd_ports,
@@ -120,7 +156,12 @@ def construct(s,
     s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType)
     s.send_to_noc_store_pkt = SendIfcRTL(NocPktType)
 
+    s.recv_from_controller_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType)
+    s.recv_from_controller_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType)
+    s.send_to_controller_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType)
+
     # Components.
+    # A list of DataMemWrapperRTL instances. Each one is a single memory bank.
     s.memory_wrapper = [DataMemWrapperRTL(DataType, MemReadPktType, MemWritePktType, MemResponsePktType,
                                           data_mem_size_global, data_mem_size_per_bank, mem_access_is_combinational)
                   for _ in range(num_banks_per_cgra)]
@@ -159,10 +200,10 @@ def construct(s,
     @update
     def assemble_xbar_pkt():
       for i in range(num_xbar_in_rd_ports):
-        s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i)
+        s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0)
 
       for i in range(num_xbar_in_wr_ports):
-        s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i)
+        s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0)
 
       for i in range(num_rd_tiles):
         recv_raddr = s.recv_raddr[i].msg
@@ -223,6 +264,41 @@ def assemble_xbar_pkt():
                                                 0,                          # src_tile
                                                 num_wr_tiles)               # remote_src_port
 
+      if has_dma_ports:
+
+        # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset).
+        # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1
+        # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error 
+        # between `dma_wr_idx` and `num_xbar_in_wr_ports`.
+        dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1)
+        dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1)
+
+        recv_raddr_from_dma = trunc(s.recv_from_controller_spm_rd_req.msg.addr, AddrType)
+        if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper):
+          bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType)
+        else:
+          bank_index_load_from_dma = XbarOutRdType(num_banks_per_cgra)
+        s.rd_pkt[dma_rd_idx] @= MemReadPktType(dma_rd_idx,                  # src
+                                               bank_index_load_from_dma,    # dst
+                                               recv_raddr_from_dma,         # addr
+                                               DataType(0, 0, 0, 0),        # data
+                                               s.cgra_id,                   # src_cgra
+                                               0,                           # src_tile
+                                               0)                           # remote_src_port
+
+        recv_waddr_from_dma = trunc(s.recv_from_controller_spm_wr_req.msg.addr, AddrType)
+        if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper):
+          bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType)
+        else:
+          bank_index_store_from_dma = XbarOutWrType(num_banks_per_cgra)
+        s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx,                 # src
+                                                bank_index_store_from_dma,  # dst
+                                                recv_waddr_from_dma,        # addr
+                                                DataType(zext(s.recv_from_controller_spm_wr_req.msg.data, PayloadType), 1, 0, 0),
+                                                0,                          # src_cgra
+                                                0,                          # src_tile
+                                                0)                          # remote_src_port
+
     # Connects xbar with the memory wrapper.
     @update
     def update_all():
@@ -286,6 +362,11 @@ def update_all():
         s.write_crossbar.recv[i].val @= 0
         s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0)
 
+      s.recv_from_controller_spm_wr_req.rdy          @= 0
+      s.recv_from_controller_spm_rd_req.rdy           @= 0
+      s.send_to_controller_spm_rd_resp.val      @= 0
+      s.send_to_controller_spm_rd_resp.msg      @= DmaSpmReadRespType(DmaSpmDataType(0))
+
       s.send_to_noc_load_request_pkt.msg @= \
           NocPktType(0, # src
                      0, # dst
@@ -310,6 +391,16 @@ def update_all():
       s.read_crossbar.recv[num_rd_tiles].val @= s.recv_from_noc_load_request.val
       s.read_crossbar.recv[num_rd_tiles].msg @= s.rd_pkt[num_rd_tiles]
       s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy
+
+      if has_dma_ports:
+        # When `has_dma_ports` is True, num_xbar_in_rd_ports = num_rd_tiles + 1 + 1(dma_port_offset).
+        # Use dma_rd_idx = num_rd_tiles + 1 = num_xbar_in_rd_ports - 1
+        # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error 
+        # between `dma_rd_idx` and `num_xbar_in_rd_ports`.
+        dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1)
+        s.read_crossbar.recv[dma_rd_idx].val @= s.recv_from_controller_spm_rd_req.val
+        s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx]
+        s.recv_from_controller_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy
       
       # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC.
       for i in range(num_wr_tiles):
@@ -321,6 +412,16 @@ def update_all():
       s.write_crossbar.recv[num_wr_tiles].msg @= s.wr_pkt[num_wr_tiles]
       s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy
 
+      if has_dma_ports:
+        # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset).
+        # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1
+        # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error 
+        # between `dma_wr_idx` and `num_xbar_in_wr_ports`.
+        dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1)
+        s.write_crossbar.recv[dma_wr_idx].val @= s.recv_from_controller_spm_wr_req.val
+        s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx]
+        s.recv_from_controller_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy
+
       # Connects the response ports to tiles and NoC from the xbar.
       # Number of load responses is expected to be the same as the number of load requests.
       for i in range(num_xbar_in_rd_ports):
@@ -328,7 +429,7 @@ def update_all():
           s.send_rdata[RdTileIdType(i)].msg @= s.response_crossbar.send[i].msg.data
           s.send_rdata[RdTileIdType(i)].val @= s.response_crossbar.send[i].val
           s.response_crossbar.send[i].rdy @= s.send_rdata[RdTileIdType(i)].rdy
-        else:
+        elif i == num_rd_tiles:
           from_cgra_id = s.response_crossbar.send[i].msg.src_cgra
           from_tile_id = s.response_crossbar.send[i].msg.src_tile
           s.send_to_noc_load_response_pkt.msg @= \
@@ -351,6 +452,11 @@ def update_all():
 
           s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val
           s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy
+        elif has_dma_ports:
+          s.send_to_controller_spm_rd_resp.msg      @= DmaSpmReadRespType(
+            trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType))
+          s.send_to_controller_spm_rd_resp.val      @= s.response_crossbar.send[i].val
+          s.response_crossbar.send[i].rdy @= s.send_to_controller_spm_rd_resp.rdy
 
       # Handles the request (not response) towards the others via the NoC. The dst would be
       # updated in the controller.
@@ -363,7 +469,7 @@ def update_all():
                       0, # dst_y
                       0, # src_tile_id
                       0, # dst_tile_id
-                      s.read_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port
+                      trunc(s.read_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port
                       0, # opaque
                       0, # vc_id
                       CgraPayloadType(
@@ -378,7 +484,7 @@ def update_all():
       s.response_crossbar.recv[num_banks_per_cgra].val @= s.recv_from_noc_load_response_pkt.val
       s.response_crossbar.recv[num_banks_per_cgra].msg @= \
           MemResponsePktType(num_banks_per_cgra,
-                             s.recv_from_noc_load_response_pkt.msg.remote_src_port,
+                             zext(s.recv_from_noc_load_response_pkt.msg.remote_src_port, XbarInRdType),
                              s.recv_from_noc_load_response_pkt.msg.payload.data_addr,
                              s.recv_from_noc_load_response_pkt.msg.payload.data,
                              s.recv_from_noc_load_response_pkt.msg.src,
@@ -399,7 +505,7 @@ def update_all():
                       0, # dst_y
                       0, # src_tile_id
                       0, # dst_tile_id
-                      s.write_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port
+                      trunc(s.write_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port
                       0, # opaque
                       0, # vc_id
                       CgraPayloadType(
diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py
new file mode 100644
index 00000000..cf39a756
--- /dev/null
+++ b/mem/data/test/DataMemControllerRTL_dma_test.py
@@ -0,0 +1,117 @@
+"""
+==========================================================================
+DataMemControllerRTL_dma_test.py
+==========================================================================
+"""
+
+from pymtl3 import *
+
+from ..DataMemControllerRTL import DataMemControllerRTL
+from ....lib.messages import *
+from ....lib.opt_type import *
+
+
+def make_types(data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles):
+  DataType = mk_data(32, 1)
+  DataAddrType = mk_bits(clog2(data_mem_size_global))
+  CtrlAddrType = mk_bits(clog2(ctrl_mem_size))
+  CtrlType = mk_ctrl(4, 2, 4, 4, 16)
+  CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType)
+  NocPktType = mk_inter_cgra_pkt(1, 1, num_tiles, num_rd_tiles, CgraPayloadType)
+  return DataType, DataAddrType, NocPktType
+
+
+def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles):
+  for i in range(num_rd_tiles):
+    dut.recv_raddr[i].val @= 0
+    dut.recv_raddr[i].msg @= DataAddrType(0)
+    dut.send_rdata[i].rdy @= 1
+
+  for i in range(num_wr_tiles):
+    dut.recv_waddr[i].val @= 0
+    dut.recv_waddr[i].msg @= DataAddrType(0)
+    dut.recv_wdata[i].val @= 0
+    dut.recv_wdata[i].msg @= DataType(0, 0, 0, 0)
+
+  dut.recv_from_noc_load_request.val @= 0
+  dut.recv_from_noc_load_request.msg @= NocPktType()
+  dut.recv_from_noc_store_request.val @= 0
+  dut.recv_from_noc_store_request.msg @= NocPktType()
+  dut.recv_from_noc_load_response_pkt.val @= 0
+  dut.recv_from_noc_load_response_pkt.msg @= NocPktType()
+  dut.send_to_noc_load_request_pkt.rdy @= 1
+  dut.send_to_noc_load_response_pkt.rdy @= 1
+  dut.send_to_noc_store_pkt.rdy @= 1
+
+  DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr)
+  dut.recv_from_controller_spm_wr_req.val @= 0
+  dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(0)
+  dut.recv_from_controller_spm_wr_req.msg.data @= 0
+  dut.recv_from_controller_spm_wr_req.msg.mask @= 0
+  dut.recv_from_controller_spm_rd_req.val @= 0
+  dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(0)
+  dut.send_to_controller_spm_rd_resp.rdy @= 1
+
+  dut.cgra_id @= 0
+  dut.address_lower @= DataAddrType(0)
+  dut.address_upper @= DataAddrType(15)
+
+
+def test_dma_ports_write_then_read():
+  """
+  Verifies that the DataMemController correctly handles requests from the
+  DMA ports. It performs a DMA write to a specific address and then a
+  DMA read from the same address to verify the data.
+  """
+  data_mem_size_global = 64
+  data_mem_size_per_bank = 16
+  num_banks = 4
+  num_rd_tiles = 2
+  num_wr_tiles = 2
+  num_tiles = 4
+  ctrl_mem_size = 16
+
+  DataType, DataAddrType, NocPktType = make_types(
+      data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles)
+
+  dut = DataMemControllerRTL(NocPktType,
+                             data_mem_size_global,
+                             data_mem_size_per_bank,
+                             num_banks,
+                             num_rd_tiles,
+                             num_wr_tiles,
+                             1,
+                             1,
+                             num_tiles,
+                             True,
+                             {0: [0, 0]},
+                             has_dma_ports = True)
+  dut.apply(DefaultPassGroup())
+  dut.sim_reset()
+  drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles)
+
+  DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr)
+  dut.recv_from_controller_spm_wr_req.val @= 1
+  dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(3)
+  dut.recv_from_controller_spm_wr_req.msg.data @= 0xaaaabbbb
+  dut.recv_from_controller_spm_wr_req.msg.mask @= 0xf
+  dut.sim_eval_combinational()
+  assert dut.recv_from_controller_spm_wr_req.rdy
+  dut.sim_tick()
+  dut.recv_from_controller_spm_wr_req.val @= 0
+
+  dut.recv_from_controller_spm_rd_req.val @= 1
+  dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(3)
+
+  seen_response = False
+  for _ in range(10):
+    dut.sim_eval_combinational()
+    if dut.recv_from_controller_spm_rd_req.val & dut.recv_from_controller_spm_rd_req.rdy:
+      dut.recv_from_controller_spm_rd_req.val @= 0
+    if dut.send_to_controller_spm_rd_resp.val:
+      assert int(dut.send_to_controller_spm_rd_resp.msg.data) == 0xaaaabbbb
+      seen_response = True
+      break
+    dut.sim_tick()
+
+  assert seen_response
diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py
new file mode 100644
index 00000000..a7fff5d8
--- /dev/null
+++ b/mem/dma/DmaEngineRTL.py
@@ -0,0 +1,307 @@
+"""
+==========================================================================
+DmaEngineRTL.py
+==========================================================================
+
+Simple DMA engine for moving opaque words between an abstract external
+memory interface and the CGRA dataSPM.
+"""
+
+from pymtl3 import *
+from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL
+from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL
+from ...lib.messages import *
+from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_DMA_IDLE, STATE_DMA_MVIN_REQ, STATE_DMA_MVIN_RESP, STATE_DMA_MVIN_WRITE, STATE_DMA_MVOUT_READ, STATE_DMA_MVOUT_RESP, STATE_DMA_MVOUT_WRITE, STATE_DMA_MVOUT_WAIT, STATE_DMA_DONE
+
+
+class DmaEngineRTL( Component ):
+  """
+  The DmaEngineRTL module is responsible for bulk data movement between an
+  external DRAM-like memory and the on-chip Scratchpad Memory (dataSPM).
+
+  It supports two main operations:
+  - DMA_MVIN:  DRAM -> DMA Engine -> SPM
+  - DMA_MVOUT: SPM -> DMA Engine -> DRAM
+
+  Architectural Design:
+  - 1 word = 4 bytes = 32 bits in this system.
+  - DRAM is byte-addressed which means each unique address points to a byte(8 bits).
+  - SPM is word-addressed which means each unique address points to a word(32 bits).
+  - The engine uses a 128-bit interface to external memory (4 words per beat)
+    and a 32-bit interface to the dataSPM (1 word per cycle).
+  - A finite state machine (FSM) manages the command execution flow, including
+    requesting memory, waiting for responses, and performing SPM accesses.
+  - MVIN logic: Requests 128-bit beats from DRAM, then unpacks them into four
+    sequential 32-bit SPM writes.
+  - MVOUT logic: Reads four 32-bit words from SPM, packs them into a 128-bit
+    beat, and issues a single write request to DRAM.
+  """
+
+  def construct( s,
+                 spm_data_nbits = 32,  # Bitwidth of a single SPM word
+                 dram_data_nbits = 128, # Bitwidth of an external memory beat
+                 dram_addr_nbits = 64, # Bitwidth of DRAM addresses
+                 spm_addr_nbits = 32,  # Bitwidth of SPM addresses
+                 bytes_nbits = 32,     # Bitwidth for transfer size in bytes
+                 tag_nbits = 8 ):      # Bitwidth for command tracking tags
+
+    assert dram_data_nbits == spm_data_nbits * 4
+
+    OpcodeType   = mk_bits( 3 )
+    DramAddrType = mk_bits( dram_addr_nbits )
+    SpmAddrType  = mk_bits( spm_addr_nbits )
+    BytesType    = mk_bits( bytes_nbits )
+    TagType      = mk_bits( tag_nbits )
+    SpmDataType  = mk_bits( spm_data_nbits )
+    MemDataType  = mk_bits( dram_data_nbits )
+    # Byte mask for SPM write
+    SpmMaskType  = mk_bits( spm_data_nbits // CHAR_BIT )
+    MemMaskType  = mk_bits( dram_data_nbits // CHAR_BIT )
+    DmaCmdType = mk_dma_cmd(dram_addr_nbits, spm_addr_nbits, bytes_nbits, tag_nbits)
+    DmaDoneType = mk_dma_done(tag_nbits)
+    DmaSpmWriteReqType = mk_dma_spm_write_req(spm_addr_nbits, spm_data_nbits)
+    DmaSpmReadReqType = mk_dma_spm_read_req(spm_addr_nbits)
+    DmaSpmReadRespType = mk_dma_spm_read_resp(spm_data_nbits)
+    DmaDramWrReqType = mk_dma_dram_wr_req(dram_addr_nbits, dram_data_nbits, dram_data_nbits // 8)
+
+    # Command interface
+    # Receives a DMA command from the controller.
+    s.dma_cmd = RecvIfcRTL(DmaCmdType)
+
+    # Sends a DMA done signal to the controller.
+    s.dma_done = SendIfcRTL(DmaDoneType)
+
+    # Abstract external memory interface
+    # Request to read from DRAM
+    s.send_to_dram_rd_req = SendIfcRTL( DramAddrType )
+    # Response from DRAM
+    s.recv_from_dram_rd_resp = RecvIfcRTL( MemDataType )
+
+    # Request to write to DRAM
+    s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType)
+    s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1))
+
+    # Send write request to SPM.
+    s.send_to_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType)
+    # Send read request to SPM.
+    s.send_to_spm_rd_req = SendIfcRTL(DmaSpmReadReqType)
+    # Receive read response from SPM.
+    s.recv_from_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType)
+
+    # State machine definitions
+
+    s.state             = Wire( StateType )
+    s.state_next        = Wire( StateType )
+
+    # Combinational logic
+    s.opcode_reg        = Wire( OpcodeType )   # Current operation (MVIN/MVOUT)
+    s.dram_addr_reg     = Wire( DramAddrType ) # Current DRAM byte address
+    s.spm_addr_reg      = Wire( SpmAddrType )  # Current SPM word address
+    s.words_left_reg    = Wire( BytesType )    # Number of 32-bit words remaining to transfer
+    s.tag_reg           = Wire( TagType )      # Tag of the active command
+    s.beat_reg          = Wire( MemDataType )  # Buffer for 128-bit DRAM beat
+    s.word_idx_reg      = Wire( Bits2 )        # Index (0-3) of the word within a beat
+    s.wr_mask_reg       = Wire( MemMaskType )  # Byte mask for DRAM write
+
+    # Sequential logic
+    s.state_ff          = Wire( StateType )
+    s.opcode_ff         = Wire( OpcodeType )
+    s.dram_addr_ff      = Wire( DramAddrType )
+    s.spm_addr_ff       = Wire( SpmAddrType )
+    s.words_left_ff     = Wire( BytesType )
+    s.tag_ff            = Wire( TagType )
+    s.beat_ff           = Wire( MemDataType )
+    s.word_idx_ff       = Wire( Bits2 )
+    s.wr_mask_ff        = Wire( MemMaskType )
+
+    # Connections
+    s.state             //= s.state_ff
+    s.opcode_reg        //= s.opcode_ff
+    s.dram_addr_reg     //= s.dram_addr_ff
+    s.spm_addr_reg      //= s.spm_addr_ff
+    s.words_left_reg    //= s.words_left_ff
+    s.tag_reg           //= s.tag_ff
+    s.beat_reg          //= s.beat_ff
+    s.word_idx_reg      //= s.word_idx_ff
+    s.wr_mask_reg       //= s.wr_mask_ff
+
+    # Precompute commonly used values at construct time (not inside any
+    # @update block) to avoid PyMTL3 AST translation limitations on the
+    # floor-division operator.
+    spm_word_nbytes = (spm_data_nbits // CHAR_BIT)
+    # SPM write mask: always all byte lanes enabled (0xf) because the DMA
+    # writes full 32-bit words to SPM. Byte-granular SPM writes are not
+    # needed in the current design.
+    spm_word_mask = SpmMaskType( (1 << spm_word_nbytes) - 1 )
+    dram_beat_nbytes = (dram_data_nbits // CHAR_BIT)
+
+    @update
+    def comb_outputs():
+      s.dma_cmd.rdy        @= s.state == STATE_DMA_IDLE
+      s.dma_done.val       @= s.state == STATE_DMA_DONE
+      s.dma_done.msg       @= DmaDoneType(s.tag_reg)
+
+      s.send_to_dram_rd_req.val    @= s.state == STATE_DMA_MVIN_REQ
+      s.send_to_dram_rd_req.msg    @= s.dram_addr_reg
+      s.recv_from_dram_rd_resp.rdy   @= s.state == STATE_DMA_MVIN_RESP
+
+      s.send_to_dram_wr_req.val    @= s.state == STATE_DMA_MVOUT_WRITE
+      s.send_to_dram_wr_req.msg.addr   @= s.dram_addr_reg
+      s.send_to_dram_wr_req.msg.data   @= s.beat_reg
+      s.send_to_dram_wr_req.msg.mask   @= s.wr_mask_reg
+
+      s.recv_from_dram_wr_resp.rdy   @= s.state == STATE_DMA_MVOUT_WAIT
+
+      spm_wdata = SpmDataType(0)
+
+      if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM
+        spm_wdata = s.beat_reg[0:spm_data_nbits]
+      elif s.word_idx_reg == b2( 1 ): # Writes the second word of the beat to SPM
+        spm_wdata = s.beat_reg[spm_data_nbits:spm_data_nbits*2]
+      elif s.word_idx_reg == b2( 2 ): # 3rd word
+        spm_wdata = s.beat_reg[spm_data_nbits*2:spm_data_nbits*3]
+      else: # 4th word
+        spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4]
+
+      s.send_to_spm_wr_req.val @= s.state == STATE_DMA_MVIN_WRITE
+      s.send_to_spm_wr_req.msg @= DmaSpmWriteReqType(
+        s.spm_addr_reg,
+        spm_wdata,
+        spm_word_mask )
+
+      s.send_to_spm_rd_req.val       @= s.state == STATE_DMA_MVOUT_READ
+      s.send_to_spm_rd_req.msg       @= DmaSpmReadReqType(s.spm_addr_reg)
+      s.recv_from_spm_rd_resp.rdy  @= s.state == STATE_DMA_MVOUT_RESP
+
+    @update_ff
+    def seq_state():
+      if s.reset:
+        s.state_ff      <<= STATE_DMA_IDLE
+        s.opcode_ff     <<= OpcodeType( 0 )
+        s.dram_addr_ff  <<= DramAddrType( 0 )
+        s.spm_addr_ff   <<= SpmAddrType( 0 )
+        s.words_left_ff <<= BytesType( 0 )
+        s.tag_ff        <<= TagType( 0 )
+        s.beat_ff       <<= MemDataType( 0 )
+        s.word_idx_ff   <<= b2( 0 )
+        s.wr_mask_ff    <<= MemMaskType( 0 )
+      else:
+        if s.state == STATE_DMA_IDLE:
+          if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command.
+            # Note: the nbytes % 4 check is omitted from the update block
+            # because PyMTL3's AST translator does not support assert
+            # statements. It is enforced in construct() instead.
+            s.opcode_ff     <<= s.dma_cmd.msg.opcode
+            s.dram_addr_ff  <<= s.dma_cmd.msg.dram_addr
+            s.spm_addr_ff   <<= s.dma_cmd.msg.spm_addr
+            # Converts the transfer size from bytes to words.
+            # NOTE We only support nbytes that are multiples of 4 now.
+            # If nbytes is not a multiple of 4, we will add 1 to the number of words to transfer.
+            s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2)
+            s.tag_ff        <<= s.dma_cmd.msg.dma_tag
+            s.beat_ff       <<= MemDataType( 0 )
+            s.word_idx_ff   <<= b2( 0 )
+            s.wr_mask_ff    <<= MemMaskType( 0 )
+
+            if s.dma_cmd.msg.nbytes == BytesType( 0 ): # No more bytes to transfer.
+              s.state_ff    <<= STATE_DMA_DONE
+            # Still has bytes to transfer.
+            elif s.dma_cmd.msg.opcode == OpcodeType( DMA_MVIN ):
+              s.state_ff    <<= STATE_DMA_MVIN_REQ # Move to the next state: to issue a read request to DRAM.
+            else: # DMA_MVOUT
+              s.state_ff    <<= STATE_DMA_MVOUT_READ # Move to the next state: to issue a read request to SPM.
+
+        elif s.state == STATE_DMA_MVIN_REQ: # Issues a read request to DRAM.
+          if s.send_to_dram_rd_req.val & s.send_to_dram_rd_req.rdy:
+            s.dram_addr_ff  <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes )
+            s.state_ff      <<= STATE_DMA_MVIN_RESP
+
+        elif s.state == STATE_DMA_MVIN_RESP: # Receives a response from DRAM.
+          if s.recv_from_dram_rd_resp.val & s.recv_from_dram_rd_resp.rdy:
+            s.beat_ff       <<= s.recv_from_dram_rd_resp.msg
+            s.word_idx_ff   <<= b2( 0 )
+            s.state_ff      <<= STATE_DMA_MVIN_WRITE # Move to the next state: to write to SPM.
+
+        elif s.state == STATE_DMA_MVIN_WRITE: # Writes to SPM.
+          if s.send_to_spm_wr_req.val & s.send_to_spm_wr_req.rdy:
+            # Update the SPM address where write next cycle(+1)
+            s.spm_addr_ff   <<= s.spm_addr_reg + SpmAddrType( 1 )
+            # Update the number of words remaining to write to SPM.
+            s.words_left_ff <<= s.words_left_reg - BytesType( 1 )
+
+            if s.words_left_reg == BytesType( 1 ):
+              s.state_ff    <<= STATE_DMA_DONE
+            elif s.word_idx_reg == b2( 3 ):
+              s.word_idx_ff <<= b2( 0 )
+              s.state_ff    <<= STATE_DMA_MVIN_REQ
+            else:
+              s.word_idx_ff <<= s.word_idx_reg + b2( 1 )
+
+        elif s.state == STATE_DMA_MVOUT_READ:
+          if s.send_to_spm_rd_req.val & s.send_to_spm_rd_req.rdy:
+            s.state_ff      <<= STATE_DMA_MVOUT_RESP # Move to the next state: to receive a response from SPM.
+
+        elif s.state == STATE_DMA_MVOUT_RESP:
+          if s.recv_from_spm_rd_resp.val & s.recv_from_spm_rd_resp.rdy:
+            # Pack the response from SPM into a 128-bit beat by left-shifting.
+            if s.word_idx_reg == b2( 0 ): # 1st word
+              s.beat_ff <<= concat( s.beat_reg[spm_data_nbits : spm_data_nbits<<2],
+                                    s.recv_from_spm_rd_resp.msg.data )
+            elif s.word_idx_reg == b2( 1 ):
+              s.beat_ff <<= concat( s.beat_reg[spm_data_nbits<<1 : spm_data_nbits<<2],
+                                    s.recv_from_spm_rd_resp.msg.data,
+                                    s.beat_reg[0:spm_data_nbits] )
+            elif s.word_idx_reg == b2( 2 ):
+              s.beat_ff <<= concat( s.beat_reg[(spm_data_nbits<<1)+spm_data_nbits : spm_data_nbits<<2],
+                                    s.recv_from_spm_rd_resp.msg.data,
+                                    s.beat_reg[0:spm_data_nbits<<1] )
+            else:
+              s.beat_ff <<= concat( s.recv_from_spm_rd_resp.msg.data,
+                                    s.beat_reg[0 : (spm_data_nbits<<1)+spm_data_nbits] )
+
+            s.spm_addr_ff   <<= s.spm_addr_reg + SpmAddrType( 1 )
+            s.words_left_ff <<= s.words_left_reg - BytesType( 1 )
+
+            if s.words_left_reg == BytesType( 1 ):
+              # Last beat of MVOUT: compute byte-mask based on how many
+              # valid 32-bit words are in this final beat.
+              if s.word_idx_reg == b2( 0 ):
+                s.wr_mask_ff <<= MemMaskType( 0x000f )  # 1 word  (bytes 0-3)
+              elif s.word_idx_reg == b2( 1 ):
+                s.wr_mask_ff <<= MemMaskType( 0x00ff )  # 2 words (bytes 0-7)
+              elif s.word_idx_reg == b2( 2 ):
+                s.wr_mask_ff <<= MemMaskType( 0x0fff )  # 3 words (bytes 0-11)
+              else:
+                s.wr_mask_ff <<= MemMaskType( 0xffff )  # 4 words (bytes 0-15)
+              s.state_ff    <<= STATE_DMA_MVOUT_WRITE
+            elif s.word_idx_reg == b2( 3 ):
+              # Full beat (4 words): all 16 bytes are valid.
+              s.wr_mask_ff  <<= MemMaskType( 0xffff )
+              s.state_ff    <<= STATE_DMA_MVOUT_WRITE
+            else:
+              s.word_idx_ff <<= s.word_idx_reg + b2( 1 )
+              s.state_ff    <<= STATE_DMA_MVOUT_READ
+
+        elif s.state == STATE_DMA_MVOUT_WRITE:
+          if s.send_to_dram_wr_req.val & s.send_to_dram_wr_req.rdy:
+            s.state_ff    <<= STATE_DMA_MVOUT_WAIT
+
+        elif s.state == STATE_DMA_MVOUT_WAIT:
+          if s.recv_from_dram_wr_resp.val & s.recv_from_dram_wr_resp.rdy:
+            # Turn to the +16 address after writing 16 bytes data.
+            s.dram_addr_ff  <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes )
+            s.beat_ff       <<= MemDataType( 0 )
+            s.word_idx_ff   <<= b2( 0 )
+            s.wr_mask_ff    <<= MemMaskType( 0 )
+
+            if s.words_left_reg == BytesType( 0 ):
+              s.state_ff    <<= STATE_DMA_DONE
+            else:
+              s.state_ff    <<= STATE_DMA_MVOUT_READ
+
+        elif s.state == STATE_DMA_DONE:
+          if s.dma_done.val & s.dma_done.rdy:
+            s.state_ff      <<= STATE_DMA_IDLE
+
+  def line_trace( s ):
+    return f"dma(state={int(s.state)},tag={int(s.tag_reg)},left={int(s.words_left_reg)})"
diff --git a/mem/dma/__init__.py b/mem/dma/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/mem/dma/__init__.py
@@ -0,0 +1 @@
+
diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py
new file mode 100644
index 00000000..186abb97
--- /dev/null
+++ b/mem/dma/test/DmaEngineRTL_test.py
@@ -0,0 +1,245 @@
+"""
+==========================================================================
+DmaEngineRTL_test.py
+==========================================================================
+"""
+
+from pymtl3 import *
+
+from ..DmaEngineRTL import DmaEngineRTL, DMA_MVIN, DMA_MVOUT
+
+
+def make_dut():
+  dut = DmaEngineRTL()
+  dut.apply(DefaultPassGroup())
+  dut.sim_reset()
+
+  dut.dma_cmd.val @= 0
+  dut.dma_cmd.msg.opcode @= 0
+  dut.dma_cmd.msg.dram_addr @= 0
+  dut.dma_cmd.msg.spm_addr @= 0
+  dut.dma_cmd.msg.nbytes @= 0
+  dut.dma_cmd.msg.dma_tag @= 0
+  dut.dma_done.rdy @= 1
+
+  dut.send_to_dram_rd_req.rdy @= 1
+  dut.recv_from_dram_rd_resp.val @= 0
+  dut.recv_from_dram_rd_resp.msg @= 0
+  dut.send_to_dram_wr_req.rdy @= 1
+  dut.recv_from_dram_wr_resp.val @= 1
+  dut.recv_from_dram_wr_resp.msg @= 0
+
+  dut.send_to_spm_wr_req.rdy @= 1
+  dut.send_to_spm_rd_req.rdy @= 1
+  dut.recv_from_spm_rd_resp.val @= 0
+  dut.recv_from_spm_rd_resp.msg.data @= 0
+  dut.sim_eval_combinational()
+  return dut
+
+
+def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag):
+  """
+  Issues a DMA command to the DUT.
+  Args:
+    dut: The DUT instance.
+    opcode: The opcode of the DMA command. DMA_MVIN or DMA_MVOUT.
+    dram_addr: The DRAM address of the DMA command.
+    spm_addr: The SPM address of the DMA command.
+    nbytes: The number of bytes to transfer.
+    tag: The tag of the DMA command.
+  """
+  # NOTE nbytes is the number of bytes to transfer.
+  # Currently, only nbytes that are multiples of 4 are supported.
+  assert nbytes % 4 == 0, \
+    f"DMA nbytes must be a multiple of 4, got {nbytes}"
+  dut.dma_cmd.val @= 1
+  dut.dma_cmd.msg.opcode @= opcode
+  dut.dma_cmd.msg.dram_addr @= dram_addr
+  dut.dma_cmd.msg.spm_addr @= spm_addr
+  dut.dma_cmd.msg.nbytes @= nbytes
+  dut.dma_cmd.msg.dma_tag @= tag
+  dut.sim_eval_combinational()
+  assert dut.dma_cmd.rdy
+  dut.sim_tick()
+  dut.dma_cmd.val @= 0
+
+
+def test_dma_mvin_one_beat():
+  """
+  Tests DMA_MVIN operation.
+  The DRAM contains 2 beats of data, which should be unpacked into 8
+  sequential SPM writes.
+  """
+  dut = make_dut()
+  issue_cmd(dut, DMA_MVIN, 
+           0x1000, # dram_addr
+           4, # spm_addr
+           32, # nbytes(number of bytes to transfer)
+           0x5a) # tag
+
+  dram = {
+    0x1000: concat(Bits32(0x44444444), Bits32(0x33333333),
+                   Bits32(0x22222222), Bits32(0x11111111)), # 4 x 4 bytes = 16 bytes in total.
+    
+    # Address bias: +16, since DRAM is byte-addressed(each address points to a byte).
+    0x1010: concat(Bits32(0x88888888), Bits32(0x77777777),
+                   Bits32(0x66666666), Bits32(0x55555555)),
+  }
+  pending_resp = None
+  spm_writes = []
+
+  for _ in range(20):
+    dut.recv_from_dram_rd_resp.val @= 0
+    if pending_resp is not None:
+      dut.recv_from_dram_rd_resp.val @= 1
+      dut.recv_from_dram_rd_resp.msg @= pending_resp
+
+    dut.sim_eval_combinational()
+
+    if dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy:
+      pending_resp = dram[int(dut.send_to_dram_rd_req.msg)]
+    else:
+      pending_resp = None
+
+    if dut.send_to_spm_wr_req.val & dut.send_to_spm_wr_req.rdy:
+      spm_writes.append((int(dut.send_to_spm_wr_req.msg.addr), int(dut.send_to_spm_wr_req.msg.data)))
+
+    if dut.dma_done.val:
+      assert int(dut.dma_done.msg.dma_tag) == 0x5a
+      break
+
+    dut.sim_tick()
+
+  for elem in spm_writes:
+    print(f'{elem[0]}: 0x{elem[1]:08x}')
+
+  assert spm_writes == [
+    (4, 0x11111111),
+    (5, 0x22222222),
+    (6, 0x33333333),
+    (7, 0x44444444),
+
+    (8, 0x55555555),
+    (9, 0x66666666),
+    (10, 0x77777777),
+    (11, 0x88888888),
+  ]
+
+
+def test_dma_mvout_partial_beat():
+  """
+  Tests a partial beat MVOUT operation (12 bytes / 3 words).
+  The DMA should read three words from SPM, pack them into a 128-bit beat
+  with a proper byte mask, and write it to DRAM.
+  """
+  dut = make_dut()
+  issue_cmd(dut, DMA_MVOUT, 
+            0x2000, # dram_addr
+            8, # spm_addr
+            12, # nbytes(number of bytes to transfer)
+            0xa5) # tag
+
+  spm = {
+    8: 0xaaaabbbb,
+    9: 0xccccdddd,
+    10: 0xeeeeffff,
+  }
+  pending_rresp = None
+  mem_writes = []
+
+  for _ in range(30):
+    dut.recv_from_spm_rd_resp.val @= 0
+    if pending_rresp is not None:
+      dut.recv_from_spm_rd_resp.val @= 1
+      dut.recv_from_spm_rd_resp.msg.data @= pending_rresp
+
+    dut.sim_eval_combinational()
+
+    if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy:
+      pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)]
+    else:
+      pending_rresp = None
+
+    if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy:
+      mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr),
+                         int(dut.send_to_dram_wr_req.msg.data),
+                         int(dut.send_to_dram_wr_req.msg.mask)))
+
+    if dut.dma_done.val:
+      assert int(dut.dma_done.msg.dma_tag) == 0xa5
+      break
+
+    dut.sim_tick()
+
+  assert mem_writes == [
+    (0x2000,
+     int(concat(Bits32(0), Bits32(0xeeeeffff),
+                Bits32(0xccccdddd), Bits32(0xaaaabbbb))),
+     0x0fff), # mask
+  ]
+
+def test_dma_mvout_full_beat():
+  """
+  Tests a full beat MVOUT operation (16 bytes / 4 words).
+  The DMA should read four words from SPM, pack them into a 128-bit beat
+  with a proper byte mask, and write it to DRAM.
+  """
+  dut = make_dut()
+  issue_cmd(dut, DMA_MVOUT, 
+            0x2000, # dram_addr
+            8, # spm_addr
+            32, # nbytes(number of bytes to transfer)
+            0xa5) # tag
+
+  spm = {
+    8 : 0x11112222,
+    9 : 0x33334444,
+    10: 0x55556666,
+    11: 0x77778888,
+    12: 0x9999aaaa,
+    13: 0xbbbbcccc,
+    14: 0xddddeeee,
+    15: 0xffff0000,
+  }
+  pending_rresp = None
+  mem_writes = []
+
+  for _ in range(30):
+    dut.recv_from_spm_rd_resp.val @= 0
+    if pending_rresp is not None:
+      dut.recv_from_spm_rd_resp.val @= 1
+      dut.recv_from_spm_rd_resp.msg.data @= pending_rresp
+
+    dut.sim_eval_combinational()
+
+    if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy:
+      pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)]
+    else:
+      pending_rresp = None
+
+    if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy:
+      mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr),
+                         int(dut.send_to_dram_wr_req.msg.data),
+                         int(dut.send_to_dram_wr_req.msg.mask)))
+
+    if dut.dma_done.val:
+      assert int(dut.dma_done.msg.dma_tag) == 0xa5
+      break
+
+    dut.sim_tick()
+
+  for elem in mem_writes:
+    print(f'{elem[0]}: 0x{elem[1]:08x}')
+    print(f'mask: 0x{elem[2]:08x}')
+
+  assert mem_writes == [
+    (0x2000,
+     int(concat(Bits32(0x77778888), Bits32(0x55556666),
+                Bits32(0x33334444), Bits32(0x11112222))),
+     0xffff), # mask
+
+     (0x2010,
+      int(concat(Bits32(0xffff0000), Bits32(0xddddeeee),
+                Bits32(0xbbbbcccc), Bits32(0x9999aaaa))),
+     0xffff),
+  ]
\ No newline at end of file
diff --git a/mem/dma/test/__init__.py b/mem/dma/test/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/mem/dma/test/__init__.py
@@ -0,0 +1 @@
+