diff --git a/.gitignore b/.gitignore index 305e025f..b41e10c2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build __pycache__ .hypothesis .vscode +*.log \ No newline at end of file diff --git a/cgra/CgraRTL.py b/cgra/CgraRTL.py index 87777384..c9dffcea 100644 --- a/cgra/CgraRTL.py +++ b/cgra/CgraRTL.py @@ -138,8 +138,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 00788487..986efe06 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -83,7 +83,10 @@ def construct(s, CgraPayloadType, provided_max_per_cgra_rows = None, provided_max_per_cgra_cols = None, provided_max_num_rd_tiles = None, - provided_max_num_wr_tiles = None): + provided_max_num_wr_tiles = None, + has_dma_ports = False, + DmaDataType = mk_dma_data(), + DmaCmdType = mk_dma_cmd()): """ provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. @@ -126,6 +129,14 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaDoneType = mk_dma_done(DmaTagType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, + DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -135,6 +146,21 @@ def construct(s, CgraPayloadType, s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + # Optional DMA engine-facing ports. The controller owns command decode and + # forwards DMA SPM access to the data memory. + if has_dma_ports: + s.dma_cmd = SendIfcRTL(DmaCmdType) + + s.dma_done = RecvIfcRTL(DmaDoneType) + + # Receive the request of writing into SPM from the DMA. + s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + # Receive the request of reading from SPM from the DMA. + s.recv_from_dma_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + # Send the response of reading from SPM to the DMA. + s.send_to_dma_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) + + if is_multi_cgra: # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. # Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA. @@ -168,11 +194,17 @@ def construct(s, CgraPayloadType, multi_cgra_columns, max_num_tiles, mem_access_is_combinational, - idTo2d_map) + idTo2d_map, + has_dma_ports, + DmaCmdType, + DmaDataType) s.cgra_id = InPort(CgraIdType) s.controller = ControllerRTL(NocPktType, multi_cgra_rows, multi_cgra_columns, - max_num_tiles, controller2addr_map, idTo2d_map) + max_num_tiles, controller2addr_map, idTo2d_map, + has_dma_ports, + DmaDataType, + DmaCmdType) # Connects controller id. s.controller.cgra_id //= s.cgra_id # An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU. @@ -190,9 +222,35 @@ def construct(s, CgraPayloadType, s.data_mem.address_lower //= s.address_lower s.data_mem.address_upper //= s.address_upper + if has_dma_ports: + # CPU packets are decoded by the controller before becoming DMA commands. + s.dma_cmd //= s.controller.dma_cmd + s.dma_done //= s.controller.dma_done + + s.recv_from_dma_spm_wr_req //= s.controller.recv_from_dma_spm_wr_req + s.recv_from_dma_spm_rd_req //= s.controller.recv_from_dma_spm_rd_req + s.send_to_dma_spm_rd_resp //= s.controller.send_to_dma_spm_rd_resp + + else: + # Grounds the DMA ports when no DMA engine is attached. + s.controller.dma_cmd.rdy //= 0 + s.controller.dma_done.val //= 0 + s.controller.dma_done.msg //= DmaDoneType() + + s.controller.recv_from_dma_spm_wr_req.val //= 0 + s.controller.recv_from_dma_spm_wr_req.msg //= DmaSpmWriteReqType() + s.controller.recv_from_dma_spm_rd_req.val //= 0 + s.controller.recv_from_dma_spm_rd_req.msg //= DmaSpmReadReqType() + s.controller.send_to_dma_spm_rd_resp.rdy //= 0 + + # Controller <-> SPM/data_mem + s.controller.send_to_sram_store_request_from_dma //= s.data_mem.recv_from_controller_spm_wr_req + s.controller.send_to_sram_load_request_from_dma //= s.data_mem.recv_from_controller_spm_rd_req + s.controller.recv_from_sram_load_response //= s.data_mem.send_to_controller_spm_rd_resp + # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraWithContextSwitchRTL.py b/cgra/CgraWithContextSwitchRTL.py index 361c0a9b..47bf8478 100644 --- a/cgra/CgraWithContextSwitchRTL.py +++ b/cgra/CgraWithContextSwitchRTL.py @@ -131,8 +131,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraWithStreamingLoadRTL.py b/cgra/CgraWithStreamingLoadRTL.py index 6e7dcbf5..e7b6b64e 100644 --- a/cgra/CgraWithStreamingLoadRTL.py +++ b/cgra/CgraWithStreamingLoadRTL.py @@ -138,8 +138,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py new file mode 100644 index 00000000..0aafa364 --- /dev/null +++ b/cgra/IntegratedCgraWithDmaRTL.py @@ -0,0 +1,194 @@ +""" +========================================================================= +IntegratedCgraWithDmaRTL.py +========================================================================= + +Wrapper that composes a CGRA template with a DMA engine attached to the +CGRA data SPM. +""" + +from pymtl3 import * + +from .CgraTemplateRTL import CgraTemplateRTL +from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ..lib.messages import * +from ..lib.util.data_struct_attr import * +from ..mem.dma.DmaEngineRTL import DmaEngineRTL + + +class IntegratedCgraWithDmaRTL( Component ): + """ + IntegratedCgraWithDmaRTL is a top-level wrapper that integrates a CGRA instance with a + DMA engine. + + Architectural Design: + - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a + DMA engine (`DmaEngineRTL`). + - CPU control packets are passed through to the CGRA's controller. + DMA commands are decoded there. + - The DMA engine accesses the CGRA's internal data SPM through controller- + forwarded ports; it is not connected directly to `DataMemControllerRTL`. + - External memory requests from the DMA engine are exposed at the top level + to be connected to a DRAM model or an AXI adapter. + - Boundary data ports for multi-CGRA configurations are also passed through + if enabled. + """ + + def construct(s, CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra = True, cgra_id = 0, + # For heterogeneous multi-cgra support.(maybe remove it in IntegratedCgraWithDmaRTL for simplicity?) + provided_max_per_cgra_rows = None, + provided_max_per_cgra_cols = None, + provided_max_num_rd_tiles = None, + provided_max_num_wr_tiles = None): + + DataType = CgraPayloadType.get_field_type(kAttrData) + data_bitwidth = DataType.get_field_type(kAttrPayload).nbits + assert data_bitwidth == 32 + + max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows + max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns + max_num_tiles = max_per_cgra_rows * max_per_cgra_cols + max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts() + + CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, CgraPayloadType) + NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, max_num_rd_tiles, + CgraPayloadType) + + CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaCmdType = mk_dma_cmd(dram_addr_nbits = 64, + spm_addr_nbits = 32, + bytes_nbits = 32, + tag_nbits = 8) + + DmaDataType = mk_dma_data(dram_data_nbits = 128, + dram_mask_nbits = 16, + spm_data_nbits = 32) + + DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) + DmaMemDataType = DmaDataType.get_field_type(kAttrDramData) + DmaMemMaskType = DmaDataType.get_field_type(kAttrDramMask) + DmaDramWrReqType = mk_dma_dram_wr_req(DmaDramAddrType.nbits, DmaMemDataType.nbits, DmaMemMaskType.nbits) + + # Existing CGRA-facing interfaces. + # CGRA <-> CPU + s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType) + s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType) + + if is_multi_cgra: + s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) + s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + + s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_west = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_west = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.recv_data_on_boundary_east = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + + s.cgra_id = InPort(CgraIdType) + # The local address range of current CGRA. + # Any address out of this range will be assumed as remote address. + s.address_lower = InPort(DataAddrType) + s.address_upper = InPort(DataAddrType) + + # Abstract external dram memory interfaces for the internal DMA engine. + + s.send_to_dram_rd_req = SendIfcRTL(DmaDramAddrType) + s.recv_from_dram_rd_resp = RecvIfcRTL(DmaMemDataType) + + s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) + s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) + + # Components. + + s.cgra = CgraTemplateRTL(CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra, cgra_id, + provided_max_per_cgra_rows, + provided_max_per_cgra_cols, + provided_max_num_rd_tiles, + provided_max_num_wr_tiles, + has_dma_ports = True, + DmaDataType = DmaDataType, + DmaCmdType = DmaCmdType) + + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) + s.dma = DmaEngineRTL(spm_data_nbits = DmaSpmDataType.nbits, + dram_data_nbits = DmaMemDataType.nbits, + dram_addr_nbits = DmaDramAddrType.nbits, + spm_addr_nbits = DmaSpmAddrType.nbits, + bytes_nbits = DmaBytesType.nbits, + tag_nbits = DmaTagType.nbits) + + # CGRA passthrough connections. + + s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt + s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt + + if is_multi_cgra: + s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc + s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc + + for i in range(max_per_cgra_cols): + s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i] + s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i] + s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i] + s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i] + + for i in range(max_per_cgra_rows): + s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i] + s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i] + s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i] + s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i] + + s.cgra_id //= s.cgra.cgra_id + s.address_lower //= s.cgra.address_lower + s.address_upper //= s.cgra.address_upper + + + # Connections between CGRA and DMA engine. + # CGRA communicates with DMA engine through the controller. + s.cgra.dma_cmd //= s.dma.dma_cmd + s.dma.dma_done //= s.cgra.dma_done + + s.send_to_dram_rd_req //= s.dma.send_to_dram_rd_req + s.recv_from_dram_rd_resp //= s.dma.recv_from_dram_rd_resp + + s.send_to_dram_wr_req //= s.dma.send_to_dram_wr_req + s.recv_from_dram_wr_resp //= s.dma.recv_from_dram_wr_resp + + # DMA to controller-forwarded SPM connections. + + s.dma.send_to_spm_wr_req //= s.cgra.recv_from_dma_spm_wr_req + s.dma.send_to_spm_rd_req //= s.cgra.recv_from_dma_spm_rd_req + s.dma.recv_from_spm_rd_resp //= s.cgra.send_to_dma_spm_rd_resp + + def line_trace(s): + return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py new file mode 100644 index 00000000..96aa8846 --- /dev/null +++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py @@ -0,0 +1,306 @@ +""" +========================================================================== +IntegratedCgraWithDmaRTL_test.py +========================================================================== +""" + +from pymtl3 import * +from pymtl3.passes.backends.verilog import VerilogTranslationPass +from pymtl3.stdlib.test_utils import config_model_with_cmdline_opts + +from ..IntegratedCgraWithDmaRTL import IntegratedCgraWithDmaRTL +from ...fu.single.AdderRTL import AdderRTL +from ...fu.single.MemUnitRTL import MemUnitRTL +from ...fu.single.RetRTL import RetRTL +from ...lib.cmd_type import * +from ...lib.messages import * +from ...lib.opt_type import * +from ...lib.util.cgra.DataSPM import DataSPM +from ...lib.util.cgra.Tile import Tile +from ...lib.util.cgra.cgra_helper import get_links + + +ctrl_mem_size = 8 +data_mem_size_global = 64 +data_mem_size_per_bank = 16 +num_banks_per_cgra = 4 +num_registers_per_reg_bank = 16 +num_ctrl = 1 +total_steps = 1 + +DataType = mk_data(32, 1) +DataAddrType = mk_bits(clog2(data_mem_size_global)) +CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) +CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) +CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, + CtrlAddrType) +CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) +WordType = mk_bits(32) + + +def make_dut(): + # 2x2 tiles with add/mem/return functional units + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + dataSPM = DataSPM(3, 3) + + dut = IntegratedCgraWithDmaRTL( + CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra=False) + + return dut + +def issue_cpu_pkt(dut, pkt, max_cycles = 20): + """ + CPU issues a packet to the CGRA. + """ + dut.recv_from_cpu_pkt.val @= 1 + dut.recv_from_cpu_pkt.msg @= pkt + + for _ in range(max_cycles): + dut.sim_eval_combinational() + if dut.recv_from_cpu_pkt.rdy: + dut.sim_tick() + dut.recv_from_cpu_pkt.val @= 0 + dut.sim_eval_combinational() + return + dut.sim_tick() + + assert False, "CPU packet was not accepted by the CGRA" + + +def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + dma_cmd, dram_addr, spm_addr, nbytes, tag): + + """ + Issues a DMA command to the CGRA. + Args: + dut: The CGRA instance. + CtrlPktType: The type of the control packet. + CgraPayloadType: The type of the CGRA payload. + DataType: The type of the data. + DataAddrType: The type of the data address. + + dma_cmd: The DMA command to issue.(CMD_DMA_MVIN or CMD_DMA_MVOUT) + dram_addr: The DRAM address to transfer data from or to.(64 bits) + spm_addr: The SPM address to transfer data from or to.(32 bits) + nbytes: The number of bytes to transfer. + tag: The tag of the DMA command. This tag isn't used now. We may use it to distinguish different DMA commands. + """ + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. + assert nbytes % 4 == 0, \ + f"DMA nbytes must be a multiple of 4, got {nbytes}" + config_pkts = [ + # The bindwidth of dram address is 64 bits, so we need to split it into two 32 bits parts. + # Lower 32 bits are sent first. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_DRAM_ADDR_LO, + data = DataType(dram_addr & 0xffffffff, 1))), + + # Higher 32 bits are sent second. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_DRAM_ADDR_HI, + data = DataType((dram_addr >> 32) & 0xffffffff, 1))), + + # The SPM address to read from or write to. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_SPM_ADDR, + data_addr = DataAddrType(spm_addr))), + + # The number of bytes to transfer. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_BYTES, + data = DataType(nbytes, 1))), + + # The tag of the DMA command. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_TAG, + data = DataType(tag, 1))), + CtrlPktType(0, 0, payload = CgraPayloadType(dma_cmd)), + ] + + for pkt in config_pkts: + issue_cpu_pkt(dut, pkt) + + +def observed_dma_done(dut, expected_tag): + dut.sim_eval_combinational() + if dut.send_to_cpu_pkt.val and dut.send_to_cpu_pkt.msg.payload.cmd == CMD_DMA_DONE: + assert int(dut.send_to_cpu_pkt.msg.opaque) == expected_tag + assert int(dut.send_to_cpu_pkt.msg.payload.data.payload) == expected_tag + return True + return False + + +def test_cgra_dma_mvin_to_local_spm(): + """ + Integration test for the IntegratedCgraWithDmaRTL wrapper. + It simulates a DMA MVIN command that moves data from external DRAM into + the CGRA's dataSPM. It then checks the SPM contents to ensure the + transfer was successful. + """ + dut = make_dut() + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 0 + dut.recv_from_dram_wr_resp.msg @= 0 + + # Read 16 bytes from DRAM address 0x1000 and write them to SPM words 0..3. + issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + CMD_DMA_MVIN, 0x1000, 0, 16, 0x33) + + beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + pending_resp = False + + for _ in range(40): + dut.recv_from_dram_rd_resp.val @= 0 + if pending_resp: + dut.recv_from_dram_rd_resp.val @= 1 + # Simulate the read response from DRAM. + dut.recv_from_dram_rd_resp.msg @= beat + + dut.sim_eval_combinational() + + pending_resp = bool(dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy) + + if observed_dma_done(dut, 0x33): + break + + dut.sim_tick() + + assert observed_dma_done(dut, 0x33) + # Check the data in the dataSPM. + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] == DataType(0x33333333, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] == DataType(0x44444444, 1, 0, 0) + + +def test_cgra_dma_mvout_from_local_spm(): + """ + Integration test for the IntegratedCgraWithDmaRTL wrapper. + It simulates a DMA MVOUT command that moves data from the local SPM + into external DRAM. + """ + dut = make_dut() + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + # Pre-load SPM with data + dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] <<= DataType(0x11111111, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] <<= DataType(0x22222222, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] <<= DataType(0x33333333, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] <<= DataType(0x44444444, 1, 0, 0) + dut.sim_tick() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 0 + dut.recv_from_dram_wr_resp.msg @= 0 + + # Read SPM words 0..3 and write 16 bytes to DRAM address 0x2000. + issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + CMD_DMA_MVOUT, 0x2000, 0, 16, 0x44) + + # Expected 128-bit beat + expected_beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + + done = False + pending_wr_resp = False + for _ in range(40): + dut.recv_from_dram_wr_resp.val @= 0 + if pending_wr_resp: + dut.recv_from_dram_wr_resp.val @= 1 + pending_wr_resp = False + + dut.sim_eval_combinational() + + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + assert dut.send_to_dram_wr_req.msg.addr == 0x2000 + assert dut.send_to_dram_wr_req.msg.data == expected_beat + pending_wr_resp = True + + if observed_dma_done(dut, 0x44): + done = True + break + + dut.sim_tick() + + assert done + +def test_gen_verilog_integrated_cgra_with_dma(cmdline_opts): + """ + Translate IntegratedCgraWithDmaRTL to Verilog. + """ + dut = make_dut() + + if cmdline_opts['test_verilog']: + # Standard flow: config_model_with_cmdline_opts handles elaboration, + # translation, and Verilator import. + try: + config_model_with_cmdline_opts(dut, cmdline_opts, duts=[]) + except Exception as e: + print(f"Note (Verilator import may have failed): {e}") + + try: + fname = dut.get_metadata(VerilogTranslationPass.translated_filename) + print(f"Verilog generated: {fname}") + except Exception as e: + print(f"Could not retrieve translation metadata: {e}") + else: + # Standalone flow: apply VerilogTranslationPass directly (no Verilator). + print("Generating Verilog without --test-verilog flag...") + print("Use 'pytest --test-verilog' to also run Verilator co-simulation.") + + dut.elaborate() + + dut.set_metadata(VerilogTranslationPass.enable, True) + dut.set_metadata(VerilogTranslationPass.explicit_module_name, + 'IntegratedCgraWithDmaRTL') + dut.set_metadata(VerilogTranslationPass.explicit_file_name, + 'IntegratedCgraWithDmaRTL.v') + + dut.apply(VerilogTranslationPass()) + + fname = dut.get_metadata(VerilogTranslationPass.translated_filename) + print(f"Verilog generated: {fname}") diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 83b41068..5a312c7f 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -29,11 +29,15 @@ def construct(s, multi_cgra_columns, num_tiles, controller2addr_map, - idTo2d_map): + idTo2d_map, + has_dma_ports = False, + DmaDataType = mk_dma_data(), + DmaCmdType = mk_dma_cmd()): # Derives types from InterCgraPktType. CgraPayloadType = InterCgraPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) + DataPayloadType = DataType.get_field_type(kAttrPayload) DataAddrType = CgraPayloadType.get_field_type(kAttrDataAddr) # Derives CgraIdType from grid dimensions. @@ -52,6 +56,22 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) TileIdType = mk_bits(clog2(num_tiles + 1)) ControllerXbarPktType = mk_controller_noc_xbar_pkt(InterCgraPktType) + DmaOpcodeType = DmaCmdType.get_field_type(kAttrOpcode) + DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + # Lower and higher 32 bits of the DRAM address. + DmaDramAddrPartType = mk_bits(DmaDramAddrType.nbits // 2) + DmaDoneType = mk_dma_done(DmaTagType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, + DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) + + if has_dma_ports: + assert DmaSpmDataType.nbits == 32 # Interface s.cgra_id = InPort(CgraIdType) @@ -71,9 +91,38 @@ def construct(s, s.recv_from_tile_load_response_pkt = RecvIfcRTL(InterCgraPktType) s.recv_from_tile_store_request_pkt = RecvIfcRTL(InterCgraPktType) - s.send_to_mem_load_request = SendIfcRTL(InterCgraPktType) + s.send_to_sram_load_request_from_noc = SendIfcRTL(InterCgraPktType) s.send_to_tile_load_response = SendIfcRTL(InterCgraPktType) - s.send_to_mem_store_request = SendIfcRTL(InterCgraPktType) + s.send_to_sram_store_request_from_noc = SendIfcRTL(InterCgraPktType) + + # Controller-owned command path from CPU packets to the DMA engine. + # Send the decoded DMA command to the DMA engine. + s.dma_cmd = SendIfcRTL(DmaCmdType) + # Receive the DMA done signal from the DMA engine. + s.dma_done = RecvIfcRTL(DmaDoneType) + + # ------------------------------------------------------- + # SPM (SRAM) access path from the DMA engine. + # The DMA and the inter-tile NoC (above) each have their own + # dedicated SPM access interfaces to the data memory controller. + # They are kept separate because the DMA can perform burst data + # movement. + # ------------------------------------------------------- + # Receive the request of writing into SPM from the DMA. + s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + # Receive the request of reading from SPM from the DMA. + s.recv_from_dma_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + # Send the response of reading from SPM to the DMA. + s.send_to_dma_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) + + # SRAM data memory side of the SPM access path (DMA). + # Send the request of writing into SPM to the data_mem controller. + s.send_to_sram_store_request_from_dma = SendIfcRTL(DmaSpmWriteReqType) + # Send the request of reading from SPM to the data_mem controller. + s.send_to_sram_load_request_from_dma = SendIfcRTL(DmaSpmReadReqType) + # Receive the response of reading from SPM from the data_mem controller. + s.recv_from_sram_load_response = RecvIfcRTL(DmaSpmReadRespType) + # Component s.recv_from_tile_load_request_pkt_queue = ChannelRTL(InterCgraPktType, latency = 1) @@ -123,6 +172,12 @@ def construct(s, s.addr_dst_id = Wire(CgraIdType) + s.dma_dram_addr_lo = Wire(DmaDramAddrPartType) + s.dma_dram_addr_hi = Wire(DmaDramAddrPartType) + s.dma_spm_addr = Wire(DmaSpmAddrType) + s.dma_bytes = Wire(DmaBytesType) + s.dma_tag = Wire(DmaTagType) + # Connections. # Requests towards others, 1 cycle delay to improve timing. s.recv_from_tile_load_request_pkt_queue.recv //= s.recv_from_tile_load_request_pkt @@ -130,14 +185,62 @@ def construct(s, s.recv_from_tile_store_request_pkt_queue.recv //= s.recv_from_tile_store_request_pkt # Requests towards local from others, 1 cycle delay to improve timing. - s.send_to_mem_load_request_queue.send //= s.send_to_mem_load_request + s.send_to_mem_load_request_queue.send //= s.send_to_sram_load_request_from_noc s.send_to_tile_load_response_queue.send //= s.send_to_tile_load_response - s.send_to_mem_store_request_queue.send //= s.send_to_mem_store_request + s.send_to_mem_store_request_queue.send //= s.send_to_sram_store_request_from_noc # For control signals delivery from CPU to tiles. s.recv_from_cpu_pkt //= s.recv_from_cpu_pkt_queue.recv s.send_to_cpu_pkt //= s.send_to_cpu_pkt_queue.send + @update_ff + def update_dma_cmd_regs(): + if s.reset: + s.dma_dram_addr_lo <<= DmaDramAddrPartType(0) + s.dma_dram_addr_hi <<= DmaDramAddrPartType(0) + s.dma_spm_addr <<= DmaSpmAddrType(0) + s.dma_bytes <<= DmaBytesType(0) + s.dma_tag <<= DmaTagType(0) + elif has_dma_ports: + cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload + cpu_cmd = cpu_payload.cmd + cpu_data = cpu_payload.data.payload + if s.recv_from_cpu_pkt_queue.send.val & s.recv_from_cpu_pkt_queue.send.rdy: + if cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO: + s.dma_dram_addr_lo <<= DmaDramAddrPartType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI: + s.dma_dram_addr_hi <<= DmaDramAddrPartType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR: + s.dma_spm_addr <<= zext(cpu_payload.data_addr, DmaSpmAddrType) + elif cpu_cmd == CMD_DMA_CONFIG_BYTES: + s.dma_bytes <<= DmaBytesType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_TAG: + s.dma_tag <<= trunc(cpu_data, DmaTagType) + + @update + def update_dma_spm_forwarding(): + if has_dma_ports: + s.send_to_sram_store_request_from_dma.val @= s.recv_from_dma_spm_wr_req.val + s.recv_from_dma_spm_wr_req.rdy @= s.send_to_sram_store_request_from_dma.rdy + s.send_to_sram_store_request_from_dma.msg @= s.recv_from_dma_spm_wr_req.msg + + s.send_to_sram_load_request_from_dma.val @= s.recv_from_dma_spm_rd_req.val + s.recv_from_dma_spm_rd_req.rdy @= s.send_to_sram_load_request_from_dma.rdy + s.send_to_sram_load_request_from_dma.msg @= s.recv_from_dma_spm_rd_req.msg + s.send_to_dma_spm_rd_resp.val @= s.recv_from_sram_load_response.val + s.recv_from_sram_load_response.rdy @= s.send_to_dma_spm_rd_resp.rdy + s.send_to_dma_spm_rd_resp.msg @= s.recv_from_sram_load_response.msg + else: + s.send_to_sram_store_request_from_dma.val @= 0 + s.send_to_sram_store_request_from_dma.msg @= DmaSpmWriteReqType() + s.send_to_sram_load_request_from_dma.val @= 0 + s.send_to_sram_load_request_from_dma.msg @= DmaSpmReadReqType() + s.recv_from_sram_load_response.rdy @= 0 + s.recv_from_dma_spm_wr_req.rdy @= 0 + s.recv_from_dma_spm_rd_req.rdy @= 0 + s.send_to_dma_spm_rd_resp.val @= 0 + s.send_to_dma_spm_rd_resp.msg @= DmaSpmReadRespType() + @update def update_received_msg(): kLoadRequestInportIdx = 0 @@ -151,6 +254,15 @@ def update_received_msg(): s.send_to_cpu_pkt_queue.recv.msg @= IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) s.recv_from_ctrl_ring_pkt.rdy @= 0 + s.dma_cmd.val @= 0 + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVIN), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + s.dma_done.rdy @= 0 + for i in range(CONTROLLER_CROSSBAR_INPORTS): s.crossbar.recv[i].val @= 0 s.crossbar.recv[i].msg @= ControllerXbarPktType(0, 0) @@ -201,24 +313,56 @@ def update_received_msg(): s.global_reduce_unit.send.rdy @= s.crossbar.recv[kFromReduceUnitIdx].rdy s.crossbar.recv[kFromReduceUnitIdx].msg @= s.global_reduce_unit.send.msg - # For the ctrl and data preloading. - s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \ - s.recv_from_cpu_pkt_queue.send.val - s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy - s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \ - ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC) - InterCgraPktType(s.cgra_id, # src - s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst - 0, # src_x - 0, # src_y - s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x - s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y - num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back. - s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id - 0, # remote_src_port, only used for inter-cgra remote load request/response. - 0, # opaque - 0, # vc_id - s.recv_from_cpu_pkt_queue.send.msg.payload)) + cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload + cpu_cmd = cpu_payload.cmd + + if has_dma_ports & ( + (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO) | + (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI) | + (cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR) | + (cpu_cmd == CMD_DMA_CONFIG_BYTES) | + (cpu_cmd == CMD_DMA_CONFIG_TAG)): + s.recv_from_cpu_pkt_queue.send.rdy @= 1 + + elif has_dma_ports & ( + (cpu_cmd == CMD_DMA_MVIN) | + (cpu_cmd == CMD_DMA_MVOUT)): + s.dma_cmd.val @= s.recv_from_cpu_pkt_queue.send.val + if cpu_cmd == CMD_DMA_MVIN: + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVIN), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + else: + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVOUT), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + s.recv_from_cpu_pkt_queue.send.rdy @= s.dma_cmd.rdy + + else: + # For the ctrl and data preloading. + s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \ + s.recv_from_cpu_pkt_queue.send.val + s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy + s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \ + ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC) + InterCgraPktType(s.cgra_id, # src + s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst + 0, # src_x + 0, # src_y + s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x + s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y + num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back. + s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id + 0, # remote_src_port, only used for inter-cgra remote load request/response. + 0, # opaque + 0, # vc_id + s.recv_from_cpu_pkt_queue.send.msg.payload)) # TODO: For the other cmd types. @@ -358,6 +502,30 @@ def update_received_msg(): # # TODO: Handle other cmd types. # assert(False) + # WARNING + # A possible conflict occurs when dma_done.valis True and the received message is CMD_COMPLETEat the same time, + # that is, when a DMA command and CMD_COMPLETE appear in the same clock cycle. + # In this case, both require the CGRA to send a return signal to the CPU, which may causes a conflict. + # Related discussion: https://github.com/tancheng/VectorCGRA/pull/293#discussion_r3418482217 + if has_dma_ports & s.dma_done.val: + s.dma_done.rdy @= s.send_to_cpu_pkt_queue.recv.rdy + s.send_to_cpu_pkt_queue.recv.val @= 1 + s.send_to_cpu_pkt_queue.recv.msg @= \ + IntraCgraPktType(num_tiles, # src_tile_id: controller/DMA sideband source + num_tiles, # dst_tile_id: CPU-facing controller endpoint + s.cgra_id, + s.cgra_id, + s.idTo2d_x_lut[s.cgra_id], + s.idTo2d_y_lut[s.cgra_id], + s.idTo2d_x_lut[s.cgra_id], + s.idTo2d_y_lut[s.cgra_id], + s.dma_done.msg.dma_tag, + 0, + CgraPayloadType( + CMD_DMA_DONE, + DataType(zext(s.dma_done.msg.dma_tag, DataPayloadType), 1, 0, 0), + 0, 0, 0)) + @update def update_sending_to_noc_msg(): s.send_to_inter_cgra_noc.val @= s.crossbar.send[0].val @@ -383,8 +551,8 @@ def line_trace(s): recv_from_tile_load_response_pkt_str = "recv_from_tile_load_response_pkt: " + str(s.recv_from_tile_load_response_pkt.msg) recv_from_tile_store_request_pkt_str = "recv_from_tile_store_request_pkt: " + str(s.recv_from_tile_store_request_pkt.msg) crossbar_str = "crossbar: {" + s.crossbar.line_trace() + "}" - send_to_mem_load_request_str = "send_to_mem_load_request: " + str(s.send_to_mem_load_request.msg) - send_to_mem_store_request_str = "send_to_mem_store_request: " + str(s.send_to_mem_store_request.msg) + send_to_mem_load_request_str = "send_to_sram_load_request_from_noc: " + str(s.send_to_sram_load_request_from_noc.msg) + send_to_mem_store_request_str = "send_to_sram_store_request_from_noc: " + str(s.send_to_sram_store_request_from_noc.msg) recv_from_noc_str ="recv_from_noc_pkt.val: " + str(s.recv_from_inter_cgra_noc.val) + " recv_from_noc_pkt.msg: " + str(s.recv_from_inter_cgra_noc.msg) + " recv_from_noc_pkt.rdy: " + str(s.recv_from_inter_cgra_noc.rdy) send_to_noc_str = "send_to_noc_pkt: " + str(s.send_to_inter_cgra_noc.msg) + "; rdy: " + str(s.send_to_inter_cgra_noc.rdy) + "; val: " + str(s.send_to_inter_cgra_noc.val) return f'{recv_from_cpu_pkt_str} || {recv_from_cpu_pkt_queue_str} || {crossbar_recv_str} || {send_to_ctrl_ring_pkt_str} || {recv_from_tile_load_request_pkt_str} || {recv_from_tile_load_response_pkt_str} || {recv_from_tile_store_request_pkt_str} || {crossbar_str} || {send_to_mem_load_request_str} || {send_to_mem_store_request_str} || {recv_from_noc_str} || {send_to_noc_str}\n' diff --git a/controller/test/ControllerRTL_test.py b/controller/test/ControllerRTL_test.py index 42d4eda9..1ad2500f 100644 --- a/controller/test/ControllerRTL_test.py +++ b/controller/test/ControllerRTL_test.py @@ -78,9 +78,9 @@ def construct(s, s.src_from_tile_load_response_pkt.send //= s.dut.recv_from_tile_load_response_pkt s.src_from_tile_store_request_pkt.send //= s.dut.recv_from_tile_store_request_pkt - s.dut.send_to_mem_store_request //= s.sink_to_mem_store_request.recv + s.dut.send_to_sram_store_request_from_noc //= s.sink_to_mem_store_request.recv s.dut.send_to_tile_load_response //= s.sink_to_mem_load_response.recv - s.dut.send_to_mem_load_request //= s.sink_to_mem_load_request.recv + s.dut.send_to_sram_load_request_from_noc //= s.sink_to_mem_load_request.recv s.src_from_noc.send //= s.dut.recv_from_inter_cgra_noc s.dut.send_to_inter_cgra_noc //= s.sink_to_noc.recv diff --git a/fu/single/ExtractPredicateRTL.py b/fu/single/ExtractPredicateRTL.py index 460e598e..15e5c562 100644 --- a/fu/single/ExtractPredicateRTL.py +++ b/fu/single/ExtractPredicateRTL.py @@ -15,7 +15,7 @@ from pymtl3 import * from ..basic.Fu import Fu from ...lib.opt_type import * - +from ...lib.util.data_struct_attr import * class ExtractPredicateRTL(Fu): def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0): @@ -60,7 +60,7 @@ def comb_logic(): # When loop is running (predicate=1) -> payload=1 # When loop terminates (predicate=0) -> payload=0 # Downstream NOT will invert: running->0 (no RET), done->1 (trigger RET) - s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type('payload')) + s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type(kAttrPayload)) s.send_out[0].msg.predicate @= 1 s.send_out[0].val @= s.recv_in[s.in0_idx].val diff --git a/fu/single/LoopControlRTL.py b/fu/single/LoopControlRTL.py index 5ac13f70..fbcf362e 100644 --- a/fu/single/LoopControlRTL.py +++ b/fu/single/LoopControlRTL.py @@ -15,7 +15,7 @@ from pymtl3 import * from ..basic.Fu import Fu from ...lib.opt_type import OPT_LOOP_CONTROL, OPT_SYMBOL_DICT - +from ...lib.util.data_struct_attr import * class LoopControlRTL(Fu): def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0): @@ -34,8 +34,8 @@ def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0 super(LoopControlRTL, s).construct(CtrlPktType, num_inports, num_outports, 1, vector_factor_power) - PayloadType = s.DataType.get_field_type('payload') - PredicateType = s.DataType.get_field_type('predicate') + PayloadType = s.DataType.get_field_type(kAttrPayload) + PredicateType = s.DataType.get_field_type(kAttrPredicate) FuInType = mk_bits(clog2(num_inports + 1)) # Internal state for loop control diff --git a/lib/cmd_type.py b/lib/cmd_type.py index a24078d7..13590ca3 100644 --- a/lib/cmd_type.py +++ b/lib/cmd_type.py @@ -14,7 +14,7 @@ # Total number of commands that are supported/recognized by controller. # Needs to be updated once more commands are added/supported. -NUM_CMDS = 44 +NUM_CMDS = 52 CMD_LAUNCH = 0 CMD_PAUSE = 1 @@ -69,6 +69,17 @@ # GEP FU Configuration Commands. CMD_CONFIG_GEP_STRIDE = 43 # Controller -> GEP FU: Configures stride for 2D GEP +# DMA commands. The CPU configures the controller-side command registers +# before issuing CMD_DMA_MVIN/CMD_DMA_MVOUT. +CMD_DMA_CONFIG_DRAM_ADDR_LO = 44 # Configures lower 32 bits of DRAM address +CMD_DMA_CONFIG_DRAM_ADDR_HI = 45 # Configures higher 32 bits of DRAM address +CMD_DMA_CONFIG_SPM_ADDR = 46 # Configures SPM address +CMD_DMA_CONFIG_BYTES = 47 # Configures number of bytes to transfer +CMD_DMA_CONFIG_TAG = 48 # Configures tag of the DMA command +CMD_DMA_MVIN = 49 # Issues a DMA_MVIN command +CMD_DMA_MVOUT = 50 # Issues a DMA_MVOUT command +CMD_DMA_DONE = 51 # Signals that the DMA command is complete + CMD_SYMBOL_DICT = { CMD_LAUNCH: "(LAUNCH_KERNEL)", CMD_PAUSE: "(PAUSE_EXECUTION)", @@ -114,5 +125,13 @@ CMD_LC_CHILD_RESET: "(LC_CHILD_RESET)", CMD_LC_ALL_COMPLETE: "(LC_ALL_COMPLETE)", CMD_CONFIG_GEP_STRIDE: "(CONFIG_GEP_STRIDE)", + CMD_DMA_CONFIG_DRAM_ADDR_LO: "(DMA_CONFIG_DRAM_ADDR_LO)", + CMD_DMA_CONFIG_DRAM_ADDR_HI: "(DMA_CONFIG_DRAM_ADDR_HI)", + CMD_DMA_CONFIG_SPM_ADDR: "(DMA_CONFIG_SPM_ADDR)", + CMD_DMA_CONFIG_BYTES: "(DMA_CONFIG_BYTES)", + CMD_DMA_CONFIG_TAG: "(DMA_CONFIG_TAG)", + CMD_DMA_MVIN: "(DMA_MVIN)", + CMD_DMA_MVOUT: "(DMA_MVOUT)", + CMD_DMA_DONE: "(DMA_DONE)", } diff --git a/lib/messages.py b/lib/messages.py index 49182f98..92748885 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -160,7 +160,7 @@ def str_func(s): field_dict[kAttrVectorFactorPower] = VectorFactorPowerType - field_dict[kAttrIsLastCtrl] = b1 + field_dict[kAttrIsLastCtrl] = mk_bits(1) # Register file related signals. # Indicates whether to write data into the register bank, and the @@ -199,6 +199,185 @@ def str_func(s): namespace = {'__str__': str_func} ) +#========================================================================= +# DMA messages +#========================================================================= + +def mk_dma_cmd(dram_addr_nbits = 64, + spm_addr_nbits = 32, + bytes_nbits = 32, + tag_nbits = 8, + prefix = "DmaCmd"): + + OpcodeType = mk_bits(3) + DramAddrType = mk_bits(dram_addr_nbits) + SpmAddrType = mk_bits(spm_addr_nbits) + BytesType = mk_bits(bytes_nbits) + TagType = mk_bits(tag_nbits) + + new_name = f"{prefix}_{dram_addr_nbits}_{spm_addr_nbits}_{bytes_nbits}_{tag_nbits}" + + def str_func(s): + return f"dma_cmd(op={s.opcode},dram={s.dram_addr},spm={s.spm_addr},bytes={s.nbytes},tag={s.dma_tag})" + + return mk_bitstruct(new_name, { + 'opcode' : OpcodeType, + 'dram_addr': DramAddrType, + 'spm_addr' : SpmAddrType, + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. + 'nbytes' : BytesType, + # This dma_tag isn't used now. We may use it to distinguish different DMA commands. + 'dma_tag' : TagType, + }, + namespace = {'__str__': str_func} + ) + +# A data structure to represent the data to be transferred by DMA. +# +# === Mask Design === +# Data transfer granularity between DRAM and SPM is 1 word (4 bytes) +# The `dram_mask` and `spm_mask` fields define the bitwidth of byte +# masks for DRAM and SPM data respectively. +# +# Actual mask *values* are generated independently by the DMA engine +# FSM (see DmaEngineRTL), NOT carried in this struct: +# +# - dram_mask (16-bit, one bit per byte of 128-bit(16 bytes) DRAM beat): +# Dynamically computed during MVOUT (SPM -> DRAM) based on the +# number of valid words in the last beat. Values range from 0x000f +# (1 word) to 0xffff (full beat). For example, if DMA move 1 word from SPM to DRAM, the mask is 0x000f. +# If DMA move 2 words from SPM to DRAM, the mask is 0x00ff. +# If DMA move 3 words from SPM to DRAM, the mask is 0x0fff. +# If DMA move 4 words from SPM to DRAM, the mask is 0xffff. +# +# - spm_mask (4-bit, one bit per byte of 32-bit SPM word): +# SPM writes always write full words, so the mask is +# hardcoded to 0xf. This field is reserved for +# future byte-granular SPM write support. +def mk_dma_data(dram_data_nbits = 128, + dram_mask_nbits = 16, + spm_data_nbits = 32, + spm_mask_nbits = 4, + prefix = "DmaData"): + DramDataType = mk_bits(dram_data_nbits) + DramMaskType = mk_bits(dram_mask_nbits) + SpmDataType = mk_bits(spm_data_nbits) + SpmMaskType = mk_bits(spm_mask_nbits) + new_name = f"{prefix}_{dram_data_nbits}_{dram_mask_nbits}_{spm_data_nbits}" + + def str_func(s): + return f"dma_data(dram_data={s.dram_data},dram_mask={s.dram_mask},spm_data={s.spm_data})" + + return mk_bitstruct(new_name, { + 'dram_data': DramDataType, + # 16-bit byte mask for 16-bytes DRAM beat. + 'dram_mask': DramMaskType, + 'spm_data': SpmDataType, + # 4-bit byte mask for 4-bytes SPM word. + # Always 0xf in current implementation (full-word writes only). + 'spm_mask': SpmMaskType, + }, + namespace = {'__str__': str_func} + ) + +def mk_dma_done(tag_nbits = 8, + prefix = "DmaDone"): + + TagType = mk_bits(tag_nbits) + + new_name = f"{prefix}_{tag_nbits}" + + def str_func(s): + return f"dma_done(dma_tag={s.dma_tag})" + + return mk_bitstruct(new_name, { + 'dma_tag': TagType, + }, + namespace = {'__str__': str_func} + ) + +#========================================================================= +# The type of write request signal from DMA to DRAM +#========================================================================= +def mk_dma_dram_wr_req(addr_nbits = 64, + data_nbits = 128, + mask_nbits = 16, + prefix = "DmaDramWrReq"): + + AddrType = mk_bits(addr_nbits) + DataType = mk_bits(data_nbits) + MaskType = mk_bits(mask_nbits) + + new_name = f"{prefix}_{addr_nbits}_{data_nbits}_{mask_nbits}" + + def str_func(s): + return f"dma_dram_wr(addr={s.addr},data={s.data},mask={s.mask})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + 'data': DataType, + 'mask': MaskType, + }, + namespace = {'__str__': str_func} + ) + +# The type of write request signal from DMA to SPM +def mk_dma_spm_write_req(addr_nbits = 32, + data_nbits = 32, + prefix = "DmaSpmWriteReq"): + + AddrType = mk_bits(addr_nbits) + DataType = mk_bits(data_nbits) + MaskType = mk_bits(max(1, data_nbits // 8)) + + new_name = f"{prefix}_{addr_nbits}_{data_nbits}" + + def str_func(s): + return f"dma_spm_wr(addr={s.addr},data={s.data},mask={s.mask})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + 'data': DataType, + 'mask': MaskType, + }, + namespace = {'__str__': str_func} + ) + +# The type of read request signal from DMA to SPM +def mk_dma_spm_read_req(addr_nbits = 32, + prefix = "DmaSpmReadReq"): + + AddrType = mk_bits(addr_nbits) + + new_name = f"{prefix}_{addr_nbits}" + + def str_func(s): + return f"dma_spm_rd(addr={s.addr})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + }, + namespace = {'__str__': str_func} + ) + +# The type of read response signal from SPM to DMA +def mk_dma_spm_read_resp(data_nbits = 32, + prefix = "DmaSpmReadResp"): + + DataType = mk_bits(data_nbits) + + new_name = f"{prefix}_{data_nbits}" + + def str_func(s): + return f"dma_spm_rd_resp(data={s.data})" + + return mk_bitstruct(new_name, { + 'data': DataType, + }, + namespace = {'__str__': str_func} + ) + #========================================================================= # Multi-cgra oriented inter-/intra-cgra data/config/cmd packet payload #========================================================================= diff --git a/lib/util/common.py b/lib/util/common.py index 51650d67..5b65174e 100644 --- a/lib/util/common.py +++ b/lib/util/common.py @@ -65,3 +65,28 @@ READ_TOWARDS_FU = 1 READ_TOWARDS_ROUTING_XBAR = 2 READ_TOWARDS_BOTH = 3 + +############################ +# Constants for DMA engine. +############################ +# DMA Move In and Out +# DMA_MVIN : DRAM -> DMA Engine -> SPM +# DMA_MVOUT : SPM -> DMA Engine -> DRAM +DMA_MVIN = 0 +DMA_MVOUT = 1 + +# 1 byte = 8 bits +CHAR_BIT = 8 + +# State machine definitions of DMA engine. +from pymtl3 import mk_bits +StateType = mk_bits( 4 ) +STATE_DMA_IDLE = StateType( 0 ) # Waiting for a new DMA command +STATE_DMA_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request +STATE_DMA_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response +STATE_DMA_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM +STATE_DMA_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request +STATE_DMA_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing +STATE_DMA_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request +STATE_DMA_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response +STATE_DMA_DONE = StateType( 8 ) # Signaling command completion diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py index 989378d1..615ef246 100644 --- a/lib/util/data_struct_attr.py +++ b/lib/util/data_struct_attr.py @@ -39,3 +39,16 @@ kAttrDstCgraX = 'dst_cgra_x' kAttrDstCgraY = 'dst_cgra_y' kAttrAddr = 'addr' + +# DMA attributes +kAttrOpcode = 'opcode' +kAttrDramAddr = 'dram_addr' +kAttrNBytes = 'nbytes' +# This dma_tag isn't used now. We may use it to distinguish different DMA commands. +kAttrDmaTag = 'dma_tag' +# TODO: https://github.com/tancheng/VectorCGRA/issues/316 -- Consolidates attributes. +kAttrSpmAddr = 'spm_addr' +kAttrSpmData = 'spm_data' +kAttrSpmMask = 'spm_mask' +kAttrDramData = 'dram_data' +kAttrDramMask = 'dram_mask' \ No newline at end of file diff --git a/local_CI.py b/local_CI.py new file mode 100644 index 00000000..f35198f8 --- /dev/null +++ b/local_CI.py @@ -0,0 +1,77 @@ +""" +local_CI.py is a script that runs the CI tests locally. +Usage: +```shell +cd /path/to/VectorCGRA/ +mkdir -p build && cd build +python3 local_CI.py +``` +The log will be saved to the `local_CI.log` file. +""" +import subprocess +import os +import sys + +def run_tests(): + current_dir = os.path.dirname(os.path.abspath(__file__)) + log_file = os.path.join(current_dir, "local_CI.log") + + commands = [ + ["pytest", "..", "-v", "--tb=short"], + ["pytest", "../mem/ctrl/test/CtrlMemDynamicRTL_test.py", "-xvs"], + ["pytest", "../tile/test/TileRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../controller/test/ControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../noc/PyOCN/pymtl3_net/ringnet/test/RingNetworkRTL_test.py"], + ["pytest", "../multi_cgra/test/RingMultiCgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_verilog_homo_2x2_4x4", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../mem/const/test/ConstQueueDynamicRTL_test.py", "-xvs"], + ["pytest", "../mem/data/test/DataMemControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_scalar_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_vector_global_reduce_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_systolic_2x2_2x2_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"] + ] + + with open(log_file, "w", encoding="utf-8") as f: + for cmd in commands: + cmd_str = " ".join(cmd) + header = f"\n{'='*80}\nExecuting: {cmd_str}\n{'='*80}\n" + + print(header) + f.write(header) + f.flush() + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1 + ) + + for line in process.stdout: + print(line, end="") + f.write(line) + + process.wait() + + if process.returncode == 0: + status = f"\nSUCCESS: {cmd_str}\n" + else: + status = f"\nFAILED (Exit Code {process.returncode}): {cmd_str}\n" + + print(status) + f.write(status) + + except Exception as e: + error_msg = f"\nERROR executing {cmd_str}: {str(e)}\n" + print(error_msg) + f.write(error_msg) + + print(f"\n\nAll tests completed. Log saved to: {os.path.abspath(log_file)}") + +if __name__ == "__main__": + run_tests() \ No newline at end of file diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 356a0ea2..58508945 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -32,8 +32,27 @@ from ...lib.messages import * from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL from ...lib.util.data_struct_attr import * +from ...lib.util.common import CHAR_BIT class DataMemControllerRTL(Component): + """ + DataMemControllerRTL manages access to the multi-banked data SPM. + It arbitrates between multiple request sources: + 1. Local tiles (via `recv_raddr`, `recv_waddr`, `recv_wdata`) + 2. Inter-CGRA NoC (via `recv_from_noc_load_request`, etc.) + 3. Optional controller-forwarded DMA access + (via `spm_dma_wval`, `spm_dma_rval`, etc.) + + Architectural Design: + - Uses crossbars to route requests to the correct memory bank based on the + address. + - Supports an optional controller-forwarded DMA SPM interface. When + `has_dma_ports` is True, extra ports are added to the read and write + crossbars. + - DMA-originated requests are treated as another master on the memory bus, + competing with tiles and NoC traffic after they pass through the + controller. + """ def construct(s, NocPktType, data_mem_size_global, @@ -45,10 +64,14 @@ def construct(s, multi_cgra_columns = 2, num_tiles = 16, mem_access_is_combinational = True, - idTo2d_map = {0: [0, 0]}): + idTo2d_map = {0: [0, 0]}, + has_dma_ports = False, + DmaCmdType = mk_dma_cmd(), + DmaDataType = mk_dma_data()): CgraPayloadType = NocPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) + PayloadType = DataType.get_field_type(kAttrPayload) # Constants. global_addr_nbits = clog2(data_mem_size_global) per_bank_addr_nbits = clog2(data_mem_size_per_bank) @@ -58,19 +81,32 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) AddrType = mk_bits(global_addr_nbits) PerBankAddrType = mk_bits(per_bank_addr_nbits) + + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaMaskType = DmaDataType.get_field_type(kAttrSpmMask) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) + NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) s.num_banks_per_cgra = num_banks_per_cgra - LocalBankIndexType = mk_bits(clog2(num_banks_per_cgra)) + s.has_dma_ports = has_dma_ports + LocalBankIndexType = mk_bits(max(1, clog2(num_banks_per_cgra))) s.num_rd_tiles = num_rd_tiles s.num_wr_tiles = num_wr_tiles - RdTileIdType = mk_bits(clog2(num_rd_tiles)) + RdTileIdType = mk_bits(max(1, clog2(num_rd_tiles))) # The additional port is for the request from inter-cgra NoC via controller. - num_xbar_in_rd_ports = num_rd_tiles + 1 - num_xbar_in_wr_ports = num_wr_tiles + 1 + # If DMA is enabled, we add one more port for the DMA engine. + dma_port_offset = 1 if has_dma_ports else 0 + num_xbar_in_rd_ports = num_rd_tiles + 1 + dma_port_offset + num_xbar_in_wr_ports = num_wr_tiles + 1 + dma_port_offset num_xbar_out_rd_ports = num_banks_per_cgra + 1 num_xbar_out_wr_ports = num_banks_per_cgra + 1 num_cgras = multi_cgra_rows * multi_cgra_columns XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports)) XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports)) + XbarInRdType = mk_bits(clog2(num_xbar_in_rd_ports)) + XbarInWrType = mk_bits(clog2(num_xbar_in_wr_ports)) MemReadPktType = \ mk_mem_access_pkt(DataType, num_xbar_in_rd_ports, @@ -120,7 +156,12 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) + s.recv_from_controller_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + s.recv_from_controller_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + s.send_to_controller_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) + # Components. + # A list of DataMemWrapperRTL instances. Each one is a single memory bank. s.memory_wrapper = [DataMemWrapperRTL(DataType, MemReadPktType, MemWritePktType, MemResponsePktType, data_mem_size_global, data_mem_size_per_bank, mem_access_is_combinational) for _ in range(num_banks_per_cgra)] @@ -159,10 +200,10 @@ def construct(s, @update def assemble_xbar_pkt(): for i in range(num_xbar_in_rd_ports): - s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) + s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) for i in range(num_xbar_in_wr_ports): - s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) + s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) for i in range(num_rd_tiles): recv_raddr = s.recv_raddr[i].msg @@ -223,6 +264,41 @@ def assemble_xbar_pkt(): 0, # src_tile num_wr_tiles) # remote_src_port + if has_dma_ports: + + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) + + recv_raddr_from_dma = trunc(s.recv_from_controller_spm_rd_req.msg.addr, AddrType) + if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): + bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) + else: + bank_index_load_from_dma = XbarOutRdType(num_banks_per_cgra) + s.rd_pkt[dma_rd_idx] @= MemReadPktType(dma_rd_idx, # src + bank_index_load_from_dma, # dst + recv_raddr_from_dma, # addr + DataType(0, 0, 0, 0), # data + s.cgra_id, # src_cgra + 0, # src_tile + 0) # remote_src_port + + recv_waddr_from_dma = trunc(s.recv_from_controller_spm_wr_req.msg.addr, AddrType) + if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): + bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) + else: + bank_index_store_from_dma = XbarOutWrType(num_banks_per_cgra) + s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src + bank_index_store_from_dma, # dst + recv_waddr_from_dma, # addr + DataType(zext(s.recv_from_controller_spm_wr_req.msg.data, PayloadType), 1, 0, 0), + 0, # src_cgra + 0, # src_tile + 0) # remote_src_port + # Connects xbar with the memory wrapper. @update def update_all(): @@ -286,6 +362,11 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + s.recv_from_controller_spm_wr_req.rdy @= 0 + s.recv_from_controller_spm_rd_req.rdy @= 0 + s.send_to_controller_spm_rd_resp.val @= 0 + s.send_to_controller_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) + s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src 0, # dst @@ -310,6 +391,16 @@ def update_all(): s.read_crossbar.recv[num_rd_tiles].val @= s.recv_from_noc_load_request.val s.read_crossbar.recv[num_rd_tiles].msg @= s.rd_pkt[num_rd_tiles] s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy + + if has_dma_ports: + # When `has_dma_ports` is True, num_xbar_in_rd_ports = num_rd_tiles + 1 + 1(dma_port_offset). + # Use dma_rd_idx = num_rd_tiles + 1 = num_xbar_in_rd_ports - 1 + # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error + # between `dma_rd_idx` and `num_xbar_in_rd_ports`. + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) + s.read_crossbar.recv[dma_rd_idx].val @= s.recv_from_controller_spm_rd_req.val + s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] + s.recv_from_controller_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -321,6 +412,16 @@ def update_all(): s.write_crossbar.recv[num_wr_tiles].msg @= s.wr_pkt[num_wr_tiles] s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy + if has_dma_ports: + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) + s.write_crossbar.recv[dma_wr_idx].val @= s.recv_from_controller_spm_wr_req.val + s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] + s.recv_from_controller_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy + # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. for i in range(num_xbar_in_rd_ports): @@ -328,7 +429,7 @@ def update_all(): s.send_rdata[RdTileIdType(i)].msg @= s.response_crossbar.send[i].msg.data s.send_rdata[RdTileIdType(i)].val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_rdata[RdTileIdType(i)].rdy - else: + elif i == num_rd_tiles: from_cgra_id = s.response_crossbar.send[i].msg.src_cgra from_tile_id = s.response_crossbar.send[i].msg.src_tile s.send_to_noc_load_response_pkt.msg @= \ @@ -351,6 +452,11 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy + elif has_dma_ports: + s.send_to_controller_spm_rd_resp.msg @= DmaSpmReadRespType( + trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType)) + s.send_to_controller_spm_rd_resp.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_to_controller_spm_rd_resp.rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. @@ -363,7 +469,7 @@ def update_all(): 0, # dst_y 0, # src_tile_id 0, # dst_tile_id - s.read_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port + trunc(s.read_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port 0, # opaque 0, # vc_id CgraPayloadType( @@ -378,7 +484,7 @@ def update_all(): s.response_crossbar.recv[num_banks_per_cgra].val @= s.recv_from_noc_load_response_pkt.val s.response_crossbar.recv[num_banks_per_cgra].msg @= \ MemResponsePktType(num_banks_per_cgra, - s.recv_from_noc_load_response_pkt.msg.remote_src_port, + zext(s.recv_from_noc_load_response_pkt.msg.remote_src_port, XbarInRdType), s.recv_from_noc_load_response_pkt.msg.payload.data_addr, s.recv_from_noc_load_response_pkt.msg.payload.data, s.recv_from_noc_load_response_pkt.msg.src, @@ -399,7 +505,7 @@ def update_all(): 0, # dst_y 0, # src_tile_id 0, # dst_tile_id - s.write_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port + trunc(s.write_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port 0, # opaque 0, # vc_id CgraPayloadType( diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py new file mode 100644 index 00000000..cf39a756 --- /dev/null +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -0,0 +1,117 @@ +""" +========================================================================== +DataMemControllerRTL_dma_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DataMemControllerRTL import DataMemControllerRTL +from ....lib.messages import * +from ....lib.opt_type import * + + +def make_types(data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles): + DataType = mk_data(32, 1) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 4, 4, 16) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + NocPktType = mk_inter_cgra_pkt(1, 1, num_tiles, num_rd_tiles, CgraPayloadType) + return DataType, DataAddrType, NocPktType + + +def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles): + for i in range(num_rd_tiles): + dut.recv_raddr[i].val @= 0 + dut.recv_raddr[i].msg @= DataAddrType(0) + dut.send_rdata[i].rdy @= 1 + + for i in range(num_wr_tiles): + dut.recv_waddr[i].val @= 0 + dut.recv_waddr[i].msg @= DataAddrType(0) + dut.recv_wdata[i].val @= 0 + dut.recv_wdata[i].msg @= DataType(0, 0, 0, 0) + + dut.recv_from_noc_load_request.val @= 0 + dut.recv_from_noc_load_request.msg @= NocPktType() + dut.recv_from_noc_store_request.val @= 0 + dut.recv_from_noc_store_request.msg @= NocPktType() + dut.recv_from_noc_load_response_pkt.val @= 0 + dut.recv_from_noc_load_response_pkt.msg @= NocPktType() + dut.send_to_noc_load_request_pkt.rdy @= 1 + dut.send_to_noc_load_response_pkt.rdy @= 1 + dut.send_to_noc_store_pkt.rdy @= 1 + + DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) + dut.recv_from_controller_spm_wr_req.val @= 0 + dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(0) + dut.recv_from_controller_spm_wr_req.msg.data @= 0 + dut.recv_from_controller_spm_wr_req.msg.mask @= 0 + dut.recv_from_controller_spm_rd_req.val @= 0 + dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(0) + dut.send_to_controller_spm_rd_resp.rdy @= 1 + + dut.cgra_id @= 0 + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + +def test_dma_ports_write_then_read(): + """ + Verifies that the DataMemController correctly handles requests from the + DMA ports. It performs a DMA write to a specific address and then a + DMA read from the same address to verify the data. + """ + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks = 4 + num_rd_tiles = 2 + num_wr_tiles = 2 + num_tiles = 4 + ctrl_mem_size = 16 + + DataType, DataAddrType, NocPktType = make_types( + data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles) + + dut = DataMemControllerRTL(NocPktType, + data_mem_size_global, + data_mem_size_per_bank, + num_banks, + num_rd_tiles, + num_wr_tiles, + 1, + 1, + num_tiles, + True, + {0: [0, 0]}, + has_dma_ports = True) + dut.apply(DefaultPassGroup()) + dut.sim_reset() + drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) + + DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) + dut.recv_from_controller_spm_wr_req.val @= 1 + dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_from_controller_spm_wr_req.msg.data @= 0xaaaabbbb + dut.recv_from_controller_spm_wr_req.msg.mask @= 0xf + dut.sim_eval_combinational() + assert dut.recv_from_controller_spm_wr_req.rdy + dut.sim_tick() + dut.recv_from_controller_spm_wr_req.val @= 0 + + dut.recv_from_controller_spm_rd_req.val @= 1 + dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(3) + + seen_response = False + for _ in range(10): + dut.sim_eval_combinational() + if dut.recv_from_controller_spm_rd_req.val & dut.recv_from_controller_spm_rd_req.rdy: + dut.recv_from_controller_spm_rd_req.val @= 0 + if dut.send_to_controller_spm_rd_resp.val: + assert int(dut.send_to_controller_spm_rd_resp.msg.data) == 0xaaaabbbb + seen_response = True + break + dut.sim_tick() + + assert seen_response diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py new file mode 100644 index 00000000..a7fff5d8 --- /dev/null +++ b/mem/dma/DmaEngineRTL.py @@ -0,0 +1,307 @@ +""" +========================================================================== +DmaEngineRTL.py +========================================================================== + +Simple DMA engine for moving opaque words between an abstract external +memory interface and the CGRA dataSPM. +""" + +from pymtl3 import * +from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.messages import * +from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_DMA_IDLE, STATE_DMA_MVIN_REQ, STATE_DMA_MVIN_RESP, STATE_DMA_MVIN_WRITE, STATE_DMA_MVOUT_READ, STATE_DMA_MVOUT_RESP, STATE_DMA_MVOUT_WRITE, STATE_DMA_MVOUT_WAIT, STATE_DMA_DONE + + +class DmaEngineRTL( Component ): + """ + The DmaEngineRTL module is responsible for bulk data movement between an + external DRAM-like memory and the on-chip Scratchpad Memory (dataSPM). + + It supports two main operations: + - DMA_MVIN: DRAM -> DMA Engine -> SPM + - DMA_MVOUT: SPM -> DMA Engine -> DRAM + + Architectural Design: + - 1 word = 4 bytes = 32 bits in this system. + - DRAM is byte-addressed which means each unique address points to a byte(8 bits). + - SPM is word-addressed which means each unique address points to a word(32 bits). + - The engine uses a 128-bit interface to external memory (4 words per beat) + and a 32-bit interface to the dataSPM (1 word per cycle). + - A finite state machine (FSM) manages the command execution flow, including + requesting memory, waiting for responses, and performing SPM accesses. + - MVIN logic: Requests 128-bit beats from DRAM, then unpacks them into four + sequential 32-bit SPM writes. + - MVOUT logic: Reads four 32-bit words from SPM, packs them into a 128-bit + beat, and issues a single write request to DRAM. + """ + + def construct( s, + spm_data_nbits = 32, # Bitwidth of a single SPM word + dram_data_nbits = 128, # Bitwidth of an external memory beat + dram_addr_nbits = 64, # Bitwidth of DRAM addresses + spm_addr_nbits = 32, # Bitwidth of SPM addresses + bytes_nbits = 32, # Bitwidth for transfer size in bytes + tag_nbits = 8 ): # Bitwidth for command tracking tags + + assert dram_data_nbits == spm_data_nbits * 4 + + OpcodeType = mk_bits( 3 ) + DramAddrType = mk_bits( dram_addr_nbits ) + SpmAddrType = mk_bits( spm_addr_nbits ) + BytesType = mk_bits( bytes_nbits ) + TagType = mk_bits( tag_nbits ) + SpmDataType = mk_bits( spm_data_nbits ) + MemDataType = mk_bits( dram_data_nbits ) + # Byte mask for SPM write + SpmMaskType = mk_bits( spm_data_nbits // CHAR_BIT ) + MemMaskType = mk_bits( dram_data_nbits // CHAR_BIT ) + DmaCmdType = mk_dma_cmd(dram_addr_nbits, spm_addr_nbits, bytes_nbits, tag_nbits) + DmaDoneType = mk_dma_done(tag_nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(spm_addr_nbits, spm_data_nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(spm_addr_nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(spm_data_nbits) + DmaDramWrReqType = mk_dma_dram_wr_req(dram_addr_nbits, dram_data_nbits, dram_data_nbits // 8) + + # Command interface + # Receives a DMA command from the controller. + s.dma_cmd = RecvIfcRTL(DmaCmdType) + + # Sends a DMA done signal to the controller. + s.dma_done = SendIfcRTL(DmaDoneType) + + # Abstract external memory interface + # Request to read from DRAM + s.send_to_dram_rd_req = SendIfcRTL( DramAddrType ) + # Response from DRAM + s.recv_from_dram_rd_resp = RecvIfcRTL( MemDataType ) + + # Request to write to DRAM + s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) + s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) + + # Send write request to SPM. + s.send_to_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) + # Send read request to SPM. + s.send_to_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) + # Receive read response from SPM. + s.recv_from_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) + + # State machine definitions + + s.state = Wire( StateType ) + s.state_next = Wire( StateType ) + + # Combinational logic + s.opcode_reg = Wire( OpcodeType ) # Current operation (MVIN/MVOUT) + s.dram_addr_reg = Wire( DramAddrType ) # Current DRAM byte address + s.spm_addr_reg = Wire( SpmAddrType ) # Current SPM word address + s.words_left_reg = Wire( BytesType ) # Number of 32-bit words remaining to transfer + s.tag_reg = Wire( TagType ) # Tag of the active command + s.beat_reg = Wire( MemDataType ) # Buffer for 128-bit DRAM beat + s.word_idx_reg = Wire( Bits2 ) # Index (0-3) of the word within a beat + s.wr_mask_reg = Wire( MemMaskType ) # Byte mask for DRAM write + + # Sequential logic + s.state_ff = Wire( StateType ) + s.opcode_ff = Wire( OpcodeType ) + s.dram_addr_ff = Wire( DramAddrType ) + s.spm_addr_ff = Wire( SpmAddrType ) + s.words_left_ff = Wire( BytesType ) + s.tag_ff = Wire( TagType ) + s.beat_ff = Wire( MemDataType ) + s.word_idx_ff = Wire( Bits2 ) + s.wr_mask_ff = Wire( MemMaskType ) + + # Connections + s.state //= s.state_ff + s.opcode_reg //= s.opcode_ff + s.dram_addr_reg //= s.dram_addr_ff + s.spm_addr_reg //= s.spm_addr_ff + s.words_left_reg //= s.words_left_ff + s.tag_reg //= s.tag_ff + s.beat_reg //= s.beat_ff + s.word_idx_reg //= s.word_idx_ff + s.wr_mask_reg //= s.wr_mask_ff + + # Precompute commonly used values at construct time (not inside any + # @update block) to avoid PyMTL3 AST translation limitations on the + # floor-division operator. + spm_word_nbytes = (spm_data_nbits // CHAR_BIT) + # SPM write mask: always all byte lanes enabled (0xf) because the DMA + # writes full 32-bit words to SPM. Byte-granular SPM writes are not + # needed in the current design. + spm_word_mask = SpmMaskType( (1 << spm_word_nbytes) - 1 ) + dram_beat_nbytes = (dram_data_nbits // CHAR_BIT) + + @update + def comb_outputs(): + s.dma_cmd.rdy @= s.state == STATE_DMA_IDLE + s.dma_done.val @= s.state == STATE_DMA_DONE + s.dma_done.msg @= DmaDoneType(s.tag_reg) + + s.send_to_dram_rd_req.val @= s.state == STATE_DMA_MVIN_REQ + s.send_to_dram_rd_req.msg @= s.dram_addr_reg + s.recv_from_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP + + s.send_to_dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE + s.send_to_dram_wr_req.msg.addr @= s.dram_addr_reg + s.send_to_dram_wr_req.msg.data @= s.beat_reg + s.send_to_dram_wr_req.msg.mask @= s.wr_mask_reg + + s.recv_from_dram_wr_resp.rdy @= s.state == STATE_DMA_MVOUT_WAIT + + spm_wdata = SpmDataType(0) + + if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM + spm_wdata = s.beat_reg[0:spm_data_nbits] + elif s.word_idx_reg == b2( 1 ): # Writes the second word of the beat to SPM + spm_wdata = s.beat_reg[spm_data_nbits:spm_data_nbits*2] + elif s.word_idx_reg == b2( 2 ): # 3rd word + spm_wdata = s.beat_reg[spm_data_nbits*2:spm_data_nbits*3] + else: # 4th word + spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] + + s.send_to_spm_wr_req.val @= s.state == STATE_DMA_MVIN_WRITE + s.send_to_spm_wr_req.msg @= DmaSpmWriteReqType( + s.spm_addr_reg, + spm_wdata, + spm_word_mask ) + + s.send_to_spm_rd_req.val @= s.state == STATE_DMA_MVOUT_READ + s.send_to_spm_rd_req.msg @= DmaSpmReadReqType(s.spm_addr_reg) + s.recv_from_spm_rd_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP + + @update_ff + def seq_state(): + if s.reset: + s.state_ff <<= STATE_DMA_IDLE + s.opcode_ff <<= OpcodeType( 0 ) + s.dram_addr_ff <<= DramAddrType( 0 ) + s.spm_addr_ff <<= SpmAddrType( 0 ) + s.words_left_ff <<= BytesType( 0 ) + s.tag_ff <<= TagType( 0 ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + else: + if s.state == STATE_DMA_IDLE: + if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command. + # Note: the nbytes % 4 check is omitted from the update block + # because PyMTL3's AST translator does not support assert + # statements. It is enforced in construct() instead. + s.opcode_ff <<= s.dma_cmd.msg.opcode + s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr + s.spm_addr_ff <<= s.dma_cmd.msg.spm_addr + # Converts the transfer size from bytes to words. + # NOTE We only support nbytes that are multiples of 4 now. + # If nbytes is not a multiple of 4, we will add 1 to the number of words to transfer. + s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) + s.tag_ff <<= s.dma_cmd.msg.dma_tag + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.dma_cmd.msg.nbytes == BytesType( 0 ): # No more bytes to transfer. + s.state_ff <<= STATE_DMA_DONE + # Still has bytes to transfer. + elif s.dma_cmd.msg.opcode == OpcodeType( DMA_MVIN ): + s.state_ff <<= STATE_DMA_MVIN_REQ # Move to the next state: to issue a read request to DRAM. + else: # DMA_MVOUT + s.state_ff <<= STATE_DMA_MVOUT_READ # Move to the next state: to issue a read request to SPM. + + elif s.state == STATE_DMA_MVIN_REQ: # Issues a read request to DRAM. + if s.send_to_dram_rd_req.val & s.send_to_dram_rd_req.rdy: + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes ) + s.state_ff <<= STATE_DMA_MVIN_RESP + + elif s.state == STATE_DMA_MVIN_RESP: # Receives a response from DRAM. + if s.recv_from_dram_rd_resp.val & s.recv_from_dram_rd_resp.rdy: + s.beat_ff <<= s.recv_from_dram_rd_resp.msg + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_DMA_MVIN_WRITE # Move to the next state: to write to SPM. + + elif s.state == STATE_DMA_MVIN_WRITE: # Writes to SPM. + if s.send_to_spm_wr_req.val & s.send_to_spm_wr_req.rdy: + # Update the SPM address where write next cycle(+1) + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + # Update the number of words remaining to write to SPM. + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + s.state_ff <<= STATE_DMA_DONE + elif s.word_idx_reg == b2( 3 ): + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_DMA_MVIN_REQ + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + + elif s.state == STATE_DMA_MVOUT_READ: + if s.send_to_spm_rd_req.val & s.send_to_spm_rd_req.rdy: + s.state_ff <<= STATE_DMA_MVOUT_RESP # Move to the next state: to receive a response from SPM. + + elif s.state == STATE_DMA_MVOUT_RESP: + if s.recv_from_spm_rd_resp.val & s.recv_from_spm_rd_resp.rdy: + # Pack the response from SPM into a 128-bit beat by left-shifting. + if s.word_idx_reg == b2( 0 ): # 1st word + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits : spm_data_nbits<<2], + s.recv_from_spm_rd_resp.msg.data ) + elif s.word_idx_reg == b2( 1 ): + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits<<1 : spm_data_nbits<<2], + s.recv_from_spm_rd_resp.msg.data, + s.beat_reg[0:spm_data_nbits] ) + elif s.word_idx_reg == b2( 2 ): + s.beat_ff <<= concat( s.beat_reg[(spm_data_nbits<<1)+spm_data_nbits : spm_data_nbits<<2], + s.recv_from_spm_rd_resp.msg.data, + s.beat_reg[0:spm_data_nbits<<1] ) + else: + s.beat_ff <<= concat( s.recv_from_spm_rd_resp.msg.data, + s.beat_reg[0 : (spm_data_nbits<<1)+spm_data_nbits] ) + + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + # Last beat of MVOUT: compute byte-mask based on how many + # valid 32-bit words are in this final beat. + if s.word_idx_reg == b2( 0 ): + s.wr_mask_ff <<= MemMaskType( 0x000f ) # 1 word (bytes 0-3) + elif s.word_idx_reg == b2( 1 ): + s.wr_mask_ff <<= MemMaskType( 0x00ff ) # 2 words (bytes 0-7) + elif s.word_idx_reg == b2( 2 ): + s.wr_mask_ff <<= MemMaskType( 0x0fff ) # 3 words (bytes 0-11) + else: + s.wr_mask_ff <<= MemMaskType( 0xffff ) # 4 words (bytes 0-15) + s.state_ff <<= STATE_DMA_MVOUT_WRITE + elif s.word_idx_reg == b2( 3 ): + # Full beat (4 words): all 16 bytes are valid. + s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.state_ff <<= STATE_DMA_MVOUT_WRITE + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + s.state_ff <<= STATE_DMA_MVOUT_READ + + elif s.state == STATE_DMA_MVOUT_WRITE: + if s.send_to_dram_wr_req.val & s.send_to_dram_wr_req.rdy: + s.state_ff <<= STATE_DMA_MVOUT_WAIT + + elif s.state == STATE_DMA_MVOUT_WAIT: + if s.recv_from_dram_wr_resp.val & s.recv_from_dram_wr_resp.rdy: + # Turn to the +16 address after writing 16 bytes data. + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.words_left_reg == BytesType( 0 ): + s.state_ff <<= STATE_DMA_DONE + else: + s.state_ff <<= STATE_DMA_MVOUT_READ + + elif s.state == STATE_DMA_DONE: + if s.dma_done.val & s.dma_done.rdy: + s.state_ff <<= STATE_DMA_IDLE + + def line_trace( s ): + return f"dma(state={int(s.state)},tag={int(s.tag_reg)},left={int(s.words_left_reg)})" diff --git a/mem/dma/__init__.py b/mem/dma/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/__init__.py @@ -0,0 +1 @@ + diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py new file mode 100644 index 00000000..186abb97 --- /dev/null +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -0,0 +1,245 @@ +""" +========================================================================== +DmaEngineRTL_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DmaEngineRTL import DmaEngineRTL, DMA_MVIN, DMA_MVOUT + + +def make_dut(): + dut = DmaEngineRTL() + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.dma_cmd.val @= 0 + dut.dma_cmd.msg.opcode @= 0 + dut.dma_cmd.msg.dram_addr @= 0 + dut.dma_cmd.msg.spm_addr @= 0 + dut.dma_cmd.msg.nbytes @= 0 + dut.dma_cmd.msg.dma_tag @= 0 + dut.dma_done.rdy @= 1 + + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 1 + dut.recv_from_dram_wr_resp.msg @= 0 + + dut.send_to_spm_wr_req.rdy @= 1 + dut.send_to_spm_rd_req.rdy @= 1 + dut.recv_from_spm_rd_resp.val @= 0 + dut.recv_from_spm_rd_resp.msg.data @= 0 + dut.sim_eval_combinational() + return dut + + +def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): + """ + Issues a DMA command to the DUT. + Args: + dut: The DUT instance. + opcode: The opcode of the DMA command. DMA_MVIN or DMA_MVOUT. + dram_addr: The DRAM address of the DMA command. + spm_addr: The SPM address of the DMA command. + nbytes: The number of bytes to transfer. + tag: The tag of the DMA command. + """ + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. + assert nbytes % 4 == 0, \ + f"DMA nbytes must be a multiple of 4, got {nbytes}" + dut.dma_cmd.val @= 1 + dut.dma_cmd.msg.opcode @= opcode + dut.dma_cmd.msg.dram_addr @= dram_addr + dut.dma_cmd.msg.spm_addr @= spm_addr + dut.dma_cmd.msg.nbytes @= nbytes + dut.dma_cmd.msg.dma_tag @= tag + dut.sim_eval_combinational() + assert dut.dma_cmd.rdy + dut.sim_tick() + dut.dma_cmd.val @= 0 + + +def test_dma_mvin_one_beat(): + """ + Tests DMA_MVIN operation. + The DRAM contains 2 beats of data, which should be unpacked into 8 + sequential SPM writes. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVIN, + 0x1000, # dram_addr + 4, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0x5a) # tag + + dram = { + 0x1000: concat(Bits32(0x44444444), Bits32(0x33333333), + Bits32(0x22222222), Bits32(0x11111111)), # 4 x 4 bytes = 16 bytes in total. + + # Address bias: +16, since DRAM is byte-addressed(each address points to a byte). + 0x1010: concat(Bits32(0x88888888), Bits32(0x77777777), + Bits32(0x66666666), Bits32(0x55555555)), + } + pending_resp = None + spm_writes = [] + + for _ in range(20): + dut.recv_from_dram_rd_resp.val @= 0 + if pending_resp is not None: + dut.recv_from_dram_rd_resp.val @= 1 + dut.recv_from_dram_rd_resp.msg @= pending_resp + + dut.sim_eval_combinational() + + if dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy: + pending_resp = dram[int(dut.send_to_dram_rd_req.msg)] + else: + pending_resp = None + + if dut.send_to_spm_wr_req.val & dut.send_to_spm_wr_req.rdy: + spm_writes.append((int(dut.send_to_spm_wr_req.msg.addr), int(dut.send_to_spm_wr_req.msg.data))) + + if dut.dma_done.val: + assert int(dut.dma_done.msg.dma_tag) == 0x5a + break + + dut.sim_tick() + + for elem in spm_writes: + print(f'{elem[0]}: 0x{elem[1]:08x}') + + assert spm_writes == [ + (4, 0x11111111), + (5, 0x22222222), + (6, 0x33333333), + (7, 0x44444444), + + (8, 0x55555555), + (9, 0x66666666), + (10, 0x77777777), + (11, 0x88888888), + ] + + +def test_dma_mvout_partial_beat(): + """ + Tests a partial beat MVOUT operation (12 bytes / 3 words). + The DMA should read three words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 12, # nbytes(number of bytes to transfer) + 0xa5) # tag + + spm = { + 8: 0xaaaabbbb, + 9: 0xccccdddd, + 10: 0xeeeeffff, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.recv_from_spm_rd_resp.val @= 0 + if pending_rresp is not None: + dut.recv_from_spm_rd_resp.val @= 1 + dut.recv_from_spm_rd_resp.msg.data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)] + else: + pending_rresp = None + + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr), + int(dut.send_to_dram_wr_req.msg.data), + int(dut.send_to_dram_wr_req.msg.mask))) + + if dut.dma_done.val: + assert int(dut.dma_done.msg.dma_tag) == 0xa5 + break + + dut.sim_tick() + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0), Bits32(0xeeeeffff), + Bits32(0xccccdddd), Bits32(0xaaaabbbb))), + 0x0fff), # mask + ] + +def test_dma_mvout_full_beat(): + """ + Tests a full beat MVOUT operation (16 bytes / 4 words). + The DMA should read four words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0xa5) # tag + + spm = { + 8 : 0x11112222, + 9 : 0x33334444, + 10: 0x55556666, + 11: 0x77778888, + 12: 0x9999aaaa, + 13: 0xbbbbcccc, + 14: 0xddddeeee, + 15: 0xffff0000, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.recv_from_spm_rd_resp.val @= 0 + if pending_rresp is not None: + dut.recv_from_spm_rd_resp.val @= 1 + dut.recv_from_spm_rd_resp.msg.data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)] + else: + pending_rresp = None + + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr), + int(dut.send_to_dram_wr_req.msg.data), + int(dut.send_to_dram_wr_req.msg.mask))) + + if dut.dma_done.val: + assert int(dut.dma_done.msg.dma_tag) == 0xa5 + break + + dut.sim_tick() + + for elem in mem_writes: + print(f'{elem[0]}: 0x{elem[1]:08x}') + print(f'mask: 0x{elem[2]:08x}') + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0x77778888), Bits32(0x55556666), + Bits32(0x33334444), Bits32(0x11112222))), + 0xffff), # mask + + (0x2010, + int(concat(Bits32(0xffff0000), Bits32(0xddddeeee), + Bits32(0xbbbbcccc), Bits32(0x9999aaaa))), + 0xffff), + ] \ No newline at end of file diff --git a/mem/dma/test/__init__.py b/mem/dma/test/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/test/__init__.py @@ -0,0 +1 @@ +