From 46e46a75a348d416e012d3095c2b4aa4ed5e0bd9 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 1 Jun 2026 22:42:03 +0800 Subject: [PATCH 01/46] Add the DmaEngine implementation and the test. --- mem/dma/DmaEngineRTL.py | 315 ++++++++++++++++++++++++++++++ mem/dma/__init__.py | 1 + mem/dma/test/DmaEngineRTL_test.py | 157 +++++++++++++++ mem/dma/test/__init__.py | 1 + 4 files changed, 474 insertions(+) create mode 100644 mem/dma/DmaEngineRTL.py create mode 100644 mem/dma/__init__.py create mode 100644 mem/dma/test/DmaEngineRTL_test.py create mode 100644 mem/dma/test/__init__.py diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py new file mode 100644 index 00000000..adf767cd --- /dev/null +++ b/mem/dma/DmaEngineRTL.py @@ -0,0 +1,315 @@ +""" +========================================================================== +DmaEngineRTL.py +========================================================================== + +Simple DMA engine for moving opaque words between an abstract external +memory interface and the CGRA dataSPM. +""" + +from pymtl3 import * + +# DMA Move In and Out +# DMA_MVIN : DRAM -> DMA Engine -> SPM +# DMA_MVOUT : SPM -> DMA Engine -> DRAM +DMA_MVIN = 0 +DMA_MVOUT = 1 + + +class DmaEngineRTL( Component ): + """ + The DmaEngineRTL module is responsible for bulk data movement between an + external DRAM-like memory and the on-chip Scratchpad Memory (dataSPM). + + It supports two main operations: + - DMA_MVIN: DRAM -> DMA Engine -> SPM + - DMA_MVOUT: SPM -> DMA Engine -> DRAM + + Architectural Design: + - 1 word = 4 bytes = 32 bits in this system. + - The engine uses a 128-bit interface to external memory (4 words per beat) + and a 32-bit interface to the dataSPM (1 word per cycle). + - A finite state machine (FSM) manages the command execution flow, including + requesting memory, waiting for responses, and performing SPM accesses. + - MVIN logic: Requests 128-bit beats from DRAM, then unpacks them into four + sequential 32-bit SPM writes. + - MVOUT logic: Reads four 32-bit words from SPM, packs them into a 128-bit + beat, and issues a single write request to DRAM. + """ + + def construct( s, + spm_data_nbits = 32, # Bitwidth of a single SPM word + mem_data_nbits = 128, # Bitwidth of an external memory beat + dram_addr_nbits = 64, # Bitwidth of DRAM addresses + spm_addr_nbits = 32, # Bitwidth of SPM addresses + bytes_nbits = 32, # Bitwidth for transfer size in bytes + tag_nbits = 8 ): # Bitwidth for command tracking tags + + assert mem_data_nbits == spm_data_nbits * 4 + + OpcodeType = mk_bits( 3 ) + DramAddrType = mk_bits( dram_addr_nbits ) + SpmAddrType = mk_bits( spm_addr_nbits ) + BytesType = mk_bits( bytes_nbits ) + TagType = mk_bits( tag_nbits ) + SpmDataType = mk_bits( spm_data_nbits ) + MemDataType = mk_bits( mem_data_nbits ) + # Byte mask for SPM write; 1 byte = 8 bits + SpmMaskType = mk_bits( spm_data_nbits // 8 ) + MemMaskType = mk_bits( mem_data_nbits // 8 ) + + # Command interface + s.dma_cmd_val = InPort() + s.dma_cmd_rdy = OutPort() + s.dma_cmd_opcode = InPort( OpcodeType ) + s.dma_cmd_dram_addr = InPort( DramAddrType ) + s.dma_cmd_spm_addr = InPort( SpmAddrType ) + # An input signal that specifies the number of bytes to transfer. + s.dma_cmd_bytes = InPort( BytesType ) + s.dma_cmd_tag = InPort( TagType ) + + s.dma_done_val = OutPort() + s.dma_done_rdy = InPort() + s.dma_done_tag = OutPort( TagType ) + + # Abstract external memory interface + # Request to read from DRAM + s.mem_rd_req_val = OutPort() + s.mem_rd_req_rdy = InPort() + s.mem_rd_req_addr = OutPort( DramAddrType ) + # Response from DRAM + s.mem_rd_resp_val = InPort() + s.mem_rd_resp_rdy = OutPort() + s.mem_rd_resp_data = InPort( MemDataType ) + + # Request to write to DRAM + s.mem_wr_req_val = OutPort() + s.mem_wr_req_rdy = InPort() + s.mem_wr_req_addr = OutPort( DramAddrType ) + s.mem_wr_req_data = OutPort( MemDataType ) + s.mem_wr_req_mask = OutPort( MemMaskType ) + s.mem_wr_resp_val = InPort() + s.mem_wr_resp_rdy = OutPort() + + # SPM interface + # Request to write to SPM + s.spm_dma_wval = OutPort() + s.spm_dma_wrdy = InPort() + s.spm_dma_waddr = OutPort( SpmAddrType ) + s.spm_dma_wdata = OutPort( SpmDataType ) + s.spm_dma_wmask = OutPort( SpmMaskType ) + + # Request to read from SPM + s.spm_dma_rval = OutPort() + s.spm_dma_rrdy = InPort() + s.spm_dma_raddr = OutPort( SpmAddrType ) + + # Response from SPM + s.spm_dma_rresp_val = InPort() + s.spm_dma_rresp_rdy = OutPort() + s.spm_dma_rresp_data = InPort( SpmDataType ) + + # State machine definitions + StateType = mk_bits( 4 ) + + STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command + STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request + STATE_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response + STATE_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM + STATE_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request + STATE_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing + STATE_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request + STATE_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response + STATE_DONE = StateType( 8 ) # Signaling command completion + + s.state = Wire( StateType ) + s.state_next = Wire( StateType ) + + # Combinational logic + s.opcode_reg = Wire( OpcodeType ) # Current operation (MVIN/MVOUT) + s.dram_addr_reg = Wire( DramAddrType ) # Current DRAM byte address + s.spm_addr_reg = Wire( SpmAddrType ) # Current SPM word address + s.words_left_reg = Wire( BytesType ) # Number of 32-bit words remaining to transfer + s.tag_reg = Wire( TagType ) # Tag of the active command + s.beat_reg = Wire( MemDataType ) # Buffer for 128-bit DRAM beat + s.word_idx_reg = Wire( Bits2 ) # Index (0-3) of the word within a beat + s.wr_mask_reg = Wire( MemMaskType ) # Byte mask for DRAM write + + # Sequential logic + s.state_ff = Wire( StateType ) + s.opcode_ff = Wire( OpcodeType ) + s.dram_addr_ff = Wire( DramAddrType ) + s.spm_addr_ff = Wire( SpmAddrType ) + s.words_left_ff = Wire( BytesType ) + s.tag_ff = Wire( TagType ) + s.beat_ff = Wire( MemDataType ) + s.word_idx_ff = Wire( Bits2 ) + s.wr_mask_ff = Wire( MemMaskType ) + + # Connections + s.state //= s.state_ff + s.opcode_reg //= s.opcode_ff + s.dram_addr_reg //= s.dram_addr_ff + s.spm_addr_reg //= s.spm_addr_ff + s.words_left_reg //= s.words_left_ff + s.tag_reg //= s.tag_ff + s.beat_reg //= s.beat_ff + s.word_idx_reg //= s.word_idx_ff + s.wr_mask_reg //= s.wr_mask_ff + + @update + def comb_outputs(): + s.dma_cmd_rdy @= s.state == STATE_IDLE + s.dma_done_val @= s.state == STATE_DONE + s.dma_done_tag @= s.tag_reg + + s.mem_rd_req_val @= s.state == STATE_MVIN_REQ + s.mem_rd_req_addr @= s.dram_addr_reg + s.mem_rd_resp_rdy @= s.state == STATE_MVIN_RESP + + s.mem_wr_req_val @= s.state == STATE_MVOUT_WRITE + s.mem_wr_req_addr @= s.dram_addr_reg + s.mem_wr_req_data @= s.beat_reg + s.mem_wr_req_mask @= s.wr_mask_reg + s.mem_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT + + s.spm_dma_wval @= s.state == STATE_MVIN_WRITE + s.spm_dma_waddr @= s.spm_addr_reg + s.spm_dma_wmask @= SpmMaskType( (1 << (spm_data_nbits // 8)) - 1 ) # Write mask for SPM write; always be 0b1111 + + if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM + s.spm_dma_wdata @= s.beat_reg[0:spm_data_nbits] + elif s.word_idx_reg == b2( 1 ): # Writes the second word of the beat to SPM + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits:spm_data_nbits*2] + elif s.word_idx_reg == b2( 2 ): # 3rd word + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*2:spm_data_nbits*3] + else: # 4th word + s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] + + s.spm_dma_rval @= s.state == STATE_MVOUT_READ + s.spm_dma_raddr @= s.spm_addr_reg + s.spm_dma_rresp_rdy @= s.state == STATE_MVOUT_RESP + + @update_ff + def seq_state(): + if s.reset: + s.state_ff <<= STATE_IDLE + s.opcode_ff <<= OpcodeType( 0 ) + s.dram_addr_ff <<= DramAddrType( 0 ) + s.spm_addr_ff <<= SpmAddrType( 0 ) + s.words_left_ff <<= BytesType( 0 ) + s.tag_ff <<= TagType( 0 ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + else: + if s.state == STATE_IDLE: + if s.dma_cmd_val & s.dma_cmd_rdy: # Receives a new DMA command. + s.opcode_ff <<= s.dma_cmd_opcode + s.dram_addr_ff <<= s.dma_cmd_dram_addr + s.spm_addr_ff <<= s.dma_cmd_spm_addr + s.words_left_ff <<= s.dma_cmd_bytes >> 2 # Converts the transfer size from bytes to words. + s.tag_ff <<= s.dma_cmd_tag + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.dma_cmd_bytes == BytesType( 0 ): # No more bytes to transfer. + s.state_ff <<= STATE_DONE + # Still has bytes to transfer. + elif s.dma_cmd_opcode == OpcodeType( DMA_MVIN ): + s.state_ff <<= STATE_MVIN_REQ # Move to the next state: to issue a read request to DRAM. + else: # DMA_MVOUT + s.state_ff <<= STATE_MVOUT_READ # Move to the next state: to issue a read request to SPM. + + elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. + if s.mem_rd_req_val & s.mem_rd_req_rdy: + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // 8 ) + s.state_ff <<= STATE_MVIN_RESP + + elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. + if s.mem_rd_resp_val & s.mem_rd_resp_rdy: + s.beat_ff <<= s.mem_rd_resp_data + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_MVIN_WRITE # Move to the next state: to write to SPM. + + elif s.state == STATE_MVIN_WRITE: # Writes to SPM. + if s.spm_dma_wval & s.spm_dma_wrdy: + # Update the SPM address where write next cycle(+1) + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + # Update the number of words remaining to write to SPM. + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + s.state_ff <<= STATE_DONE + elif s.word_idx_reg == b2( 3 ): + s.word_idx_ff <<= b2( 0 ) + s.state_ff <<= STATE_MVIN_REQ + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + + elif s.state == STATE_MVOUT_READ: + if s.spm_dma_rval & s.spm_dma_rrdy: + s.state_ff <<= STATE_MVOUT_RESP # Move to the next state: to receive a response from SPM. + + elif s.state == STATE_MVOUT_RESP: + if s.spm_dma_rresp_val & s.spm_dma_rresp_rdy: + # Pack the response from SPM into a 128-bit beat by left-shifting. + if s.word_idx_reg == b2( 0 ): # 1st word + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], + s.spm_dma_rresp_data ) + elif s.word_idx_reg == b2( 1 ): + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], + s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits] ) + elif s.word_idx_reg == b2( 2 ): + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], + s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits*2] ) + else: + s.beat_ff <<= concat( s.spm_dma_rresp_data, + s.beat_reg[0:spm_data_nbits*3] ) + + s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) + s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) + + if s.words_left_reg == BytesType( 1 ): + if s.word_idx_reg == b2( 0 ): + s.wr_mask_ff <<= MemMaskType( 0x000f ) + elif s.word_idx_reg == b2( 1 ): + s.wr_mask_ff <<= MemMaskType( 0x00ff ) + elif s.word_idx_reg == b2( 2 ): + s.wr_mask_ff <<= MemMaskType( 0x0fff ) + else: + s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.state_ff <<= STATE_MVOUT_WRITE + elif s.word_idx_reg == b2( 3 ): + s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.state_ff <<= STATE_MVOUT_WRITE + else: + s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) + s.state_ff <<= STATE_MVOUT_READ + + elif s.state == STATE_MVOUT_WRITE: + if s.mem_wr_req_val & s.mem_wr_req_rdy: + s.state_ff <<= STATE_MVOUT_WAIT + + elif s.state == STATE_MVOUT_WAIT: + if s.mem_wr_resp_val & s.mem_wr_resp_rdy: + # DRAM byte-addresses and transfer 128-bit per beat. + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // 8 ) + s.beat_ff <<= MemDataType( 0 ) + s.word_idx_ff <<= b2( 0 ) + s.wr_mask_ff <<= MemMaskType( 0 ) + + if s.words_left_reg == BytesType( 0 ): + s.state_ff <<= STATE_DONE + else: + s.state_ff <<= STATE_MVOUT_READ + + elif s.state == STATE_DONE: + if s.dma_done_val & s.dma_done_rdy: + s.state_ff <<= STATE_IDLE + + def line_trace( s ): + return f"dma(state={int(s.state)},tag={int(s.tag_reg)},left={int(s.words_left_reg)})" diff --git a/mem/dma/__init__.py b/mem/dma/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/__init__.py @@ -0,0 +1 @@ + diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py new file mode 100644 index 00000000..b85c037b --- /dev/null +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -0,0 +1,157 @@ +""" +========================================================================== +DmaEngineRTL_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DmaEngineRTL import DmaEngineRTL, DMA_MVIN, DMA_MVOUT + + +def make_dut(): + dut = DmaEngineRTL() + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.dma_cmd_val @= 0 + dut.dma_cmd_opcode @= 0 + dut.dma_cmd_dram_addr @= 0 + dut.dma_cmd_spm_addr @= 0 + dut.dma_cmd_bytes @= 0 + dut.dma_cmd_tag @= 0 + dut.dma_done_rdy @= 1 + + dut.mem_rd_req_rdy @= 1 + dut.mem_rd_resp_val @= 0 + dut.mem_rd_resp_data @= 0 + dut.mem_wr_req_rdy @= 1 + dut.mem_wr_resp_val @= 1 + + dut.spm_dma_wrdy @= 1 + dut.spm_dma_rrdy @= 1 + dut.spm_dma_rresp_val @= 0 + dut.spm_dma_rresp_data @= 0 + dut.sim_eval_combinational() + return dut + + +def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): + """ + Issues a DMA command to the DUT. + Args: + dut: The DUT instance. + opcode: The opcode of the DMA command. DMA_MVIN or DMA_MVOUT. + dram_addr: The DRAM address of the DMA command. + spm_addr: The SPM address of the DMA command. + nbytes: The number of bytes to transfer. + tag: The tag of the DMA command. + """ + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= opcode + dut.dma_cmd_dram_addr @= dram_addr + dut.dma_cmd_spm_addr @= spm_addr + dut.dma_cmd_bytes @= nbytes + dut.dma_cmd_tag @= tag + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + +def test_dma_mvin_one_beat(): + """ + Tests a single 128-bit beat MVIN operation. + The DRAM contains one beat of data, which should be unpacked into four + sequential SPM writes. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVIN, 0x1000, 4, 16, 0x5a) + + dram = { + 0x1000: concat(Bits32(0x44444444), Bits32(0x33333333), + Bits32(0x22222222), Bits32(0x11111111)), + } + pending_resp = None + spm_writes = [] + + for _ in range(20): + dut.mem_rd_resp_val @= 0 + if pending_resp is not None: + dut.mem_rd_resp_val @= 1 + dut.mem_rd_resp_data @= pending_resp + + dut.sim_eval_combinational() + + if dut.mem_rd_req_val & dut.mem_rd_req_rdy: + pending_resp = dram[int(dut.mem_rd_req_addr)] + else: + pending_resp = None + + if dut.spm_dma_wval & dut.spm_dma_wrdy: + spm_writes.append((int(dut.spm_dma_waddr), int(dut.spm_dma_wdata))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0x5a + break + + dut.sim_tick() + + for elem in spm_writes: + print(f'{elem[0]}: 0x{elem[1]:08x}') + + assert spm_writes == [ + (4, 0x11111111), + (5, 0x22222222), + (6, 0x33333333), + (7, 0x44444444), + ] + + +def test_dma_mvout_partial_beat(): + """ + Tests a partial beat MVOUT operation (12 bytes / 3 words). + The DMA should read three words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, 0x2000, 8, 12, 0xa5) + + spm = { + 8: 0xaaaabbbb, + 9: 0xccccdddd, + 10: 0xeeeeffff, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.spm_dma_rresp_val @= 0 + if pending_rresp is not None: + dut.spm_dma_rresp_val @= 1 + dut.spm_dma_rresp_data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.spm_dma_rval & dut.spm_dma_rrdy: + pending_rresp = spm[int(dut.spm_dma_raddr)] + else: + pending_rresp = None + + if dut.mem_wr_req_val & dut.mem_wr_req_rdy: + mem_writes.append((int(dut.mem_wr_req_addr), + int(dut.mem_wr_req_data), + int(dut.mem_wr_req_mask))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0xa5 + break + + dut.sim_tick() + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0), Bits32(0xeeeeffff), + Bits32(0xccccdddd), Bits32(0xaaaabbbb))), + 0x0fff), + ] diff --git a/mem/dma/test/__init__.py b/mem/dma/test/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/mem/dma/test/__init__.py @@ -0,0 +1 @@ + From d64eded1bcf5b5321015adc87ed6dface8343a55 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 1 Jun 2026 23:12:08 +0800 Subject: [PATCH 02/46] [Test] Update the test of DmaEngine. --- mem/dma/DmaEngineRTL.py | 4 +- mem/dma/test/DmaEngineRTL_test.py | 91 +++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 7 deletions(-) diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index adf767cd..41e2562b 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -27,6 +27,8 @@ class DmaEngineRTL( Component ): Architectural Design: - 1 word = 4 bytes = 32 bits in this system. + - DRAM is byte-addressed which means each unique address points to a byte(8 bits). + - SPM is word-addressed which means each unique address points to a word(32 bits). - The engine uses a 128-bit interface to external memory (4 words per beat) and a 32-bit interface to the dataSPM (1 word per cycle). - A finite state machine (FSM) manages the command execution flow, including @@ -296,7 +298,7 @@ def seq_state(): elif s.state == STATE_MVOUT_WAIT: if s.mem_wr_resp_val & s.mem_wr_resp_rdy: - # DRAM byte-addresses and transfer 128-bit per beat. + # Turn to the +16 address after writing 16 bytes data. s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // 8 ) s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index b85c037b..12392465 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -61,16 +61,24 @@ def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): def test_dma_mvin_one_beat(): """ - Tests a single 128-bit beat MVIN operation. - The DRAM contains one beat of data, which should be unpacked into four + Tests DMA_MVIN operation. + The DRAM contains 2 beats of data, which should be unpacked into 8 sequential SPM writes. """ dut = make_dut() - issue_cmd(dut, DMA_MVIN, 0x1000, 4, 16, 0x5a) + issue_cmd(dut, DMA_MVIN, + 0x1000, # dram_addr + 4, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0x5a) # tag dram = { 0x1000: concat(Bits32(0x44444444), Bits32(0x33333333), - Bits32(0x22222222), Bits32(0x11111111)), + Bits32(0x22222222), Bits32(0x11111111)), # 4 x 4 bytes = 16 bytes in total. + + # Address bias: +16, since DRAM is byte-addressed(each address points to a byte). + 0x1010: concat(Bits32(0x88888888), Bits32(0x77777777), + Bits32(0x66666666), Bits32(0x55555555)), } pending_resp = None spm_writes = [] @@ -105,6 +113,11 @@ def test_dma_mvin_one_beat(): (5, 0x22222222), (6, 0x33333333), (7, 0x44444444), + + (8, 0x55555555), + (9, 0x66666666), + (10, 0x77777777), + (11, 0x88888888), ] @@ -115,7 +128,11 @@ def test_dma_mvout_partial_beat(): with a proper byte mask, and write it to DRAM. """ dut = make_dut() - issue_cmd(dut, DMA_MVOUT, 0x2000, 8, 12, 0xa5) + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 12, # nbytes(number of bytes to transfer) + 0xa5) # tag spm = { 8: 0xaaaabbbb, @@ -153,5 +170,67 @@ def test_dma_mvout_partial_beat(): (0x2000, int(concat(Bits32(0), Bits32(0xeeeeffff), Bits32(0xccccdddd), Bits32(0xaaaabbbb))), - 0x0fff), + 0x0fff), # mask ] + +def test_dma_mvout_full_beat(): + """ + Tests a full beat MVOUT operation (16 bytes / 4 words). + The DMA should read four words from SPM, pack them into a 128-bit beat + with a proper byte mask, and write it to DRAM. + """ + dut = make_dut() + issue_cmd(dut, DMA_MVOUT, + 0x2000, # dram_addr + 8, # spm_addr + 32, # nbytes(number of bytes to transfer) + 0xa5) # tag + + spm = { + 8 : 0x11112222, + 9 : 0x33334444, + 10: 0x55556666, + 11: 0x77778888, + 12: 0x9999aaaa, + 13: 0xbbbbcccc, + 14: 0xddddeeee, + 15: 0xffff0000, + } + pending_rresp = None + mem_writes = [] + + for _ in range(30): + dut.spm_dma_rresp_val @= 0 + if pending_rresp is not None: + dut.spm_dma_rresp_val @= 1 + dut.spm_dma_rresp_data @= pending_rresp + + dut.sim_eval_combinational() + + if dut.spm_dma_rval & dut.spm_dma_rrdy: + pending_rresp = spm[int(dut.spm_dma_raddr)] + else: + pending_rresp = None + + if dut.mem_wr_req_val & dut.mem_wr_req_rdy: + mem_writes.append((int(dut.mem_wr_req_addr), + int(dut.mem_wr_req_data), + int(dut.mem_wr_req_mask))) + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0xa5 + break + + dut.sim_tick() + + assert mem_writes == [ + (0x2000, + int(concat(Bits32(0x77778888), Bits32(0x55556666), + Bits32(0x33334444), Bits32(0x11112222))), + 0xffff), # mask + + (0x2010, + int(concat(Bits32(0xffff0000), Bits32(0xddddeeee), + Bits32(0xbbbbcccc), Bits32(0x9999aaaa))), + 0xffff), + ] \ No newline at end of file From f96823931e89f4f1fc9b164c062ffe614878ee53 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 16:56:28 +0800 Subject: [PATCH 03/46] Add DMA support to DataMemControllerRTL and implement corresponding tests. --- mem/data/DataMemControllerRTL.py | 137 ++++++++++++++++-- .../test/DataMemControllerRTL_dma_test.py | 115 +++++++++++++++ 2 files changed, 241 insertions(+), 11 deletions(-) create mode 100644 mem/data/test/DataMemControllerRTL_dma_test.py diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 356a0ea2..4f017d53 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -34,6 +34,21 @@ from ...lib.util.data_struct_attr import * class DataMemControllerRTL(Component): + """ + DataMemControllerRTL manages access to the multi-banked data SPM. + It arbitrates between multiple request sources: + 1. Local tiles (via `recv_raddr`, `recv_waddr`, `recv_wdata`) + 2. Inter-CGRA NoC (via `recv_from_noc_load_request`, etc.) + 3. Optional DMA engine (via `spm_dma_wval`, `spm_dma_rval`, etc.) + + Architectural Design: + - Uses crossbars to route requests to the correct memory bank based on the + address. + - Supports an optional DMA interface. When `has_dma_ports` is True, extra + ports are added to the read and write crossbars. + - DMA requests are treated as another master on the memory bus, competing + with tiles and NoC traffic. + """ def construct(s, NocPktType, data_mem_size_global, @@ -45,7 +60,8 @@ def construct(s, multi_cgra_columns = 2, num_tiles = 16, mem_access_is_combinational = True, - idTo2d_map = {0: [0, 0]}): + idTo2d_map = {0: [0, 0]}, + has_dma_ports = False): CgraPayloadType = NocPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) @@ -58,19 +74,26 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) AddrType = mk_bits(global_addr_nbits) PerBankAddrType = mk_bits(per_bank_addr_nbits) + DmaDataType = DataType.get_field_type(kAttrPayload) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // 8)) + NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) s.num_banks_per_cgra = num_banks_per_cgra - LocalBankIndexType = mk_bits(clog2(num_banks_per_cgra)) + s.has_dma_ports = has_dma_ports + LocalBankIndexType = mk_bits(max(1, clog2(num_banks_per_cgra))) s.num_rd_tiles = num_rd_tiles s.num_wr_tiles = num_wr_tiles - RdTileIdType = mk_bits(clog2(num_rd_tiles)) + RdTileIdType = mk_bits(max(1, clog2(num_rd_tiles))) # The additional port is for the request from inter-cgra NoC via controller. - num_xbar_in_rd_ports = num_rd_tiles + 1 - num_xbar_in_wr_ports = num_wr_tiles + 1 + # If DMA is enabled, we add one more port for the DMA engine. + dma_port_offset = 1 if has_dma_ports else 0 + num_xbar_in_rd_ports = num_rd_tiles + 1 + dma_port_offset + num_xbar_in_wr_ports = num_wr_tiles + 1 + dma_port_offset num_xbar_out_rd_ports = num_banks_per_cgra + 1 num_xbar_out_wr_ports = num_banks_per_cgra + 1 num_cgras = multi_cgra_rows * multi_cgra_columns XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports)) XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports)) + XbarInRdType = mk_bits(clog2(num_xbar_in_rd_ports)) MemReadPktType = \ mk_mem_access_pkt(DataType, num_xbar_in_rd_ports, @@ -120,7 +143,47 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) + if has_dma_ports: + # DMA writes SPM: used by DMA_MVIN. + s.spm_dma_wval = InPort() + s.spm_dma_wrdy = OutPort() + s.spm_dma_waddr = InPort(AddrType) + s.spm_dma_wdata = InPort(DmaDataType) + s.spm_dma_wmask = InPort(DmaMaskType) + + # DMA reads SPM: used by DMA_MVOUT. + s.spm_dma_rval = InPort() + s.spm_dma_rrdy = OutPort() + s.spm_dma_raddr = InPort(AddrType) + s.spm_dma_rresp_val = OutPort() + s.spm_dma_rresp_rdy = InPort() + s.spm_dma_rresp_data = OutPort(DmaDataType) + else: + # Keep these as internal wires so PyMTL's static update-block analysis + # can see declared objects even when the optional DMA interface is off. + s.spm_dma_wval = Wire() + s.spm_dma_wrdy = Wire() + s.spm_dma_waddr = Wire(AddrType) + s.spm_dma_wdata = Wire(DmaDataType) + s.spm_dma_wmask = Wire(DmaMaskType) + + s.spm_dma_rval = Wire() + s.spm_dma_rrdy = Wire() + s.spm_dma_raddr = Wire(AddrType) + s.spm_dma_rresp_val = Wire() + s.spm_dma_rresp_rdy = Wire() + s.spm_dma_rresp_data = Wire(DmaDataType) + + s.spm_dma_wval //= 0 + s.spm_dma_waddr //= AddrType(0) + s.spm_dma_wdata //= DmaDataType(0) + s.spm_dma_wmask //= DmaMaskType(0) + s.spm_dma_rval //= 0 + s.spm_dma_raddr //= AddrType(0) + s.spm_dma_rresp_rdy //= 0 + # Components. + # A list of DataMemWrapperRTL instances. Each one is a single memory bank. s.memory_wrapper = [DataMemWrapperRTL(DataType, MemReadPktType, MemWritePktType, MemResponsePktType, data_mem_size_global, data_mem_size_per_bank, mem_access_is_combinational) for _ in range(num_banks_per_cgra)] @@ -159,10 +222,10 @@ def construct(s, @update def assemble_xbar_pkt(): for i in range(num_xbar_in_rd_ports): - s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) + s.rd_pkt[i] @= MemReadPktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) for i in range(num_xbar_in_wr_ports): - s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, i) + s.wr_pkt[i] @= MemWritePktType(i, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) for i in range(num_rd_tiles): recv_raddr = s.recv_raddr[i].msg @@ -223,6 +286,36 @@ def assemble_xbar_pkt(): 0, # src_tile num_wr_tiles) # remote_src_port + if has_dma_ports: + dma_rd_idx = num_rd_tiles + 1 + dma_wr_idx = num_wr_tiles + 1 + + recv_raddr_from_dma = s.spm_dma_raddr + if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): + bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) + else: + bank_index_load_from_dma = XbarOutRdType(num_banks_per_cgra) + s.rd_pkt[dma_rd_idx] @= MemReadPktType(dma_rd_idx, # src + bank_index_load_from_dma, # dst + recv_raddr_from_dma, # addr + DataType(0, 0, 0, 0), # data + s.cgra_id, # src_cgra + 0, # src_tile + 0) # remote_src_port + + recv_waddr_from_dma = s.spm_dma_waddr + if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): + bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) + else: + bank_index_store_from_dma = XbarOutWrType(num_banks_per_cgra) + s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src + bank_index_store_from_dma, # dst + recv_waddr_from_dma, # addr + DataType(s.spm_dma_wdata, 1, 0, 0), + 0, # src_cgra + 0, # src_tile + 0) # remote_src_port + # Connects xbar with the memory wrapper. @update def update_all(): @@ -286,6 +379,12 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) + if has_dma_ports: + s.spm_dma_wrdy @= 0 + s.spm_dma_rrdy @= 0 + s.spm_dma_rresp_val @= 0 + s.spm_dma_rresp_data @= DmaDataType(0) + s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src 0, # dst @@ -310,6 +409,12 @@ def update_all(): s.read_crossbar.recv[num_rd_tiles].val @= s.recv_from_noc_load_request.val s.read_crossbar.recv[num_rd_tiles].msg @= s.rd_pkt[num_rd_tiles] s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy + + if has_dma_ports: + dma_rd_idx = num_rd_tiles + 1 + s.read_crossbar.recv[dma_rd_idx].val @= s.spm_dma_rval + s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] + s.spm_dma_rrdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -321,6 +426,12 @@ def update_all(): s.write_crossbar.recv[num_wr_tiles].msg @= s.wr_pkt[num_wr_tiles] s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy + if has_dma_ports: + dma_wr_idx = num_wr_tiles + 1 + s.write_crossbar.recv[dma_wr_idx].val @= s.spm_dma_wval + s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] + s.spm_dma_wrdy @= s.write_crossbar.recv[dma_wr_idx].rdy + # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. for i in range(num_xbar_in_rd_ports): @@ -328,7 +439,7 @@ def update_all(): s.send_rdata[RdTileIdType(i)].msg @= s.response_crossbar.send[i].msg.data s.send_rdata[RdTileIdType(i)].val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_rdata[RdTileIdType(i)].rdy - else: + elif i == num_rd_tiles: from_cgra_id = s.response_crossbar.send[i].msg.src_cgra from_tile_id = s.response_crossbar.send[i].msg.src_tile s.send_to_noc_load_response_pkt.msg @= \ @@ -351,6 +462,10 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy + elif has_dma_ports: + s.spm_dma_rresp_data @= s.response_crossbar.send[i].msg.data.payload + s.spm_dma_rresp_val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.spm_dma_rresp_rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. @@ -363,7 +478,7 @@ def update_all(): 0, # dst_y 0, # src_tile_id 0, # dst_tile_id - s.read_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port + trunc(s.read_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port 0, # opaque 0, # vc_id CgraPayloadType( @@ -378,7 +493,7 @@ def update_all(): s.response_crossbar.recv[num_banks_per_cgra].val @= s.recv_from_noc_load_response_pkt.val s.response_crossbar.recv[num_banks_per_cgra].msg @= \ MemResponsePktType(num_banks_per_cgra, - s.recv_from_noc_load_response_pkt.msg.remote_src_port, + zext(s.recv_from_noc_load_response_pkt.msg.remote_src_port, XbarInRdType), s.recv_from_noc_load_response_pkt.msg.payload.data_addr, s.recv_from_noc_load_response_pkt.msg.payload.data, s.recv_from_noc_load_response_pkt.msg.src, @@ -399,7 +514,7 @@ def update_all(): 0, # dst_y 0, # src_tile_id 0, # dst_tile_id - s.write_crossbar.send[num_banks_per_cgra].msg.src, # remote_src_port + trunc(s.write_crossbar.send[num_banks_per_cgra].msg.src, NocRemoteSrcPortType), # remote_src_port 0, # opaque 0, # vc_id CgraPayloadType( diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py new file mode 100644 index 00000000..b4cf1495 --- /dev/null +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -0,0 +1,115 @@ +""" +========================================================================== +DataMemControllerRTL_dma_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..DataMemControllerRTL import DataMemControllerRTL +from ....lib.messages import * +from ....lib.opt_type import * + + +def make_types(data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles): + DataType = mk_data(32, 1) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 4, 4, 16) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + NocPktType = mk_inter_cgra_pkt(1, 1, num_tiles, num_rd_tiles, CgraPayloadType) + return DataType, DataAddrType, NocPktType + + +def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles): + for i in range(num_rd_tiles): + dut.recv_raddr[i].val @= 0 + dut.recv_raddr[i].msg @= DataAddrType(0) + dut.send_rdata[i].rdy @= 1 + + for i in range(num_wr_tiles): + dut.recv_waddr[i].val @= 0 + dut.recv_waddr[i].msg @= DataAddrType(0) + dut.recv_wdata[i].val @= 0 + dut.recv_wdata[i].msg @= DataType(0, 0, 0, 0) + + dut.recv_from_noc_load_request.val @= 0 + dut.recv_from_noc_load_request.msg @= NocPktType() + dut.recv_from_noc_store_request.val @= 0 + dut.recv_from_noc_store_request.msg @= NocPktType() + dut.recv_from_noc_load_response_pkt.val @= 0 + dut.recv_from_noc_load_response_pkt.msg @= NocPktType() + dut.send_to_noc_load_request_pkt.rdy @= 1 + dut.send_to_noc_load_response_pkt.rdy @= 1 + dut.send_to_noc_store_pkt.rdy @= 1 + + dut.spm_dma_wval @= 0 + dut.spm_dma_waddr @= DataAddrType(0) + dut.spm_dma_wdata @= 0 + dut.spm_dma_wmask @= 0 + dut.spm_dma_rval @= 0 + dut.spm_dma_raddr @= DataAddrType(0) + dut.spm_dma_rresp_rdy @= 1 + + dut.cgra_id @= 0 + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + +def test_dma_ports_write_then_read(): + """ + Verifies that the DataMemController correctly handles requests from the + DMA ports. It performs a DMA write to a specific address and then a + DMA read from the same address to verify the data. + """ + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks = 4 + num_rd_tiles = 2 + num_wr_tiles = 2 + num_tiles = 4 + ctrl_mem_size = 16 + + DataType, DataAddrType, NocPktType = make_types( + data_mem_size_global, ctrl_mem_size, num_tiles, num_rd_tiles) + + dut = DataMemControllerRTL(NocPktType, + data_mem_size_global, + data_mem_size_per_bank, + num_banks, + num_rd_tiles, + num_wr_tiles, + 1, + 1, + num_tiles, + True, + {0: [0, 0]}, + has_dma_ports = True) + dut.apply(DefaultPassGroup()) + dut.sim_reset() + drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) + + dut.spm_dma_wval @= 1 + dut.spm_dma_waddr @= DataAddrType(3) + dut.spm_dma_wdata @= 0xaaaabbbb + dut.spm_dma_wmask @= 0xf + dut.sim_eval_combinational() + assert dut.spm_dma_wrdy + dut.sim_tick() + dut.spm_dma_wval @= 0 + + dut.spm_dma_rval @= 1 + dut.spm_dma_raddr @= DataAddrType(3) + + seen_response = False + for _ in range(10): + dut.sim_eval_combinational() + if dut.spm_dma_rval & dut.spm_dma_rrdy: + dut.spm_dma_rval @= 0 + if dut.spm_dma_rresp_val: + assert int(dut.spm_dma_rresp_data) == 0xaaaabbbb + seen_response = True + break + dut.sim_tick() + + assert seen_response From 7bd704e109538bdeb01e7d6f587043b6ba14d9e8 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 17:01:27 +0800 Subject: [PATCH 04/46] Add the dma ports into CgraTemplateRTL --- cgra/CgraTemplateRTL.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 00788487..f613485f 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -83,7 +83,8 @@ def construct(s, CgraPayloadType, provided_max_per_cgra_rows = None, provided_max_per_cgra_cols = None, provided_max_num_rd_tiles = None, - provided_max_num_wr_tiles = None): + provided_max_num_wr_tiles = None, + has_dma_ports = False): """ provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. @@ -126,6 +127,8 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaDataType = DataType.get_field_type(kAttrPayload) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // 8)) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -135,6 +138,23 @@ def construct(s, CgraPayloadType, s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + # Optional DMA interface ports. These are exposed at the template level + # to allow a top-level wrapper (like CgraDmaRTL) to connect a DMA engine + # directly to the internal DataMemController. + if has_dma_ports: + s.spm_dma_wval = InPort() + s.spm_dma_wrdy = OutPort() + s.spm_dma_waddr = InPort(DataAddrType) + s.spm_dma_wdata = InPort(DmaDataType) + s.spm_dma_wmask = InPort(DmaMaskType) + + s.spm_dma_rval = InPort() + s.spm_dma_rrdy = OutPort() + s.spm_dma_raddr = InPort(DataAddrType) + s.spm_dma_rresp_val = OutPort() + s.spm_dma_rresp_rdy = InPort() + s.spm_dma_rresp_data = OutPort(DmaDataType) + if is_multi_cgra: # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. # Remember to ground the remaining boundary ports of the current CGRA when the current CGRA has fewer rows or columns than the largest CGRA. @@ -168,7 +188,8 @@ def construct(s, CgraPayloadType, multi_cgra_columns, max_num_tiles, mem_access_is_combinational, - idTo2d_map) + idTo2d_map, + has_dma_ports) s.cgra_id = InPort(CgraIdType) s.controller = ControllerRTL(NocPktType, multi_cgra_rows, multi_cgra_columns, @@ -190,6 +211,22 @@ def construct(s, CgraPayloadType, s.data_mem.address_lower //= s.address_lower s.data_mem.address_upper //= s.address_upper + if has_dma_ports: + # DMA_MVIN: dram -> dma -> spm + s.data_mem.spm_dma_wval //= s.spm_dma_wval + s.data_mem.spm_dma_wrdy //= s.spm_dma_wrdy + s.data_mem.spm_dma_waddr //= s.spm_dma_waddr + s.data_mem.spm_dma_wdata //= s.spm_dma_wdata + s.data_mem.spm_dma_wmask //= s.spm_dma_wmask + + # DMA_MVOUT: spm -> dma -> dram + s.data_mem.spm_dma_rval //= s.spm_dma_rval + s.data_mem.spm_dma_rrdy //= s.spm_dma_rrdy + s.data_mem.spm_dma_raddr //= s.spm_dma_raddr + s.data_mem.spm_dma_rresp_val //= s.spm_dma_rresp_val + s.data_mem.spm_dma_rresp_rdy //= s.spm_dma_rresp_rdy + s.data_mem.spm_dma_rresp_data //= s.spm_dma_rresp_data + # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request From 72d95313780db6deb1282797eaedbe20aa0dfb01 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 17:31:46 +0800 Subject: [PATCH 05/46] Wrap the Cgra and Dma into one single module. --- cgra/CgraDmaRTL.py | 225 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 cgra/CgraDmaRTL.py diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py new file mode 100644 index 00000000..8a950f61 --- /dev/null +++ b/cgra/CgraDmaRTL.py @@ -0,0 +1,225 @@ +""" +========================================================================= +CgraDmaRTL.py +========================================================================= + +Wrapper that composes a CGRA template with a DMA engine attached to the +CGRA data SPM. +""" + +from pymtl3 import * + +from .CgraTemplateRTL import CgraTemplateRTL +from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ..lib.messages import * +from ..lib.util.data_struct_attr import * +from ..mem.dma.DmaEngineRTL import DmaEngineRTL + + +class CgraDmaRTL( Component ): + """ + CgraDmaRTL is a top-level wrapper that integrates a CGRA instance with a + DMA engine. + + Architectural Design: + - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a + DMA engine (`DmaEngineRTL`). + - The DMA engine is connected to the CGRA's internal data SPM through a + dedicated master port on the `DataMemControllerRTL`. + - CPU control packets are passed through to the CGRA's controller. + - External memory requests from the DMA engine are exposed at the top level + to be connected to a DRAM model or an AXI adapter. + - Boundary data ports for multi-CGRA configurations are also passed through + if enabled. + """ + + def construct(s, CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra = True, cgra_id = 0, + # For heterogeneous multi-cgra support.(maybe remove it in CgraDmaRTL for simplicity?) + provided_max_per_cgra_rows = None, + provided_max_per_cgra_cols = None, + provided_max_num_rd_tiles = None, + provided_max_num_wr_tiles = None): + + DataType = CgraPayloadType.get_field_type(kAttrData) + data_bitwidth = DataType.get_field_type(kAttrPayload).nbits + assert data_bitwidth == 32 + + max_per_cgra_rows = provided_max_per_cgra_rows if provided_max_per_cgra_rows is not None else per_cgra_rows + max_per_cgra_cols = provided_max_per_cgra_cols if provided_max_per_cgra_cols is not None else per_cgra_columns + max_num_tiles = max_per_cgra_rows * max_per_cgra_cols + max_num_rd_tiles = provided_max_num_rd_tiles if provided_max_num_rd_tiles is not None else dataSPM.getNumOfValidReadPorts() + + CtrlPktType = mk_intra_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, CgraPayloadType) + NocPktType = mk_inter_cgra_pkt(multi_cgra_columns, multi_cgra_rows, + max_num_tiles, max_num_rd_tiles, + CgraPayloadType) + + CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + DmaOpcodeType = mk_bits(3) + DmaDramAddrType = mk_bits(64) + DmaBytesType = mk_bits(32) + DmaTagType = mk_bits(8) + DmaMemDataType = mk_bits(128) + DmaMemMaskType = mk_bits(16) + + # Existing CGRA-facing interfaces. + + s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType) + s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType) + + if is_multi_cgra: + s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) + s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) + + s.recv_data_on_boundary_north = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_north = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_south = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.send_data_on_boundary_south = [SendIfcRTL(DataType) for _ in range(max_per_cgra_cols)] + s.recv_data_on_boundary_west = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_west = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.recv_data_on_boundary_east = [RecvIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] + + s.cgra_id = InPort(CgraIdType) + s.address_lower = InPort(DataAddrType) + s.address_upper = InPort(DataAddrType) + + # DMA command/done and abstract external memory interfaces. + + s.dma_cmd_val = InPort() + s.dma_cmd_rdy = OutPort() + s.dma_cmd_opcode = InPort(DmaOpcodeType) + s.dma_cmd_dram_addr = InPort(DmaDramAddrType) + s.dma_cmd_spm_addr = InPort(DataAddrType) + s.dma_cmd_bytes = InPort(DmaBytesType) + s.dma_cmd_tag = InPort(DmaTagType) + + s.dma_done_val = OutPort() + s.dma_done_rdy = InPort() + s.dma_done_tag = OutPort(DmaTagType) + + s.mem_rd_req_val = OutPort() + s.mem_rd_req_rdy = InPort() + s.mem_rd_req_addr = OutPort(DmaDramAddrType) + + s.mem_rd_resp_val = InPort() + s.mem_rd_resp_rdy = OutPort() + s.mem_rd_resp_data = InPort(DmaMemDataType) + + s.mem_wr_req_val = OutPort() + s.mem_wr_req_rdy = InPort() + s.mem_wr_req_addr = OutPort(DmaDramAddrType) + s.mem_wr_req_data = OutPort(DmaMemDataType) + s.mem_wr_req_mask = OutPort(DmaMemMaskType) + + s.mem_wr_resp_val = InPort() + s.mem_wr_resp_rdy = OutPort() + + # Components. + + s.cgra = CgraTemplateRTL(CgraPayloadType, + multi_cgra_rows, + multi_cgra_columns, + per_cgra_rows, per_cgra_columns, + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, mem_access_is_combinational, + FunctionUnit, FuList, TileList, LinkList, + dataSPM, controller2addr_map, idTo2d_map, + is_multi_cgra, cgra_id, + provided_max_per_cgra_rows, + provided_max_per_cgra_cols, + provided_max_num_rd_tiles, + provided_max_num_wr_tiles, + has_dma_ports = True) + + s.dma = DmaEngineRTL(spm_data_nbits = data_bitwidth, + spm_addr_nbits = clog2(data_mem_size_global)) + + # CGRA passthrough connections. + + s.recv_from_cpu_pkt //= s.cgra.recv_from_cpu_pkt + s.send_to_cpu_pkt //= s.cgra.send_to_cpu_pkt + + if is_multi_cgra: + s.recv_from_inter_cgra_noc //= s.cgra.recv_from_inter_cgra_noc + s.send_to_inter_cgra_noc //= s.cgra.send_to_inter_cgra_noc + + for i in range(max_per_cgra_cols): + s.recv_data_on_boundary_north[i] //= s.cgra.recv_data_on_boundary_north[i] + s.send_data_on_boundary_north[i] //= s.cgra.send_data_on_boundary_north[i] + s.recv_data_on_boundary_south[i] //= s.cgra.recv_data_on_boundary_south[i] + s.send_data_on_boundary_south[i] //= s.cgra.send_data_on_boundary_south[i] + + for i in range(max_per_cgra_rows): + s.recv_data_on_boundary_west[i] //= s.cgra.recv_data_on_boundary_west[i] + s.send_data_on_boundary_west[i] //= s.cgra.send_data_on_boundary_west[i] + s.recv_data_on_boundary_east[i] //= s.cgra.recv_data_on_boundary_east[i] + s.send_data_on_boundary_east[i] //= s.cgra.send_data_on_boundary_east[i] + + s.cgra_id //= s.cgra.cgra_id + s.address_lower //= s.cgra.address_lower + s.address_upper //= s.cgra.address_upper + + # DMA top-level connections. + + s.dma_cmd_val //= s.dma.dma_cmd_val + s.dma_cmd_rdy //= s.dma.dma_cmd_rdy + s.dma_cmd_opcode //= s.dma.dma_cmd_opcode + s.dma_cmd_dram_addr //= s.dma.dma_cmd_dram_addr + s.dma_cmd_spm_addr //= s.dma.dma_cmd_spm_addr + s.dma_cmd_bytes //= s.dma.dma_cmd_bytes + s.dma_cmd_tag //= s.dma.dma_cmd_tag + + s.dma_done_val //= s.dma.dma_done_val + s.dma_done_rdy //= s.dma.dma_done_rdy + s.dma_done_tag //= s.dma.dma_done_tag + + s.mem_rd_req_val //= s.dma.mem_rd_req_val + s.mem_rd_req_rdy //= s.dma.mem_rd_req_rdy + s.mem_rd_req_addr //= s.dma.mem_rd_req_addr + + s.mem_rd_resp_val //= s.dma.mem_rd_resp_val + s.mem_rd_resp_rdy //= s.dma.mem_rd_resp_rdy + s.mem_rd_resp_data //= s.dma.mem_rd_resp_data + + s.mem_wr_req_val //= s.dma.mem_wr_req_val + s.mem_wr_req_rdy //= s.dma.mem_wr_req_rdy + s.mem_wr_req_addr //= s.dma.mem_wr_req_addr + s.mem_wr_req_data //= s.dma.mem_wr_req_data + s.mem_wr_req_mask //= s.dma.mem_wr_req_mask + + s.mem_wr_resp_val //= s.dma.mem_wr_resp_val + s.mem_wr_resp_rdy //= s.dma.mem_wr_resp_rdy + + # DMA to SPM connections. + + s.dma.spm_dma_wval //= s.cgra.spm_dma_wval + s.dma.spm_dma_wrdy //= s.cgra.spm_dma_wrdy + s.dma.spm_dma_waddr //= s.cgra.spm_dma_waddr + s.dma.spm_dma_wdata //= s.cgra.spm_dma_wdata + s.dma.spm_dma_wmask //= s.cgra.spm_dma_wmask + + s.dma.spm_dma_rval //= s.cgra.spm_dma_rval + s.dma.spm_dma_rrdy //= s.cgra.spm_dma_rrdy + s.dma.spm_dma_raddr //= s.cgra.spm_dma_raddr + s.dma.spm_dma_rresp_val //= s.cgra.spm_dma_rresp_val + s.dma.spm_dma_rresp_rdy //= s.cgra.spm_dma_rresp_rdy + s.dma.spm_dma_rresp_data //= s.cgra.spm_dma_rresp_data + + def line_trace(s): + return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" From 46cfb8e52f74ce27cce16b51005469e7cbe7210c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 17:34:32 +0800 Subject: [PATCH 06/46] [Script] Add the local_CI script file --- local_CI.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 local_CI.py diff --git a/local_CI.py b/local_CI.py new file mode 100644 index 00000000..f35198f8 --- /dev/null +++ b/local_CI.py @@ -0,0 +1,77 @@ +""" +local_CI.py is a script that runs the CI tests locally. +Usage: +```shell +cd /path/to/VectorCGRA/ +mkdir -p build && cd build +python3 local_CI.py +``` +The log will be saved to the `local_CI.log` file. +""" +import subprocess +import os +import sys + +def run_tests(): + current_dir = os.path.dirname(os.path.abspath(__file__)) + log_file = os.path.join(current_dir, "local_CI.log") + + commands = [ + ["pytest", "..", "-v", "--tb=short"], + ["pytest", "../mem/ctrl/test/CtrlMemDynamicRTL_test.py", "-xvs"], + ["pytest", "../tile/test/TileRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../controller/test/ControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../cgra/test/CgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../noc/PyOCN/pymtl3_net/ringnet/test/RingNetworkRTL_test.py"], + ["pytest", "../multi_cgra/test/RingMultiCgraRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_verilog_homo_2x2_4x4", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../mem/const/test/ConstQueueDynamicRTL_test.py", "-xvs"], + ["pytest", "../mem/data/test/DataMemControllerRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraTemplateRTL_test.py", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_scalar_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_fir_vector_global_reduce_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"], + ["pytest", "../multi_cgra/test/MeshMultiCgraRTL_test.py::test_multi_CGRA_systolic_2x2_2x2_translation", "-xvs", "--test-verilog", "--dump-vtb", "--dump-vcd"] + ] + + with open(log_file, "w", encoding="utf-8") as f: + for cmd in commands: + cmd_str = " ".join(cmd) + header = f"\n{'='*80}\nExecuting: {cmd_str}\n{'='*80}\n" + + print(header) + f.write(header) + f.flush() + + try: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1 + ) + + for line in process.stdout: + print(line, end="") + f.write(line) + + process.wait() + + if process.returncode == 0: + status = f"\nSUCCESS: {cmd_str}\n" + else: + status = f"\nFAILED (Exit Code {process.returncode}): {cmd_str}\n" + + print(status) + f.write(status) + + except Exception as e: + error_msg = f"\nERROR executing {cmd_str}: {str(e)}\n" + print(error_msg) + f.write(error_msg) + + print(f"\n\nAll tests completed. Log saved to: {os.path.abspath(log_file)}") + +if __name__ == "__main__": + run_tests() \ No newline at end of file From 3d4ec4eb9680d37b5d4b4b8c6af0d9ccf258a029 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 20:22:46 +0800 Subject: [PATCH 07/46] Update .gitignore to ignore the log file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 305e025f..b41e10c2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build __pycache__ .hypothesis .vscode +*.log \ No newline at end of file From e90d45b6258966a275e2161627363de6d8e108ea Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 20:31:09 +0800 Subject: [PATCH 08/46] [Test] Add the test for CgraDmaRTL --- cgra/test/CgraDmaRTL_test.py | 231 +++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 cgra/test/CgraDmaRTL_test.py diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py new file mode 100644 index 00000000..22189865 --- /dev/null +++ b/cgra/test/CgraDmaRTL_test.py @@ -0,0 +1,231 @@ +""" +========================================================================== +CgraDmaRTL_test.py +========================================================================== +""" + +from pymtl3 import * + +from ..CgraDmaRTL import CgraDmaRTL +from ...fu.single.AdderRTL import AdderRTL +from ...fu.single.MemUnitRTL import MemUnitRTL +from ...fu.single.RetRTL import RetRTL +from ...lib.messages import * +from ...lib.opt_type import * +from ...lib.util.cgra.DataSPM import DataSPM +from ...lib.util.cgra.Tile import Tile +from ...lib.util.cgra.cgra_helper import get_links +from ...mem.dma.DmaEngineRTL import DMA_MVIN, DMA_MVOUT + + +def test_cgra_dma_mvin_to_local_spm(): + """ + Integration test for the CgraDmaRTL wrapper. + It simulates a DMA MVIN command that moves data from external DRAM into + the CGRA's dataSPM. It then checks the SPM contents to ensure the + transfer was successful. + """ + ctrl_mem_size = 8 + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks_per_cgra = 4 + num_registers_per_reg_bank = 16 + num_ctrl = 1 + total_steps = 1 + + DataType = mk_data(32, 1) + WordType = mk_bits(32) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) + + # 2x2 tiles + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + # The first row and the first column of the 2x2 tiles are connected to the data SPM. + dataSPM = DataSPM(3, 3) + + dut = CgraDmaRTL(CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra = False) + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.mem_rd_req_rdy @= 1 + dut.mem_rd_resp_val @= 0 + dut.mem_rd_resp_data @= 0 + dut.mem_wr_req_rdy @= 1 + dut.mem_wr_resp_val @= 0 + dut.dma_done_rdy @= 1 + + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= DMA_MVIN + # Read the data of DRAM from address 0x1000(16 bytes in total), + # then write the data to SPM from address 0x0 to 0x3. + dut.dma_cmd_dram_addr @= 0x1000 + dut.dma_cmd_spm_addr @= DataAddrType(0) + dut.dma_cmd_bytes @= 16 + dut.dma_cmd_tag @= 0x33 + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + pending_resp = False + + for _ in range(40): + dut.mem_rd_resp_val @= 0 + if pending_resp: + dut.mem_rd_resp_val @= 1 + dut.mem_rd_resp_data @= beat + + dut.sim_eval_combinational() + + pending_resp = bool(dut.mem_rd_req_val & dut.mem_rd_req_rdy) + + if dut.dma_done_val: + # Transfer finished, check the tag. + assert int(dut.dma_done_tag) == 0x33 + break + + dut.sim_tick() + + assert dut.dma_done_val + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] == DataType(0x33333333, 1, 0, 0) + assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] == DataType(0x44444444, 1, 0, 0) + + +def test_cgra_dma_mvout_from_local_spm(): + """ + Integration test for the CgraDmaRTL wrapper. + It simulates a DMA MVOUT command that moves data from the local SPM + into external DRAM. + """ + ctrl_mem_size = 8 + data_mem_size_global = 64 + data_mem_size_per_bank = 16 + num_banks_per_cgra = 4 + num_registers_per_reg_bank = 16 + num_ctrl = 1 + total_steps = 1 + + DataType = mk_data(32, 1) + WordType = mk_bits(32) + DataAddrType = mk_bits(clog2(data_mem_size_global)) + CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) + CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) + CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) + CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) + NocPktType = mk_inter_cgra_pkt(1, 1, 4, 3, CgraPayloadType) + + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + dataSPM = DataSPM(3, 3) + + dut = CgraDmaRTL(CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra = False) + + dut.apply(DefaultPassGroup()) + dut.sim_reset() + + # Pre-load SPM with data + dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] <<= DataType(0x11111111, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] <<= DataType(0x22222222, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] <<= DataType(0x33333333, 1, 0, 0) + dut.cgra.data_mem.memory_wrapper[0].memory.regs[3] <<= DataType(0x44444444, 1, 0, 0) + dut.sim_tick() + + dut.cgra_id @= 0 + # Address range: [0:15] + dut.address_lower @= DataAddrType(0) + dut.address_upper @= DataAddrType(15) + + dut.recv_from_cpu_pkt.val @= 0 + dut.recv_from_cpu_pkt.msg @= CtrlPktType() + dut.send_to_cpu_pkt.rdy @= 1 + dut.mem_rd_req_rdy @= 1 + dut.mem_rd_resp_val @= 0 + dut.mem_rd_resp_data @= 0 + dut.mem_wr_req_rdy @= 1 + dut.mem_wr_resp_val @= 0 + dut.dma_done_rdy @= 1 + + # Issue DMA MVOUT command + dut.dma_cmd_val @= 1 + dut.dma_cmd_opcode @= DMA_MVOUT + # Read the data of SPM from address 0x0 to 0x3(16 bytes in total), + # then write the data to DRAM address 0x2000. + dut.dma_cmd_dram_addr @= 0x2000 + dut.dma_cmd_spm_addr @= DataAddrType(0) + dut.dma_cmd_bytes @= 16 + dut.dma_cmd_tag @= 0x44 + dut.sim_eval_combinational() + assert dut.dma_cmd_rdy + dut.sim_tick() + dut.dma_cmd_val @= 0 + + # Expected 128-bit beat + expected_beat = concat(WordType(0x44444444), WordType(0x33333333), + WordType(0x22222222), WordType(0x11111111)) + + done = False + pending_wr_resp = False + for _ in range(40): + dut.mem_wr_resp_val @= 0 + if pending_wr_resp: + dut.mem_wr_resp_val @= 1 + pending_wr_resp = False + + if dut.mem_wr_req_val: + assert dut.mem_wr_req_addr == 0x2000 + assert dut.mem_wr_req_data == expected_beat + pending_wr_resp = True + + dut.sim_eval_combinational() + + if dut.dma_done_val: + assert int(dut.dma_done_tag) == 0x44 + done = True + break + + dut.sim_tick() + + assert done From 8fe1e7607265c34876d0dd78f7225ece95f2e622 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 21:17:02 +0800 Subject: [PATCH 09/46] [Fix] Fix the bit mismatch error between dma_idx and num_xbar_in_ports. --- mem/data/DataMemControllerRTL.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 4f017d53..7feb7d4c 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -287,8 +287,13 @@ def assemble_xbar_pkt(): num_wr_tiles) # remote_src_port if has_dma_ports: - dma_rd_idx = num_rd_tiles + 1 - dma_wr_idx = num_wr_tiles + 1 + + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_rd_idx = num_xbar_in_rd_ports - 1 + dma_wr_idx = num_xbar_in_wr_ports - 1 recv_raddr_from_dma = s.spm_dma_raddr if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): @@ -411,7 +416,11 @@ def update_all(): s.recv_from_noc_load_request.rdy @= s.read_crossbar.recv[num_rd_tiles].rdy if has_dma_ports: - dma_rd_idx = num_rd_tiles + 1 + # When `has_dma_ports` is True, num_xbar_in_rd_ports = num_rd_tiles + 1 + 1(dma_port_offset). + # Use dma_rd_idx = num_rd_tiles + 1 = num_xbar_in_rd_ports - 1 + # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error + # between `dma_rd_idx` and `num_xbar_in_rd_ports`. + dma_rd_idx = num_xbar_in_rd_ports - 1 s.read_crossbar.recv[dma_rd_idx].val @= s.spm_dma_rval s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] s.spm_dma_rrdy @= s.read_crossbar.recv[dma_rd_idx].rdy @@ -427,7 +436,11 @@ def update_all(): s.recv_from_noc_store_request.rdy @= s.write_crossbar.recv[num_wr_tiles].rdy if has_dma_ports: - dma_wr_idx = num_wr_tiles + 1 + # When `has_dma_ports` is True, num_xbar_in_wr_ports = num_wr_tiles + 1 + 1(dma_port_offset). + # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 + # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error + # between `dma_wr_idx` and `num_xbar_in_wr_ports`. + dma_wr_idx = num_xbar_in_wr_ports - 1 s.write_crossbar.recv[dma_wr_idx].val @= s.spm_dma_wval s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] s.spm_dma_wrdy @= s.write_crossbar.recv[dma_wr_idx].rdy From bf28acc1a3d07ffc52129d42e3b744c834db0dea Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 2 Jun 2026 21:47:03 +0800 Subject: [PATCH 10/46] [Doc] Add some comments --- cgra/CgraDmaRTL.py | 30 ++++++++++++++++-------------- cgra/CgraTemplateRTL.py | 4 +++- cgra/test/CgraDmaRTL_test.py | 3 ++- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index 8a950f61..6b0160ad 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -68,15 +68,15 @@ def construct(s, CgraPayloadType, CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaOpcodeType = mk_bits(3) + DmaOpcodeType = mk_bits(3) #DMA_MVIN: 0, DMA_MVOUT: 1 DmaDramAddrType = mk_bits(64) DmaBytesType = mk_bits(32) DmaTagType = mk_bits(8) - DmaMemDataType = mk_bits(128) + DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM DmaMemMaskType = mk_bits(16) # Existing CGRA-facing interfaces. - + # CGRA <-> CPU s.recv_from_cpu_pkt = RecvIfcRTL(CtrlPktType) s.send_to_cpu_pkt = SendIfcRTL(CtrlPktType) @@ -94,36 +94,38 @@ def construct(s, CgraPayloadType, s.send_data_on_boundary_east = [SendIfcRTL(DataType) for _ in range(max_per_cgra_rows)] s.cgra_id = InPort(CgraIdType) + # The local address range of current CGRA. + # Any address out of this range will be assumed as remote address. s.address_lower = InPort(DataAddrType) s.address_upper = InPort(DataAddrType) # DMA command/done and abstract external memory interfaces. - s.dma_cmd_val = InPort() - s.dma_cmd_rdy = OutPort() + s.dma_cmd_val = InPort() # dma_command_valid + s.dma_cmd_rdy = OutPort() # dma_command_ready s.dma_cmd_opcode = InPort(DmaOpcodeType) s.dma_cmd_dram_addr = InPort(DmaDramAddrType) s.dma_cmd_spm_addr = InPort(DataAddrType) - s.dma_cmd_bytes = InPort(DmaBytesType) - s.dma_cmd_tag = InPort(DmaTagType) + s.dma_cmd_bytes = InPort(DmaBytesType) # The number of bytes to transfer. + s.dma_cmd_tag = InPort(DmaTagType) # Doesn't use it now, but keep it for future use(e.g., distinguish different DMA commands). s.dma_done_val = OutPort() s.dma_done_rdy = InPort() - s.dma_done_tag = OutPort(DmaTagType) + s.dma_done_tag = OutPort(DmaTagType) # Must be same as the input `dma_cmd_tag` - s.mem_rd_req_val = OutPort() - s.mem_rd_req_rdy = InPort() + s.mem_rd_req_val = OutPort() # dma_read_request_valid + s.mem_rd_req_rdy = InPort() # dma_read_request_ready s.mem_rd_req_addr = OutPort(DmaDramAddrType) - s.mem_rd_resp_val = InPort() - s.mem_rd_resp_rdy = OutPort() - s.mem_rd_resp_data = InPort(DmaMemDataType) + s.mem_rd_resp_val = InPort() # dma_read_response_valid + s.mem_rd_resp_rdy = OutPort() # dma_read_response_ready + s.mem_rd_resp_data = InPort(DmaMemDataType) # dma_read_response_data s.mem_wr_req_val = OutPort() s.mem_wr_req_rdy = InPort() s.mem_wr_req_addr = OutPort(DmaDramAddrType) s.mem_wr_req_data = OutPort(DmaMemDataType) - s.mem_wr_req_mask = OutPort(DmaMemMaskType) + s.mem_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM s.mem_wr_resp_val = InPort() s.mem_wr_resp_rdy = OutPort() diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index f613485f..ce3c6416 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -142,12 +142,14 @@ def construct(s, CgraPayloadType, # to allow a top-level wrapper (like CgraDmaRTL) to connect a DMA engine # directly to the internal DataMemController. if has_dma_ports: - s.spm_dma_wval = InPort() + # DMA write request interface. + s.spm_dma_wval = InPort() # dma write request valid(write data into SPM) s.spm_dma_wrdy = OutPort() s.spm_dma_waddr = InPort(DataAddrType) s.spm_dma_wdata = InPort(DmaDataType) s.spm_dma_wmask = InPort(DmaMaskType) + # DMA read response interface. s.spm_dma_rval = InPort() s.spm_dma_rrdy = OutPort() s.spm_dma_raddr = InPort(DataAddrType) diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 22189865..62e111fd 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -101,6 +101,7 @@ def test_cgra_dma_mvin_to_local_spm(): dut.mem_rd_resp_val @= 0 if pending_resp: dut.mem_rd_resp_val @= 1 + # Simulate the read response from DRAM. dut.mem_rd_resp_data @= beat dut.sim_eval_combinational() @@ -115,6 +116,7 @@ def test_cgra_dma_mvin_to_local_spm(): dut.sim_tick() assert dut.dma_done_val + # Check the data in the dataSPM. assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0) assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0) assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[2] == DataType(0x33333333, 1, 0, 0) @@ -142,7 +144,6 @@ def test_cgra_dma_mvout_from_local_spm(): CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) - NocPktType = mk_inter_cgra_pkt(1, 1, 4, 3, CgraPayloadType) tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) for x in range(2)] for y in range(2)] From 8480563aae2a345bde41f21ee0a297d1c012545a Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Wed, 3 Jun 2026 17:19:29 +0800 Subject: [PATCH 11/46] [Fix] Fix the bit mismatch by type convertion --- mem/data/DataMemControllerRTL.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 7feb7d4c..e7897422 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -32,6 +32,7 @@ from ...lib.messages import * from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL from ...lib.util.data_struct_attr import * +from ...lib.util.common import CHAR_BIT class DataMemControllerRTL(Component): """ @@ -75,7 +76,7 @@ def construct(s, AddrType = mk_bits(global_addr_nbits) PerBankAddrType = mk_bits(per_bank_addr_nbits) DmaDataType = DataType.get_field_type(kAttrPayload) - DmaMaskType = mk_bits(max(1, DmaDataType.nbits // 8)) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) s.num_banks_per_cgra = num_banks_per_cgra s.has_dma_ports = has_dma_ports @@ -94,6 +95,7 @@ def construct(s, XbarOutRdType = mk_bits(clog2(num_xbar_out_rd_ports)) XbarOutWrType = mk_bits(clog2(num_xbar_out_wr_ports)) XbarInRdType = mk_bits(clog2(num_xbar_in_rd_ports)) + XbarInWrType = mk_bits(clog2(num_xbar_in_wr_ports)) MemReadPktType = \ mk_mem_access_pkt(DataType, num_xbar_in_rd_ports, @@ -292,8 +294,8 @@ def assemble_xbar_pkt(): # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. - dma_rd_idx = num_xbar_in_rd_ports - 1 - dma_wr_idx = num_xbar_in_wr_ports - 1 + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) recv_raddr_from_dma = s.spm_dma_raddr if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): @@ -420,7 +422,7 @@ def update_all(): # Use dma_rd_idx = num_rd_tiles + 1 = num_xbar_in_rd_ports - 1 # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error # between `dma_rd_idx` and `num_xbar_in_rd_ports`. - dma_rd_idx = num_xbar_in_rd_ports - 1 + dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) s.read_crossbar.recv[dma_rd_idx].val @= s.spm_dma_rval s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] s.spm_dma_rrdy @= s.read_crossbar.recv[dma_rd_idx].rdy @@ -440,7 +442,7 @@ def update_all(): # Use dma_wr_idx = num_wr_tiles + 1 = num_xbar_in_wr_ports - 1 # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. - dma_wr_idx = num_xbar_in_wr_ports - 1 + dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) s.write_crossbar.recv[dma_wr_idx].val @= s.spm_dma_wval s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] s.spm_dma_wrdy @= s.write_crossbar.recv[dma_wr_idx].rdy From 25c17cb2db2781480da7e8a5321084992428cde7 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Wed, 3 Jun 2026 17:20:52 +0800 Subject: [PATCH 12/46] Move some constant into common header file --- cgra/CgraTemplateRTL.py | 2 +- lib/util/common.py | 24 ++++++++++++++++++++++++ mem/dma/DmaEngineRTL.py | 29 +++++++---------------------- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index ce3c6416..eddfd105 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -128,7 +128,7 @@ def construct(s, CgraPayloadType, CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) DmaDataType = DataType.get_field_type(kAttrPayload) - DmaMaskType = mk_bits(max(1, DmaDataType.nbits // 8)) + DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) diff --git a/lib/util/common.py b/lib/util/common.py index 51650d67..840cd7b6 100644 --- a/lib/util/common.py +++ b/lib/util/common.py @@ -65,3 +65,27 @@ READ_TOWARDS_FU = 1 READ_TOWARDS_ROUTING_XBAR = 2 READ_TOWARDS_BOTH = 3 + +############################ +# Constants for DMA engine. +############################ +# DMA Move In and Out +# DMA_MVIN : DRAM -> DMA Engine -> SPM +# DMA_MVOUT : SPM -> DMA Engine -> DRAM +DMA_MVIN = 0 +DMA_MVOUT = 1 + +# 1 byte = 8 bits +CHAR_BIT = 8 + +# State machine definitions of DMA engine. +StateType = mk_bits( 4 ) +STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command +STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request +STATE_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response +STATE_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM +STATE_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request +STATE_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing +STATE_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request +STATE_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response +STATE_DONE = StateType( 8 ) # Signaling command completion diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 41e2562b..f21ffd9d 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -9,11 +9,7 @@ from pymtl3 import * -# DMA Move In and Out -# DMA_MVIN : DRAM -> DMA Engine -> SPM -# DMA_MVOUT : SPM -> DMA Engine -> DRAM -DMA_MVIN = 0 -DMA_MVOUT = 1 +from lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE class DmaEngineRTL( Component ): @@ -56,9 +52,9 @@ def construct( s, TagType = mk_bits( tag_nbits ) SpmDataType = mk_bits( spm_data_nbits ) MemDataType = mk_bits( mem_data_nbits ) - # Byte mask for SPM write; 1 byte = 8 bits - SpmMaskType = mk_bits( spm_data_nbits // 8 ) - MemMaskType = mk_bits( mem_data_nbits // 8 ) + # Byte mask for SPM write + SpmMaskType = mk_bits( spm_data_nbits // CHAR_BIT ) + MemMaskType = mk_bits( mem_data_nbits // CHAR_BIT ) # Command interface s.dma_cmd_val = InPort() @@ -112,17 +108,6 @@ def construct( s, s.spm_dma_rresp_data = InPort( SpmDataType ) # State machine definitions - StateType = mk_bits( 4 ) - - STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command - STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request - STATE_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response - STATE_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM - STATE_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request - STATE_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing - STATE_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request - STATE_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response - STATE_DONE = StateType( 8 ) # Signaling command completion s.state = Wire( StateType ) s.state_next = Wire( StateType ) @@ -177,7 +162,7 @@ def comb_outputs(): s.spm_dma_wval @= s.state == STATE_MVIN_WRITE s.spm_dma_waddr @= s.spm_addr_reg - s.spm_dma_wmask @= SpmMaskType( (1 << (spm_data_nbits // 8)) - 1 ) # Write mask for SPM write; always be 0b1111 + s.spm_dma_wmask @= SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) # Write mask for SPM write; always be 0b1111 if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM s.spm_dma_wdata @= s.beat_reg[0:spm_data_nbits] @@ -226,7 +211,7 @@ def seq_state(): elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. if s.mem_rd_req_val & s.mem_rd_req_rdy: - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // 8 ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) s.state_ff <<= STATE_MVIN_RESP elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. @@ -299,7 +284,7 @@ def seq_state(): elif s.state == STATE_MVOUT_WAIT: if s.mem_wr_resp_val & s.mem_wr_resp_rdy: # Turn to the +16 address after writing 16 bytes data. - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // 8 ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) From e59d782c1d3e6ab74342f0e4bd5e6cca7ce37406 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Wed, 3 Jun 2026 17:54:16 +0800 Subject: [PATCH 13/46] [Refactor] Wrap the signals between dma and dram with SendIfcRTL and RecvIfcRTL. Replace `mem` with `dram` for clarity. --- cgra/CgraDmaRTL.py | 46 +++++++++++---------------- cgra/test/CgraDmaRTL_test.py | 38 +++++++++++----------- lib/util/common.py | 1 + mem/dma/DmaEngineRTL.py | 53 +++++++++++++++---------------- mem/dma/test/DmaEngineRTL_test.py | 36 ++++++++++----------- 5 files changed, 81 insertions(+), 93 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index 6b0160ad..27d04985 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -113,22 +113,17 @@ def construct(s, CgraPayloadType, s.dma_done_rdy = InPort() s.dma_done_tag = OutPort(DmaTagType) # Must be same as the input `dma_cmd_tag` - s.mem_rd_req_val = OutPort() # dma_read_request_valid - s.mem_rd_req_rdy = InPort() # dma_read_request_ready - s.mem_rd_req_addr = OutPort(DmaDramAddrType) + s.dram_rd_req = SendIfcRTL(DmaDramAddrType) + s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) - s.mem_rd_resp_val = InPort() # dma_read_response_valid - s.mem_rd_resp_rdy = OutPort() # dma_read_response_ready - s.mem_rd_resp_data = InPort(DmaMemDataType) # dma_read_response_data + s.dram_wr_req_val = OutPort() + s.dram_wr_req_rdy = InPort() + s.dram_wr_req_addr = OutPort(DmaDramAddrType) + s.dram_wr_req_data = OutPort(DmaMemDataType) + s.dram_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM - s.mem_wr_req_val = OutPort() - s.mem_wr_req_rdy = InPort() - s.mem_wr_req_addr = OutPort(DmaDramAddrType) - s.mem_wr_req_data = OutPort(DmaMemDataType) - s.mem_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM - - s.mem_wr_resp_val = InPort() - s.mem_wr_resp_rdy = OutPort() + s.dram_wr_resp_val = InPort() + s.dram_wr_resp_rdy = OutPort() # Components. @@ -191,22 +186,17 @@ def construct(s, CgraPayloadType, s.dma_done_rdy //= s.dma.dma_done_rdy s.dma_done_tag //= s.dma.dma_done_tag - s.mem_rd_req_val //= s.dma.mem_rd_req_val - s.mem_rd_req_rdy //= s.dma.mem_rd_req_rdy - s.mem_rd_req_addr //= s.dma.mem_rd_req_addr - - s.mem_rd_resp_val //= s.dma.mem_rd_resp_val - s.mem_rd_resp_rdy //= s.dma.mem_rd_resp_rdy - s.mem_rd_resp_data //= s.dma.mem_rd_resp_data + s.dram_rd_req //= s.dma.dram_rd_req + s.dram_rd_resp //= s.dma.dram_rd_resp - s.mem_wr_req_val //= s.dma.mem_wr_req_val - s.mem_wr_req_rdy //= s.dma.mem_wr_req_rdy - s.mem_wr_req_addr //= s.dma.mem_wr_req_addr - s.mem_wr_req_data //= s.dma.mem_wr_req_data - s.mem_wr_req_mask //= s.dma.mem_wr_req_mask + s.dram_wr_req_val //= s.dma.dram_wr_req_val + s.dram_wr_req_rdy //= s.dma.dram_wr_req_rdy + s.dram_wr_req_addr //= s.dma.dram_wr_req_addr + s.dram_wr_req_data //= s.dma.dram_wr_req_data + s.dram_wr_req_mask //= s.dma.dram_wr_req_mask - s.mem_wr_resp_val //= s.dma.mem_wr_resp_val - s.mem_wr_resp_rdy //= s.dma.mem_wr_resp_rdy + s.dram_wr_resp_val //= s.dma.dram_wr_resp_val + s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy # DMA to SPM connections. diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 62e111fd..c9d61e41 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -73,11 +73,11 @@ def test_cgra_dma_mvin_to_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.mem_rd_req_rdy @= 1 - dut.mem_rd_resp_val @= 0 - dut.mem_rd_resp_data @= 0 - dut.mem_wr_req_rdy @= 1 - dut.mem_wr_resp_val @= 0 + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 0 dut.dma_done_rdy @= 1 dut.dma_cmd_val @= 1 @@ -98,15 +98,15 @@ def test_cgra_dma_mvin_to_local_spm(): pending_resp = False for _ in range(40): - dut.mem_rd_resp_val @= 0 + dut.dram_rd_resp.val @= 0 if pending_resp: - dut.mem_rd_resp_val @= 1 + dut.dram_rd_resp.val @= 1 # Simulate the read response from DRAM. - dut.mem_rd_resp_data @= beat + dut.dram_rd_resp.msg @= beat dut.sim_eval_combinational() - pending_resp = bool(dut.mem_rd_req_val & dut.mem_rd_req_rdy) + pending_resp = bool(dut.dram_rd_req.val & dut.dram_rd_req.rdy) if dut.dma_done_val: # Transfer finished, check the tag. @@ -182,11 +182,11 @@ def test_cgra_dma_mvout_from_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.mem_rd_req_rdy @= 1 - dut.mem_rd_resp_val @= 0 - dut.mem_rd_resp_data @= 0 - dut.mem_wr_req_rdy @= 1 - dut.mem_wr_resp_val @= 0 + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 0 dut.dma_done_rdy @= 1 # Issue DMA MVOUT command @@ -210,14 +210,14 @@ def test_cgra_dma_mvout_from_local_spm(): done = False pending_wr_resp = False for _ in range(40): - dut.mem_wr_resp_val @= 0 + dut.dram_wr_resp_val @= 0 if pending_wr_resp: - dut.mem_wr_resp_val @= 1 + dut.dram_wr_resp_val @= 1 pending_wr_resp = False - if dut.mem_wr_req_val: - assert dut.mem_wr_req_addr == 0x2000 - assert dut.mem_wr_req_data == expected_beat + if dut.dram_wr_req_val: + assert dut.dram_wr_req_addr == 0x2000 + assert dut.dram_wr_req_data == expected_beat pending_wr_resp = True dut.sim_eval_combinational() diff --git a/lib/util/common.py b/lib/util/common.py index 840cd7b6..eedb056e 100644 --- a/lib/util/common.py +++ b/lib/util/common.py @@ -79,6 +79,7 @@ CHAR_BIT = 8 # State machine definitions of DMA engine. +from pymtl3 import mk_bits StateType = mk_bits( 4 ) STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index f21ffd9d..efb10827 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -8,8 +8,9 @@ """ from pymtl3 import * - -from lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE +from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL +from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE class DmaEngineRTL( Component ): @@ -72,22 +73,18 @@ def construct( s, # Abstract external memory interface # Request to read from DRAM - s.mem_rd_req_val = OutPort() - s.mem_rd_req_rdy = InPort() - s.mem_rd_req_addr = OutPort( DramAddrType ) + s.dram_rd_req = SendIfcRTL( DramAddrType ) # Response from DRAM - s.mem_rd_resp_val = InPort() - s.mem_rd_resp_rdy = OutPort() - s.mem_rd_resp_data = InPort( MemDataType ) + s.dram_rd_resp = RecvIfcRTL( MemDataType ) # Request to write to DRAM - s.mem_wr_req_val = OutPort() - s.mem_wr_req_rdy = InPort() - s.mem_wr_req_addr = OutPort( DramAddrType ) - s.mem_wr_req_data = OutPort( MemDataType ) - s.mem_wr_req_mask = OutPort( MemMaskType ) - s.mem_wr_resp_val = InPort() - s.mem_wr_resp_rdy = OutPort() + s.dram_wr_req_val = OutPort() + s.dram_wr_req_rdy = InPort() + s.dram_wr_req_addr = OutPort( DramAddrType ) + s.dram_wr_req_data = OutPort( MemDataType ) + s.dram_wr_req_mask = OutPort( MemMaskType ) + s.dram_wr_resp_val = InPort() + s.dram_wr_resp_rdy = OutPort() # SPM interface # Request to write to SPM @@ -150,15 +147,15 @@ def comb_outputs(): s.dma_done_val @= s.state == STATE_DONE s.dma_done_tag @= s.tag_reg - s.mem_rd_req_val @= s.state == STATE_MVIN_REQ - s.mem_rd_req_addr @= s.dram_addr_reg - s.mem_rd_resp_rdy @= s.state == STATE_MVIN_RESP + s.dram_rd_req.val @= s.state == STATE_MVIN_REQ + s.dram_rd_req.msg @= s.dram_addr_reg + s.dram_rd_resp.rdy @= s.state == STATE_MVIN_RESP - s.mem_wr_req_val @= s.state == STATE_MVOUT_WRITE - s.mem_wr_req_addr @= s.dram_addr_reg - s.mem_wr_req_data @= s.beat_reg - s.mem_wr_req_mask @= s.wr_mask_reg - s.mem_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT + s.dram_wr_req_val @= s.state == STATE_MVOUT_WRITE + s.dram_wr_req_addr @= s.dram_addr_reg + s.dram_wr_req_data @= s.beat_reg + s.dram_wr_req_mask @= s.wr_mask_reg + s.dram_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT s.spm_dma_wval @= s.state == STATE_MVIN_WRITE s.spm_dma_waddr @= s.spm_addr_reg @@ -210,13 +207,13 @@ def seq_state(): s.state_ff <<= STATE_MVOUT_READ # Move to the next state: to issue a read request to SPM. elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. - if s.mem_rd_req_val & s.mem_rd_req_rdy: + if s.dram_rd_req.val & s.dram_rd_req.rdy: s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) s.state_ff <<= STATE_MVIN_RESP elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. - if s.mem_rd_resp_val & s.mem_rd_resp_rdy: - s.beat_ff <<= s.mem_rd_resp_data + if s.dram_rd_resp.val & s.dram_rd_resp.rdy: + s.beat_ff <<= s.dram_rd_resp.msg s.word_idx_ff <<= b2( 0 ) s.state_ff <<= STATE_MVIN_WRITE # Move to the next state: to write to SPM. @@ -278,11 +275,11 @@ def seq_state(): s.state_ff <<= STATE_MVOUT_READ elif s.state == STATE_MVOUT_WRITE: - if s.mem_wr_req_val & s.mem_wr_req_rdy: + if s.dram_wr_req_val & s.dram_wr_req_rdy: s.state_ff <<= STATE_MVOUT_WAIT elif s.state == STATE_MVOUT_WAIT: - if s.mem_wr_resp_val & s.mem_wr_resp_rdy: + if s.dram_wr_resp_val & s.dram_wr_resp_rdy: # Turn to the +16 address after writing 16 bytes data. s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) s.beat_ff <<= MemDataType( 0 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 12392465..28f30cc1 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -22,11 +22,11 @@ def make_dut(): dut.dma_cmd_tag @= 0 dut.dma_done_rdy @= 1 - dut.mem_rd_req_rdy @= 1 - dut.mem_rd_resp_val @= 0 - dut.mem_rd_resp_data @= 0 - dut.mem_wr_req_rdy @= 1 - dut.mem_wr_resp_val @= 1 + dut.dram_rd_req.rdy @= 1 + dut.dram_rd_resp.val @= 0 + dut.dram_rd_resp.msg @= 0 + dut.dram_wr_req_rdy @= 1 + dut.dram_wr_resp_val @= 1 dut.spm_dma_wrdy @= 1 dut.spm_dma_rrdy @= 1 @@ -84,15 +84,15 @@ def test_dma_mvin_one_beat(): spm_writes = [] for _ in range(20): - dut.mem_rd_resp_val @= 0 + dut.dram_rd_resp.val @= 0 if pending_resp is not None: - dut.mem_rd_resp_val @= 1 - dut.mem_rd_resp_data @= pending_resp + dut.dram_rd_resp.val @= 1 + dut.dram_rd_resp.msg @= pending_resp dut.sim_eval_combinational() - if dut.mem_rd_req_val & dut.mem_rd_req_rdy: - pending_resp = dram[int(dut.mem_rd_req_addr)] + if dut.dram_rd_req.val & dut.dram_rd_req.rdy: + pending_resp = dram[int(dut.dram_rd_req.msg)] else: pending_resp = None @@ -155,10 +155,10 @@ def test_dma_mvout_partial_beat(): else: pending_rresp = None - if dut.mem_wr_req_val & dut.mem_wr_req_rdy: - mem_writes.append((int(dut.mem_wr_req_addr), - int(dut.mem_wr_req_data), - int(dut.mem_wr_req_mask))) + if dut.dram_wr_req_val & dut.dram_wr_req_rdy: + mem_writes.append((int(dut.dram_wr_req_addr), + int(dut.dram_wr_req_data), + int(dut.dram_wr_req_mask))) if dut.dma_done_val: assert int(dut.dma_done_tag) == 0xa5 @@ -212,10 +212,10 @@ def test_dma_mvout_full_beat(): else: pending_rresp = None - if dut.mem_wr_req_val & dut.mem_wr_req_rdy: - mem_writes.append((int(dut.mem_wr_req_addr), - int(dut.mem_wr_req_data), - int(dut.mem_wr_req_mask))) + if dut.dram_wr_req_val & dut.dram_wr_req_rdy: + mem_writes.append((int(dut.dram_wr_req_addr), + int(dut.dram_wr_req_data), + int(dut.dram_wr_req_mask))) if dut.dma_done_val: assert int(dut.dma_done_tag) == 0xa5 From 4b994dee8f3cf78d3f11b65610344c02299c85d2 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sat, 13 Jun 2026 20:56:02 +0800 Subject: [PATCH 14/46] [Refactor] Update DMA command handling in CgraDmaRTL and CgraTemplateRTL. The DMA is connected to the data memory controller indirectly via the controller, with the decoding logic integrated into the controller. --- cgra/CgraDmaRTL.py | 46 +++---- cgra/CgraTemplateRTL.py | 78 ++++++++--- cgra/test/CgraDmaRTL_test.py | 133 ++++++++++++------ controller/ControllerRTL.py | 225 ++++++++++++++++++++++++++++--- lib/cmd_type.py | 21 ++- mem/data/DataMemControllerRTL.py | 13 +- 6 files changed, 404 insertions(+), 112 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index 27d04985..daa5315d 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -25,9 +25,10 @@ class CgraDmaRTL( Component ): Architectural Design: - It instantiates a standard CGRA template (`CgraTemplateRTL`) and a DMA engine (`DmaEngineRTL`). - - The DMA engine is connected to the CGRA's internal data SPM through a - dedicated master port on the `DataMemControllerRTL`. - CPU control packets are passed through to the CGRA's controller. + DMA commands are decoded there. + - The DMA engine accesses the CGRA's internal data SPM through controller- + forwarded ports; it is not connected directly to `DataMemControllerRTL`. - External memory requests from the DMA engine are exposed at the top level to be connected to a DRAM model or an AXI adapter. - Boundary data ports for multi-CGRA configurations are also passed through @@ -68,10 +69,7 @@ def construct(s, CgraPayloadType, CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaOpcodeType = mk_bits(3) #DMA_MVIN: 0, DMA_MVOUT: 1 DmaDramAddrType = mk_bits(64) - DmaBytesType = mk_bits(32) - DmaTagType = mk_bits(8) DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM DmaMemMaskType = mk_bits(16) @@ -99,19 +97,7 @@ def construct(s, CgraPayloadType, s.address_lower = InPort(DataAddrType) s.address_upper = InPort(DataAddrType) - # DMA command/done and abstract external memory interfaces. - - s.dma_cmd_val = InPort() # dma_command_valid - s.dma_cmd_rdy = OutPort() # dma_command_ready - s.dma_cmd_opcode = InPort(DmaOpcodeType) - s.dma_cmd_dram_addr = InPort(DmaDramAddrType) - s.dma_cmd_spm_addr = InPort(DataAddrType) - s.dma_cmd_bytes = InPort(DmaBytesType) # The number of bytes to transfer. - s.dma_cmd_tag = InPort(DmaTagType) # Doesn't use it now, but keep it for future use(e.g., distinguish different DMA commands). - - s.dma_done_val = OutPort() - s.dma_done_rdy = InPort() - s.dma_done_tag = OutPort(DmaTagType) # Must be same as the input `dma_cmd_tag` + # Abstract external dram memory interfaces for the internal DMA engine. s.dram_rd_req = SendIfcRTL(DmaDramAddrType) s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) @@ -172,19 +158,19 @@ def construct(s, CgraPayloadType, s.address_lower //= s.cgra.address_lower s.address_upper //= s.cgra.address_upper - # DMA top-level connections. + # Controller-decoded DMA command/done connections. - s.dma_cmd_val //= s.dma.dma_cmd_val - s.dma_cmd_rdy //= s.dma.dma_cmd_rdy - s.dma_cmd_opcode //= s.dma.dma_cmd_opcode - s.dma_cmd_dram_addr //= s.dma.dma_cmd_dram_addr - s.dma_cmd_spm_addr //= s.dma.dma_cmd_spm_addr - s.dma_cmd_bytes //= s.dma.dma_cmd_bytes - s.dma_cmd_tag //= s.dma.dma_cmd_tag + s.cgra.dma_cmd_val //= s.dma.dma_cmd_val + s.cgra.dma_cmd_rdy //= s.dma.dma_cmd_rdy + s.cgra.dma_cmd_opcode //= s.dma.dma_cmd_opcode + s.cgra.dma_cmd_dram_addr //= s.dma.dma_cmd_dram_addr + s.cgra.dma_cmd_spm_addr //= s.dma.dma_cmd_spm_addr + s.cgra.dma_cmd_bytes //= s.dma.dma_cmd_bytes + s.cgra.dma_cmd_tag //= s.dma.dma_cmd_tag - s.dma_done_val //= s.dma.dma_done_val - s.dma_done_rdy //= s.dma.dma_done_rdy - s.dma_done_tag //= s.dma.dma_done_tag + s.dma.dma_done_val //= s.cgra.dma_done_val + s.dma.dma_done_rdy //= s.cgra.dma_done_rdy + s.dma.dma_done_tag //= s.cgra.dma_done_tag s.dram_rd_req //= s.dma.dram_rd_req s.dram_rd_resp //= s.dma.dram_rd_resp @@ -198,7 +184,7 @@ def construct(s, CgraPayloadType, s.dram_wr_resp_val //= s.dma.dram_wr_resp_val s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy - # DMA to SPM connections. + # DMA to controller-forwarded SPM connections. s.dma.spm_dma_wval //= s.cgra.spm_dma_wval s.dma.spm_dma_wrdy //= s.cgra.spm_dma_wrdy diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index eddfd105..afeb44b3 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -129,6 +129,10 @@ def construct(s, CgraPayloadType, DataAddrType = mk_bits(clog2(data_mem_size_global)) DmaDataType = DataType.get_field_type(kAttrPayload) DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) + DmaOpcodeType = mk_bits(3) + DmaDramAddrType = mk_bits(64) + DmaBytesType = mk_bits(32) + DmaTagType = mk_bits(8) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -138,10 +142,21 @@ def construct(s, CgraPayloadType, s.recv_from_inter_cgra_noc = RecvIfcRTL(NocPktType) s.send_to_inter_cgra_noc = SendIfcRTL(NocPktType) - # Optional DMA interface ports. These are exposed at the template level - # to allow a top-level wrapper (like CgraDmaRTL) to connect a DMA engine - # directly to the internal DataMemController. + # Optional DMA engine-facing ports. The controller owns command decode and + # forwards DMA SPM access to the data memory. if has_dma_ports: + s.dma_cmd_val = OutPort() + s.dma_cmd_rdy = InPort() + s.dma_cmd_opcode = OutPort(DmaOpcodeType) + s.dma_cmd_dram_addr = OutPort(DmaDramAddrType) + s.dma_cmd_spm_addr = OutPort(DataAddrType) + s.dma_cmd_bytes = OutPort(DmaBytesType) + s.dma_cmd_tag = OutPort(DmaTagType) + + s.dma_done_val = InPort() + s.dma_done_rdy = OutPort() + s.dma_done_tag = InPort(DmaTagType) + # DMA write request interface. s.spm_dma_wval = InPort() # dma write request valid(write data into SPM) s.spm_dma_wrdy = OutPort() @@ -195,7 +210,8 @@ def construct(s, CgraPayloadType, s.cgra_id = InPort(CgraIdType) s.controller = ControllerRTL(NocPktType, multi_cgra_rows, multi_cgra_columns, - max_num_tiles, controller2addr_map, idTo2d_map) + max_num_tiles, controller2addr_map, idTo2d_map, + has_dma_ports) # Connects controller id. s.controller.cgra_id //= s.cgra_id # An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU. @@ -214,20 +230,46 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper if has_dma_ports: - # DMA_MVIN: dram -> dma -> spm - s.data_mem.spm_dma_wval //= s.spm_dma_wval - s.data_mem.spm_dma_wrdy //= s.spm_dma_wrdy - s.data_mem.spm_dma_waddr //= s.spm_dma_waddr - s.data_mem.spm_dma_wdata //= s.spm_dma_wdata - s.data_mem.spm_dma_wmask //= s.spm_dma_wmask - - # DMA_MVOUT: spm -> dma -> dram - s.data_mem.spm_dma_rval //= s.spm_dma_rval - s.data_mem.spm_dma_rrdy //= s.spm_dma_rrdy - s.data_mem.spm_dma_raddr //= s.spm_dma_raddr - s.data_mem.spm_dma_rresp_val //= s.spm_dma_rresp_val - s.data_mem.spm_dma_rresp_rdy //= s.spm_dma_rresp_rdy - s.data_mem.spm_dma_rresp_data //= s.spm_dma_rresp_data + # CPU packets are decoded by the controller before becoming DMA commands. + s.dma_cmd_val //= s.controller.dma_cmd_val + s.dma_cmd_rdy //= s.controller.dma_cmd_rdy + s.dma_cmd_opcode //= s.controller.dma_cmd_opcode + s.dma_cmd_dram_addr //= s.controller.dma_cmd_dram_addr + s.dma_cmd_spm_addr //= s.controller.dma_cmd_spm_addr + s.dma_cmd_bytes //= s.controller.dma_cmd_bytes + s.dma_cmd_tag //= s.controller.dma_cmd_tag + + s.dma_done_val //= s.controller.dma_done_val + s.dma_done_rdy //= s.controller.dma_done_rdy + s.dma_done_tag //= s.controller.dma_done_tag + + # DMA engine <-> controller side of the SPM path. + s.spm_dma_wval //= s.controller.spm_dma_wval + s.spm_dma_wrdy //= s.controller.spm_dma_wrdy + s.spm_dma_waddr //= s.controller.spm_dma_waddr + s.spm_dma_wdata //= s.controller.spm_dma_wdata + s.spm_dma_wmask //= s.controller.spm_dma_wmask + + s.spm_dma_rval //= s.controller.spm_dma_rval + s.spm_dma_rrdy //= s.controller.spm_dma_rrdy + s.spm_dma_raddr //= s.controller.spm_dma_raddr + s.spm_dma_rresp_val //= s.controller.spm_dma_rresp_val + s.spm_dma_rresp_rdy //= s.controller.spm_dma_rresp_rdy + s.spm_dma_rresp_data //= s.controller.spm_dma_rresp_data + + # Controller <-> data memory side of the SPM path. + s.controller.send_to_mem_dma_wval //= s.data_mem.spm_dma_wval + s.controller.recv_from_mem_dma_wrdy //= s.data_mem.spm_dma_wrdy + s.controller.send_to_mem_dma_waddr //= s.data_mem.spm_dma_waddr + s.controller.send_to_mem_dma_wdata //= s.data_mem.spm_dma_wdata + s.controller.send_to_mem_dma_wmask //= s.data_mem.spm_dma_wmask + + s.controller.send_to_mem_dma_rval //= s.data_mem.spm_dma_rval + s.controller.recv_from_mem_dma_rrdy //= s.data_mem.spm_dma_rrdy + s.controller.send_to_mem_dma_raddr //= s.data_mem.spm_dma_raddr + s.controller.recv_from_mem_dma_rresp_val //= s.data_mem.spm_dma_rresp_val + s.controller.send_to_mem_dma_rresp_rdy //= s.data_mem.spm_dma_rresp_rdy + s.controller.recv_from_mem_dma_rresp_data //= s.data_mem.spm_dma_rresp_data # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index c9d61e41..4f4781bb 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -10,12 +10,91 @@ from ...fu.single.AdderRTL import AdderRTL from ...fu.single.MemUnitRTL import MemUnitRTL from ...fu.single.RetRTL import RetRTL +from ...lib.cmd_type import * from ...lib.messages import * from ...lib.opt_type import * from ...lib.util.cgra.DataSPM import DataSPM from ...lib.util.cgra.Tile import Tile from ...lib.util.cgra.cgra_helper import get_links -from ...mem.dma.DmaEngineRTL import DMA_MVIN, DMA_MVOUT + + +def issue_cpu_pkt(dut, pkt, max_cycles = 20): + """ + CPU issues a packet to the CGRA. + """ + dut.recv_from_cpu_pkt.val @= 1 + dut.recv_from_cpu_pkt.msg @= pkt + + for _ in range(max_cycles): + dut.sim_eval_combinational() + if dut.recv_from_cpu_pkt.rdy: + dut.sim_tick() + dut.recv_from_cpu_pkt.val @= 0 + dut.sim_eval_combinational() + return + dut.sim_tick() + + assert False, "CPU packet was not accepted by the CGRA" + + +def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + dma_cmd, dram_addr, spm_addr, nbytes, tag): + + """ + Issues a DMA command to the CGRA. + Args: + dut: The CGRA instance. + CtrlPktType: The type of the control packet. + CgraPayloadType: The type of the CGRA payload. + DataType: The type of the data. + DataAddrType: The type of the data address. + + dma_cmd: The DMA command to issue.(CMD_DMA_MVIN or CMD_DMA_MVOUT) + dram_addr: The DRAM address to transfer data from or to.(64 bits) + spm_addr: The SPM address to transfer data from or to.(32 bits) + nbytes: The number of bytes to transfer. + tag: The tag of the DMA command. + """ + config_pkts = [ + # The bindwidth of dram address is 64 bits, so we need to split it into two 32 bits parts. + # Lower 32 bits are sent first. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_DRAM_ADDR_LO, + data = DataType(dram_addr & 0xffffffff, 1))), + + # Higher 32 bits are sent second. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_DRAM_ADDR_HI, + data = DataType((dram_addr >> 32) & 0xffffffff, 1))), + + # The SPM address to read from or write to. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_SPM_ADDR, + data_addr = DataAddrType(spm_addr))), + + # The number of bytes to transfer. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_BYTES, + data = DataType(nbytes, 1))), + + # The tag of the DMA command. + CtrlPktType(0, 0, payload = CgraPayloadType( + CMD_DMA_CONFIG_TAG, + data = DataType(tag, 1))), + CtrlPktType(0, 0, payload = CgraPayloadType(dma_cmd)), + ] + + for pkt in config_pkts: + issue_cpu_pkt(dut, pkt) + + +def observed_dma_done(dut, expected_tag): + dut.sim_eval_combinational() + if dut.send_to_cpu_pkt.val and dut.send_to_cpu_pkt.msg.payload.cmd == CMD_DMA_DONE: + assert int(dut.send_to_cpu_pkt.msg.opaque) == expected_tag + assert int(dut.send_to_cpu_pkt.msg.payload.data.payload) == expected_tag + return True + return False def test_cgra_dma_mvin_to_local_spm(): @@ -78,20 +157,10 @@ def test_cgra_dma_mvin_to_local_spm(): dut.dram_rd_resp.msg @= 0 dut.dram_wr_req_rdy @= 1 dut.dram_wr_resp_val @= 0 - dut.dma_done_rdy @= 1 - - dut.dma_cmd_val @= 1 - dut.dma_cmd_opcode @= DMA_MVIN - # Read the data of DRAM from address 0x1000(16 bytes in total), - # then write the data to SPM from address 0x0 to 0x3. - dut.dma_cmd_dram_addr @= 0x1000 - dut.dma_cmd_spm_addr @= DataAddrType(0) - dut.dma_cmd_bytes @= 16 - dut.dma_cmd_tag @= 0x33 - dut.sim_eval_combinational() - assert dut.dma_cmd_rdy - dut.sim_tick() - dut.dma_cmd_val @= 0 + + # Read 16 bytes from DRAM address 0x1000 and write them to SPM words 0..3. + issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + CMD_DMA_MVIN, 0x1000, 0, 16, 0x33) beat = concat(WordType(0x44444444), WordType(0x33333333), WordType(0x22222222), WordType(0x11111111)) @@ -108,14 +177,12 @@ def test_cgra_dma_mvin_to_local_spm(): pending_resp = bool(dut.dram_rd_req.val & dut.dram_rd_req.rdy) - if dut.dma_done_val: - # Transfer finished, check the tag. - assert int(dut.dma_done_tag) == 0x33 + if observed_dma_done(dut, 0x33): break dut.sim_tick() - assert dut.dma_done_val + assert observed_dma_done(dut, 0x33) # Check the data in the dataSPM. assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[0] == DataType(0x11111111, 1, 0, 0) assert dut.cgra.data_mem.memory_wrapper[0].memory.regs[1] == DataType(0x22222222, 1, 0, 0) @@ -187,21 +254,10 @@ def test_cgra_dma_mvout_from_local_spm(): dut.dram_rd_resp.msg @= 0 dut.dram_wr_req_rdy @= 1 dut.dram_wr_resp_val @= 0 - dut.dma_done_rdy @= 1 - - # Issue DMA MVOUT command - dut.dma_cmd_val @= 1 - dut.dma_cmd_opcode @= DMA_MVOUT - # Read the data of SPM from address 0x0 to 0x3(16 bytes in total), - # then write the data to DRAM address 0x2000. - dut.dma_cmd_dram_addr @= 0x2000 - dut.dma_cmd_spm_addr @= DataAddrType(0) - dut.dma_cmd_bytes @= 16 - dut.dma_cmd_tag @= 0x44 - dut.sim_eval_combinational() - assert dut.dma_cmd_rdy - dut.sim_tick() - dut.dma_cmd_val @= 0 + + # Read SPM words 0..3 and write 16 bytes to DRAM address 0x2000. + issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, + CMD_DMA_MVOUT, 0x2000, 0, 16, 0x44) # Expected 128-bit beat expected_beat = concat(WordType(0x44444444), WordType(0x33333333), @@ -215,15 +271,14 @@ def test_cgra_dma_mvout_from_local_spm(): dut.dram_wr_resp_val @= 1 pending_wr_resp = False - if dut.dram_wr_req_val: + dut.sim_eval_combinational() + + if dut.dram_wr_req_val & dut.dram_wr_req_rdy: assert dut.dram_wr_req_addr == 0x2000 assert dut.dram_wr_req_data == expected_beat pending_wr_resp = True - dut.sim_eval_combinational() - - if dut.dma_done_val: - assert int(dut.dma_done_tag) == 0x44 + if observed_dma_done(dut, 0x44): done = True break diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 83b41068..d925c555 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -29,11 +29,13 @@ def construct(s, multi_cgra_columns, num_tiles, controller2addr_map, - idTo2d_map): + idTo2d_map, + has_dma_ports = False): # Derives types from InterCgraPktType. CgraPayloadType = InterCgraPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) + DataPayloadType = DataType.get_field_type(kAttrPayload) DataAddrType = CgraPayloadType.get_field_type(kAttrDataAddr) # Derives CgraIdType from grid dimensions. @@ -52,6 +54,15 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) TileIdType = mk_bits(clog2(num_tiles + 1)) ControllerXbarPktType = mk_controller_noc_xbar_pkt(InterCgraPktType) + DmaOpcodeType = mk_bits(3) + DmaDramAddrType = mk_bits(64) + DmaBytesType = mk_bits(32) + DmaTagType = mk_bits(8) + DmaDramAddrPartType = mk_bits(32) + DmaMaskType = mk_bits(max(1, DataPayloadType.nbits // CHAR_BIT)) + + if has_dma_ports: + assert DataPayloadType.nbits == 32 # Interface s.cgra_id = InPort(CgraIdType) @@ -75,6 +86,86 @@ def construct(s, s.send_to_tile_load_response = SendIfcRTL(InterCgraPktType) s.send_to_mem_store_request = SendIfcRTL(InterCgraPktType) + if has_dma_ports: + # Controller-owned command path from CPU packets to the DMA engine. + s.dma_cmd_val = OutPort() + s.dma_cmd_rdy = InPort() + s.dma_cmd_opcode = OutPort(DmaOpcodeType) + s.dma_cmd_dram_addr = OutPort(DmaDramAddrType) + s.dma_cmd_spm_addr = OutPort(DataAddrType) + s.dma_cmd_bytes = OutPort(DmaBytesType) + s.dma_cmd_tag = OutPort(DmaTagType) + + s.dma_done_val = InPort() + s.dma_done_rdy = OutPort() + s.dma_done_tag = InPort(DmaTagType) + + # DMA engine side of the controller-forwarded SPM access path. + s.spm_dma_wval = InPort() + s.spm_dma_wrdy = OutPort() + s.spm_dma_waddr = InPort(DataAddrType) + s.spm_dma_wdata = InPort(DataPayloadType) + s.spm_dma_wmask = InPort(DmaMaskType) + + s.spm_dma_rval = InPort() + s.spm_dma_rrdy = OutPort() + s.spm_dma_raddr = InPort(DataAddrType) + s.spm_dma_rresp_val = OutPort() + s.spm_dma_rresp_rdy = InPort() + s.spm_dma_rresp_data = OutPort(DataPayloadType) + + # Data memory side of the same SPM access path. + s.send_to_mem_dma_wval = OutPort() + s.recv_from_mem_dma_wrdy = InPort() + s.send_to_mem_dma_waddr = OutPort(DataAddrType) + s.send_to_mem_dma_wdata = OutPort(DataPayloadType) + s.send_to_mem_dma_wmask = OutPort(DmaMaskType) + + s.send_to_mem_dma_rval = OutPort() + s.recv_from_mem_dma_rrdy = InPort() + s.send_to_mem_dma_raddr = OutPort(DataAddrType) + s.recv_from_mem_dma_rresp_val = InPort() + s.send_to_mem_dma_rresp_rdy = OutPort() + s.recv_from_mem_dma_rresp_data = InPort(DataPayloadType) + else: + s.dma_cmd_val = Wire() + s.dma_cmd_rdy = Wire() + s.dma_cmd_opcode = Wire(DmaOpcodeType) + s.dma_cmd_dram_addr = Wire(DmaDramAddrType) + s.dma_cmd_spm_addr = Wire(DataAddrType) + s.dma_cmd_bytes = Wire(DmaBytesType) + s.dma_cmd_tag = Wire(DmaTagType) + + s.dma_done_val = Wire() + s.dma_done_rdy = Wire() + s.dma_done_tag = Wire(DmaTagType) + + s.spm_dma_wval = Wire() + s.spm_dma_wrdy = Wire() + s.spm_dma_waddr = Wire(DataAddrType) + s.spm_dma_wdata = Wire(DataPayloadType) + s.spm_dma_wmask = Wire(DmaMaskType) + + s.spm_dma_rval = Wire() + s.spm_dma_rrdy = Wire() + s.spm_dma_raddr = Wire(DataAddrType) + s.spm_dma_rresp_val = Wire() + s.spm_dma_rresp_rdy = Wire() + s.spm_dma_rresp_data = Wire(DataPayloadType) + + s.send_to_mem_dma_wval = Wire() + s.recv_from_mem_dma_wrdy = Wire() + s.send_to_mem_dma_waddr = Wire(DataAddrType) + s.send_to_mem_dma_wdata = Wire(DataPayloadType) + s.send_to_mem_dma_wmask = Wire(DmaMaskType) + + s.send_to_mem_dma_rval = Wire() + s.recv_from_mem_dma_rrdy = Wire() + s.send_to_mem_dma_raddr = Wire(DataAddrType) + s.recv_from_mem_dma_rresp_val = Wire() + s.send_to_mem_dma_rresp_rdy = Wire() + s.recv_from_mem_dma_rresp_data = Wire(DataPayloadType) + # Component s.recv_from_tile_load_request_pkt_queue = ChannelRTL(InterCgraPktType, latency = 1) s.recv_from_tile_load_response_pkt_queue = ChannelRTL(InterCgraPktType, latency = 1) @@ -123,6 +214,12 @@ def construct(s, s.addr_dst_id = Wire(CgraIdType) + s.dma_dram_addr_lo = Wire(DmaDramAddrPartType) + s.dma_dram_addr_hi = Wire(DmaDramAddrPartType) + s.dma_spm_addr = Wire(DataAddrType) + s.dma_bytes = Wire(DmaBytesType) + s.dma_tag = Wire(DmaTagType) + # Connections. # Requests towards others, 1 cycle delay to improve timing. s.recv_from_tile_load_request_pkt_queue.recv //= s.recv_from_tile_load_request_pkt @@ -138,6 +235,46 @@ def construct(s, s.recv_from_cpu_pkt //= s.recv_from_cpu_pkt_queue.recv s.send_to_cpu_pkt //= s.send_to_cpu_pkt_queue.send + @update_ff + def update_dma_cmd_regs(): + if s.reset: + s.dma_dram_addr_lo <<= DmaDramAddrPartType(0) + s.dma_dram_addr_hi <<= DmaDramAddrPartType(0) + s.dma_spm_addr <<= DataAddrType(0) + s.dma_bytes <<= DmaBytesType(0) + s.dma_tag <<= DmaTagType(0) + elif has_dma_ports: + cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload + cpu_cmd = cpu_payload.cmd + cpu_data = cpu_payload.data.payload + if s.recv_from_cpu_pkt_queue.send.val & s.recv_from_cpu_pkt_queue.send.rdy: + if cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO: + s.dma_dram_addr_lo <<= DmaDramAddrPartType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI: + s.dma_dram_addr_hi <<= DmaDramAddrPartType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR: + s.dma_spm_addr <<= cpu_payload.data_addr + elif cpu_cmd == CMD_DMA_CONFIG_BYTES: + s.dma_bytes <<= DmaBytesType(cpu_data) + elif cpu_cmd == CMD_DMA_CONFIG_TAG: + s.dma_tag <<= trunc(cpu_data, DmaTagType) + + @update + def update_dma_spm_forwarding(): + if has_dma_ports: + s.send_to_mem_dma_wval @= s.spm_dma_wval + s.spm_dma_wrdy @= s.recv_from_mem_dma_wrdy + s.send_to_mem_dma_waddr @= s.spm_dma_waddr + s.send_to_mem_dma_wdata @= s.spm_dma_wdata + s.send_to_mem_dma_wmask @= s.spm_dma_wmask + + s.send_to_mem_dma_rval @= s.spm_dma_rval + s.spm_dma_rrdy @= s.recv_from_mem_dma_rrdy + s.send_to_mem_dma_raddr @= s.spm_dma_raddr + s.spm_dma_rresp_val @= s.recv_from_mem_dma_rresp_val + s.send_to_mem_dma_rresp_rdy @= s.spm_dma_rresp_rdy + s.spm_dma_rresp_data @= s.recv_from_mem_dma_rresp_data + @update def update_received_msg(): kLoadRequestInportIdx = 0 @@ -151,6 +288,15 @@ def update_received_msg(): s.send_to_cpu_pkt_queue.recv.msg @= IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) s.recv_from_ctrl_ring_pkt.rdy @= 0 + if has_dma_ports: + s.dma_cmd_val @= 0 + s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVIN) + s.dma_cmd_dram_addr @= concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo) + s.dma_cmd_spm_addr @= s.dma_spm_addr + s.dma_cmd_bytes @= s.dma_bytes + s.dma_cmd_tag @= s.dma_tag + s.dma_done_rdy @= 0 + for i in range(CONTROLLER_CROSSBAR_INPORTS): s.crossbar.recv[i].val @= 0 s.crossbar.recv[i].msg @= ControllerXbarPktType(0, 0) @@ -201,24 +347,46 @@ def update_received_msg(): s.global_reduce_unit.send.rdy @= s.crossbar.recv[kFromReduceUnitIdx].rdy s.crossbar.recv[kFromReduceUnitIdx].msg @= s.global_reduce_unit.send.msg - # For the ctrl and data preloading. - s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \ - s.recv_from_cpu_pkt_queue.send.val - s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy - s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \ - ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC) - InterCgraPktType(s.cgra_id, # src - s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst - 0, # src_x - 0, # src_y - s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x - s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y - num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back. - s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id - 0, # remote_src_port, only used for inter-cgra remote load request/response. - 0, # opaque - 0, # vc_id - s.recv_from_cpu_pkt_queue.send.msg.payload)) + cpu_payload = s.recv_from_cpu_pkt_queue.send.msg.payload + cpu_cmd = cpu_payload.cmd + + if has_dma_ports & ( + (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_LO) | + (cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI) | + (cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR) | + (cpu_cmd == CMD_DMA_CONFIG_BYTES) | + (cpu_cmd == CMD_DMA_CONFIG_TAG)): + s.recv_from_cpu_pkt_queue.send.rdy @= 1 + + elif has_dma_ports & ( + (cpu_cmd == CMD_DMA_MVIN) | + (cpu_cmd == CMD_DMA_MVOUT)): + s.dma_cmd_val @= s.recv_from_cpu_pkt_queue.send.val + if cpu_cmd == CMD_DMA_MVIN: + s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVIN) + else: + s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVOUT) + s.recv_from_cpu_pkt_queue.send.rdy @= s.dma_cmd_rdy + + else: + # For the ctrl and data preloading. + s.crossbar.recv[kFromCpuCtrlAndDataIdx].val @= \ + s.recv_from_cpu_pkt_queue.send.val + s.recv_from_cpu_pkt_queue.send.rdy @= s.crossbar.recv[kFromCpuCtrlAndDataIdx].rdy + s.crossbar.recv[kFromCpuCtrlAndDataIdx].msg @= \ + ControllerXbarPktType(0, # dst (always 0 to align with the single outport of the crossbar, i.e., NoC) + InterCgraPktType(s.cgra_id, # src + s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id, # dst + 0, # src_x + 0, # src_y + s.idTo2d_x_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_x + s.idTo2d_y_lut[s.recv_from_cpu_pkt_queue.send.msg.dst_cgra_id], # dst_y + num_tiles, # src_tile_id, num_tiles is used to indicate the request is from CPU, so the LOAD response can come back. + s.recv_from_cpu_pkt_queue.send.msg.dst, # dst_tile_id + 0, # remote_src_port, only used for inter-cgra remote load request/response. + 0, # opaque + 0, # vc_id + s.recv_from_cpu_pkt_queue.send.msg.payload)) # TODO: For the other cmd types. @@ -358,6 +526,25 @@ def update_received_msg(): # # TODO: Handle other cmd types. # assert(False) + if has_dma_ports & s.dma_done_val: + s.dma_done_rdy @= s.send_to_cpu_pkt_queue.recv.rdy + s.send_to_cpu_pkt_queue.recv.val @= 1 + s.send_to_cpu_pkt_queue.recv.msg @= \ + IntraCgraPktType(num_tiles, # src_tile_id: controller/DMA sideband source + num_tiles, # dst_tile_id: CPU-facing controller endpoint + s.cgra_id, + s.cgra_id, + s.idTo2d_x_lut[s.cgra_id], + s.idTo2d_y_lut[s.cgra_id], + s.idTo2d_x_lut[s.cgra_id], + s.idTo2d_y_lut[s.cgra_id], + s.dma_done_tag, + 0, + CgraPayloadType( + CMD_DMA_DONE, + DataType(zext(s.dma_done_tag, DataPayloadType), 1, 0, 0), + 0, 0, 0)) + @update def update_sending_to_noc_msg(): s.send_to_inter_cgra_noc.val @= s.crossbar.send[0].val diff --git a/lib/cmd_type.py b/lib/cmd_type.py index a24078d7..13590ca3 100644 --- a/lib/cmd_type.py +++ b/lib/cmd_type.py @@ -14,7 +14,7 @@ # Total number of commands that are supported/recognized by controller. # Needs to be updated once more commands are added/supported. -NUM_CMDS = 44 +NUM_CMDS = 52 CMD_LAUNCH = 0 CMD_PAUSE = 1 @@ -69,6 +69,17 @@ # GEP FU Configuration Commands. CMD_CONFIG_GEP_STRIDE = 43 # Controller -> GEP FU: Configures stride for 2D GEP +# DMA commands. The CPU configures the controller-side command registers +# before issuing CMD_DMA_MVIN/CMD_DMA_MVOUT. +CMD_DMA_CONFIG_DRAM_ADDR_LO = 44 # Configures lower 32 bits of DRAM address +CMD_DMA_CONFIG_DRAM_ADDR_HI = 45 # Configures higher 32 bits of DRAM address +CMD_DMA_CONFIG_SPM_ADDR = 46 # Configures SPM address +CMD_DMA_CONFIG_BYTES = 47 # Configures number of bytes to transfer +CMD_DMA_CONFIG_TAG = 48 # Configures tag of the DMA command +CMD_DMA_MVIN = 49 # Issues a DMA_MVIN command +CMD_DMA_MVOUT = 50 # Issues a DMA_MVOUT command +CMD_DMA_DONE = 51 # Signals that the DMA command is complete + CMD_SYMBOL_DICT = { CMD_LAUNCH: "(LAUNCH_KERNEL)", CMD_PAUSE: "(PAUSE_EXECUTION)", @@ -114,5 +125,13 @@ CMD_LC_CHILD_RESET: "(LC_CHILD_RESET)", CMD_LC_ALL_COMPLETE: "(LC_ALL_COMPLETE)", CMD_CONFIG_GEP_STRIDE: "(CONFIG_GEP_STRIDE)", + CMD_DMA_CONFIG_DRAM_ADDR_LO: "(DMA_CONFIG_DRAM_ADDR_LO)", + CMD_DMA_CONFIG_DRAM_ADDR_HI: "(DMA_CONFIG_DRAM_ADDR_HI)", + CMD_DMA_CONFIG_SPM_ADDR: "(DMA_CONFIG_SPM_ADDR)", + CMD_DMA_CONFIG_BYTES: "(DMA_CONFIG_BYTES)", + CMD_DMA_CONFIG_TAG: "(DMA_CONFIG_TAG)", + CMD_DMA_MVIN: "(DMA_MVIN)", + CMD_DMA_MVOUT: "(DMA_MVOUT)", + CMD_DMA_DONE: "(DMA_DONE)", } diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index e7897422..e83de7f5 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -40,15 +40,18 @@ class DataMemControllerRTL(Component): It arbitrates between multiple request sources: 1. Local tiles (via `recv_raddr`, `recv_waddr`, `recv_wdata`) 2. Inter-CGRA NoC (via `recv_from_noc_load_request`, etc.) - 3. Optional DMA engine (via `spm_dma_wval`, `spm_dma_rval`, etc.) + 3. Optional controller-forwarded DMA access + (via `spm_dma_wval`, `spm_dma_rval`, etc.) Architectural Design: - Uses crossbars to route requests to the correct memory bank based on the address. - - Supports an optional DMA interface. When `has_dma_ports` is True, extra - ports are added to the read and write crossbars. - - DMA requests are treated as another master on the memory bus, competing - with tiles and NoC traffic. + - Supports an optional controller-forwarded DMA SPM interface. When + `has_dma_ports` is True, extra ports are added to the read and write + crossbars. + - DMA-originated requests are treated as another master on the memory bus, + competing with tiles and NoC traffic after they pass through the + controller. """ def construct(s, NocPktType, From 241bee72dd94b99de22e23ed50be71c0ce998d32 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sat, 13 Jun 2026 22:15:06 +0800 Subject: [PATCH 15/46] [Refactor] Simplify DMA interface connections in CgraDmaRTL, CgraTemplateRTL, and ControllerRTL. --- cgra/CgraDmaRTL.py | 29 +-- cgra/CgraTemplateRTL.py | 99 +++++----- controller/ControllerRTL.py | 176 ++++++++++++------ lib/basic/val_rdy/ifcs.py | 71 +++++++ lib/messages.py | 102 +++++++++- mem/data/DataMemControllerRTL.py | 59 +++--- .../test/DataMemControllerRTL_dma_test.py | 38 ++-- mem/dma/DmaEngineRTL.py | 105 +++++------ mem/dma/test/DmaEngineRTL_test.py | 74 ++++---- 9 files changed, 468 insertions(+), 285 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index daa5315d..a63d6331 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -158,19 +158,11 @@ def construct(s, CgraPayloadType, s.address_lower //= s.cgra.address_lower s.address_upper //= s.cgra.address_upper - # Controller-decoded DMA command/done connections. - s.cgra.dma_cmd_val //= s.dma.dma_cmd_val - s.cgra.dma_cmd_rdy //= s.dma.dma_cmd_rdy - s.cgra.dma_cmd_opcode //= s.dma.dma_cmd_opcode - s.cgra.dma_cmd_dram_addr //= s.dma.dma_cmd_dram_addr - s.cgra.dma_cmd_spm_addr //= s.dma.dma_cmd_spm_addr - s.cgra.dma_cmd_bytes //= s.dma.dma_cmd_bytes - s.cgra.dma_cmd_tag //= s.dma.dma_cmd_tag - - s.dma.dma_done_val //= s.cgra.dma_done_val - s.dma.dma_done_rdy //= s.cgra.dma_done_rdy - s.dma.dma_done_tag //= s.cgra.dma_done_tag + # Connections between CGRA and DMA engine. + # CGRA communicates with DMA engine through the controller. + s.cgra.dma_cmd //= s.dma.dma_cmd + s.dma.dma_done //= s.cgra.dma_done s.dram_rd_req //= s.dma.dram_rd_req s.dram_rd_resp //= s.dma.dram_rd_resp @@ -186,18 +178,7 @@ def construct(s, CgraPayloadType, # DMA to controller-forwarded SPM connections. - s.dma.spm_dma_wval //= s.cgra.spm_dma_wval - s.dma.spm_dma_wrdy //= s.cgra.spm_dma_wrdy - s.dma.spm_dma_waddr //= s.cgra.spm_dma_waddr - s.dma.spm_dma_wdata //= s.cgra.spm_dma_wdata - s.dma.spm_dma_wmask //= s.cgra.spm_dma_wmask - - s.dma.spm_dma_rval //= s.cgra.spm_dma_rval - s.dma.spm_dma_rrdy //= s.cgra.spm_dma_rrdy - s.dma.spm_dma_raddr //= s.cgra.spm_dma_raddr - s.dma.spm_dma_rresp_val //= s.cgra.spm_dma_rresp_val - s.dma.spm_dma_rresp_rdy //= s.cgra.spm_dma_rresp_rdy - s.dma.spm_dma_rresp_data //= s.cgra.spm_dma_rresp_data + s.dma.spm //= s.cgra.dma_spm def line_trace(s): return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index afeb44b3..bccd7960 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -9,6 +9,7 @@ from ..controller.ControllerRTL import ControllerRTL from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ..lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL from ..lib.basic.val_rdy.queues import BypassQueueRTL from ..lib.opt_type import * from ..lib.util.common import * @@ -133,6 +134,15 @@ def construct(s, CgraPayloadType, DmaDramAddrType = mk_bits(64) DmaBytesType = mk_bits(32) DmaTagType = mk_bits(8) + DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits, + DataAddrType.nbits, + DmaBytesType.nbits, + DmaTagType.nbits) + DmaDoneType = mk_dma_done(DmaTagType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits, + DmaDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -145,32 +155,38 @@ def construct(s, CgraPayloadType, # Optional DMA engine-facing ports. The controller owns command decode and # forwards DMA SPM access to the data memory. if has_dma_ports: - s.dma_cmd_val = OutPort() - s.dma_cmd_rdy = InPort() - s.dma_cmd_opcode = OutPort(DmaOpcodeType) - s.dma_cmd_dram_addr = OutPort(DmaDramAddrType) - s.dma_cmd_spm_addr = OutPort(DataAddrType) - s.dma_cmd_bytes = OutPort(DmaBytesType) - s.dma_cmd_tag = OutPort(DmaTagType) - - s.dma_done_val = InPort() - s.dma_done_rdy = OutPort() - s.dma_done_tag = InPort(DmaTagType) + s.dma_cmd = SendIfcRTL(DmaCmdType) + s.dma_cmd_val = s.dma_cmd.val + s.dma_cmd_rdy = s.dma_cmd.rdy + s.dma_cmd_opcode = s.dma_cmd.msg.opcode + s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr + s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr + s.dma_cmd_bytes = s.dma_cmd.msg.nbytes + s.dma_cmd_tag = s.dma_cmd.msg.tag + + s.dma_done = RecvIfcRTL(DmaDoneType) + s.dma_done_val = s.dma_done.val + s.dma_done_rdy = s.dma_done.rdy + s.dma_done_tag = s.dma_done.msg.tag + + s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) # DMA write request interface. - s.spm_dma_wval = InPort() # dma write request valid(write data into SPM) - s.spm_dma_wrdy = OutPort() - s.spm_dma_waddr = InPort(DataAddrType) - s.spm_dma_wdata = InPort(DmaDataType) - s.spm_dma_wmask = InPort(DmaMaskType) + s.spm_dma_wval = s.dma_spm.write.val # dma write request valid(write data into SPM) + s.spm_dma_wrdy = s.dma_spm.write.rdy + s.spm_dma_waddr = s.dma_spm.write.msg.addr + s.spm_dma_wdata = s.dma_spm.write.msg.data + s.spm_dma_wmask = s.dma_spm.write.msg.mask # DMA read response interface. - s.spm_dma_rval = InPort() - s.spm_dma_rrdy = OutPort() - s.spm_dma_raddr = InPort(DataAddrType) - s.spm_dma_rresp_val = OutPort() - s.spm_dma_rresp_rdy = InPort() - s.spm_dma_rresp_data = OutPort(DmaDataType) + s.spm_dma_rval = s.dma_spm.read.val + s.spm_dma_rrdy = s.dma_spm.read.rdy + s.spm_dma_raddr = s.dma_spm.read.msg.addr + s.spm_dma_rresp_val = s.dma_spm.read_resp.val + s.spm_dma_rresp_rdy = s.dma_spm.read_resp.rdy + s.spm_dma_rresp_data = s.dma_spm.read_resp.msg.data if is_multi_cgra: # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. @@ -231,45 +247,14 @@ def construct(s, CgraPayloadType, if has_dma_ports: # CPU packets are decoded by the controller before becoming DMA commands. - s.dma_cmd_val //= s.controller.dma_cmd_val - s.dma_cmd_rdy //= s.controller.dma_cmd_rdy - s.dma_cmd_opcode //= s.controller.dma_cmd_opcode - s.dma_cmd_dram_addr //= s.controller.dma_cmd_dram_addr - s.dma_cmd_spm_addr //= s.controller.dma_cmd_spm_addr - s.dma_cmd_bytes //= s.controller.dma_cmd_bytes - s.dma_cmd_tag //= s.controller.dma_cmd_tag - - s.dma_done_val //= s.controller.dma_done_val - s.dma_done_rdy //= s.controller.dma_done_rdy - s.dma_done_tag //= s.controller.dma_done_tag + s.dma_cmd //= s.controller.dma_cmd + s.dma_done //= s.controller.dma_done # DMA engine <-> controller side of the SPM path. - s.spm_dma_wval //= s.controller.spm_dma_wval - s.spm_dma_wrdy //= s.controller.spm_dma_wrdy - s.spm_dma_waddr //= s.controller.spm_dma_waddr - s.spm_dma_wdata //= s.controller.spm_dma_wdata - s.spm_dma_wmask //= s.controller.spm_dma_wmask - - s.spm_dma_rval //= s.controller.spm_dma_rval - s.spm_dma_rrdy //= s.controller.spm_dma_rrdy - s.spm_dma_raddr //= s.controller.spm_dma_raddr - s.spm_dma_rresp_val //= s.controller.spm_dma_rresp_val - s.spm_dma_rresp_rdy //= s.controller.spm_dma_rresp_rdy - s.spm_dma_rresp_data //= s.controller.spm_dma_rresp_data + s.dma_spm //= s.controller.dma_spm_from_dma # Controller <-> data memory side of the SPM path. - s.controller.send_to_mem_dma_wval //= s.data_mem.spm_dma_wval - s.controller.recv_from_mem_dma_wrdy //= s.data_mem.spm_dma_wrdy - s.controller.send_to_mem_dma_waddr //= s.data_mem.spm_dma_waddr - s.controller.send_to_mem_dma_wdata //= s.data_mem.spm_dma_wdata - s.controller.send_to_mem_dma_wmask //= s.data_mem.spm_dma_wmask - - s.controller.send_to_mem_dma_rval //= s.data_mem.spm_dma_rval - s.controller.recv_from_mem_dma_rrdy //= s.data_mem.spm_dma_rrdy - s.controller.send_to_mem_dma_raddr //= s.data_mem.spm_dma_raddr - s.controller.recv_from_mem_dma_rresp_val //= s.data_mem.spm_dma_rresp_val - s.controller.send_to_mem_dma_rresp_rdy //= s.data_mem.spm_dma_rresp_rdy - s.controller.recv_from_mem_dma_rresp_data //= s.data_mem.spm_dma_rresp_data + s.controller.dma_spm_to_mem //= s.data_mem.dma_spm # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index d925c555..078cda51 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -11,6 +11,8 @@ from ..lib.basic.val_rdy.ifcs import RecvIfcRTL from ..lib.basic.val_rdy.ifcs import SendIfcRTL +from ..lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL, DmaSpmMinionIfcRTL +from ..lib.basic.val_rdy.ifcs import DmaWireIfcRTL, DmaSpmWireIfcRTL from ..lib.basic.val_rdy.queues import NormalQueueRTL from ..lib.messages import * from ..lib.opt_type import * @@ -60,6 +62,15 @@ def construct(s, DmaTagType = mk_bits(8) DmaDramAddrPartType = mk_bits(32) DmaMaskType = mk_bits(max(1, DataPayloadType.nbits // CHAR_BIT)) + DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits, + DataAddrType.nbits, + DmaBytesType.nbits, + DmaTagType.nbits) + DmaDoneType = mk_dma_done(DmaTagType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits, + DataPayloadType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DataPayloadType.nbits) if has_dma_ports: assert DataPayloadType.nbits == 32 @@ -88,46 +99,78 @@ def construct(s, if has_dma_ports: # Controller-owned command path from CPU packets to the DMA engine. - s.dma_cmd_val = OutPort() - s.dma_cmd_rdy = InPort() - s.dma_cmd_opcode = OutPort(DmaOpcodeType) - s.dma_cmd_dram_addr = OutPort(DmaDramAddrType) - s.dma_cmd_spm_addr = OutPort(DataAddrType) - s.dma_cmd_bytes = OutPort(DmaBytesType) - s.dma_cmd_tag = OutPort(DmaTagType) - - s.dma_done_val = InPort() - s.dma_done_rdy = OutPort() - s.dma_done_tag = InPort(DmaTagType) + s.dma_cmd = SendIfcRTL(DmaCmdType) + s.dma_cmd_val = s.dma_cmd.val + s.dma_cmd_rdy = s.dma_cmd.rdy + s.dma_cmd_opcode = s.dma_cmd.msg.opcode + s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr + s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr + s.dma_cmd_bytes = s.dma_cmd.msg.nbytes + s.dma_cmd_tag = s.dma_cmd.msg.tag + + s.dma_done = RecvIfcRTL(DmaDoneType) + s.dma_done_val = s.dma_done.val + s.dma_done_rdy = s.dma_done.rdy + s.dma_done_tag = s.dma_done.msg.tag # DMA engine side of the controller-forwarded SPM access path. - s.spm_dma_wval = InPort() - s.spm_dma_wrdy = OutPort() - s.spm_dma_waddr = InPort(DataAddrType) - s.spm_dma_wdata = InPort(DataPayloadType) - s.spm_dma_wmask = InPort(DmaMaskType) - - s.spm_dma_rval = InPort() - s.spm_dma_rrdy = OutPort() - s.spm_dma_raddr = InPort(DataAddrType) - s.spm_dma_rresp_val = OutPort() - s.spm_dma_rresp_rdy = InPort() - s.spm_dma_rresp_data = OutPort(DataPayloadType) + s.dma_spm_from_dma = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) + s.spm_dma_wval = s.dma_spm_from_dma.write.val + s.spm_dma_wrdy = s.dma_spm_from_dma.write.rdy + s.spm_dma_waddr = s.dma_spm_from_dma.write.msg.addr + s.spm_dma_wdata = s.dma_spm_from_dma.write.msg.data + s.spm_dma_wmask = s.dma_spm_from_dma.write.msg.mask + + s.spm_dma_rval = s.dma_spm_from_dma.read.val + s.spm_dma_rrdy = s.dma_spm_from_dma.read.rdy + s.spm_dma_raddr = s.dma_spm_from_dma.read.msg.addr + s.spm_dma_rresp_val = s.dma_spm_from_dma.read_resp.val + s.spm_dma_rresp_rdy = s.dma_spm_from_dma.read_resp.rdy + s.spm_dma_rresp_data = s.dma_spm_from_dma.read_resp.msg.data # Data memory side of the same SPM access path. - s.send_to_mem_dma_wval = OutPort() - s.recv_from_mem_dma_wrdy = InPort() - s.send_to_mem_dma_waddr = OutPort(DataAddrType) - s.send_to_mem_dma_wdata = OutPort(DataPayloadType) - s.send_to_mem_dma_wmask = OutPort(DmaMaskType) - - s.send_to_mem_dma_rval = OutPort() - s.recv_from_mem_dma_rrdy = InPort() - s.send_to_mem_dma_raddr = OutPort(DataAddrType) - s.recv_from_mem_dma_rresp_val = InPort() - s.send_to_mem_dma_rresp_rdy = OutPort() - s.recv_from_mem_dma_rresp_data = InPort(DataPayloadType) + s.dma_spm_to_mem = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) + s.send_to_mem_dma_wval = s.dma_spm_to_mem.write.val + s.recv_from_mem_dma_wrdy = s.dma_spm_to_mem.write.rdy + s.send_to_mem_dma_waddr = s.dma_spm_to_mem.write.msg.addr + s.send_to_mem_dma_wdata = s.dma_spm_to_mem.write.msg.data + s.send_to_mem_dma_wmask = s.dma_spm_to_mem.write.msg.mask + + s.send_to_mem_dma_rval = s.dma_spm_to_mem.read.val + s.recv_from_mem_dma_rrdy = s.dma_spm_to_mem.read.rdy + s.send_to_mem_dma_raddr = s.dma_spm_to_mem.read.msg.addr + s.recv_from_mem_dma_rresp_val = s.dma_spm_to_mem.read_resp.val + s.send_to_mem_dma_rresp_rdy = s.dma_spm_to_mem.read_resp.rdy + s.recv_from_mem_dma_rresp_data = s.dma_spm_to_mem.read_resp.msg.data else: + s.dma_cmd = DmaWireIfcRTL(DmaCmdType) + s.dma_cmd.rdy //= 0 + + s.dma_done = DmaWireIfcRTL(DmaDoneType) + s.dma_done.val //= 0 + s.dma_done.msg //= DmaDoneType() + + s.dma_spm_from_dma = DmaSpmWireIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) + s.dma_spm_from_dma.write.val //= 0 + s.dma_spm_from_dma.write.msg //= DmaSpmWriteReqType() + s.dma_spm_from_dma.read.val //= 0 + s.dma_spm_from_dma.read.msg //= DmaSpmReadReqType() + s.dma_spm_from_dma.read_resp.rdy //= 0 + + s.dma_spm_to_mem = DmaSpmWireIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) + s.dma_spm_to_mem.write.rdy //= 0 + s.dma_spm_to_mem.read.rdy //= 0 + s.dma_spm_to_mem.read_resp.val //= 0 + s.dma_spm_to_mem.read_resp.msg //= DmaSpmReadRespType() + s.dma_cmd_val = Wire() s.dma_cmd_rdy = Wire() s.dma_cmd_opcode = Wire(DmaOpcodeType) @@ -262,18 +305,16 @@ def update_dma_cmd_regs(): @update def update_dma_spm_forwarding(): if has_dma_ports: - s.send_to_mem_dma_wval @= s.spm_dma_wval - s.spm_dma_wrdy @= s.recv_from_mem_dma_wrdy - s.send_to_mem_dma_waddr @= s.spm_dma_waddr - s.send_to_mem_dma_wdata @= s.spm_dma_wdata - s.send_to_mem_dma_wmask @= s.spm_dma_wmask - - s.send_to_mem_dma_rval @= s.spm_dma_rval - s.spm_dma_rrdy @= s.recv_from_mem_dma_rrdy - s.send_to_mem_dma_raddr @= s.spm_dma_raddr - s.spm_dma_rresp_val @= s.recv_from_mem_dma_rresp_val - s.send_to_mem_dma_rresp_rdy @= s.spm_dma_rresp_rdy - s.spm_dma_rresp_data @= s.recv_from_mem_dma_rresp_data + s.dma_spm_to_mem.write.val @= s.dma_spm_from_dma.write.val + s.dma_spm_from_dma.write.rdy @= s.dma_spm_to_mem.write.rdy + s.dma_spm_to_mem.write.msg @= s.dma_spm_from_dma.write.msg + + s.dma_spm_to_mem.read.val @= s.dma_spm_from_dma.read.val + s.dma_spm_from_dma.read.rdy @= s.dma_spm_to_mem.read.rdy + s.dma_spm_to_mem.read.msg @= s.dma_spm_from_dma.read.msg + s.dma_spm_from_dma.read_resp.val @= s.dma_spm_to_mem.read_resp.val + s.dma_spm_to_mem.read_resp.rdy @= s.dma_spm_from_dma.read_resp.rdy + s.dma_spm_from_dma.read_resp.msg @= s.dma_spm_to_mem.read_resp.msg @update def update_received_msg(): @@ -289,13 +330,14 @@ def update_received_msg(): s.recv_from_ctrl_ring_pkt.rdy @= 0 if has_dma_ports: - s.dma_cmd_val @= 0 - s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVIN) - s.dma_cmd_dram_addr @= concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo) - s.dma_cmd_spm_addr @= s.dma_spm_addr - s.dma_cmd_bytes @= s.dma_bytes - s.dma_cmd_tag @= s.dma_tag - s.dma_done_rdy @= 0 + s.dma_cmd.val @= 0 + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVIN), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + s.dma_done.rdy @= 0 for i in range(CONTROLLER_CROSSBAR_INPORTS): s.crossbar.recv[i].val @= 0 @@ -361,12 +403,22 @@ def update_received_msg(): elif has_dma_ports & ( (cpu_cmd == CMD_DMA_MVIN) | (cpu_cmd == CMD_DMA_MVOUT)): - s.dma_cmd_val @= s.recv_from_cpu_pkt_queue.send.val + s.dma_cmd.val @= s.recv_from_cpu_pkt_queue.send.val if cpu_cmd == CMD_DMA_MVIN: - s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVIN) + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVIN), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) else: - s.dma_cmd_opcode @= DmaOpcodeType(DMA_MVOUT) - s.recv_from_cpu_pkt_queue.send.rdy @= s.dma_cmd_rdy + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVOUT), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + s.recv_from_cpu_pkt_queue.send.rdy @= s.dma_cmd.rdy else: # For the ctrl and data preloading. @@ -526,8 +578,8 @@ def update_received_msg(): # # TODO: Handle other cmd types. # assert(False) - if has_dma_ports & s.dma_done_val: - s.dma_done_rdy @= s.send_to_cpu_pkt_queue.recv.rdy + if has_dma_ports & s.dma_done.val: + s.dma_done.rdy @= s.send_to_cpu_pkt_queue.recv.rdy s.send_to_cpu_pkt_queue.recv.val @= 1 s.send_to_cpu_pkt_queue.recv.msg @= \ IntraCgraPktType(num_tiles, # src_tile_id: controller/DMA sideband source @@ -538,11 +590,11 @@ def update_received_msg(): s.idTo2d_y_lut[s.cgra_id], s.idTo2d_x_lut[s.cgra_id], s.idTo2d_y_lut[s.cgra_id], - s.dma_done_tag, + s.dma_done.msg.tag, 0, CgraPayloadType( CMD_DMA_DONE, - DataType(zext(s.dma_done_tag, DataPayloadType), 1, 0, 0), + DataType(zext(s.dma_done.msg.tag, DataPayloadType), 1, 0, 0), 0, 0, 0)) @update diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 644b98f5..5c964665 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -85,3 +85,74 @@ def construct( s, ReqType, RespType ): s.resp = SendIfcRTL( Type=RespType ) def __str__( s ): return f"{s.req}|{s.resp}" + +class DmaWireIfcRTL( Interface ): + def construct( s, Type ): + s.msg = Wire( Type ) + s.val = Wire() + s.rdy = Wire() + s.trace_len = len(str(Type())) + def __str__( s ): + return valrdy_to_str( s.msg, s.val, s.rdy, s.trace_len ) + +class DmaSpmMasterIfcRTL( Interface ): + """ + DMA-to-SPM Master Interface. + + This interface is instantiated on the DMA side. + It initiates all transfer requests (both write and read) to the SPM + and receives the corresponding read data back. + + Direction: + - write : Output (Send). DMA sends write requests to SPM. + - read : Output (Send). DMA sends read requests to SPM. + - read_resp: Input (Recv). DMA receives read data from SPM. + """ + + def construct( s, WriteReqType, ReadReqType, ReadRespType ): + s.WriteReqType = WriteReqType + s.ReadReqType = ReadReqType + s.ReadRespType = ReadRespType + s.write = SendIfcRTL( WriteReqType ) + s.read = SendIfcRTL( ReadReqType ) + s.read_resp = RecvIfcRTL( ReadRespType ) + def __str__( s ): + return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" + +class DmaSpmMinionIfcRTL( Interface ): + """ + DMA-to-SPM Minion Interface. + + This interface is instantiated on the SPM side. + It passively accepts incoming transfer requests from the DMA master, + performs the requested memory operations, and returns read data if needed. + + Direction: + - write : Input (Recv). SPM receives write requests from DMA. + - read : Input (Recv). SPM receives read requests from DMA. + - read_resp: Output (Send). SPM sends read data back to DMA. + """ + def construct( s, WriteReqType, ReadReqType, ReadRespType ): + s.WriteReqType = WriteReqType + s.ReadReqType = ReadReqType + s.ReadRespType = ReadRespType + s.write = RecvIfcRTL( WriteReqType ) + s.read = RecvIfcRTL( ReadReqType ) + s.read_resp = SendIfcRTL( ReadRespType ) + def __str__( s ): + return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" + +class DmaSpmWireIfcRTL( Interface ): + """ + Wire interface/connection for DMA-to-SPM, no direction. + + """ + def construct( s, WriteReqType, ReadReqType, ReadRespType ): + s.WriteReqType = WriteReqType + s.ReadReqType = ReadReqType + s.ReadRespType = ReadRespType + s.write = DmaWireIfcRTL( WriteReqType ) + s.read = DmaWireIfcRTL( ReadReqType ) + s.read_resp = DmaWireIfcRTL( ReadRespType ) + def __str__( s ): + return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" diff --git a/lib/messages.py b/lib/messages.py index 49182f98..e05ef1cd 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -160,7 +160,7 @@ def str_func(s): field_dict[kAttrVectorFactorPower] = VectorFactorPowerType - field_dict[kAttrIsLastCtrl] = b1 + field_dict[kAttrIsLastCtrl] = mk_bits(1) # Register file related signals. # Indicates whether to write data into the register bank, and the @@ -199,6 +199,106 @@ def str_func(s): namespace = {'__str__': str_func} ) +#========================================================================= +# DMA messages +#========================================================================= + +def mk_dma_cmd(dram_addr_nbits = 64, + spm_addr_nbits = 32, + bytes_nbits = 32, + tag_nbits = 8, + prefix = "DmaCmd"): + + OpcodeType = mk_bits(3) + DramAddrType = mk_bits(dram_addr_nbits) + SpmAddrType = mk_bits(spm_addr_nbits) + BytesType = mk_bits(bytes_nbits) + TagType = mk_bits(tag_nbits) + + new_name = f"{prefix}_{dram_addr_nbits}_{spm_addr_nbits}_{bytes_nbits}_{tag_nbits}" + + def str_func(s): + return f"dma_cmd(op={s.opcode},dram={s.dram_addr},spm={s.spm_addr},bytes={s.nbytes},tag={s.tag})" + + return mk_bitstruct(new_name, { + 'opcode' : OpcodeType, + 'dram_addr': DramAddrType, + 'spm_addr' : SpmAddrType, + 'nbytes' : BytesType, + 'tag' : TagType, + }, + namespace = {'__str__': str_func} + ) + +def mk_dma_done(tag_nbits = 8, + prefix = "DmaDone"): + + TagType = mk_bits(tag_nbits) + + new_name = f"{prefix}_{tag_nbits}" + + def str_func(s): + return f"dma_done(tag={s.tag})" + + return mk_bitstruct(new_name, { + 'tag': TagType, + }, + namespace = {'__str__': str_func} + ) + +def mk_dma_spm_write_req(addr_nbits = 32, + data_nbits = 32, + prefix = "DmaSpmWriteReq"): + + AddrType = mk_bits(addr_nbits) + DataType = mk_bits(data_nbits) + MaskType = mk_bits(max(1, data_nbits // 8)) + + new_name = f"{prefix}_{addr_nbits}_{data_nbits}" + + def str_func(s): + return f"dma_spm_wr(addr={s.addr},data={s.data},mask={s.mask})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + 'data': DataType, + 'mask': MaskType, + }, + namespace = {'__str__': str_func} + ) + +def mk_dma_spm_read_req(addr_nbits = 32, + prefix = "DmaSpmReadReq"): + + AddrType = mk_bits(addr_nbits) + + new_name = f"{prefix}_{addr_nbits}" + + def str_func(s): + return f"dma_spm_rd(addr={s.addr})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + }, + namespace = {'__str__': str_func} + ) + +def mk_dma_spm_read_resp(data_nbits = 32, + prefix = "DmaSpmReadResp"): + + DataType = mk_bits(data_nbits) + + new_name = f"{prefix}_{data_nbits}" + + def str_func(s): + return f"dma_spm_rd_resp(data={s.data})" + + return mk_bitstruct(new_name, { + 'data': DataType, + }, + namespace = {'__str__': str_func} + ) + #========================================================================= # Multi-cgra oriented inter-/intra-cgra data/config/cmd packet payload #========================================================================= diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index e83de7f5..3031f72f 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -29,6 +29,7 @@ from .DataMemWrapperRTL import DataMemWrapperRTL from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL, DmaSpmWireIfcRTL from ...lib.messages import * from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL from ...lib.util.data_struct_attr import * @@ -80,6 +81,9 @@ def construct(s, PerBankAddrType = mk_bits(per_bank_addr_nbits) DmaDataType = DataType.get_field_type(kAttrPayload) DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) + DmaSpmWriteReqType = mk_dma_spm_write_req(AddrType.nbits, DmaDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(AddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits) NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) s.num_banks_per_cgra = num_banks_per_cgra s.has_dma_ports = has_dma_ports @@ -149,21 +153,19 @@ def construct(s, s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) if has_dma_ports: - # DMA writes SPM: used by DMA_MVIN. - s.spm_dma_wval = InPort() - s.spm_dma_wrdy = OutPort() - s.spm_dma_waddr = InPort(AddrType) - s.spm_dma_wdata = InPort(DmaDataType) - s.spm_dma_wmask = InPort(DmaMaskType) - - # DMA reads SPM: used by DMA_MVOUT. - s.spm_dma_rval = InPort() - s.spm_dma_rrdy = OutPort() - s.spm_dma_raddr = InPort(AddrType) - s.spm_dma_rresp_val = OutPort() - s.spm_dma_rresp_rdy = InPort() - s.spm_dma_rresp_data = OutPort(DmaDataType) + s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) else: + s.dma_spm = DmaSpmWireIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) + s.dma_spm.write.val //= 0 + s.dma_spm.write.msg //= DmaSpmWriteReqType() + s.dma_spm.read.val //= 0 + s.dma_spm.read.msg //= DmaSpmReadReqType() + s.dma_spm.read_resp.rdy //= 0 + # Keep these as internal wires so PyMTL's static update-block analysis # can see declared objects even when the optional DMA interface is off. s.spm_dma_wval = Wire() @@ -300,7 +302,7 @@ def assemble_xbar_pkt(): dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - recv_raddr_from_dma = s.spm_dma_raddr + recv_raddr_from_dma = s.dma_spm.read.msg.addr if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) else: @@ -313,7 +315,7 @@ def assemble_xbar_pkt(): 0, # src_tile 0) # remote_src_port - recv_waddr_from_dma = s.spm_dma_waddr + recv_waddr_from_dma = s.dma_spm.write.msg.addr if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) else: @@ -321,7 +323,7 @@ def assemble_xbar_pkt(): s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src bank_index_store_from_dma, # dst recv_waddr_from_dma, # addr - DataType(s.spm_dma_wdata, 1, 0, 0), + DataType(s.dma_spm.write.msg.data, 1, 0, 0), 0, # src_cgra 0, # src_tile 0) # remote_src_port @@ -390,10 +392,10 @@ def update_all(): s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) if has_dma_ports: - s.spm_dma_wrdy @= 0 - s.spm_dma_rrdy @= 0 - s.spm_dma_rresp_val @= 0 - s.spm_dma_rresp_data @= DmaDataType(0) + s.dma_spm.write.rdy @= 0 + s.dma_spm.read.rdy @= 0 + s.dma_spm.read_resp.val @= 0 + s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src @@ -426,9 +428,9 @@ def update_all(): # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error # between `dma_rd_idx` and `num_xbar_in_rd_ports`. dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) - s.read_crossbar.recv[dma_rd_idx].val @= s.spm_dma_rval + s.read_crossbar.recv[dma_rd_idx].val @= s.dma_spm.read.val s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] - s.spm_dma_rrdy @= s.read_crossbar.recv[dma_rd_idx].rdy + s.dma_spm.read.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -446,9 +448,9 @@ def update_all(): # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - s.write_crossbar.recv[dma_wr_idx].val @= s.spm_dma_wval + s.write_crossbar.recv[dma_wr_idx].val @= s.dma_spm.write.val s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] - s.spm_dma_wrdy @= s.write_crossbar.recv[dma_wr_idx].rdy + s.dma_spm.write.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. @@ -481,9 +483,10 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy elif has_dma_ports: - s.spm_dma_rresp_data @= s.response_crossbar.send[i].msg.data.payload - s.spm_dma_rresp_val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.spm_dma_rresp_rdy + s.dma_spm.read_resp.msg @= DmaSpmReadRespType( + s.response_crossbar.send[i].msg.data.payload) + s.dma_spm.read_resp.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.dma_spm.read_resp.rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index b4cf1495..add8ce17 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -43,13 +43,13 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_load_response_pkt.rdy @= 1 dut.send_to_noc_store_pkt.rdy @= 1 - dut.spm_dma_wval @= 0 - dut.spm_dma_waddr @= DataAddrType(0) - dut.spm_dma_wdata @= 0 - dut.spm_dma_wmask @= 0 - dut.spm_dma_rval @= 0 - dut.spm_dma_raddr @= DataAddrType(0) - dut.spm_dma_rresp_rdy @= 1 + dut.dma_spm.write.val @= 0 + dut.dma_spm.write.msg.addr @= DataAddrType(0) + dut.dma_spm.write.msg.data @= 0 + dut.dma_spm.write.msg.mask @= 0 + dut.dma_spm.read.val @= 0 + dut.dma_spm.read.msg.addr @= DataAddrType(0) + dut.dma_spm.read_resp.rdy @= 1 dut.cgra_id @= 0 dut.address_lower @= DataAddrType(0) @@ -89,25 +89,25 @@ def test_dma_ports_write_then_read(): dut.sim_reset() drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) - dut.spm_dma_wval @= 1 - dut.spm_dma_waddr @= DataAddrType(3) - dut.spm_dma_wdata @= 0xaaaabbbb - dut.spm_dma_wmask @= 0xf + dut.dma_spm.write.val @= 1 + dut.dma_spm.write.msg.addr @= DataAddrType(3) + dut.dma_spm.write.msg.data @= 0xaaaabbbb + dut.dma_spm.write.msg.mask @= 0xf dut.sim_eval_combinational() - assert dut.spm_dma_wrdy + assert dut.dma_spm.write.rdy dut.sim_tick() - dut.spm_dma_wval @= 0 + dut.dma_spm.write.val @= 0 - dut.spm_dma_rval @= 1 - dut.spm_dma_raddr @= DataAddrType(3) + dut.dma_spm.read.val @= 1 + dut.dma_spm.read.msg.addr @= DataAddrType(3) seen_response = False for _ in range(10): dut.sim_eval_combinational() - if dut.spm_dma_rval & dut.spm_dma_rrdy: - dut.spm_dma_rval @= 0 - if dut.spm_dma_rresp_val: - assert int(dut.spm_dma_rresp_data) == 0xaaaabbbb + if dut.dma_spm.read.val & dut.dma_spm.read.rdy: + dut.dma_spm.read.val @= 0 + if dut.dma_spm.read_resp.val: + assert int(dut.dma_spm.read_resp.msg.data) == 0xaaaabbbb seen_response = True break dut.sim_tick() diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index efb10827..f8d91568 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -10,6 +10,8 @@ from pymtl3 import * from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ...lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL +from ...lib.messages import * from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE @@ -56,20 +58,18 @@ def construct( s, # Byte mask for SPM write SpmMaskType = mk_bits( spm_data_nbits // CHAR_BIT ) MemMaskType = mk_bits( mem_data_nbits // CHAR_BIT ) + DmaCmdType = mk_dma_cmd(dram_addr_nbits, spm_addr_nbits, bytes_nbits, tag_nbits) + DmaDoneType = mk_dma_done(tag_nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(spm_addr_nbits, spm_data_nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(spm_addr_nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(spm_data_nbits) # Command interface - s.dma_cmd_val = InPort() - s.dma_cmd_rdy = OutPort() - s.dma_cmd_opcode = InPort( OpcodeType ) - s.dma_cmd_dram_addr = InPort( DramAddrType ) - s.dma_cmd_spm_addr = InPort( SpmAddrType ) - # An input signal that specifies the number of bytes to transfer. - s.dma_cmd_bytes = InPort( BytesType ) - s.dma_cmd_tag = InPort( TagType ) - - s.dma_done_val = OutPort() - s.dma_done_rdy = InPort() - s.dma_done_tag = OutPort( TagType ) + # Receives a DMA command from the controller. + s.dma_cmd = RecvIfcRTL(DmaCmdType) + + # Sends a DMA done signal to the controller. + s.dma_done = SendIfcRTL(DmaDoneType) # Abstract external memory interface # Request to read from DRAM @@ -87,22 +87,9 @@ def construct( s, s.dram_wr_resp_rdy = OutPort() # SPM interface - # Request to write to SPM - s.spm_dma_wval = OutPort() - s.spm_dma_wrdy = InPort() - s.spm_dma_waddr = OutPort( SpmAddrType ) - s.spm_dma_wdata = OutPort( SpmDataType ) - s.spm_dma_wmask = OutPort( SpmMaskType ) - - # Request to read from SPM - s.spm_dma_rval = OutPort() - s.spm_dma_rrdy = InPort() - s.spm_dma_raddr = OutPort( SpmAddrType ) - - # Response from SPM - s.spm_dma_rresp_val = InPort() - s.spm_dma_rresp_rdy = OutPort() - s.spm_dma_rresp_data = InPort( SpmDataType ) + s.spm = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, + DmaSpmReadReqType, + DmaSpmReadRespType) # State machine definitions @@ -143,9 +130,9 @@ def construct( s, @update def comb_outputs(): - s.dma_cmd_rdy @= s.state == STATE_IDLE - s.dma_done_val @= s.state == STATE_DONE - s.dma_done_tag @= s.tag_reg + s.dma_cmd.rdy @= s.state == STATE_IDLE + s.dma_done.val @= s.state == STATE_DONE + s.dma_done.msg @= DmaDoneType(s.tag_reg) s.dram_rd_req.val @= s.state == STATE_MVIN_REQ s.dram_rd_req.msg @= s.dram_addr_reg @@ -157,22 +144,26 @@ def comb_outputs(): s.dram_wr_req_mask @= s.wr_mask_reg s.dram_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT - s.spm_dma_wval @= s.state == STATE_MVIN_WRITE - s.spm_dma_waddr @= s.spm_addr_reg - s.spm_dma_wmask @= SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) # Write mask for SPM write; always be 0b1111 + spm_wdata = SpmDataType(0) if s.word_idx_reg == b2( 0 ): # Writes the first word of the beat to SPM - s.spm_dma_wdata @= s.beat_reg[0:spm_data_nbits] + spm_wdata = s.beat_reg[0:spm_data_nbits] elif s.word_idx_reg == b2( 1 ): # Writes the second word of the beat to SPM - s.spm_dma_wdata @= s.beat_reg[spm_data_nbits:spm_data_nbits*2] + spm_wdata = s.beat_reg[spm_data_nbits:spm_data_nbits*2] elif s.word_idx_reg == b2( 2 ): # 3rd word - s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*2:spm_data_nbits*3] + spm_wdata = s.beat_reg[spm_data_nbits*2:spm_data_nbits*3] else: # 4th word - s.spm_dma_wdata @= s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] + spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] + + s.spm.write.val @= s.state == STATE_MVIN_WRITE + s.spm.write.msg @= DmaSpmWriteReqType( + s.spm_addr_reg, + spm_wdata, + SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) ) - s.spm_dma_rval @= s.state == STATE_MVOUT_READ - s.spm_dma_raddr @= s.spm_addr_reg - s.spm_dma_rresp_rdy @= s.state == STATE_MVOUT_RESP + s.spm.read.val @= s.state == STATE_MVOUT_READ + s.spm.read.msg @= DmaSpmReadReqType(s.spm_addr_reg) + s.spm.read_resp.rdy @= s.state == STATE_MVOUT_RESP @update_ff def seq_state(): @@ -188,20 +179,20 @@ def seq_state(): s.wr_mask_ff <<= MemMaskType( 0 ) else: if s.state == STATE_IDLE: - if s.dma_cmd_val & s.dma_cmd_rdy: # Receives a new DMA command. - s.opcode_ff <<= s.dma_cmd_opcode - s.dram_addr_ff <<= s.dma_cmd_dram_addr - s.spm_addr_ff <<= s.dma_cmd_spm_addr - s.words_left_ff <<= s.dma_cmd_bytes >> 2 # Converts the transfer size from bytes to words. - s.tag_ff <<= s.dma_cmd_tag + if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command. + s.opcode_ff <<= s.dma_cmd.msg.opcode + s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr + s.spm_addr_ff <<= s.dma_cmd.msg.spm_addr + s.words_left_ff <<= s.dma_cmd.msg.nbytes >> 2 # Converts the transfer size from bytes to words. + s.tag_ff <<= s.dma_cmd.msg.tag s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) - if s.dma_cmd_bytes == BytesType( 0 ): # No more bytes to transfer. + if s.dma_cmd.msg.nbytes == BytesType( 0 ): # No more bytes to transfer. s.state_ff <<= STATE_DONE # Still has bytes to transfer. - elif s.dma_cmd_opcode == OpcodeType( DMA_MVIN ): + elif s.dma_cmd.msg.opcode == OpcodeType( DMA_MVIN ): s.state_ff <<= STATE_MVIN_REQ # Move to the next state: to issue a read request to DRAM. else: # DMA_MVOUT s.state_ff <<= STATE_MVOUT_READ # Move to the next state: to issue a read request to SPM. @@ -218,7 +209,7 @@ def seq_state(): s.state_ff <<= STATE_MVIN_WRITE # Move to the next state: to write to SPM. elif s.state == STATE_MVIN_WRITE: # Writes to SPM. - if s.spm_dma_wval & s.spm_dma_wrdy: + if s.spm.write.val & s.spm.write.rdy: # Update the SPM address where write next cycle(+1) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) # Update the number of words remaining to write to SPM. @@ -233,25 +224,25 @@ def seq_state(): s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) elif s.state == STATE_MVOUT_READ: - if s.spm_dma_rval & s.spm_dma_rrdy: + if s.spm.read.val & s.spm.read.rdy: s.state_ff <<= STATE_MVOUT_RESP # Move to the next state: to receive a response from SPM. elif s.state == STATE_MVOUT_RESP: - if s.spm_dma_rresp_val & s.spm_dma_rresp_rdy: + if s.spm.read_resp.val & s.spm.read_resp.rdy: # Pack the response from SPM into a 128-bit beat by left-shifting. if s.word_idx_reg == b2( 0 ): # 1st word s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], - s.spm_dma_rresp_data ) + s.spm.read_resp.msg.data ) elif s.word_idx_reg == b2( 1 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], - s.spm_dma_rresp_data, + s.spm.read_resp.msg.data, s.beat_reg[0:spm_data_nbits] ) elif s.word_idx_reg == b2( 2 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], - s.spm_dma_rresp_data, + s.spm.read_resp.msg.data, s.beat_reg[0:spm_data_nbits*2] ) else: - s.beat_ff <<= concat( s.spm_dma_rresp_data, + s.beat_ff <<= concat( s.spm.read_resp.msg.data, s.beat_reg[0:spm_data_nbits*3] ) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) @@ -292,7 +283,7 @@ def seq_state(): s.state_ff <<= STATE_MVOUT_READ elif s.state == STATE_DONE: - if s.dma_done_val & s.dma_done_rdy: + if s.dma_done.val & s.dma_done.rdy: s.state_ff <<= STATE_IDLE def line_trace( s ): diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 28f30cc1..b81397da 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -14,13 +14,13 @@ def make_dut(): dut.apply(DefaultPassGroup()) dut.sim_reset() - dut.dma_cmd_val @= 0 - dut.dma_cmd_opcode @= 0 - dut.dma_cmd_dram_addr @= 0 - dut.dma_cmd_spm_addr @= 0 - dut.dma_cmd_bytes @= 0 - dut.dma_cmd_tag @= 0 - dut.dma_done_rdy @= 1 + dut.dma_cmd.val @= 0 + dut.dma_cmd.msg.opcode @= 0 + dut.dma_cmd.msg.dram_addr @= 0 + dut.dma_cmd.msg.spm_addr @= 0 + dut.dma_cmd.msg.nbytes @= 0 + dut.dma_cmd.msg.tag @= 0 + dut.dma_done.rdy @= 1 dut.dram_rd_req.rdy @= 1 dut.dram_rd_resp.val @= 0 @@ -28,10 +28,10 @@ def make_dut(): dut.dram_wr_req_rdy @= 1 dut.dram_wr_resp_val @= 1 - dut.spm_dma_wrdy @= 1 - dut.spm_dma_rrdy @= 1 - dut.spm_dma_rresp_val @= 0 - dut.spm_dma_rresp_data @= 0 + dut.spm.write.rdy @= 1 + dut.spm.read.rdy @= 1 + dut.spm.read_resp.val @= 0 + dut.spm.read_resp.msg.data @= 0 dut.sim_eval_combinational() return dut @@ -47,16 +47,16 @@ def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): nbytes: The number of bytes to transfer. tag: The tag of the DMA command. """ - dut.dma_cmd_val @= 1 - dut.dma_cmd_opcode @= opcode - dut.dma_cmd_dram_addr @= dram_addr - dut.dma_cmd_spm_addr @= spm_addr - dut.dma_cmd_bytes @= nbytes - dut.dma_cmd_tag @= tag + dut.dma_cmd.val @= 1 + dut.dma_cmd.msg.opcode @= opcode + dut.dma_cmd.msg.dram_addr @= dram_addr + dut.dma_cmd.msg.spm_addr @= spm_addr + dut.dma_cmd.msg.nbytes @= nbytes + dut.dma_cmd.msg.tag @= tag dut.sim_eval_combinational() - assert dut.dma_cmd_rdy + assert dut.dma_cmd.rdy dut.sim_tick() - dut.dma_cmd_val @= 0 + dut.dma_cmd.val @= 0 def test_dma_mvin_one_beat(): @@ -96,11 +96,11 @@ def test_dma_mvin_one_beat(): else: pending_resp = None - if dut.spm_dma_wval & dut.spm_dma_wrdy: - spm_writes.append((int(dut.spm_dma_waddr), int(dut.spm_dma_wdata))) + if dut.spm.write.val & dut.spm.write.rdy: + spm_writes.append((int(dut.spm.write.msg.addr), int(dut.spm.write.msg.data))) - if dut.dma_done_val: - assert int(dut.dma_done_tag) == 0x5a + if dut.dma_done.val: + assert int(dut.dma_done.msg.tag) == 0x5a break dut.sim_tick() @@ -143,15 +143,15 @@ def test_dma_mvout_partial_beat(): mem_writes = [] for _ in range(30): - dut.spm_dma_rresp_val @= 0 + dut.spm.read_resp.val @= 0 if pending_rresp is not None: - dut.spm_dma_rresp_val @= 1 - dut.spm_dma_rresp_data @= pending_rresp + dut.spm.read_resp.val @= 1 + dut.spm.read_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.spm_dma_rval & dut.spm_dma_rrdy: - pending_rresp = spm[int(dut.spm_dma_raddr)] + if dut.spm.read.val & dut.spm.read.rdy: + pending_rresp = spm[int(dut.spm.read.msg.addr)] else: pending_rresp = None @@ -160,8 +160,8 @@ def test_dma_mvout_partial_beat(): int(dut.dram_wr_req_data), int(dut.dram_wr_req_mask))) - if dut.dma_done_val: - assert int(dut.dma_done_tag) == 0xa5 + if dut.dma_done.val: + assert int(dut.dma_done.msg.tag) == 0xa5 break dut.sim_tick() @@ -200,15 +200,15 @@ def test_dma_mvout_full_beat(): mem_writes = [] for _ in range(30): - dut.spm_dma_rresp_val @= 0 + dut.spm.read_resp.val @= 0 if pending_rresp is not None: - dut.spm_dma_rresp_val @= 1 - dut.spm_dma_rresp_data @= pending_rresp + dut.spm.read_resp.val @= 1 + dut.spm.read_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.spm_dma_rval & dut.spm_dma_rrdy: - pending_rresp = spm[int(dut.spm_dma_raddr)] + if dut.spm.read.val & dut.spm.read.rdy: + pending_rresp = spm[int(dut.spm.read.msg.addr)] else: pending_rresp = None @@ -217,8 +217,8 @@ def test_dma_mvout_full_beat(): int(dut.dram_wr_req_data), int(dut.dram_wr_req_mask))) - if dut.dma_done_val: - assert int(dut.dma_done_tag) == 0xa5 + if dut.dma_done.val: + assert int(dut.dma_done.msg.tag) == 0xa5 break dut.sim_tick() From a1252028f0d1f26f00a1756b903eb532596b5a23 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 11:10:36 +0800 Subject: [PATCH 16/46] [Fix] Use Outport instead of Wire in DmaWireIfcRTL to avoid the RTLIR error of pytml verilog backend. --- lib/basic/val_rdy/ifcs.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 5c964665..388ac01a 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -87,10 +87,15 @@ def __str__( s ): return f"{s.req}|{s.resp}" class DmaWireIfcRTL( Interface ): + """Undirected val/rdy stub interface for disabled DMA paths. + + Uses OutPort for all signals so parent components can tie off unused + DMA stubs with structural connections during Verilog translation. + """ def construct( s, Type ): - s.msg = Wire( Type ) - s.val = Wire() - s.rdy = Wire() + s.msg = OutPort( Type ) + s.val = OutPort() + s.rdy = OutPort() s.trace_len = len(str(Type())) def __str__( s ): return valrdy_to_str( s.msg, s.val, s.rdy, s.trace_len ) From 4112ec3a18a33d121c59a77045942dc89686b93c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 11:23:25 +0800 Subject: [PATCH 17/46] [CleanUp] Remove the unnecessary ports. --- cgra/CgraTemplateRTL.py | 24 ------------- controller/ControllerRTL.py | 71 ------------------------------------- 2 files changed, 95 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index bccd7960..0baea9b2 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -156,37 +156,13 @@ def construct(s, CgraPayloadType, # forwards DMA SPM access to the data memory. if has_dma_ports: s.dma_cmd = SendIfcRTL(DmaCmdType) - s.dma_cmd_val = s.dma_cmd.val - s.dma_cmd_rdy = s.dma_cmd.rdy - s.dma_cmd_opcode = s.dma_cmd.msg.opcode - s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr - s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr - s.dma_cmd_bytes = s.dma_cmd.msg.nbytes - s.dma_cmd_tag = s.dma_cmd.msg.tag s.dma_done = RecvIfcRTL(DmaDoneType) - s.dma_done_val = s.dma_done.val - s.dma_done_rdy = s.dma_done.rdy - s.dma_done_tag = s.dma_done.msg.tag s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - # DMA write request interface. - s.spm_dma_wval = s.dma_spm.write.val # dma write request valid(write data into SPM) - s.spm_dma_wrdy = s.dma_spm.write.rdy - s.spm_dma_waddr = s.dma_spm.write.msg.addr - s.spm_dma_wdata = s.dma_spm.write.msg.data - s.spm_dma_wmask = s.dma_spm.write.msg.mask - - # DMA read response interface. - s.spm_dma_rval = s.dma_spm.read.val - s.spm_dma_rrdy = s.dma_spm.read.rdy - s.spm_dma_raddr = s.dma_spm.read.msg.addr - s.spm_dma_rresp_val = s.dma_spm.read_resp.val - s.spm_dma_rresp_rdy = s.dma_spm.read_resp.rdy - s.spm_dma_rresp_data = s.dma_spm.read_resp.msg.data if is_multi_cgra: # Use the largest CGRA shape to set the boundary ports for compatibility in the case of heterogeneous multi-cgra. diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 078cda51..1ce27594 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -100,52 +100,18 @@ def construct(s, if has_dma_ports: # Controller-owned command path from CPU packets to the DMA engine. s.dma_cmd = SendIfcRTL(DmaCmdType) - s.dma_cmd_val = s.dma_cmd.val - s.dma_cmd_rdy = s.dma_cmd.rdy - s.dma_cmd_opcode = s.dma_cmd.msg.opcode - s.dma_cmd_dram_addr = s.dma_cmd.msg.dram_addr - s.dma_cmd_spm_addr = s.dma_cmd.msg.spm_addr - s.dma_cmd_bytes = s.dma_cmd.msg.nbytes - s.dma_cmd_tag = s.dma_cmd.msg.tag s.dma_done = RecvIfcRTL(DmaDoneType) - s.dma_done_val = s.dma_done.val - s.dma_done_rdy = s.dma_done.rdy - s.dma_done_tag = s.dma_done.msg.tag # DMA engine side of the controller-forwarded SPM access path. s.dma_spm_from_dma = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - s.spm_dma_wval = s.dma_spm_from_dma.write.val - s.spm_dma_wrdy = s.dma_spm_from_dma.write.rdy - s.spm_dma_waddr = s.dma_spm_from_dma.write.msg.addr - s.spm_dma_wdata = s.dma_spm_from_dma.write.msg.data - s.spm_dma_wmask = s.dma_spm_from_dma.write.msg.mask - - s.spm_dma_rval = s.dma_spm_from_dma.read.val - s.spm_dma_rrdy = s.dma_spm_from_dma.read.rdy - s.spm_dma_raddr = s.dma_spm_from_dma.read.msg.addr - s.spm_dma_rresp_val = s.dma_spm_from_dma.read_resp.val - s.spm_dma_rresp_rdy = s.dma_spm_from_dma.read_resp.rdy - s.spm_dma_rresp_data = s.dma_spm_from_dma.read_resp.msg.data # Data memory side of the same SPM access path. s.dma_spm_to_mem = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - s.send_to_mem_dma_wval = s.dma_spm_to_mem.write.val - s.recv_from_mem_dma_wrdy = s.dma_spm_to_mem.write.rdy - s.send_to_mem_dma_waddr = s.dma_spm_to_mem.write.msg.addr - s.send_to_mem_dma_wdata = s.dma_spm_to_mem.write.msg.data - s.send_to_mem_dma_wmask = s.dma_spm_to_mem.write.msg.mask - - s.send_to_mem_dma_rval = s.dma_spm_to_mem.read.val - s.recv_from_mem_dma_rrdy = s.dma_spm_to_mem.read.rdy - s.send_to_mem_dma_raddr = s.dma_spm_to_mem.read.msg.addr - s.recv_from_mem_dma_rresp_val = s.dma_spm_to_mem.read_resp.val - s.send_to_mem_dma_rresp_rdy = s.dma_spm_to_mem.read_resp.rdy - s.recv_from_mem_dma_rresp_data = s.dma_spm_to_mem.read_resp.msg.data else: s.dma_cmd = DmaWireIfcRTL(DmaCmdType) s.dma_cmd.rdy //= 0 @@ -171,43 +137,6 @@ def construct(s, s.dma_spm_to_mem.read_resp.val //= 0 s.dma_spm_to_mem.read_resp.msg //= DmaSpmReadRespType() - s.dma_cmd_val = Wire() - s.dma_cmd_rdy = Wire() - s.dma_cmd_opcode = Wire(DmaOpcodeType) - s.dma_cmd_dram_addr = Wire(DmaDramAddrType) - s.dma_cmd_spm_addr = Wire(DataAddrType) - s.dma_cmd_bytes = Wire(DmaBytesType) - s.dma_cmd_tag = Wire(DmaTagType) - - s.dma_done_val = Wire() - s.dma_done_rdy = Wire() - s.dma_done_tag = Wire(DmaTagType) - - s.spm_dma_wval = Wire() - s.spm_dma_wrdy = Wire() - s.spm_dma_waddr = Wire(DataAddrType) - s.spm_dma_wdata = Wire(DataPayloadType) - s.spm_dma_wmask = Wire(DmaMaskType) - - s.spm_dma_rval = Wire() - s.spm_dma_rrdy = Wire() - s.spm_dma_raddr = Wire(DataAddrType) - s.spm_dma_rresp_val = Wire() - s.spm_dma_rresp_rdy = Wire() - s.spm_dma_rresp_data = Wire(DataPayloadType) - - s.send_to_mem_dma_wval = Wire() - s.recv_from_mem_dma_wrdy = Wire() - s.send_to_mem_dma_waddr = Wire(DataAddrType) - s.send_to_mem_dma_wdata = Wire(DataPayloadType) - s.send_to_mem_dma_wmask = Wire(DmaMaskType) - - s.send_to_mem_dma_rval = Wire() - s.recv_from_mem_dma_rrdy = Wire() - s.send_to_mem_dma_raddr = Wire(DataAddrType) - s.recv_from_mem_dma_rresp_val = Wire() - s.send_to_mem_dma_rresp_rdy = Wire() - s.recv_from_mem_dma_rresp_data = Wire(DataPayloadType) # Component s.recv_from_tile_load_request_pkt_queue = ChannelRTL(InterCgraPktType, latency = 1) From 43da86dbb652b8e0483a546ffb287653978d63c8 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 23:41:50 +0800 Subject: [PATCH 18/46] [Feature] Introduce DMA data structure and DMA-to-DRAM write request interface for enhanced data transfer capabilities. --- lib/basic/val_rdy/ifcs.py | 20 ++++++++++++++++++++ lib/messages.py | 24 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 388ac01a..44d1a508 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -161,3 +161,23 @@ def construct( s, WriteReqType, ReadReqType, ReadRespType ): s.read_resp = DmaWireIfcRTL( ReadRespType ) def __str__( s ): return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" + + +class DmaDramWrReqIfcRTL( Interface ): + """ + DMA-to-DRAM Write Request Interface. + + This interface is instantiated on the DMA side. + It initiates a write request to the DRAM. + + Direction: + - req: Output (Send). DMA sends write requests to DRAM. + """ + def construct( s, DramAddrType, DmaMemDataType, DmaMemMaskType ): + s.val = OutPort() + s.rdy = InPort() + s.addr = OutPort(DramAddrType) + s.data = OutPort(DmaMemDataType) + s.mask = OutPort(DmaMemMaskType) + def __str__( s ): + return f"val:{s.val}|rdy:{s.rdy}|addr:{s.addr}|data:{s.data}|mask:{s.mask}" \ No newline at end of file diff --git a/lib/messages.py b/lib/messages.py index e05ef1cd..509638d7 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -230,6 +230,30 @@ def str_func(s): namespace = {'__str__': str_func} ) +# A data structure to represent the data to be transferred by DMA. +def mk_dma_data(dram_data_nbits = 128, + dram_mask_nbits = 16, + spm_data_nbits = 32, + spm_mask_nbits = 4, + prefix = "DmaData"): + DramDataType = mk_bits(dram_data_nbits) + DramMaskType = mk_bits(dram_mask_nbits) + SpmDataType = mk_bits(spm_data_nbits) + SpmMaskType = mk_bits(spm_mask_nbits) + new_name = f"{prefix}_{dram_data_nbits}_{dram_mask_nbits}_{spm_data_nbits}" + + def str_func(s): + return f"dma_data(dram_data={s.dram_data},dram_mask={s.dram_mask},spm_data={s.spm_data})" + + return mk_bitstruct(new_name, { + 'dram_data': DramDataType, + 'dram_mask': DramMaskType, + 'spm_data': SpmDataType, + 'spm_mask': SpmMaskType, + }, + namespace = {'__str__': str_func} + ) + def mk_dma_done(tag_nbits = 8, prefix = "DmaDone"): From 6e647dd13d695137e87baef0653dde4532cb2d19 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 23:43:41 +0800 Subject: [PATCH 19/46] [Refactor] Pass DmaCmdType and DmaDataType into DataMemController and then drives types from them --- mem/data/DataMemControllerRTL.py | 22 +++++++++++-------- .../test/DataMemControllerRTL_dma_test.py | 10 +++++---- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 3031f72f..0fdaa4c1 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -66,7 +66,9 @@ def construct(s, num_tiles = 16, mem_access_is_combinational = True, idTo2d_map = {0: [0, 0]}, - has_dma_ports = False): + has_dma_ports = False, + DmaCmdType = mk_dma_cmd(), + DmaDataType = mk_dma_data()): CgraPayloadType = NocPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) @@ -79,11 +81,13 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) AddrType = mk_bits(global_addr_nbits) PerBankAddrType = mk_bits(per_bank_addr_nbits) - DmaDataType = DataType.get_field_type(kAttrPayload) - DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) - DmaSpmWriteReqType = mk_dma_spm_write_req(AddrType.nbits, DmaDataType.nbits) - DmaSpmReadReqType = mk_dma_spm_read_req(AddrType.nbits) - DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits) + + DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') + DmaMaskType = DmaDataType.get_field_type('spm_mask') + DmaSpmDataType = DmaDataType.get_field_type('spm_data') + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) NocRemoteSrcPortType = NocPktType.get_field_type(kAttrRemoteSrcPort) s.num_banks_per_cgra = num_banks_per_cgra s.has_dma_ports = has_dma_ports @@ -302,7 +306,7 @@ def assemble_xbar_pkt(): dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - recv_raddr_from_dma = s.dma_spm.read.msg.addr + recv_raddr_from_dma = trunc(s.dma_spm.read.msg.addr, AddrType) if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) else: @@ -315,7 +319,7 @@ def assemble_xbar_pkt(): 0, # src_tile 0) # remote_src_port - recv_waddr_from_dma = s.dma_spm.write.msg.addr + recv_waddr_from_dma = trunc(s.dma_spm.write.msg.addr, AddrType) if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) else: @@ -395,7 +399,7 @@ def update_all(): s.dma_spm.write.rdy @= 0 s.dma_spm.read.rdy @= 0 s.dma_spm.read_resp.val @= 0 - s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaDataType(0)) + s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index add8ce17..fa85d685 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -43,12 +43,13 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_load_response_pkt.rdy @= 1 dut.send_to_noc_store_pkt.rdy @= 1 + DmaSpmAddrType = mk_dma_cmd().get_field_type('spm_addr') dut.dma_spm.write.val @= 0 - dut.dma_spm.write.msg.addr @= DataAddrType(0) + dut.dma_spm.write.msg.addr @= DmaSpmAddrType(0) dut.dma_spm.write.msg.data @= 0 dut.dma_spm.write.msg.mask @= 0 dut.dma_spm.read.val @= 0 - dut.dma_spm.read.msg.addr @= DataAddrType(0) + dut.dma_spm.read.msg.addr @= DmaSpmAddrType(0) dut.dma_spm.read_resp.rdy @= 1 dut.cgra_id @= 0 @@ -89,8 +90,9 @@ def test_dma_ports_write_then_read(): dut.sim_reset() drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) + DmaSpmAddrType = mk_dma_cmd().get_field_type('spm_addr') dut.dma_spm.write.val @= 1 - dut.dma_spm.write.msg.addr @= DataAddrType(3) + dut.dma_spm.write.msg.addr @= DmaSpmAddrType(3) dut.dma_spm.write.msg.data @= 0xaaaabbbb dut.dma_spm.write.msg.mask @= 0xf dut.sim_eval_combinational() @@ -99,7 +101,7 @@ def test_dma_ports_write_then_read(): dut.dma_spm.write.val @= 0 dut.dma_spm.read.val @= 1 - dut.dma_spm.read.msg.addr @= DataAddrType(3) + dut.dma_spm.read.msg.addr @= DmaSpmAddrType(3) seen_response = False for _ in range(10): From 78a158722ad28e406d1f33e066af48de077b2fd4 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 23:44:30 +0800 Subject: [PATCH 20/46] [Refactor] Update DmaEngineRTL to use DmaDramWrReqIfcRTL for DRAM write requests and adjust related signal handling for clarity and consistency. --- mem/dma/DmaEngineRTL.py | 34 ++++++++++++++++--------------- mem/dma/test/DmaEngineRTL_test.py | 18 ++++++++-------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index f8d91568..c4ffad5b 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -11,6 +11,7 @@ from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL from ...lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL +from ...lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ...lib.messages import * from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE @@ -40,13 +41,13 @@ class DmaEngineRTL( Component ): def construct( s, spm_data_nbits = 32, # Bitwidth of a single SPM word - mem_data_nbits = 128, # Bitwidth of an external memory beat + dram_data_nbits = 128, # Bitwidth of an external memory beat dram_addr_nbits = 64, # Bitwidth of DRAM addresses spm_addr_nbits = 32, # Bitwidth of SPM addresses bytes_nbits = 32, # Bitwidth for transfer size in bytes tag_nbits = 8 ): # Bitwidth for command tracking tags - assert mem_data_nbits == spm_data_nbits * 4 + assert dram_data_nbits == spm_data_nbits * 4 OpcodeType = mk_bits( 3 ) DramAddrType = mk_bits( dram_addr_nbits ) @@ -54,10 +55,10 @@ def construct( s, BytesType = mk_bits( bytes_nbits ) TagType = mk_bits( tag_nbits ) SpmDataType = mk_bits( spm_data_nbits ) - MemDataType = mk_bits( mem_data_nbits ) + MemDataType = mk_bits( dram_data_nbits ) # Byte mask for SPM write SpmMaskType = mk_bits( spm_data_nbits // CHAR_BIT ) - MemMaskType = mk_bits( mem_data_nbits // CHAR_BIT ) + MemMaskType = mk_bits( dram_data_nbits // CHAR_BIT ) DmaCmdType = mk_dma_cmd(dram_addr_nbits, spm_addr_nbits, bytes_nbits, tag_nbits) DmaDoneType = mk_dma_done(tag_nbits) DmaSpmWriteReqType = mk_dma_spm_write_req(spm_addr_nbits, spm_data_nbits) @@ -78,11 +79,7 @@ def construct( s, s.dram_rd_resp = RecvIfcRTL( MemDataType ) # Request to write to DRAM - s.dram_wr_req_val = OutPort() - s.dram_wr_req_rdy = InPort() - s.dram_wr_req_addr = OutPort( DramAddrType ) - s.dram_wr_req_data = OutPort( MemDataType ) - s.dram_wr_req_mask = OutPort( MemMaskType ) + s.dram_wr_req = DmaDramWrReqIfcRTL(DramAddrType, MemDataType, MemMaskType) s.dram_wr_resp_val = InPort() s.dram_wr_resp_rdy = OutPort() @@ -138,10 +135,15 @@ def comb_outputs(): s.dram_rd_req.msg @= s.dram_addr_reg s.dram_rd_resp.rdy @= s.state == STATE_MVIN_RESP - s.dram_wr_req_val @= s.state == STATE_MVOUT_WRITE - s.dram_wr_req_addr @= s.dram_addr_reg - s.dram_wr_req_data @= s.beat_reg - s.dram_wr_req_mask @= s.wr_mask_reg + # s.dram_wr_req_val @= s.state == STATE_MVOUT_WRITE + # s.dram_wr_req_addr @= s.dram_addr_reg + # s.dram_wr_req_data @= s.beat_reg + # s.dram_wr_req_mask @= s.wr_mask_reg + s.dram_wr_req.val @= s.state == STATE_MVOUT_WRITE + s.dram_wr_req.addr @= s.dram_addr_reg + s.dram_wr_req.data @= s.beat_reg + s.dram_wr_req.mask @= s.wr_mask_reg + s.dram_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT spm_wdata = SpmDataType(0) @@ -199,7 +201,7 @@ def seq_state(): elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. if s.dram_rd_req.val & s.dram_rd_req.rdy: - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) s.state_ff <<= STATE_MVIN_RESP elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. @@ -266,13 +268,13 @@ def seq_state(): s.state_ff <<= STATE_MVOUT_READ elif s.state == STATE_MVOUT_WRITE: - if s.dram_wr_req_val & s.dram_wr_req_rdy: + if s.dram_wr_req.val & s.dram_wr_req.rdy: s.state_ff <<= STATE_MVOUT_WAIT elif s.state == STATE_MVOUT_WAIT: if s.dram_wr_resp_val & s.dram_wr_resp_rdy: # Turn to the +16 address after writing 16 bytes data. - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( mem_data_nbits // CHAR_BIT ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index b81397da..285b6def 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -25,7 +25,7 @@ def make_dut(): dut.dram_rd_req.rdy @= 1 dut.dram_rd_resp.val @= 0 dut.dram_rd_resp.msg @= 0 - dut.dram_wr_req_rdy @= 1 + dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 1 dut.spm.write.rdy @= 1 @@ -155,10 +155,10 @@ def test_dma_mvout_partial_beat(): else: pending_rresp = None - if dut.dram_wr_req_val & dut.dram_wr_req_rdy: - mem_writes.append((int(dut.dram_wr_req_addr), - int(dut.dram_wr_req_data), - int(dut.dram_wr_req_mask))) + if dut.dram_wr_req.val & dut.dram_wr_req.rdy: + mem_writes.append((int(dut.dram_wr_req.addr), + int(dut.dram_wr_req.data), + int(dut.dram_wr_req.mask))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0xa5 @@ -212,10 +212,10 @@ def test_dma_mvout_full_beat(): else: pending_rresp = None - if dut.dram_wr_req_val & dut.dram_wr_req_rdy: - mem_writes.append((int(dut.dram_wr_req_addr), - int(dut.dram_wr_req_data), - int(dut.dram_wr_req_mask))) + if dut.dram_wr_req.val & dut.dram_wr_req.rdy: + mem_writes.append((int(dut.dram_wr_req.addr), + int(dut.dram_wr_req.data), + int(dut.dram_wr_req.mask))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0xa5 From 6fb7e50355cd9b8ebb419eb3da4770c758114286 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 23:45:25 +0800 Subject: [PATCH 21/46] [Refactor] Enhance DMA integration in CgraTemplateRTL and ControllerRTL by passing DmaDataType and DmaCmdType as parameters, and updating related type definitions for improved clarity and consistency. --- cgra/CgraTemplateRTL.py | 33 ++++++++++++++++---------------- controller/ControllerRTL.py | 38 ++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 0baea9b2..33c85c16 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -85,7 +85,9 @@ def construct(s, CgraPayloadType, provided_max_per_cgra_cols = None, provided_max_num_rd_tiles = None, provided_max_num_wr_tiles = None, - has_dma_ports = False): + has_dma_ports = False, + DmaDataType = mk_dma_data(), + DmaCmdType = mk_dma_cmd()): """ provided_max_per_cgra_rows: the row number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. provided_max_per_cgra_cols: the column number of the largest cgra in the multi heterogeneous cgra architecture. None for single cgra arch or Homogeneous multi-cgra arch. @@ -128,21 +130,14 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaDataType = DataType.get_field_type(kAttrPayload) - DmaMaskType = mk_bits(max(1, DmaDataType.nbits // CHAR_BIT)) - DmaOpcodeType = mk_bits(3) - DmaDramAddrType = mk_bits(64) - DmaBytesType = mk_bits(32) - DmaTagType = mk_bits(8) - DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits, - DataAddrType.nbits, - DmaBytesType.nbits, - DmaTagType.nbits) + DmaTagType = DmaCmdType.get_field_type('tag') + DmaSpmDataType = DmaDataType.get_field_type('spm_data') + DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') DmaDoneType = mk_dma_done(DmaTagType.nbits) - DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits, - DmaDataType.nbits) - DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits) - DmaSpmReadRespType = mk_dma_spm_read_resp(DmaDataType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, + DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) assert(data_mem_size_per_bank * num_banks_per_cgra <= \ data_mem_size_global) @@ -198,12 +193,16 @@ def construct(s, CgraPayloadType, max_num_tiles, mem_access_is_combinational, idTo2d_map, - has_dma_ports) + has_dma_ports, + DmaCmdType, + DmaDataType) s.cgra_id = InPort(CgraIdType) s.controller = ControllerRTL(NocPktType, multi_cgra_rows, multi_cgra_columns, max_num_tiles, controller2addr_map, idTo2d_map, - has_dma_ports) + has_dma_ports, + DmaDataType, + DmaCmdType) # Connects controller id. s.controller.cgra_id //= s.cgra_id # An additional router for controller to receive CMD_COMPLETE signal from Ring to CPU. diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 1ce27594..2591d1fa 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -32,7 +32,9 @@ def construct(s, num_tiles, controller2addr_map, idTo2d_map, - has_dma_ports = False): + has_dma_ports = False, + DmaDataType = mk_dma_data(), + DmaCmdType = mk_dma_cmd()): # Derives types from InterCgraPktType. CgraPayloadType = InterCgraPktType.get_field_type(kAttrPayload) @@ -56,24 +58,22 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) TileIdType = mk_bits(clog2(num_tiles + 1)) ControllerXbarPktType = mk_controller_noc_xbar_pkt(InterCgraPktType) - DmaOpcodeType = mk_bits(3) - DmaDramAddrType = mk_bits(64) - DmaBytesType = mk_bits(32) - DmaTagType = mk_bits(8) - DmaDramAddrPartType = mk_bits(32) - DmaMaskType = mk_bits(max(1, DataPayloadType.nbits // CHAR_BIT)) - DmaCmdType = mk_dma_cmd(DmaDramAddrType.nbits, - DataAddrType.nbits, - DmaBytesType.nbits, - DmaTagType.nbits) + DmaOpcodeType = DmaCmdType.get_field_type('opcode') + DmaDramAddrType = DmaCmdType.get_field_type('dram_addr') + DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') + DmaBytesType = DmaCmdType.get_field_type('nbytes') + DmaTagType = DmaCmdType.get_field_type('tag') + DmaSpmDataType = DmaDataType.get_field_type('spm_data') + # Lower and higher 32 bits of the DRAM address. + DmaDramAddrPartType = mk_bits(DmaDramAddrType.nbits // 2) DmaDoneType = mk_dma_done(DmaTagType.nbits) - DmaSpmWriteReqType = mk_dma_spm_write_req(DataAddrType.nbits, - DataPayloadType.nbits) - DmaSpmReadReqType = mk_dma_spm_read_req(DataAddrType.nbits) - DmaSpmReadRespType = mk_dma_spm_read_resp(DataPayloadType.nbits) + DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, + DmaSpmDataType.nbits) + DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) + DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) if has_dma_ports: - assert DataPayloadType.nbits == 32 + assert DmaSpmDataType.nbits == 32 # Interface s.cgra_id = InPort(CgraIdType) @@ -188,7 +188,7 @@ def construct(s, s.dma_dram_addr_lo = Wire(DmaDramAddrPartType) s.dma_dram_addr_hi = Wire(DmaDramAddrPartType) - s.dma_spm_addr = Wire(DataAddrType) + s.dma_spm_addr = Wire(DmaSpmAddrType) s.dma_bytes = Wire(DmaBytesType) s.dma_tag = Wire(DmaTagType) @@ -212,7 +212,7 @@ def update_dma_cmd_regs(): if s.reset: s.dma_dram_addr_lo <<= DmaDramAddrPartType(0) s.dma_dram_addr_hi <<= DmaDramAddrPartType(0) - s.dma_spm_addr <<= DataAddrType(0) + s.dma_spm_addr <<= DmaSpmAddrType(0) s.dma_bytes <<= DmaBytesType(0) s.dma_tag <<= DmaTagType(0) elif has_dma_ports: @@ -225,7 +225,7 @@ def update_dma_cmd_regs(): elif cpu_cmd == CMD_DMA_CONFIG_DRAM_ADDR_HI: s.dma_dram_addr_hi <<= DmaDramAddrPartType(cpu_data) elif cpu_cmd == CMD_DMA_CONFIG_SPM_ADDR: - s.dma_spm_addr <<= cpu_payload.data_addr + s.dma_spm_addr <<= zext(cpu_payload.data_addr, DmaSpmAddrType) elif cpu_cmd == CMD_DMA_CONFIG_BYTES: s.dma_bytes <<= DmaBytesType(cpu_data) elif cpu_cmd == CMD_DMA_CONFIG_TAG: From a7618d8b3b59fabb9aaabda4d8c1371691e57cbe Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 14 Jun 2026 23:46:11 +0800 Subject: [PATCH 22/46] [Refactor] Update CgraDmaRTL to utilize DmaDramWrReqIfcRTL for DRAM write requests, enhancing type definitions for DmaCmdType and DmaDataType --- cgra/CgraDmaRTL.py | 46 +++++++++++++++++++++++------------- cgra/test/CgraDmaRTL_test.py | 10 ++++---- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index a63d6331..bf8a8a87 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -12,6 +12,7 @@ from .CgraTemplateRTL import CgraTemplateRTL from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL +from ..lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ..lib.messages import * from ..lib.util.data_struct_attr import * from ..mem.dma.DmaEngineRTL import DmaEngineRTL @@ -69,9 +70,18 @@ def construct(s, CgraPayloadType, CgraIdType = mk_cgra_id_type(multi_cgra_columns, multi_cgra_rows) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaDramAddrType = mk_bits(64) - DmaMemDataType = mk_bits(128) # Write/Read 128 bits data per beat from/to DRAM - DmaMemMaskType = mk_bits(16) + DmaCmdType = mk_dma_cmd(dram_addr_nbits = 64, + spm_addr_nbits = 32, + bytes_nbits = 32, + tag_nbits = 8) + + DmaDataType = mk_dma_data(dram_data_nbits = 128, + dram_mask_nbits = 16, + spm_data_nbits = 32) + + DmaDramAddrType = DmaCmdType.get_field_type('dram_addr') + DmaMemDataType = DmaDataType.get_field_type('dram_data') + DmaMemMaskType = DmaDataType.get_field_type('dram_mask') # Existing CGRA-facing interfaces. # CGRA <-> CPU @@ -102,11 +112,7 @@ def construct(s, CgraPayloadType, s.dram_rd_req = SendIfcRTL(DmaDramAddrType) s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) - s.dram_wr_req_val = OutPort() - s.dram_wr_req_rdy = InPort() - s.dram_wr_req_addr = OutPort(DmaDramAddrType) - s.dram_wr_req_data = OutPort(DmaMemDataType) - s.dram_wr_req_mask = OutPort(DmaMemMaskType) # Masks for wrting DRAM + s.dram_wr_req = DmaDramWrReqIfcRTL(DmaDramAddrType, DmaMemDataType, DmaMemMaskType) s.dram_wr_resp_val = InPort() s.dram_wr_resp_rdy = OutPort() @@ -128,10 +134,20 @@ def construct(s, CgraPayloadType, provided_max_per_cgra_cols, provided_max_num_rd_tiles, provided_max_num_wr_tiles, - has_dma_ports = True) - - s.dma = DmaEngineRTL(spm_data_nbits = data_bitwidth, - spm_addr_nbits = clog2(data_mem_size_global)) + has_dma_ports = True, + DmaDataType = DmaDataType, + DmaCmdType = DmaCmdType) + + DmaSpmDataType = DmaDataType.get_field_type('spm_data') + DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') + DmaBytesType = DmaCmdType.get_field_type('nbytes') + DmaTagType = DmaCmdType.get_field_type('tag') + s.dma = DmaEngineRTL(spm_data_nbits = DmaSpmDataType.nbits, + dram_data_nbits = DmaMemDataType.nbits, + dram_addr_nbits = DmaDramAddrType.nbits, + spm_addr_nbits = DmaSpmAddrType.nbits, + bytes_nbits = DmaBytesType.nbits, + tag_nbits = DmaTagType.nbits) # CGRA passthrough connections. @@ -167,11 +183,7 @@ def construct(s, CgraPayloadType, s.dram_rd_req //= s.dma.dram_rd_req s.dram_rd_resp //= s.dma.dram_rd_resp - s.dram_wr_req_val //= s.dma.dram_wr_req_val - s.dram_wr_req_rdy //= s.dma.dram_wr_req_rdy - s.dram_wr_req_addr //= s.dma.dram_wr_req_addr - s.dram_wr_req_data //= s.dma.dram_wr_req_data - s.dram_wr_req_mask //= s.dma.dram_wr_req_mask + s.dram_wr_req //= s.dma.dram_wr_req s.dram_wr_resp_val //= s.dma.dram_wr_resp_val s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 4f4781bb..32f9f122 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -155,7 +155,7 @@ def test_cgra_dma_mvin_to_local_spm(): dut.dram_rd_req.rdy @= 1 dut.dram_rd_resp.val @= 0 dut.dram_rd_resp.msg @= 0 - dut.dram_wr_req_rdy @= 1 + dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 0 # Read 16 bytes from DRAM address 0x1000 and write them to SPM words 0..3. @@ -252,7 +252,7 @@ def test_cgra_dma_mvout_from_local_spm(): dut.dram_rd_req.rdy @= 1 dut.dram_rd_resp.val @= 0 dut.dram_rd_resp.msg @= 0 - dut.dram_wr_req_rdy @= 1 + dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 0 # Read SPM words 0..3 and write 16 bytes to DRAM address 0x2000. @@ -273,9 +273,9 @@ def test_cgra_dma_mvout_from_local_spm(): dut.sim_eval_combinational() - if dut.dram_wr_req_val & dut.dram_wr_req_rdy: - assert dut.dram_wr_req_addr == 0x2000 - assert dut.dram_wr_req_data == expected_beat + if dut.dram_wr_req.val & dut.dram_wr_req.rdy: + assert dut.dram_wr_req.addr == 0x2000 + assert dut.dram_wr_req.data == expected_beat pending_wr_resp = True if observed_dma_done(dut, 0x44): From 1bf3b79ecf1dfba18b1efd45efa366b27bb4967f Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 15 Jun 2026 10:19:08 +0800 Subject: [PATCH 23/46] [Fix] Fix the bitwidth mismatch error between DataType and DmaSpmDataType --- mem/data/DataMemControllerRTL.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 0fdaa4c1..f2523700 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -72,6 +72,7 @@ def construct(s, CgraPayloadType = NocPktType.get_field_type(kAttrPayload) DataType = CgraPayloadType.get_field_type(kAttrData) + PayloadType = DataType.get_field_type(kAttrPayload) # Constants. global_addr_nbits = clog2(data_mem_size_global) per_bank_addr_nbits = clog2(data_mem_size_per_bank) @@ -82,9 +83,9 @@ def construct(s, AddrType = mk_bits(global_addr_nbits) PerBankAddrType = mk_bits(per_bank_addr_nbits) - DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') - DmaMaskType = DmaDataType.get_field_type('spm_mask') - DmaSpmDataType = DmaDataType.get_field_type('spm_data') + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaMaskType = DmaDataType.get_field_type(kAttrSpmMask) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, DmaSpmDataType.nbits) DmaSpmReadReqType = mk_dma_spm_read_req(DmaSpmAddrType.nbits) DmaSpmReadRespType = mk_dma_spm_read_resp(DmaSpmDataType.nbits) @@ -327,7 +328,7 @@ def assemble_xbar_pkt(): s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src bank_index_store_from_dma, # dst recv_waddr_from_dma, # addr - DataType(s.dma_spm.write.msg.data, 1, 0, 0), + DataType(zext(s.dma_spm.write.msg.data, PayloadType), 1, 0, 0), 0, # src_cgra 0, # src_tile 0) # remote_src_port @@ -488,7 +489,7 @@ def update_all(): s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy elif has_dma_ports: s.dma_spm.read_resp.msg @= DmaSpmReadRespType( - s.response_crossbar.send[i].msg.data.payload) + trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType)) s.dma_spm.read_resp.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.dma_spm.read_resp.rdy From d4ce9814956e134e239169d3bcc53cfa052e4c8c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 15 Jun 2026 10:22:18 +0800 Subject: [PATCH 24/46] [CleanUp] Update DMA attribute references to use new constants for improved clarity --- cgra/CgraDmaRTL.py | 14 +++++++------- cgra/CgraTemplateRTL.py | 6 +++--- controller/ControllerRTL.py | 12 ++++++------ fu/single/ExtractPredicateRTL.py | 4 ++-- fu/single/LoopControlRTL.py | 6 +++--- lib/util/data_struct_attr.py | 11 +++++++++++ mem/data/test/DataMemControllerRTL_dma_test.py | 4 ++-- 7 files changed, 34 insertions(+), 23 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index bf8a8a87..89cfb49e 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -79,9 +79,9 @@ def construct(s, CgraPayloadType, dram_mask_nbits = 16, spm_data_nbits = 32) - DmaDramAddrType = DmaCmdType.get_field_type('dram_addr') - DmaMemDataType = DmaDataType.get_field_type('dram_data') - DmaMemMaskType = DmaDataType.get_field_type('dram_mask') + DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) + DmaMemDataType = DmaDataType.get_field_type(kAttrDramData) + DmaMemMaskType = DmaDataType.get_field_type(kAttrDramMask) # Existing CGRA-facing interfaces. # CGRA <-> CPU @@ -138,10 +138,10 @@ def construct(s, CgraPayloadType, DmaDataType = DmaDataType, DmaCmdType = DmaCmdType) - DmaSpmDataType = DmaDataType.get_field_type('spm_data') - DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') - DmaBytesType = DmaCmdType.get_field_type('nbytes') - DmaTagType = DmaCmdType.get_field_type('tag') + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) + DmaTagType = DmaCmdType.get_field_type(kAttrTag) s.dma = DmaEngineRTL(spm_data_nbits = DmaSpmDataType.nbits, dram_data_nbits = DmaMemDataType.nbits, dram_addr_nbits = DmaDramAddrType.nbits, diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 33c85c16..8c6fc719 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -130,9 +130,9 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaTagType = DmaCmdType.get_field_type('tag') - DmaSpmDataType = DmaDataType.get_field_type('spm_data') - DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') + DmaTagType = DmaCmdType.get_field_type(kAttrTag) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) DmaDoneType = mk_dma_done(DmaTagType.nbits) DmaSpmWriteReqType = mk_dma_spm_write_req(DmaSpmAddrType.nbits, DmaSpmDataType.nbits) diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 2591d1fa..94c79184 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -58,12 +58,12 @@ def construct(s, YType = mk_bits(max(clog2(multi_cgra_rows), 1)) TileIdType = mk_bits(clog2(num_tiles + 1)) ControllerXbarPktType = mk_controller_noc_xbar_pkt(InterCgraPktType) - DmaOpcodeType = DmaCmdType.get_field_type('opcode') - DmaDramAddrType = DmaCmdType.get_field_type('dram_addr') - DmaSpmAddrType = DmaCmdType.get_field_type('spm_addr') - DmaBytesType = DmaCmdType.get_field_type('nbytes') - DmaTagType = DmaCmdType.get_field_type('tag') - DmaSpmDataType = DmaDataType.get_field_type('spm_data') + DmaOpcodeType = DmaCmdType.get_field_type(kAttrOpcode) + DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) + DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) + DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) + DmaTagType = DmaCmdType.get_field_type(kAttrTag) + DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) # Lower and higher 32 bits of the DRAM address. DmaDramAddrPartType = mk_bits(DmaDramAddrType.nbits // 2) DmaDoneType = mk_dma_done(DmaTagType.nbits) diff --git a/fu/single/ExtractPredicateRTL.py b/fu/single/ExtractPredicateRTL.py index 460e598e..15e5c562 100644 --- a/fu/single/ExtractPredicateRTL.py +++ b/fu/single/ExtractPredicateRTL.py @@ -15,7 +15,7 @@ from pymtl3 import * from ..basic.Fu import Fu from ...lib.opt_type import * - +from ...lib.util.data_struct_attr import * class ExtractPredicateRTL(Fu): def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0): @@ -60,7 +60,7 @@ def comb_logic(): # When loop is running (predicate=1) -> payload=1 # When loop terminates (predicate=0) -> payload=0 # Downstream NOT will invert: running->0 (no RET), done->1 (trigger RET) - s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type('payload')) + s.send_out[0].msg.payload @= zext(s.recv_in[s.in0_idx].msg.predicate, s.DataType.get_field_type(kAttrPayload)) s.send_out[0].msg.predicate @= 1 s.send_out[0].val @= s.recv_in[s.in0_idx].val diff --git a/fu/single/LoopControlRTL.py b/fu/single/LoopControlRTL.py index 5ac13f70..fbcf362e 100644 --- a/fu/single/LoopControlRTL.py +++ b/fu/single/LoopControlRTL.py @@ -15,7 +15,7 @@ from pymtl3 import * from ..basic.Fu import Fu from ...lib.opt_type import OPT_LOOP_CONTROL, OPT_SYMBOL_DICT - +from ...lib.util.data_struct_attr import * class LoopControlRTL(Fu): def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0): @@ -34,8 +34,8 @@ def construct(s, CtrlPktType, num_inports, num_outports, vector_factor_power = 0 super(LoopControlRTL, s).construct(CtrlPktType, num_inports, num_outports, 1, vector_factor_power) - PayloadType = s.DataType.get_field_type('payload') - PredicateType = s.DataType.get_field_type('predicate') + PayloadType = s.DataType.get_field_type(kAttrPayload) + PredicateType = s.DataType.get_field_type(kAttrPredicate) FuInType = mk_bits(clog2(num_inports + 1)) # Internal state for loop control diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py index 989378d1..98ed2b74 100644 --- a/lib/util/data_struct_attr.py +++ b/lib/util/data_struct_attr.py @@ -39,3 +39,14 @@ kAttrDstCgraX = 'dst_cgra_x' kAttrDstCgraY = 'dst_cgra_y' kAttrAddr = 'addr' + +# DMA attributes +kAttrOpcode = 'opcode' +kAttrDramAddr = 'dram_addr' +kAttrNBytes = 'nbytes' +kAttrTag = 'tag' +kAttrSpmAddr = 'spm_addr' +kAttrSpmData = 'spm_data' +kAttrSpmMask = 'spm_mask' +kAttrDramData = 'dram_data' +kAttrDramMask = 'dram_mask' \ No newline at end of file diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index fa85d685..5a2d43de 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -43,7 +43,7 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_load_response_pkt.rdy @= 1 dut.send_to_noc_store_pkt.rdy @= 1 - DmaSpmAddrType = mk_dma_cmd().get_field_type('spm_addr') + DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) dut.dma_spm.write.val @= 0 dut.dma_spm.write.msg.addr @= DmaSpmAddrType(0) dut.dma_spm.write.msg.data @= 0 @@ -90,7 +90,7 @@ def test_dma_ports_write_then_read(): dut.sim_reset() drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) - DmaSpmAddrType = mk_dma_cmd().get_field_type('spm_addr') + DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) dut.dma_spm.write.val @= 1 dut.dma_spm.write.msg.addr @= DmaSpmAddrType(3) dut.dma_spm.write.msg.data @= 0xaaaabbbb From bca3100b09f7cf76db9d1ea0eba3ca3535ed3f2f Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Wed, 17 Jun 2026 10:17:03 +0800 Subject: [PATCH 25/46] [Rename][NFC] Rename some variables for clarity --- cgra/CgraDmaRTL.py | 8 +-- cgra/test/CgraDmaRTL_test.py | 20 ++++---- lib/util/common.py | 18 +++---- mem/dma/DmaEngineRTL.py | 84 +++++++++++++++---------------- mem/dma/test/DmaEngineRTL_test.py | 20 +++++--- 5 files changed, 77 insertions(+), 73 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index 89cfb49e..846ba441 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -109,8 +109,8 @@ def construct(s, CgraPayloadType, # Abstract external dram memory interfaces for the internal DMA engine. - s.dram_rd_req = SendIfcRTL(DmaDramAddrType) - s.dram_rd_resp = RecvIfcRTL(DmaMemDataType) + s.send_dram_rd_req = SendIfcRTL(DmaDramAddrType) + s.recv_dram_rd_resp = RecvIfcRTL(DmaMemDataType) s.dram_wr_req = DmaDramWrReqIfcRTL(DmaDramAddrType, DmaMemDataType, DmaMemMaskType) @@ -180,8 +180,8 @@ def construct(s, CgraPayloadType, s.cgra.dma_cmd //= s.dma.dma_cmd s.dma.dma_done //= s.cgra.dma_done - s.dram_rd_req //= s.dma.dram_rd_req - s.dram_rd_resp //= s.dma.dram_rd_resp + s.send_dram_rd_req //= s.dma.send_dram_rd_req + s.recv_dram_rd_resp //= s.dma.recv_dram_rd_resp s.dram_wr_req //= s.dma.dram_wr_req diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 32f9f122..869256d9 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -152,9 +152,9 @@ def test_cgra_dma_mvin_to_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.dram_rd_req.rdy @= 1 - dut.dram_rd_resp.val @= 0 - dut.dram_rd_resp.msg @= 0 + dut.send_dram_rd_req.rdy @= 1 + dut.recv_dram_rd_resp.val @= 0 + dut.recv_dram_rd_resp.msg @= 0 dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 0 @@ -167,15 +167,15 @@ def test_cgra_dma_mvin_to_local_spm(): pending_resp = False for _ in range(40): - dut.dram_rd_resp.val @= 0 + dut.recv_dram_rd_resp.val @= 0 if pending_resp: - dut.dram_rd_resp.val @= 1 + dut.recv_dram_rd_resp.val @= 1 # Simulate the read response from DRAM. - dut.dram_rd_resp.msg @= beat + dut.recv_dram_rd_resp.msg @= beat dut.sim_eval_combinational() - pending_resp = bool(dut.dram_rd_req.val & dut.dram_rd_req.rdy) + pending_resp = bool(dut.send_dram_rd_req.val & dut.send_dram_rd_req.rdy) if observed_dma_done(dut, 0x33): break @@ -249,9 +249,9 @@ def test_cgra_dma_mvout_from_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.dram_rd_req.rdy @= 1 - dut.dram_rd_resp.val @= 0 - dut.dram_rd_resp.msg @= 0 + dut.send_dram_rd_req.rdy @= 1 + dut.recv_dram_rd_resp.val @= 0 + dut.recv_dram_rd_resp.msg @= 0 dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 0 diff --git a/lib/util/common.py b/lib/util/common.py index eedb056e..5b65174e 100644 --- a/lib/util/common.py +++ b/lib/util/common.py @@ -81,12 +81,12 @@ # State machine definitions of DMA engine. from pymtl3 import mk_bits StateType = mk_bits( 4 ) -STATE_IDLE = StateType( 0 ) # Waiting for a new DMA command -STATE_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request -STATE_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response -STATE_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM -STATE_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request -STATE_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing -STATE_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request -STATE_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response -STATE_DONE = StateType( 8 ) # Signaling command completion +STATE_DMA_IDLE = StateType( 0 ) # Waiting for a new DMA command +STATE_DMA_MVIN_REQ = StateType( 1 ) # MVIN: Issuing DRAM read request +STATE_DMA_MVIN_RESP = StateType( 2 ) # MVIN: Waiting for DRAM read response +STATE_DMA_MVIN_WRITE = StateType( 3 ) # MVIN: Writing unpacked words to SPM +STATE_DMA_MVOUT_READ = StateType( 4 ) # MVOUT: Issuing SPM read request +STATE_DMA_MVOUT_RESP = StateType( 5 ) # MVOUT: Receiving SPM read response and packing +STATE_DMA_MVOUT_WRITE = StateType( 6 ) # MVOUT: Issuing DRAM write request +STATE_DMA_MVOUT_WAIT = StateType( 7 ) # MVOUT: Waiting for DRAM write response +STATE_DMA_DONE = StateType( 8 ) # Signaling command completion diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index c4ffad5b..72b3afd0 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -13,7 +13,7 @@ from ...lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL from ...lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ...lib.messages import * -from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_IDLE, STATE_MVIN_REQ, STATE_MVIN_RESP, STATE_MVIN_WRITE, STATE_MVOUT_READ, STATE_MVOUT_RESP, STATE_MVOUT_WRITE, STATE_MVOUT_WAIT, STATE_DONE +from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_DMA_IDLE, STATE_DMA_MVIN_REQ, STATE_DMA_MVIN_RESP, STATE_DMA_MVIN_WRITE, STATE_DMA_MVOUT_READ, STATE_DMA_MVOUT_RESP, STATE_DMA_MVOUT_WRITE, STATE_DMA_MVOUT_WAIT, STATE_DMA_DONE class DmaEngineRTL( Component ): @@ -74,9 +74,9 @@ def construct( s, # Abstract external memory interface # Request to read from DRAM - s.dram_rd_req = SendIfcRTL( DramAddrType ) + s.send_dram_rd_req = SendIfcRTL( DramAddrType ) # Response from DRAM - s.dram_rd_resp = RecvIfcRTL( MemDataType ) + s.recv_dram_rd_resp = RecvIfcRTL( MemDataType ) # Request to write to DRAM s.dram_wr_req = DmaDramWrReqIfcRTL(DramAddrType, MemDataType, MemMaskType) @@ -127,24 +127,24 @@ def construct( s, @update def comb_outputs(): - s.dma_cmd.rdy @= s.state == STATE_IDLE - s.dma_done.val @= s.state == STATE_DONE + s.dma_cmd.rdy @= s.state == STATE_DMA_IDLE + s.dma_done.val @= s.state == STATE_DMA_DONE s.dma_done.msg @= DmaDoneType(s.tag_reg) - s.dram_rd_req.val @= s.state == STATE_MVIN_REQ - s.dram_rd_req.msg @= s.dram_addr_reg - s.dram_rd_resp.rdy @= s.state == STATE_MVIN_RESP + s.send_dram_rd_req.val @= s.state == STATE_DMA_MVIN_REQ + s.send_dram_rd_req.msg @= s.dram_addr_reg + s.recv_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP - # s.dram_wr_req_val @= s.state == STATE_MVOUT_WRITE + # s.dram_wr_req_val @= s.state == STATE_DMA_MVOUT_WRITE # s.dram_wr_req_addr @= s.dram_addr_reg # s.dram_wr_req_data @= s.beat_reg # s.dram_wr_req_mask @= s.wr_mask_reg - s.dram_wr_req.val @= s.state == STATE_MVOUT_WRITE + s.dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE s.dram_wr_req.addr @= s.dram_addr_reg s.dram_wr_req.data @= s.beat_reg s.dram_wr_req.mask @= s.wr_mask_reg - s.dram_wr_resp_rdy @= s.state == STATE_MVOUT_WAIT + s.dram_wr_resp_rdy @= s.state == STATE_DMA_MVOUT_WAIT spm_wdata = SpmDataType(0) @@ -157,20 +157,20 @@ def comb_outputs(): else: # 4th word spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] - s.spm.write.val @= s.state == STATE_MVIN_WRITE + s.spm.write.val @= s.state == STATE_DMA_MVIN_WRITE s.spm.write.msg @= DmaSpmWriteReqType( s.spm_addr_reg, spm_wdata, SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) ) - s.spm.read.val @= s.state == STATE_MVOUT_READ + s.spm.read.val @= s.state == STATE_DMA_MVOUT_READ s.spm.read.msg @= DmaSpmReadReqType(s.spm_addr_reg) - s.spm.read_resp.rdy @= s.state == STATE_MVOUT_RESP + s.spm.read_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP @update_ff def seq_state(): if s.reset: - s.state_ff <<= STATE_IDLE + s.state_ff <<= STATE_DMA_IDLE s.opcode_ff <<= OpcodeType( 0 ) s.dram_addr_ff <<= DramAddrType( 0 ) s.spm_addr_ff <<= SpmAddrType( 0 ) @@ -180,7 +180,7 @@ def seq_state(): s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) else: - if s.state == STATE_IDLE: + if s.state == STATE_DMA_IDLE: if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command. s.opcode_ff <<= s.dma_cmd.msg.opcode s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr @@ -192,25 +192,25 @@ def seq_state(): s.wr_mask_ff <<= MemMaskType( 0 ) if s.dma_cmd.msg.nbytes == BytesType( 0 ): # No more bytes to transfer. - s.state_ff <<= STATE_DONE + s.state_ff <<= STATE_DMA_DONE # Still has bytes to transfer. elif s.dma_cmd.msg.opcode == OpcodeType( DMA_MVIN ): - s.state_ff <<= STATE_MVIN_REQ # Move to the next state: to issue a read request to DRAM. + s.state_ff <<= STATE_DMA_MVIN_REQ # Move to the next state: to issue a read request to DRAM. else: # DMA_MVOUT - s.state_ff <<= STATE_MVOUT_READ # Move to the next state: to issue a read request to SPM. + s.state_ff <<= STATE_DMA_MVOUT_READ # Move to the next state: to issue a read request to SPM. - elif s.state == STATE_MVIN_REQ: # Issues a read request to DRAM. - if s.dram_rd_req.val & s.dram_rd_req.rdy: + elif s.state == STATE_DMA_MVIN_REQ: # Issues a read request to DRAM. + if s.send_dram_rd_req.val & s.send_dram_rd_req.rdy: s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) - s.state_ff <<= STATE_MVIN_RESP + s.state_ff <<= STATE_DMA_MVIN_RESP - elif s.state == STATE_MVIN_RESP: # Receives a response from DRAM. - if s.dram_rd_resp.val & s.dram_rd_resp.rdy: - s.beat_ff <<= s.dram_rd_resp.msg + elif s.state == STATE_DMA_MVIN_RESP: # Receives a response from DRAM. + if s.recv_dram_rd_resp.val & s.recv_dram_rd_resp.rdy: + s.beat_ff <<= s.recv_dram_rd_resp.msg s.word_idx_ff <<= b2( 0 ) - s.state_ff <<= STATE_MVIN_WRITE # Move to the next state: to write to SPM. + s.state_ff <<= STATE_DMA_MVIN_WRITE # Move to the next state: to write to SPM. - elif s.state == STATE_MVIN_WRITE: # Writes to SPM. + elif s.state == STATE_DMA_MVIN_WRITE: # Writes to SPM. if s.spm.write.val & s.spm.write.rdy: # Update the SPM address where write next cycle(+1) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) @@ -218,18 +218,18 @@ def seq_state(): s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) if s.words_left_reg == BytesType( 1 ): - s.state_ff <<= STATE_DONE + s.state_ff <<= STATE_DMA_DONE elif s.word_idx_reg == b2( 3 ): s.word_idx_ff <<= b2( 0 ) - s.state_ff <<= STATE_MVIN_REQ + s.state_ff <<= STATE_DMA_MVIN_REQ else: s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) - elif s.state == STATE_MVOUT_READ: + elif s.state == STATE_DMA_MVOUT_READ: if s.spm.read.val & s.spm.read.rdy: - s.state_ff <<= STATE_MVOUT_RESP # Move to the next state: to receive a response from SPM. + s.state_ff <<= STATE_DMA_MVOUT_RESP # Move to the next state: to receive a response from SPM. - elif s.state == STATE_MVOUT_RESP: + elif s.state == STATE_DMA_MVOUT_RESP: if s.spm.read_resp.val & s.spm.read_resp.rdy: # Pack the response from SPM into a 128-bit beat by left-shifting. if s.word_idx_reg == b2( 0 ): # 1st word @@ -259,19 +259,19 @@ def seq_state(): s.wr_mask_ff <<= MemMaskType( 0x0fff ) else: s.wr_mask_ff <<= MemMaskType( 0xffff ) - s.state_ff <<= STATE_MVOUT_WRITE + s.state_ff <<= STATE_DMA_MVOUT_WRITE elif s.word_idx_reg == b2( 3 ): s.wr_mask_ff <<= MemMaskType( 0xffff ) - s.state_ff <<= STATE_MVOUT_WRITE + s.state_ff <<= STATE_DMA_MVOUT_WRITE else: s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) - s.state_ff <<= STATE_MVOUT_READ + s.state_ff <<= STATE_DMA_MVOUT_READ - elif s.state == STATE_MVOUT_WRITE: + elif s.state == STATE_DMA_MVOUT_WRITE: if s.dram_wr_req.val & s.dram_wr_req.rdy: - s.state_ff <<= STATE_MVOUT_WAIT + s.state_ff <<= STATE_DMA_MVOUT_WAIT - elif s.state == STATE_MVOUT_WAIT: + elif s.state == STATE_DMA_MVOUT_WAIT: if s.dram_wr_resp_val & s.dram_wr_resp_rdy: # Turn to the +16 address after writing 16 bytes data. s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) @@ -280,13 +280,13 @@ def seq_state(): s.wr_mask_ff <<= MemMaskType( 0 ) if s.words_left_reg == BytesType( 0 ): - s.state_ff <<= STATE_DONE + s.state_ff <<= STATE_DMA_DONE else: - s.state_ff <<= STATE_MVOUT_READ + s.state_ff <<= STATE_DMA_MVOUT_READ - elif s.state == STATE_DONE: + elif s.state == STATE_DMA_DONE: if s.dma_done.val & s.dma_done.rdy: - s.state_ff <<= STATE_IDLE + s.state_ff <<= STATE_DMA_IDLE def line_trace( s ): return f"dma(state={int(s.state)},tag={int(s.tag_reg)},left={int(s.words_left_reg)})" diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 285b6def..04dca944 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -22,9 +22,9 @@ def make_dut(): dut.dma_cmd.msg.tag @= 0 dut.dma_done.rdy @= 1 - dut.dram_rd_req.rdy @= 1 - dut.dram_rd_resp.val @= 0 - dut.dram_rd_resp.msg @= 0 + dut.send_dram_rd_req.rdy @= 1 + dut.recv_dram_rd_resp.val @= 0 + dut.recv_dram_rd_resp.msg @= 0 dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 1 @@ -84,15 +84,15 @@ def test_dma_mvin_one_beat(): spm_writes = [] for _ in range(20): - dut.dram_rd_resp.val @= 0 + dut.recv_dram_rd_resp.val @= 0 if pending_resp is not None: - dut.dram_rd_resp.val @= 1 - dut.dram_rd_resp.msg @= pending_resp + dut.recv_dram_rd_resp.val @= 1 + dut.recv_dram_rd_resp.msg @= pending_resp dut.sim_eval_combinational() - if dut.dram_rd_req.val & dut.dram_rd_req.rdy: - pending_resp = dram[int(dut.dram_rd_req.msg)] + if dut.send_dram_rd_req.val & dut.send_dram_rd_req.rdy: + pending_resp = dram[int(dut.send_dram_rd_req.msg)] else: pending_resp = None @@ -223,6 +223,10 @@ def test_dma_mvout_full_beat(): dut.sim_tick() + for elem in mem_writes: + print(f'{elem[0]}: 0x{elem[1]:08x}') + print(f'mask: 0x{elem[2]:08x}') + assert mem_writes == [ (0x2000, int(concat(Bits32(0x77778888), Bits32(0x55556666), From 075f63f9ba9ddf27b2823975d83238d47916626c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 22 Jun 2026 15:58:54 +0800 Subject: [PATCH 26/46] Add the assertion to ensure the number of tranfer data is the multiple of 4 --- cgra/test/CgraDmaRTL_test.py | 2 ++ controller/ControllerRTL.py | 3 +++ mem/dma/DmaEngineRTL.py | 5 ++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 869256d9..990a0abe 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -55,6 +55,8 @@ def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, nbytes: The number of bytes to transfer. tag: The tag of the DMA command. """ + assert nbytes % 4 == 0, \ + f"DMA nbytes must be a multiple of 4, got {nbytes}" config_pkts = [ # The bindwidth of dram address is 64 bits, so we need to split it into two 32 bits parts. # Lower 32 bits are sent first. diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 94c79184..3392bf36 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -332,6 +332,9 @@ def update_received_msg(): elif has_dma_ports & ( (cpu_cmd == CMD_DMA_MVIN) | (cpu_cmd == CMD_DMA_MVOUT)): + if s.recv_from_cpu_pkt_queue.send.val: + assert int(s.dma_bytes) % 4 == 0, \ + f"DMA transfer size must be a multiple of 4 bytes, got {int(s.dma_bytes)}" s.dma_cmd.val @= s.recv_from_cpu_pkt_queue.send.val if cpu_cmd == CMD_DMA_MVIN: s.dma_cmd.msg @= DmaCmdType( diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 72b3afd0..2e04daf5 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -182,10 +182,13 @@ def seq_state(): else: if s.state == STATE_DMA_IDLE: if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command. + assert int(s.dma_cmd.msg.nbytes) % 4 == 0, \ + f"DMA nbytes must be a multiple of 4, got {int(s.dma_cmd.msg.nbytes)}" s.opcode_ff <<= s.dma_cmd.msg.opcode s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr s.spm_addr_ff <<= s.dma_cmd.msg.spm_addr - s.words_left_ff <<= s.dma_cmd.msg.nbytes >> 2 # Converts the transfer size from bytes to words. + # Converts the transfer size from bytes to words. + s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) s.tag_ff <<= s.dma_cmd.msg.tag s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) From 628e2d3a087203e1221dc036e2c95a069862ad6c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Mon, 22 Jun 2026 17:29:06 +0800 Subject: [PATCH 27/46] Add assertions to ensure that the number of bytes transferred by DMA is an integer multiple of 4 --- cgra/test/CgraDmaRTL_test.py | 2 ++ controller/ControllerRTL.py | 3 --- lib/messages.py | 2 ++ mem/dma/DmaEngineRTL.py | 4 +++- mem/dma/test/DmaEngineRTL_test.py | 4 ++++ 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/CgraDmaRTL_test.py index 990a0abe..810c11c2 100644 --- a/cgra/test/CgraDmaRTL_test.py +++ b/cgra/test/CgraDmaRTL_test.py @@ -55,6 +55,8 @@ def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, nbytes: The number of bytes to transfer. tag: The tag of the DMA command. """ + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. assert nbytes % 4 == 0, \ f"DMA nbytes must be a multiple of 4, got {nbytes}" config_pkts = [ diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 3392bf36..94c79184 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -332,9 +332,6 @@ def update_received_msg(): elif has_dma_ports & ( (cpu_cmd == CMD_DMA_MVIN) | (cpu_cmd == CMD_DMA_MVOUT)): - if s.recv_from_cpu_pkt_queue.send.val: - assert int(s.dma_bytes) % 4 == 0, \ - f"DMA transfer size must be a multiple of 4 bytes, got {int(s.dma_bytes)}" s.dma_cmd.val @= s.recv_from_cpu_pkt_queue.send.val if cpu_cmd == CMD_DMA_MVIN: s.dma_cmd.msg @= DmaCmdType( diff --git a/lib/messages.py b/lib/messages.py index 509638d7..9e433cf0 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -224,6 +224,8 @@ def str_func(s): 'opcode' : OpcodeType, 'dram_addr': DramAddrType, 'spm_addr' : SpmAddrType, + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. 'nbytes' : BytesType, 'tag' : TagType, }, diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 2e04daf5..713fecc3 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -188,7 +188,9 @@ def seq_state(): s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr s.spm_addr_ff <<= s.dma_cmd.msg.spm_addr # Converts the transfer size from bytes to words. - s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) + # NOTE We only support nbytes that are multiples of 4 now. + # If nbytes is not a multiple of 4, we will add 1 to the number of words to transfer. + s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) if (s.dma_cmd.msg.nbytes % 4 == 0) else (s.dma_cmd.msg.nbytes >> 2) + 1 s.tag_ff <<= s.dma_cmd.msg.tag s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 04dca944..062143a7 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -47,6 +47,10 @@ def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): nbytes: The number of bytes to transfer. tag: The tag of the DMA command. """ + # NOTE nbytes is the number of bytes to transfer. + # Currently, only nbytes that are multiples of 4 are supported. + assert nbytes % 4 == 0, \ + f"DMA nbytes must be a multiple of 4, got {nbytes}" dut.dma_cmd.val @= 1 dut.dma_cmd.msg.opcode @= opcode dut.dma_cmd.msg.dram_addr @= dram_addr From 90023f2ac4d9be8a6a2549b73cf69b3117f2b674 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 10:38:35 +0800 Subject: [PATCH 28/46] [Refactor] Remove DmaWireIfcRTL and DmaSpmWireIfcRTL. Use ValRdyRecv/SendRTL to replace them. --- cgra/CgraTemplateRTL.py | 18 ++++++-- controller/ControllerRTL.py | 70 ++++++++++++-------------------- lib/basic/val_rdy/ifcs.py | 30 -------------- mem/data/DataMemControllerRTL.py | 46 +++------------------ 4 files changed, 48 insertions(+), 116 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 8c6fc719..a1bf4f86 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -228,9 +228,21 @@ def construct(s, CgraPayloadType, # DMA engine <-> controller side of the SPM path. s.dma_spm //= s.controller.dma_spm_from_dma - # Controller <-> data memory side of the SPM path. - s.controller.dma_spm_to_mem //= s.data_mem.dma_spm - + else: + # Grounds the DMA ports when no DMA engine is attached. + s.controller.dma_cmd.rdy //= 0 + s.controller.dma_done.val //= 0 + s.controller.dma_done.msg //= DmaDoneType() + + s.controller.dma_spm_from_dma.write.val //= 0 + s.controller.dma_spm_from_dma.write.msg //= DmaSpmWriteReqType() + s.controller.dma_spm_from_dma.read.val //= 0 + s.controller.dma_spm_from_dma.read.msg //= DmaSpmReadReqType() + s.controller.dma_spm_from_dma.read_resp.rdy //= 0 + + # Controller <-> data memory side of the SPM path. + s.controller.dma_spm_to_mem //= s.data_mem.dma_spm + # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 94c79184..071d53c9 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -12,7 +12,6 @@ from ..lib.basic.val_rdy.ifcs import RecvIfcRTL from ..lib.basic.val_rdy.ifcs import SendIfcRTL from ..lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL, DmaSpmMinionIfcRTL -from ..lib.basic.val_rdy.ifcs import DmaWireIfcRTL, DmaSpmWireIfcRTL from ..lib.basic.val_rdy.queues import NormalQueueRTL from ..lib.messages import * from ..lib.opt_type import * @@ -97,45 +96,21 @@ def construct(s, s.send_to_tile_load_response = SendIfcRTL(InterCgraPktType) s.send_to_mem_store_request = SendIfcRTL(InterCgraPktType) - if has_dma_ports: - # Controller-owned command path from CPU packets to the DMA engine. - s.dma_cmd = SendIfcRTL(DmaCmdType) - - s.dma_done = RecvIfcRTL(DmaDoneType) - - # DMA engine side of the controller-forwarded SPM access path. - s.dma_spm_from_dma = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + # Controller-owned command path from CPU packets to the DMA engine. + # Send the decoded DMA command to the DMA engine. + s.dma_cmd = SendIfcRTL(DmaCmdType) + # Receive the DMA done signal from the DMA engine. + s.dma_done = RecvIfcRTL(DmaDoneType) - # Data memory side of the same SPM access path. - s.dma_spm_to_mem = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, + # DMA engine side of the controller-forwarded SPM access path. + s.dma_spm_from_dma = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - else: - s.dma_cmd = DmaWireIfcRTL(DmaCmdType) - s.dma_cmd.rdy //= 0 - s.dma_done = DmaWireIfcRTL(DmaDoneType) - s.dma_done.val //= 0 - s.dma_done.msg //= DmaDoneType() - - s.dma_spm_from_dma = DmaSpmWireIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) - s.dma_spm_from_dma.write.val //= 0 - s.dma_spm_from_dma.write.msg //= DmaSpmWriteReqType() - s.dma_spm_from_dma.read.val //= 0 - s.dma_spm_from_dma.read.msg //= DmaSpmReadReqType() - s.dma_spm_from_dma.read_resp.rdy //= 0 - - s.dma_spm_to_mem = DmaSpmWireIfcRTL(DmaSpmWriteReqType, + # Data memory side of the same SPM access path. + s.dma_spm_to_mem = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - s.dma_spm_to_mem.write.rdy //= 0 - s.dma_spm_to_mem.read.rdy //= 0 - s.dma_spm_to_mem.read_resp.val //= 0 - s.dma_spm_to_mem.read_resp.msg //= DmaSpmReadRespType() # Component @@ -244,6 +219,16 @@ def update_dma_spm_forwarding(): s.dma_spm_from_dma.read_resp.val @= s.dma_spm_to_mem.read_resp.val s.dma_spm_to_mem.read_resp.rdy @= s.dma_spm_from_dma.read_resp.rdy s.dma_spm_from_dma.read_resp.msg @= s.dma_spm_to_mem.read_resp.msg + else: + s.dma_spm_to_mem.write.val @= 0 + s.dma_spm_to_mem.write.msg @= DmaSpmWriteReqType() + s.dma_spm_to_mem.read.val @= 0 + s.dma_spm_to_mem.read.msg @= DmaSpmReadReqType() + s.dma_spm_to_mem.read_resp.rdy @= 0 + s.dma_spm_from_dma.write.rdy @= 0 + s.dma_spm_from_dma.read.rdy @= 0 + s.dma_spm_from_dma.read_resp.val @= 0 + s.dma_spm_from_dma.read_resp.msg @= DmaSpmReadRespType() @update def update_received_msg(): @@ -258,15 +243,14 @@ def update_received_msg(): s.send_to_cpu_pkt_queue.recv.msg @= IntraCgraPktType(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) s.recv_from_ctrl_ring_pkt.rdy @= 0 - if has_dma_ports: - s.dma_cmd.val @= 0 - s.dma_cmd.msg @= DmaCmdType( - DmaOpcodeType(DMA_MVIN), - concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), - s.dma_spm_addr, - s.dma_bytes, - s.dma_tag) - s.dma_done.rdy @= 0 + s.dma_cmd.val @= 0 + s.dma_cmd.msg @= DmaCmdType( + DmaOpcodeType(DMA_MVIN), + concat(s.dma_dram_addr_hi, s.dma_dram_addr_lo), + s.dma_spm_addr, + s.dma_bytes, + s.dma_tag) + s.dma_done.rdy @= 0 for i in range(CONTROLLER_CROSSBAR_INPORTS): s.crossbar.recv[i].val @= 0 diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 44d1a508..9a30f8d7 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -86,20 +86,6 @@ def construct( s, ReqType, RespType ): def __str__( s ): return f"{s.req}|{s.resp}" -class DmaWireIfcRTL( Interface ): - """Undirected val/rdy stub interface for disabled DMA paths. - - Uses OutPort for all signals so parent components can tie off unused - DMA stubs with structural connections during Verilog translation. - """ - def construct( s, Type ): - s.msg = OutPort( Type ) - s.val = OutPort() - s.rdy = OutPort() - s.trace_len = len(str(Type())) - def __str__( s ): - return valrdy_to_str( s.msg, s.val, s.rdy, s.trace_len ) - class DmaSpmMasterIfcRTL( Interface ): """ DMA-to-SPM Master Interface. @@ -147,22 +133,6 @@ def construct( s, WriteReqType, ReadReqType, ReadRespType ): def __str__( s ): return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" -class DmaSpmWireIfcRTL( Interface ): - """ - Wire interface/connection for DMA-to-SPM, no direction. - - """ - def construct( s, WriteReqType, ReadReqType, ReadRespType ): - s.WriteReqType = WriteReqType - s.ReadReqType = ReadReqType - s.ReadRespType = ReadRespType - s.write = DmaWireIfcRTL( WriteReqType ) - s.read = DmaWireIfcRTL( ReadReqType ) - s.read_resp = DmaWireIfcRTL( ReadRespType ) - def __str__( s ): - return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" - - class DmaDramWrReqIfcRTL( Interface ): """ DMA-to-DRAM Write Request Interface. diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index f2523700..c8751ac8 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -29,7 +29,7 @@ from .DataMemWrapperRTL import DataMemWrapperRTL from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ...lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL, DmaSpmWireIfcRTL +from ...lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL from ...lib.messages import * from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL from ...lib.util.data_struct_attr import * @@ -157,42 +157,9 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) - if has_dma_ports: - s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) - else: - s.dma_spm = DmaSpmWireIfcRTL(DmaSpmWriteReqType, + s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, DmaSpmReadReqType, DmaSpmReadRespType) - s.dma_spm.write.val //= 0 - s.dma_spm.write.msg //= DmaSpmWriteReqType() - s.dma_spm.read.val //= 0 - s.dma_spm.read.msg //= DmaSpmReadReqType() - s.dma_spm.read_resp.rdy //= 0 - - # Keep these as internal wires so PyMTL's static update-block analysis - # can see declared objects even when the optional DMA interface is off. - s.spm_dma_wval = Wire() - s.spm_dma_wrdy = Wire() - s.spm_dma_waddr = Wire(AddrType) - s.spm_dma_wdata = Wire(DmaDataType) - s.spm_dma_wmask = Wire(DmaMaskType) - - s.spm_dma_rval = Wire() - s.spm_dma_rrdy = Wire() - s.spm_dma_raddr = Wire(AddrType) - s.spm_dma_rresp_val = Wire() - s.spm_dma_rresp_rdy = Wire() - s.spm_dma_rresp_data = Wire(DmaDataType) - - s.spm_dma_wval //= 0 - s.spm_dma_waddr //= AddrType(0) - s.spm_dma_wdata //= DmaDataType(0) - s.spm_dma_wmask //= DmaMaskType(0) - s.spm_dma_rval //= 0 - s.spm_dma_raddr //= AddrType(0) - s.spm_dma_rresp_rdy //= 0 # Components. # A list of DataMemWrapperRTL instances. Each one is a single memory bank. @@ -396,11 +363,10 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - if has_dma_ports: - s.dma_spm.write.rdy @= 0 - s.dma_spm.read.rdy @= 0 - s.dma_spm.read_resp.val @= 0 - s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) + s.dma_spm.write.rdy @= 0 + s.dma_spm.read.rdy @= 0 + s.dma_spm.read_resp.val @= 0 + s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src From 37a363e24e6516a7e556e3fae3baafe610bb1095 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 19:31:21 +0800 Subject: [PATCH 29/46] Split the dma_spm_to_dram into 3 signals. --- controller/ControllerRTL.py | 58 ++++++++++++++++++++----------------- lib/basic/val_rdy/ifcs.py | 47 ------------------------------ 2 files changed, 31 insertions(+), 74 deletions(-) diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 071d53c9..a5ff04a0 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -11,7 +11,6 @@ from ..lib.basic.val_rdy.ifcs import RecvIfcRTL from ..lib.basic.val_rdy.ifcs import SendIfcRTL -from ..lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL, DmaSpmMinionIfcRTL from ..lib.basic.val_rdy.queues import NormalQueueRTL from ..lib.messages import * from ..lib.opt_type import * @@ -102,15 +101,20 @@ def construct(s, # Receive the DMA done signal from the DMA engine. s.dma_done = RecvIfcRTL(DmaDoneType) - # DMA engine side of the controller-forwarded SPM access path. - s.dma_spm_from_dma = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + # Receive the request of writing into SPM from the DMA. + s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + # Receive the request of reading from SPM from the DMA. + s.recv_from_dma_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + # Send the response of reading from SPM to the DMA. + s.send_to_dma_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) # Data memory side of the same SPM access path. - s.dma_spm_to_mem = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + # Send the request of writing into SPM to the data_mem controller. + s.send_to_mem_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) + # Send the request of reading from SPM to the data_mem controller. + s.send_to_mem_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) + # Receive the response of reading from SPM from the data_mem controller. + s.recv_from_mem_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) # Component @@ -209,26 +213,26 @@ def update_dma_cmd_regs(): @update def update_dma_spm_forwarding(): if has_dma_ports: - s.dma_spm_to_mem.write.val @= s.dma_spm_from_dma.write.val - s.dma_spm_from_dma.write.rdy @= s.dma_spm_to_mem.write.rdy - s.dma_spm_to_mem.write.msg @= s.dma_spm_from_dma.write.msg - - s.dma_spm_to_mem.read.val @= s.dma_spm_from_dma.read.val - s.dma_spm_from_dma.read.rdy @= s.dma_spm_to_mem.read.rdy - s.dma_spm_to_mem.read.msg @= s.dma_spm_from_dma.read.msg - s.dma_spm_from_dma.read_resp.val @= s.dma_spm_to_mem.read_resp.val - s.dma_spm_to_mem.read_resp.rdy @= s.dma_spm_from_dma.read_resp.rdy - s.dma_spm_from_dma.read_resp.msg @= s.dma_spm_to_mem.read_resp.msg + s.send_to_mem_spm_wr_req.val @= s.recv_from_dma_spm_wr_req.val + s.recv_from_dma_spm_wr_req.rdy @= s.send_to_mem_spm_wr_req.rdy + s.send_to_mem_spm_wr_req.msg @= s.recv_from_dma_spm_wr_req.msg + + s.send_to_mem_spm_rd_req.val @= s.recv_from_dma_spm_rd_req.val + s.recv_from_dma_spm_rd_req.rdy @= s.send_to_mem_spm_rd_req.rdy + s.send_to_mem_spm_rd_req.msg @= s.recv_from_dma_spm_rd_req.msg + s.send_to_dma_spm_rd_resp.val @= s.recv_from_mem_spm_rd_resp.val + s.recv_from_mem_spm_rd_resp.rdy @= s.send_to_dma_spm_rd_resp.rdy + s.send_to_dma_spm_rd_resp.msg @= s.recv_from_mem_spm_rd_resp.msg else: - s.dma_spm_to_mem.write.val @= 0 - s.dma_spm_to_mem.write.msg @= DmaSpmWriteReqType() - s.dma_spm_to_mem.read.val @= 0 - s.dma_spm_to_mem.read.msg @= DmaSpmReadReqType() - s.dma_spm_to_mem.read_resp.rdy @= 0 - s.dma_spm_from_dma.write.rdy @= 0 - s.dma_spm_from_dma.read.rdy @= 0 - s.dma_spm_from_dma.read_resp.val @= 0 - s.dma_spm_from_dma.read_resp.msg @= DmaSpmReadRespType() + s.send_to_mem_spm_wr_req.val @= 0 + s.send_to_mem_spm_wr_req.msg @= DmaSpmWriteReqType() + s.send_to_mem_spm_rd_req.val @= 0 + s.send_to_mem_spm_rd_req.msg @= DmaSpmReadReqType() + s.recv_from_mem_spm_rd_resp.rdy @= 0 + s.recv_from_dma_spm_wr_req.rdy @= 0 + s.recv_from_dma_spm_rd_req.rdy @= 0 + s.send_to_dma_spm_rd_resp.val @= 0 + s.send_to_dma_spm_rd_resp.msg @= DmaSpmReadRespType() @update def update_received_msg(): diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 9a30f8d7..22d328a8 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -86,53 +86,6 @@ def construct( s, ReqType, RespType ): def __str__( s ): return f"{s.req}|{s.resp}" -class DmaSpmMasterIfcRTL( Interface ): - """ - DMA-to-SPM Master Interface. - - This interface is instantiated on the DMA side. - It initiates all transfer requests (both write and read) to the SPM - and receives the corresponding read data back. - - Direction: - - write : Output (Send). DMA sends write requests to SPM. - - read : Output (Send). DMA sends read requests to SPM. - - read_resp: Input (Recv). DMA receives read data from SPM. - """ - - def construct( s, WriteReqType, ReadReqType, ReadRespType ): - s.WriteReqType = WriteReqType - s.ReadReqType = ReadReqType - s.ReadRespType = ReadRespType - s.write = SendIfcRTL( WriteReqType ) - s.read = SendIfcRTL( ReadReqType ) - s.read_resp = RecvIfcRTL( ReadRespType ) - def __str__( s ): - return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" - -class DmaSpmMinionIfcRTL( Interface ): - """ - DMA-to-SPM Minion Interface. - - This interface is instantiated on the SPM side. - It passively accepts incoming transfer requests from the DMA master, - performs the requested memory operations, and returns read data if needed. - - Direction: - - write : Input (Recv). SPM receives write requests from DMA. - - read : Input (Recv). SPM receives read requests from DMA. - - read_resp: Output (Send). SPM sends read data back to DMA. - """ - def construct( s, WriteReqType, ReadReqType, ReadRespType ): - s.WriteReqType = WriteReqType - s.ReadReqType = ReadReqType - s.ReadRespType = ReadRespType - s.write = RecvIfcRTL( WriteReqType ) - s.read = RecvIfcRTL( ReadReqType ) - s.read_resp = SendIfcRTL( ReadRespType ) - def __str__( s ): - return f"wr:{s.write}|rd:{s.read}|resp:{s.read_resp}" - class DmaDramWrReqIfcRTL( Interface ): """ DMA-to-DRAM Write Request Interface. From af3c0a68719a99830e193db61e93261957b22427 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 19:34:12 +0800 Subject: [PATCH 30/46] Deprecate the DmaSpmMasterRTL in DMA module --- mem/dma/DmaEngineRTL.py | 39 ++++++++++++++----------------- mem/dma/test/DmaEngineRTL_test.py | 32 ++++++++++++------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 713fecc3..252b00a5 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -10,7 +10,6 @@ from pymtl3 import * from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ...lib.basic.val_rdy.ifcs import DmaSpmMasterIfcRTL from ...lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ...lib.messages import * from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_DMA_IDLE, STATE_DMA_MVIN_REQ, STATE_DMA_MVIN_RESP, STATE_DMA_MVIN_WRITE, STATE_DMA_MVOUT_READ, STATE_DMA_MVOUT_RESP, STATE_DMA_MVOUT_WRITE, STATE_DMA_MVOUT_WAIT, STATE_DMA_DONE @@ -83,10 +82,12 @@ def construct( s, s.dram_wr_resp_val = InPort() s.dram_wr_resp_rdy = OutPort() - # SPM interface - s.spm = DmaSpmMasterIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + # Send write request to SPM. + s.send_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) + # Send read request to SPM. + s.send_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) + # Receive read response from SPM. + s.recv_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) # State machine definitions @@ -135,10 +136,6 @@ def comb_outputs(): s.send_dram_rd_req.msg @= s.dram_addr_reg s.recv_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP - # s.dram_wr_req_val @= s.state == STATE_DMA_MVOUT_WRITE - # s.dram_wr_req_addr @= s.dram_addr_reg - # s.dram_wr_req_data @= s.beat_reg - # s.dram_wr_req_mask @= s.wr_mask_reg s.dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE s.dram_wr_req.addr @= s.dram_addr_reg s.dram_wr_req.data @= s.beat_reg @@ -157,15 +154,15 @@ def comb_outputs(): else: # 4th word spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] - s.spm.write.val @= s.state == STATE_DMA_MVIN_WRITE - s.spm.write.msg @= DmaSpmWriteReqType( + s.send_spm_wr_req.val @= s.state == STATE_DMA_MVIN_WRITE + s.send_spm_wr_req.msg @= DmaSpmWriteReqType( s.spm_addr_reg, spm_wdata, SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) ) - s.spm.read.val @= s.state == STATE_DMA_MVOUT_READ - s.spm.read.msg @= DmaSpmReadReqType(s.spm_addr_reg) - s.spm.read_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP + s.send_spm_rd_req.val @= s.state == STATE_DMA_MVOUT_READ + s.send_spm_rd_req.msg @= DmaSpmReadReqType(s.spm_addr_reg) + s.recv_spm_rd_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP @update_ff def seq_state(): @@ -216,7 +213,7 @@ def seq_state(): s.state_ff <<= STATE_DMA_MVIN_WRITE # Move to the next state: to write to SPM. elif s.state == STATE_DMA_MVIN_WRITE: # Writes to SPM. - if s.spm.write.val & s.spm.write.rdy: + if s.send_spm_wr_req.val & s.send_spm_wr_req.rdy: # Update the SPM address where write next cycle(+1) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) # Update the number of words remaining to write to SPM. @@ -231,25 +228,25 @@ def seq_state(): s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) elif s.state == STATE_DMA_MVOUT_READ: - if s.spm.read.val & s.spm.read.rdy: + if s.send_spm_rd_req.val & s.send_spm_rd_req.rdy: s.state_ff <<= STATE_DMA_MVOUT_RESP # Move to the next state: to receive a response from SPM. elif s.state == STATE_DMA_MVOUT_RESP: - if s.spm.read_resp.val & s.spm.read_resp.rdy: + if s.recv_spm_rd_resp.val & s.recv_spm_rd_resp.rdy: # Pack the response from SPM into a 128-bit beat by left-shifting. if s.word_idx_reg == b2( 0 ): # 1st word s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], - s.spm.read_resp.msg.data ) + s.recv_spm_rd_resp.msg.data ) elif s.word_idx_reg == b2( 1 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], - s.spm.read_resp.msg.data, + s.recv_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits] ) elif s.word_idx_reg == b2( 2 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], - s.spm.read_resp.msg.data, + s.recv_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits*2] ) else: - s.beat_ff <<= concat( s.spm.read_resp.msg.data, + s.beat_ff <<= concat( s.recv_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits*3] ) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 062143a7..2b73735b 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -28,10 +28,10 @@ def make_dut(): dut.dram_wr_req.rdy @= 1 dut.dram_wr_resp_val @= 1 - dut.spm.write.rdy @= 1 - dut.spm.read.rdy @= 1 - dut.spm.read_resp.val @= 0 - dut.spm.read_resp.msg.data @= 0 + dut.send_spm_wr_req.rdy @= 1 + dut.send_spm_rd_req.rdy @= 1 + dut.recv_spm_rd_resp.val @= 0 + dut.recv_spm_rd_resp.msg.data @= 0 dut.sim_eval_combinational() return dut @@ -100,8 +100,8 @@ def test_dma_mvin_one_beat(): else: pending_resp = None - if dut.spm.write.val & dut.spm.write.rdy: - spm_writes.append((int(dut.spm.write.msg.addr), int(dut.spm.write.msg.data))) + if dut.send_spm_wr_req.val & dut.send_spm_wr_req.rdy: + spm_writes.append((int(dut.send_spm_wr_req.msg.addr), int(dut.send_spm_wr_req.msg.data))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0x5a @@ -147,15 +147,15 @@ def test_dma_mvout_partial_beat(): mem_writes = [] for _ in range(30): - dut.spm.read_resp.val @= 0 + dut.recv_spm_rd_resp.val @= 0 if pending_rresp is not None: - dut.spm.read_resp.val @= 1 - dut.spm.read_resp.msg.data @= pending_rresp + dut.recv_spm_rd_resp.val @= 1 + dut.recv_spm_rd_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.spm.read.val & dut.spm.read.rdy: - pending_rresp = spm[int(dut.spm.read.msg.addr)] + if dut.send_spm_rd_req.val & dut.send_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_spm_rd_req.msg.addr)] else: pending_rresp = None @@ -204,15 +204,15 @@ def test_dma_mvout_full_beat(): mem_writes = [] for _ in range(30): - dut.spm.read_resp.val @= 0 + dut.recv_spm_rd_resp.val @= 0 if pending_rresp is not None: - dut.spm.read_resp.val @= 1 - dut.spm.read_resp.msg.data @= pending_rresp + dut.recv_spm_rd_resp.val @= 1 + dut.recv_spm_rd_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.spm.read.val & dut.spm.read.rdy: - pending_rresp = spm[int(dut.spm.read.msg.addr)] + if dut.send_spm_rd_req.val & dut.send_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_spm_rd_req.msg.addr)] else: pending_rresp = None From 0fb1b5aec800cb41b7c9e72a2da5c4e33c80babd Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 19:35:31 +0800 Subject: [PATCH 31/46] Refactor DataMemControllerRTL to replace DmaSpmMinionIfcRTL with ValRdyRecv/SendIfcRTL for improved clarity and consistency in DMA signal handling. --- mem/data/DataMemControllerRTL.py | 35 +++++++++-------- .../test/DataMemControllerRTL_dma_test.py | 38 +++++++++---------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index c8751ac8..55d4288f 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -29,7 +29,6 @@ from .DataMemWrapperRTL import DataMemWrapperRTL from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ...lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL from ...lib.messages import * from ...noc.PyOCN.pymtl3_net.xbar.XbarBypassQueueRTL import XbarBypassQueueRTL from ...lib.util.data_struct_attr import * @@ -157,9 +156,9 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) - s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + s.recv_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + s.recv_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + s.send_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) # Components. # A list of DataMemWrapperRTL instances. Each one is a single memory bank. @@ -274,7 +273,7 @@ def assemble_xbar_pkt(): dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - recv_raddr_from_dma = trunc(s.dma_spm.read.msg.addr, AddrType) + recv_raddr_from_dma = trunc(s.recv_spm_rd_req.msg.addr, AddrType) if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) else: @@ -287,7 +286,7 @@ def assemble_xbar_pkt(): 0, # src_tile 0) # remote_src_port - recv_waddr_from_dma = trunc(s.dma_spm.write.msg.addr, AddrType) + recv_waddr_from_dma = trunc(s.recv_spm_wr_req.msg.addr, AddrType) if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) else: @@ -295,7 +294,7 @@ def assemble_xbar_pkt(): s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src bank_index_store_from_dma, # dst recv_waddr_from_dma, # addr - DataType(zext(s.dma_spm.write.msg.data, PayloadType), 1, 0, 0), + DataType(zext(s.recv_spm_wr_req.msg.data, PayloadType), 1, 0, 0), 0, # src_cgra 0, # src_tile 0) # remote_src_port @@ -363,10 +362,10 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - s.dma_spm.write.rdy @= 0 - s.dma_spm.read.rdy @= 0 - s.dma_spm.read_resp.val @= 0 - s.dma_spm.read_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) + s.recv_spm_wr_req.rdy @= 0 + s.recv_spm_rd_req.rdy @= 0 + s.send_spm_rd_resp.val @= 0 + s.send_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src @@ -399,9 +398,9 @@ def update_all(): # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error # between `dma_rd_idx` and `num_xbar_in_rd_ports`. dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) - s.read_crossbar.recv[dma_rd_idx].val @= s.dma_spm.read.val + s.read_crossbar.recv[dma_rd_idx].val @= s.recv_spm_rd_req.val s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] - s.dma_spm.read.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy + s.recv_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -419,9 +418,9 @@ def update_all(): # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - s.write_crossbar.recv[dma_wr_idx].val @= s.dma_spm.write.val + s.write_crossbar.recv[dma_wr_idx].val @= s.recv_spm_wr_req.val s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] - s.dma_spm.write.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy + s.recv_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. @@ -454,10 +453,10 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy elif has_dma_ports: - s.dma_spm.read_resp.msg @= DmaSpmReadRespType( + s.send_spm_rd_resp.msg @= DmaSpmReadRespType( trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType)) - s.dma_spm.read_resp.val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.dma_spm.read_resp.rdy + s.send_spm_rd_resp.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_spm_rd_resp.rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index 5a2d43de..74d39901 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -44,13 +44,13 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_store_pkt.rdy @= 1 DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.dma_spm.write.val @= 0 - dut.dma_spm.write.msg.addr @= DmaSpmAddrType(0) - dut.dma_spm.write.msg.data @= 0 - dut.dma_spm.write.msg.mask @= 0 - dut.dma_spm.read.val @= 0 - dut.dma_spm.read.msg.addr @= DmaSpmAddrType(0) - dut.dma_spm.read_resp.rdy @= 1 + dut.recv_spm_wr_req.val @= 0 + dut.recv_spm_wr_req.msg.addr @= DmaSpmAddrType(0) + dut.recv_spm_wr_req.msg.data @= 0 + dut.recv_spm_wr_req.msg.mask @= 0 + dut.recv_spm_rd_req.val @= 0 + dut.recv_spm_rd_req.msg.addr @= DmaSpmAddrType(0) + dut.send_spm_rd_resp.rdy @= 1 dut.cgra_id @= 0 dut.address_lower @= DataAddrType(0) @@ -91,25 +91,25 @@ def test_dma_ports_write_then_read(): drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.dma_spm.write.val @= 1 - dut.dma_spm.write.msg.addr @= DmaSpmAddrType(3) - dut.dma_spm.write.msg.data @= 0xaaaabbbb - dut.dma_spm.write.msg.mask @= 0xf + dut.recv_spm_wr_req.val @= 1 + dut.recv_spm_wr_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_spm_wr_req.msg.data @= 0xaaaabbbb + dut.recv_spm_wr_req.msg.mask @= 0xf dut.sim_eval_combinational() - assert dut.dma_spm.write.rdy + assert dut.recv_spm_wr_req.rdy dut.sim_tick() - dut.dma_spm.write.val @= 0 + dut.recv_spm_wr_req.val @= 0 - dut.dma_spm.read.val @= 1 - dut.dma_spm.read.msg.addr @= DmaSpmAddrType(3) + dut.recv_spm_rd_req.val @= 1 + dut.recv_spm_rd_req.msg.addr @= DmaSpmAddrType(3) seen_response = False for _ in range(10): dut.sim_eval_combinational() - if dut.dma_spm.read.val & dut.dma_spm.read.rdy: - dut.dma_spm.read.val @= 0 - if dut.dma_spm.read_resp.val: - assert int(dut.dma_spm.read_resp.msg.data) == 0xaaaabbbb + if dut.recv_spm_rd_req.val & dut.recv_spm_rd_req.rdy: + dut.recv_spm_rd_req.val @= 0 + if dut.send_spm_rd_resp.val: + assert int(dut.send_spm_rd_resp.msg.data) == 0xaaaabbbb seen_response = True break dut.sim_tick() From 0bb2d9cc246a5bf8eb7eacb830e2af74a2b71bdb Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 19:35:49 +0800 Subject: [PATCH 32/46] Refactor CgraDmaRTL and CgraTemplateRTL to replace DmaSpmMinionIfcRTL with ValRdyRecv/SendIfcRTL --- cgra/CgraDmaRTL.py | 4 +++- cgra/CgraTemplateRTL.py | 31 ++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cgra/CgraDmaRTL.py b/cgra/CgraDmaRTL.py index 846ba441..f16b1fb4 100644 --- a/cgra/CgraDmaRTL.py +++ b/cgra/CgraDmaRTL.py @@ -190,7 +190,9 @@ def construct(s, CgraPayloadType, # DMA to controller-forwarded SPM connections. - s.dma.spm //= s.cgra.dma_spm + s.dma.send_spm_wr_req //= s.cgra.recv_from_dma_spm_wr_req + s.dma.send_spm_rd_req //= s.cgra.recv_from_dma_spm_rd_req + s.dma.recv_spm_rd_resp //= s.cgra.send_to_dma_spm_rd_resp def line_trace(s): return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index a1bf4f86..ff23154b 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -9,7 +9,6 @@ from ..controller.ControllerRTL import ControllerRTL from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ..lib.basic.val_rdy.ifcs import DmaSpmMinionIfcRTL from ..lib.basic.val_rdy.queues import BypassQueueRTL from ..lib.opt_type import * from ..lib.util.common import * @@ -154,9 +153,12 @@ def construct(s, CgraPayloadType, s.dma_done = RecvIfcRTL(DmaDoneType) - s.dma_spm = DmaSpmMinionIfcRTL(DmaSpmWriteReqType, - DmaSpmReadReqType, - DmaSpmReadRespType) + # Receive the request of writing into SPM from the DMA. + s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + # Receive the request of reading from SPM from the DMA. + s.recv_from_dma_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + # Send the response of reading from SPM to the DMA. + s.send_to_dma_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) if is_multi_cgra: @@ -225,8 +227,9 @@ def construct(s, CgraPayloadType, s.dma_cmd //= s.controller.dma_cmd s.dma_done //= s.controller.dma_done - # DMA engine <-> controller side of the SPM path. - s.dma_spm //= s.controller.dma_spm_from_dma + s.recv_from_dma_spm_wr_req //= s.controller.recv_from_dma_spm_wr_req + s.recv_from_dma_spm_rd_req //= s.controller.recv_from_dma_spm_rd_req + s.send_to_dma_spm_rd_resp //= s.controller.send_to_dma_spm_rd_resp else: # Grounds the DMA ports when no DMA engine is attached. @@ -234,14 +237,16 @@ def construct(s, CgraPayloadType, s.controller.dma_done.val //= 0 s.controller.dma_done.msg //= DmaDoneType() - s.controller.dma_spm_from_dma.write.val //= 0 - s.controller.dma_spm_from_dma.write.msg //= DmaSpmWriteReqType() - s.controller.dma_spm_from_dma.read.val //= 0 - s.controller.dma_spm_from_dma.read.msg //= DmaSpmReadReqType() - s.controller.dma_spm_from_dma.read_resp.rdy //= 0 + s.controller.recv_from_dma_spm_wr_req.val //= 0 + s.controller.recv_from_dma_spm_wr_req.msg //= DmaSpmWriteReqType() + s.controller.recv_from_dma_spm_rd_req.val //= 0 + s.controller.recv_from_dma_spm_rd_req.msg //= DmaSpmReadReqType() + s.controller.send_to_dma_spm_rd_resp.rdy //= 0 - # Controller <-> data memory side of the SPM path. - s.controller.dma_spm_to_mem //= s.data_mem.dma_spm + # Controller <-> SPM/data_mem + s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_spm_wr_req + s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_spm_rd_req + s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_spm_rd_resp # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request From 06eeec43bbe459cb3989848d08be12dc26315594 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 19:38:19 +0800 Subject: [PATCH 33/46] Add CgraDmaRTL wrapper integrating CGRA with DMA engine and corresponding tests --- cgra/{CgraDmaRTL.py => IntegratedCgraWithDmaRTL.py} | 0 .../test/{CgraDmaRTL_test.py => IntegratedCgraWithDmaRTL_test.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename cgra/{CgraDmaRTL.py => IntegratedCgraWithDmaRTL.py} (100%) rename cgra/test/{CgraDmaRTL_test.py => IntegratedCgraWithDmaRTL_test.py} (100%) diff --git a/cgra/CgraDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py similarity index 100% rename from cgra/CgraDmaRTL.py rename to cgra/IntegratedCgraWithDmaRTL.py diff --git a/cgra/test/CgraDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py similarity index 100% rename from cgra/test/CgraDmaRTL_test.py rename to cgra/test/IntegratedCgraWithDmaRTL_test.py From 33622ea2167b989ab2665c20576decc2219cbb6f Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 20:02:48 +0800 Subject: [PATCH 34/46] Refactor CgraDmaRTL to replace DmaDramWrReqIfcRTL with new DMA DRAM write request type and update corresponding tests for consistency --- cgra/IntegratedCgraWithDmaRTL.py | 14 +++++------- cgra/test/IntegratedCgraWithDmaRTL_test.py | 22 ++++++++++--------- lib/basic/val_rdy/ifcs.py | 19 ---------------- lib/messages.py | 25 ++++++++++++++++++++++ mem/dma/DmaEngineRTL.py | 21 +++++++++--------- mem/dma/test/DmaEngineRTL_test.py | 21 +++++++++--------- 6 files changed, 63 insertions(+), 59 deletions(-) diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py index f16b1fb4..b3b30ab0 100644 --- a/cgra/IntegratedCgraWithDmaRTL.py +++ b/cgra/IntegratedCgraWithDmaRTL.py @@ -12,7 +12,6 @@ from .CgraTemplateRTL import CgraTemplateRTL from ..lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ..lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ..lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ..lib.messages import * from ..lib.util.data_struct_attr import * from ..mem.dma.DmaEngineRTL import DmaEngineRTL @@ -82,6 +81,7 @@ def construct(s, CgraPayloadType, DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) DmaMemDataType = DmaDataType.get_field_type(kAttrDramData) DmaMemMaskType = DmaDataType.get_field_type(kAttrDramMask) + DmaDramWrReqType = mk_dma_dram_wr_req(DmaDramAddrType.nbits, DmaMemDataType.nbits, DmaMemMaskType.nbits) # Existing CGRA-facing interfaces. # CGRA <-> CPU @@ -112,10 +112,8 @@ def construct(s, CgraPayloadType, s.send_dram_rd_req = SendIfcRTL(DmaDramAddrType) s.recv_dram_rd_resp = RecvIfcRTL(DmaMemDataType) - s.dram_wr_req = DmaDramWrReqIfcRTL(DmaDramAddrType, DmaMemDataType, DmaMemMaskType) - - s.dram_wr_resp_val = InPort() - s.dram_wr_resp_rdy = OutPort() + s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) + s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) # Components. @@ -183,10 +181,8 @@ def construct(s, CgraPayloadType, s.send_dram_rd_req //= s.dma.send_dram_rd_req s.recv_dram_rd_resp //= s.dma.recv_dram_rd_resp - s.dram_wr_req //= s.dma.dram_wr_req - - s.dram_wr_resp_val //= s.dma.dram_wr_resp_val - s.dram_wr_resp_rdy //= s.dma.dram_wr_resp_rdy + s.send_to_dram_wr_req //= s.dma.send_to_dram_wr_req + s.recv_from_dram_wr_resp //= s.dma.recv_from_dram_wr_resp # DMA to controller-forwarded SPM connections. diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py index 810c11c2..3a955197 100644 --- a/cgra/test/IntegratedCgraWithDmaRTL_test.py +++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py @@ -6,7 +6,7 @@ from pymtl3 import * -from ..CgraDmaRTL import CgraDmaRTL +from ..IntegratedCgraWithDmaRTL import CgraDmaRTL from ...fu.single.AdderRTL import AdderRTL from ...fu.single.MemUnitRTL import MemUnitRTL from ...fu.single.RetRTL import RetRTL @@ -159,8 +159,9 @@ def test_cgra_dma_mvin_to_local_spm(): dut.send_dram_rd_req.rdy @= 1 dut.recv_dram_rd_resp.val @= 0 dut.recv_dram_rd_resp.msg @= 0 - dut.dram_wr_req.rdy @= 1 - dut.dram_wr_resp_val @= 0 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 0 + dut.recv_from_dram_wr_resp.msg @= 0 # Read 16 bytes from DRAM address 0x1000 and write them to SPM words 0..3. issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, @@ -256,8 +257,9 @@ def test_cgra_dma_mvout_from_local_spm(): dut.send_dram_rd_req.rdy @= 1 dut.recv_dram_rd_resp.val @= 0 dut.recv_dram_rd_resp.msg @= 0 - dut.dram_wr_req.rdy @= 1 - dut.dram_wr_resp_val @= 0 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 0 + dut.recv_from_dram_wr_resp.msg @= 0 # Read SPM words 0..3 and write 16 bytes to DRAM address 0x2000. issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, @@ -270,16 +272,16 @@ def test_cgra_dma_mvout_from_local_spm(): done = False pending_wr_resp = False for _ in range(40): - dut.dram_wr_resp_val @= 0 + dut.recv_from_dram_wr_resp.val @= 0 if pending_wr_resp: - dut.dram_wr_resp_val @= 1 + dut.recv_from_dram_wr_resp.val @= 1 pending_wr_resp = False dut.sim_eval_combinational() - if dut.dram_wr_req.val & dut.dram_wr_req.rdy: - assert dut.dram_wr_req.addr == 0x2000 - assert dut.dram_wr_req.data == expected_beat + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + assert dut.send_to_dram_wr_req.msg.addr == 0x2000 + assert dut.send_to_dram_wr_req.msg.data == expected_beat pending_wr_resp = True if observed_dma_done(dut, 0x44): diff --git a/lib/basic/val_rdy/ifcs.py b/lib/basic/val_rdy/ifcs.py index 22d328a8..644b98f5 100644 --- a/lib/basic/val_rdy/ifcs.py +++ b/lib/basic/val_rdy/ifcs.py @@ -85,22 +85,3 @@ def construct( s, ReqType, RespType ): s.resp = SendIfcRTL( Type=RespType ) def __str__( s ): return f"{s.req}|{s.resp}" - -class DmaDramWrReqIfcRTL( Interface ): - """ - DMA-to-DRAM Write Request Interface. - - This interface is instantiated on the DMA side. - It initiates a write request to the DRAM. - - Direction: - - req: Output (Send). DMA sends write requests to DRAM. - """ - def construct( s, DramAddrType, DmaMemDataType, DmaMemMaskType ): - s.val = OutPort() - s.rdy = InPort() - s.addr = OutPort(DramAddrType) - s.data = OutPort(DmaMemDataType) - s.mask = OutPort(DmaMemMaskType) - def __str__( s ): - return f"val:{s.val}|rdy:{s.rdy}|addr:{s.addr}|data:{s.data}|mask:{s.mask}" \ No newline at end of file diff --git a/lib/messages.py b/lib/messages.py index 9e433cf0..0193d0e5 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -272,6 +272,31 @@ def str_func(s): namespace = {'__str__': str_func} ) +#========================================================================= +# DMA DRAM write request type +#========================================================================= +def mk_dma_dram_wr_req(addr_nbits = 64, + data_nbits = 128, + mask_nbits = 16, + prefix = "DmaDramWrReq"): + + AddrType = mk_bits(addr_nbits) + DataType = mk_bits(data_nbits) + MaskType = mk_bits(mask_nbits) + + new_name = f"{prefix}_{addr_nbits}_{data_nbits}_{mask_nbits}" + + def str_func(s): + return f"dma_dram_wr(addr={s.addr},data={s.data},mask={s.mask})" + + return mk_bitstruct(new_name, { + 'addr': AddrType, + 'data': DataType, + 'mask': MaskType, + }, + namespace = {'__str__': str_func} + ) + def mk_dma_spm_write_req(addr_nbits = 32, data_nbits = 32, prefix = "DmaSpmWriteReq"): diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 252b00a5..7377dc50 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -10,7 +10,6 @@ from pymtl3 import * from ...lib.basic.val_rdy.ifcs import ValRdyRecvIfcRTL as RecvIfcRTL from ...lib.basic.val_rdy.ifcs import ValRdySendIfcRTL as SendIfcRTL -from ...lib.basic.val_rdy.ifcs import DmaDramWrReqIfcRTL from ...lib.messages import * from ...lib.util.common import DMA_MVIN, DMA_MVOUT, CHAR_BIT, StateType, STATE_DMA_IDLE, STATE_DMA_MVIN_REQ, STATE_DMA_MVIN_RESP, STATE_DMA_MVIN_WRITE, STATE_DMA_MVOUT_READ, STATE_DMA_MVOUT_RESP, STATE_DMA_MVOUT_WRITE, STATE_DMA_MVOUT_WAIT, STATE_DMA_DONE @@ -63,6 +62,7 @@ def construct( s, DmaSpmWriteReqType = mk_dma_spm_write_req(spm_addr_nbits, spm_data_nbits) DmaSpmReadReqType = mk_dma_spm_read_req(spm_addr_nbits) DmaSpmReadRespType = mk_dma_spm_read_resp(spm_data_nbits) + DmaDramWrReqType = mk_dma_dram_wr_req(dram_addr_nbits, dram_data_nbits, dram_data_nbits // 8) # Command interface # Receives a DMA command from the controller. @@ -78,9 +78,8 @@ def construct( s, s.recv_dram_rd_resp = RecvIfcRTL( MemDataType ) # Request to write to DRAM - s.dram_wr_req = DmaDramWrReqIfcRTL(DramAddrType, MemDataType, MemMaskType) - s.dram_wr_resp_val = InPort() - s.dram_wr_resp_rdy = OutPort() + s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) + s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) # Send write request to SPM. s.send_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) @@ -136,12 +135,12 @@ def comb_outputs(): s.send_dram_rd_req.msg @= s.dram_addr_reg s.recv_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP - s.dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE - s.dram_wr_req.addr @= s.dram_addr_reg - s.dram_wr_req.data @= s.beat_reg - s.dram_wr_req.mask @= s.wr_mask_reg + s.send_to_dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE + s.send_to_dram_wr_req.msg.addr @= s.dram_addr_reg + s.send_to_dram_wr_req.msg.data @= s.beat_reg + s.send_to_dram_wr_req.msg.mask @= s.wr_mask_reg - s.dram_wr_resp_rdy @= s.state == STATE_DMA_MVOUT_WAIT + s.recv_from_dram_wr_resp.rdy @= s.state == STATE_DMA_MVOUT_WAIT spm_wdata = SpmDataType(0) @@ -270,11 +269,11 @@ def seq_state(): s.state_ff <<= STATE_DMA_MVOUT_READ elif s.state == STATE_DMA_MVOUT_WRITE: - if s.dram_wr_req.val & s.dram_wr_req.rdy: + if s.send_to_dram_wr_req.val & s.send_to_dram_wr_req.rdy: s.state_ff <<= STATE_DMA_MVOUT_WAIT elif s.state == STATE_DMA_MVOUT_WAIT: - if s.dram_wr_resp_val & s.dram_wr_resp_rdy: + if s.recv_from_dram_wr_resp.val & s.recv_from_dram_wr_resp.rdy: # Turn to the +16 address after writing 16 bytes data. s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) s.beat_ff <<= MemDataType( 0 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 2b73735b..4bcae68d 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -25,8 +25,9 @@ def make_dut(): dut.send_dram_rd_req.rdy @= 1 dut.recv_dram_rd_resp.val @= 0 dut.recv_dram_rd_resp.msg @= 0 - dut.dram_wr_req.rdy @= 1 - dut.dram_wr_resp_val @= 1 + dut.send_to_dram_wr_req.rdy @= 1 + dut.recv_from_dram_wr_resp.val @= 1 + dut.recv_from_dram_wr_resp.msg @= 0 dut.send_spm_wr_req.rdy @= 1 dut.send_spm_rd_req.rdy @= 1 @@ -159,10 +160,10 @@ def test_dma_mvout_partial_beat(): else: pending_rresp = None - if dut.dram_wr_req.val & dut.dram_wr_req.rdy: - mem_writes.append((int(dut.dram_wr_req.addr), - int(dut.dram_wr_req.data), - int(dut.dram_wr_req.mask))) + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr), + int(dut.send_to_dram_wr_req.msg.data), + int(dut.send_to_dram_wr_req.msg.mask))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0xa5 @@ -216,10 +217,10 @@ def test_dma_mvout_full_beat(): else: pending_rresp = None - if dut.dram_wr_req.val & dut.dram_wr_req.rdy: - mem_writes.append((int(dut.dram_wr_req.addr), - int(dut.dram_wr_req.data), - int(dut.dram_wr_req.mask))) + if dut.send_to_dram_wr_req.val & dut.send_to_dram_wr_req.rdy: + mem_writes.append((int(dut.send_to_dram_wr_req.msg.addr), + int(dut.send_to_dram_wr_req.msg.data), + int(dut.send_to_dram_wr_req.msg.mask))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0xa5 From 832f701d523ffc161c9971019af87f3955016cbf Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 20:16:39 +0800 Subject: [PATCH 35/46] Refactor DMA signal handling across multiple components to improve clarity and consistency by renaming signals related to memory requests and responses. --- cgra/CgraTemplateRTL.py | 6 +-- cgra/IntegratedCgraWithDmaRTL.py | 14 +++--- cgra/test/IntegratedCgraWithDmaRTL_test.py | 20 ++++---- mem/data/DataMemControllerRTL.py | 34 ++++++------- .../test/DataMemControllerRTL_dma_test.py | 38 +++++++-------- mem/dma/DmaEngineRTL.py | 46 +++++++++--------- mem/dma/test/DmaEngineRTL_test.py | 48 +++++++++---------- 7 files changed, 103 insertions(+), 103 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index ff23154b..223aa5e8 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -244,9 +244,9 @@ def construct(s, CgraPayloadType, s.controller.send_to_dma_spm_rd_resp.rdy //= 0 # Controller <-> SPM/data_mem - s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_spm_wr_req - s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_spm_rd_req - s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_spm_rd_resp + s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_from_ctrl_spm_wr_req + s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_from_ctrl_spm_rd_req + s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_to_ctrl_spm_rd_resp # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py index b3b30ab0..f90d795c 100644 --- a/cgra/IntegratedCgraWithDmaRTL.py +++ b/cgra/IntegratedCgraWithDmaRTL.py @@ -109,8 +109,8 @@ def construct(s, CgraPayloadType, # Abstract external dram memory interfaces for the internal DMA engine. - s.send_dram_rd_req = SendIfcRTL(DmaDramAddrType) - s.recv_dram_rd_resp = RecvIfcRTL(DmaMemDataType) + s.send_to_dram_rd_req = SendIfcRTL(DmaDramAddrType) + s.recv_from_dram_rd_resp = RecvIfcRTL(DmaMemDataType) s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) @@ -178,17 +178,17 @@ def construct(s, CgraPayloadType, s.cgra.dma_cmd //= s.dma.dma_cmd s.dma.dma_done //= s.cgra.dma_done - s.send_dram_rd_req //= s.dma.send_dram_rd_req - s.recv_dram_rd_resp //= s.dma.recv_dram_rd_resp + s.send_to_dram_rd_req //= s.dma.send_to_dram_rd_req + s.recv_from_dram_rd_resp //= s.dma.recv_from_dram_rd_resp s.send_to_dram_wr_req //= s.dma.send_to_dram_wr_req s.recv_from_dram_wr_resp //= s.dma.recv_from_dram_wr_resp # DMA to controller-forwarded SPM connections. - s.dma.send_spm_wr_req //= s.cgra.recv_from_dma_spm_wr_req - s.dma.send_spm_rd_req //= s.cgra.recv_from_dma_spm_rd_req - s.dma.recv_spm_rd_resp //= s.cgra.send_to_dma_spm_rd_resp + s.dma.send_to_spm_wr_req //= s.cgra.recv_from_dma_spm_wr_req + s.dma.send_to_spm_rd_req //= s.cgra.recv_from_dma_spm_rd_req + s.dma.recv_from_spm_rd_resp //= s.cgra.send_to_dma_spm_rd_resp def line_trace(s): return f"{s.dma.line_trace()} || {s.cgra.line_trace()}" diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py index 3a955197..af760035 100644 --- a/cgra/test/IntegratedCgraWithDmaRTL_test.py +++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py @@ -156,9 +156,9 @@ def test_cgra_dma_mvin_to_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.send_dram_rd_req.rdy @= 1 - dut.recv_dram_rd_resp.val @= 0 - dut.recv_dram_rd_resp.msg @= 0 + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 dut.send_to_dram_wr_req.rdy @= 1 dut.recv_from_dram_wr_resp.val @= 0 dut.recv_from_dram_wr_resp.msg @= 0 @@ -172,15 +172,15 @@ def test_cgra_dma_mvin_to_local_spm(): pending_resp = False for _ in range(40): - dut.recv_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.val @= 0 if pending_resp: - dut.recv_dram_rd_resp.val @= 1 + dut.recv_from_dram_rd_resp.val @= 1 # Simulate the read response from DRAM. - dut.recv_dram_rd_resp.msg @= beat + dut.recv_from_dram_rd_resp.msg @= beat dut.sim_eval_combinational() - pending_resp = bool(dut.send_dram_rd_req.val & dut.send_dram_rd_req.rdy) + pending_resp = bool(dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy) if observed_dma_done(dut, 0x33): break @@ -254,9 +254,9 @@ def test_cgra_dma_mvout_from_local_spm(): dut.recv_from_cpu_pkt.val @= 0 dut.recv_from_cpu_pkt.msg @= CtrlPktType() dut.send_to_cpu_pkt.rdy @= 1 - dut.send_dram_rd_req.rdy @= 1 - dut.recv_dram_rd_resp.val @= 0 - dut.recv_dram_rd_resp.msg @= 0 + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 dut.send_to_dram_wr_req.rdy @= 1 dut.recv_from_dram_wr_resp.val @= 0 dut.recv_from_dram_wr_resp.msg @= 0 diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 55d4288f..88233908 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -156,9 +156,9 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) - s.recv_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) - s.recv_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) - s.send_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) + s.recv_from_ctrl_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + s.recv_from_ctrl_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + s.send_to_ctrl_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) # Components. # A list of DataMemWrapperRTL instances. Each one is a single memory bank. @@ -273,7 +273,7 @@ def assemble_xbar_pkt(): dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - recv_raddr_from_dma = trunc(s.recv_spm_rd_req.msg.addr, AddrType) + recv_raddr_from_dma = trunc(s.recv_from_ctrl_spm_rd_req.msg.addr, AddrType) if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) else: @@ -286,7 +286,7 @@ def assemble_xbar_pkt(): 0, # src_tile 0) # remote_src_port - recv_waddr_from_dma = trunc(s.recv_spm_wr_req.msg.addr, AddrType) + recv_waddr_from_dma = trunc(s.recv_from_ctrl_spm_wr_req.msg.addr, AddrType) if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) else: @@ -294,7 +294,7 @@ def assemble_xbar_pkt(): s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src bank_index_store_from_dma, # dst recv_waddr_from_dma, # addr - DataType(zext(s.recv_spm_wr_req.msg.data, PayloadType), 1, 0, 0), + DataType(zext(s.recv_from_ctrl_spm_wr_req.msg.data, PayloadType), 1, 0, 0), 0, # src_cgra 0, # src_tile 0) # remote_src_port @@ -362,10 +362,10 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - s.recv_spm_wr_req.rdy @= 0 - s.recv_spm_rd_req.rdy @= 0 - s.send_spm_rd_resp.val @= 0 - s.send_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) + s.recv_from_ctrl_spm_wr_req.rdy @= 0 + s.recv_from_ctrl_spm_rd_req.rdy @= 0 + s.send_to_ctrl_spm_rd_resp.val @= 0 + s.send_to_ctrl_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src @@ -398,9 +398,9 @@ def update_all(): # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error # between `dma_rd_idx` and `num_xbar_in_rd_ports`. dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) - s.read_crossbar.recv[dma_rd_idx].val @= s.recv_spm_rd_req.val + s.read_crossbar.recv[dma_rd_idx].val @= s.recv_from_ctrl_spm_rd_req.val s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] - s.recv_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy + s.recv_from_ctrl_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -418,9 +418,9 @@ def update_all(): # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - s.write_crossbar.recv[dma_wr_idx].val @= s.recv_spm_wr_req.val + s.write_crossbar.recv[dma_wr_idx].val @= s.recv_from_ctrl_spm_wr_req.val s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] - s.recv_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy + s.recv_from_ctrl_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. @@ -453,10 +453,10 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy elif has_dma_ports: - s.send_spm_rd_resp.msg @= DmaSpmReadRespType( + s.send_to_ctrl_spm_rd_resp.msg @= DmaSpmReadRespType( trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType)) - s.send_spm_rd_resp.val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.send_spm_rd_resp.rdy + s.send_to_ctrl_spm_rd_resp.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_to_ctrl_spm_rd_resp.rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index 74d39901..86d5a000 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -44,13 +44,13 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_store_pkt.rdy @= 1 DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.recv_spm_wr_req.val @= 0 - dut.recv_spm_wr_req.msg.addr @= DmaSpmAddrType(0) - dut.recv_spm_wr_req.msg.data @= 0 - dut.recv_spm_wr_req.msg.mask @= 0 - dut.recv_spm_rd_req.val @= 0 - dut.recv_spm_rd_req.msg.addr @= DmaSpmAddrType(0) - dut.send_spm_rd_resp.rdy @= 1 + dut.recv_from_ctrl_spm_wr_req.val @= 0 + dut.recv_from_ctrl_spm_wr_req.msg.addr @= DmaSpmAddrType(0) + dut.recv_from_ctrl_spm_wr_req.msg.data @= 0 + dut.recv_from_ctrl_spm_wr_req.msg.mask @= 0 + dut.recv_from_ctrl_spm_rd_req.val @= 0 + dut.recv_from_ctrl_spm_rd_req.msg.addr @= DmaSpmAddrType(0) + dut.send_to_ctrl_spm_rd_resp.rdy @= 1 dut.cgra_id @= 0 dut.address_lower @= DataAddrType(0) @@ -91,25 +91,25 @@ def test_dma_ports_write_then_read(): drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.recv_spm_wr_req.val @= 1 - dut.recv_spm_wr_req.msg.addr @= DmaSpmAddrType(3) - dut.recv_spm_wr_req.msg.data @= 0xaaaabbbb - dut.recv_spm_wr_req.msg.mask @= 0xf + dut.recv_from_ctrl_spm_wr_req.val @= 1 + dut.recv_from_ctrl_spm_wr_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_from_ctrl_spm_wr_req.msg.data @= 0xaaaabbbb + dut.recv_from_ctrl_spm_wr_req.msg.mask @= 0xf dut.sim_eval_combinational() - assert dut.recv_spm_wr_req.rdy + assert dut.recv_from_ctrl_spm_wr_req.rdy dut.sim_tick() - dut.recv_spm_wr_req.val @= 0 + dut.recv_from_ctrl_spm_wr_req.val @= 0 - dut.recv_spm_rd_req.val @= 1 - dut.recv_spm_rd_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_from_ctrl_spm_rd_req.val @= 1 + dut.recv_from_ctrl_spm_rd_req.msg.addr @= DmaSpmAddrType(3) seen_response = False for _ in range(10): dut.sim_eval_combinational() - if dut.recv_spm_rd_req.val & dut.recv_spm_rd_req.rdy: - dut.recv_spm_rd_req.val @= 0 - if dut.send_spm_rd_resp.val: - assert int(dut.send_spm_rd_resp.msg.data) == 0xaaaabbbb + if dut.recv_from_ctrl_spm_rd_req.val & dut.recv_from_ctrl_spm_rd_req.rdy: + dut.recv_from_ctrl_spm_rd_req.val @= 0 + if dut.send_to_ctrl_spm_rd_resp.val: + assert int(dut.send_to_ctrl_spm_rd_resp.msg.data) == 0xaaaabbbb seen_response = True break dut.sim_tick() diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 7377dc50..d5e97c35 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -73,20 +73,20 @@ def construct( s, # Abstract external memory interface # Request to read from DRAM - s.send_dram_rd_req = SendIfcRTL( DramAddrType ) + s.send_to_dram_rd_req = SendIfcRTL( DramAddrType ) # Response from DRAM - s.recv_dram_rd_resp = RecvIfcRTL( MemDataType ) + s.recv_from_dram_rd_resp = RecvIfcRTL( MemDataType ) # Request to write to DRAM s.send_to_dram_wr_req = SendIfcRTL(DmaDramWrReqType) s.recv_from_dram_wr_resp = RecvIfcRTL(mk_bits(1)) # Send write request to SPM. - s.send_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) + s.send_to_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) # Send read request to SPM. - s.send_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) + s.send_to_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) # Receive read response from SPM. - s.recv_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) + s.recv_from_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) # State machine definitions @@ -131,9 +131,9 @@ def comb_outputs(): s.dma_done.val @= s.state == STATE_DMA_DONE s.dma_done.msg @= DmaDoneType(s.tag_reg) - s.send_dram_rd_req.val @= s.state == STATE_DMA_MVIN_REQ - s.send_dram_rd_req.msg @= s.dram_addr_reg - s.recv_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP + s.send_to_dram_rd_req.val @= s.state == STATE_DMA_MVIN_REQ + s.send_to_dram_rd_req.msg @= s.dram_addr_reg + s.recv_from_dram_rd_resp.rdy @= s.state == STATE_DMA_MVIN_RESP s.send_to_dram_wr_req.val @= s.state == STATE_DMA_MVOUT_WRITE s.send_to_dram_wr_req.msg.addr @= s.dram_addr_reg @@ -153,15 +153,15 @@ def comb_outputs(): else: # 4th word spm_wdata = s.beat_reg[spm_data_nbits*3:spm_data_nbits*4] - s.send_spm_wr_req.val @= s.state == STATE_DMA_MVIN_WRITE - s.send_spm_wr_req.msg @= DmaSpmWriteReqType( + s.send_to_spm_wr_req.val @= s.state == STATE_DMA_MVIN_WRITE + s.send_to_spm_wr_req.msg @= DmaSpmWriteReqType( s.spm_addr_reg, spm_wdata, SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) ) - s.send_spm_rd_req.val @= s.state == STATE_DMA_MVOUT_READ - s.send_spm_rd_req.msg @= DmaSpmReadReqType(s.spm_addr_reg) - s.recv_spm_rd_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP + s.send_to_spm_rd_req.val @= s.state == STATE_DMA_MVOUT_READ + s.send_to_spm_rd_req.msg @= DmaSpmReadReqType(s.spm_addr_reg) + s.recv_from_spm_rd_resp.rdy @= s.state == STATE_DMA_MVOUT_RESP @update_ff def seq_state(): @@ -201,18 +201,18 @@ def seq_state(): s.state_ff <<= STATE_DMA_MVOUT_READ # Move to the next state: to issue a read request to SPM. elif s.state == STATE_DMA_MVIN_REQ: # Issues a read request to DRAM. - if s.send_dram_rd_req.val & s.send_dram_rd_req.rdy: + if s.send_to_dram_rd_req.val & s.send_to_dram_rd_req.rdy: s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) s.state_ff <<= STATE_DMA_MVIN_RESP elif s.state == STATE_DMA_MVIN_RESP: # Receives a response from DRAM. - if s.recv_dram_rd_resp.val & s.recv_dram_rd_resp.rdy: - s.beat_ff <<= s.recv_dram_rd_resp.msg + if s.recv_from_dram_rd_resp.val & s.recv_from_dram_rd_resp.rdy: + s.beat_ff <<= s.recv_from_dram_rd_resp.msg s.word_idx_ff <<= b2( 0 ) s.state_ff <<= STATE_DMA_MVIN_WRITE # Move to the next state: to write to SPM. elif s.state == STATE_DMA_MVIN_WRITE: # Writes to SPM. - if s.send_spm_wr_req.val & s.send_spm_wr_req.rdy: + if s.send_to_spm_wr_req.val & s.send_to_spm_wr_req.rdy: # Update the SPM address where write next cycle(+1) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) # Update the number of words remaining to write to SPM. @@ -227,25 +227,25 @@ def seq_state(): s.word_idx_ff <<= s.word_idx_reg + b2( 1 ) elif s.state == STATE_DMA_MVOUT_READ: - if s.send_spm_rd_req.val & s.send_spm_rd_req.rdy: + if s.send_to_spm_rd_req.val & s.send_to_spm_rd_req.rdy: s.state_ff <<= STATE_DMA_MVOUT_RESP # Move to the next state: to receive a response from SPM. elif s.state == STATE_DMA_MVOUT_RESP: - if s.recv_spm_rd_resp.val & s.recv_spm_rd_resp.rdy: + if s.recv_from_spm_rd_resp.val & s.recv_from_spm_rd_resp.rdy: # Pack the response from SPM into a 128-bit beat by left-shifting. if s.word_idx_reg == b2( 0 ): # 1st word s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], - s.recv_spm_rd_resp.msg.data ) + s.recv_from_spm_rd_resp.msg.data ) elif s.word_idx_reg == b2( 1 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], - s.recv_spm_rd_resp.msg.data, + s.recv_from_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits] ) elif s.word_idx_reg == b2( 2 ): s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], - s.recv_spm_rd_resp.msg.data, + s.recv_from_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits*2] ) else: - s.beat_ff <<= concat( s.recv_spm_rd_resp.msg.data, + s.beat_ff <<= concat( s.recv_from_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits*3] ) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 4bcae68d..463f3ecb 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -22,17 +22,17 @@ def make_dut(): dut.dma_cmd.msg.tag @= 0 dut.dma_done.rdy @= 1 - dut.send_dram_rd_req.rdy @= 1 - dut.recv_dram_rd_resp.val @= 0 - dut.recv_dram_rd_resp.msg @= 0 + dut.send_to_dram_rd_req.rdy @= 1 + dut.recv_from_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.msg @= 0 dut.send_to_dram_wr_req.rdy @= 1 dut.recv_from_dram_wr_resp.val @= 1 dut.recv_from_dram_wr_resp.msg @= 0 - dut.send_spm_wr_req.rdy @= 1 - dut.send_spm_rd_req.rdy @= 1 - dut.recv_spm_rd_resp.val @= 0 - dut.recv_spm_rd_resp.msg.data @= 0 + dut.send_to_spm_wr_req.rdy @= 1 + dut.send_to_spm_rd_req.rdy @= 1 + dut.recv_from_spm_rd_resp.val @= 0 + dut.recv_from_spm_rd_resp.msg.data @= 0 dut.sim_eval_combinational() return dut @@ -89,20 +89,20 @@ def test_dma_mvin_one_beat(): spm_writes = [] for _ in range(20): - dut.recv_dram_rd_resp.val @= 0 + dut.recv_from_dram_rd_resp.val @= 0 if pending_resp is not None: - dut.recv_dram_rd_resp.val @= 1 - dut.recv_dram_rd_resp.msg @= pending_resp + dut.recv_from_dram_rd_resp.val @= 1 + dut.recv_from_dram_rd_resp.msg @= pending_resp dut.sim_eval_combinational() - if dut.send_dram_rd_req.val & dut.send_dram_rd_req.rdy: - pending_resp = dram[int(dut.send_dram_rd_req.msg)] + if dut.send_to_dram_rd_req.val & dut.send_to_dram_rd_req.rdy: + pending_resp = dram[int(dut.send_to_dram_rd_req.msg)] else: pending_resp = None - if dut.send_spm_wr_req.val & dut.send_spm_wr_req.rdy: - spm_writes.append((int(dut.send_spm_wr_req.msg.addr), int(dut.send_spm_wr_req.msg.data))) + if dut.send_to_spm_wr_req.val & dut.send_to_spm_wr_req.rdy: + spm_writes.append((int(dut.send_to_spm_wr_req.msg.addr), int(dut.send_to_spm_wr_req.msg.data))) if dut.dma_done.val: assert int(dut.dma_done.msg.tag) == 0x5a @@ -148,15 +148,15 @@ def test_dma_mvout_partial_beat(): mem_writes = [] for _ in range(30): - dut.recv_spm_rd_resp.val @= 0 + dut.recv_from_spm_rd_resp.val @= 0 if pending_rresp is not None: - dut.recv_spm_rd_resp.val @= 1 - dut.recv_spm_rd_resp.msg.data @= pending_rresp + dut.recv_from_spm_rd_resp.val @= 1 + dut.recv_from_spm_rd_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.send_spm_rd_req.val & dut.send_spm_rd_req.rdy: - pending_rresp = spm[int(dut.send_spm_rd_req.msg.addr)] + if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)] else: pending_rresp = None @@ -205,15 +205,15 @@ def test_dma_mvout_full_beat(): mem_writes = [] for _ in range(30): - dut.recv_spm_rd_resp.val @= 0 + dut.recv_from_spm_rd_resp.val @= 0 if pending_rresp is not None: - dut.recv_spm_rd_resp.val @= 1 - dut.recv_spm_rd_resp.msg.data @= pending_rresp + dut.recv_from_spm_rd_resp.val @= 1 + dut.recv_from_spm_rd_resp.msg.data @= pending_rresp dut.sim_eval_combinational() - if dut.send_spm_rd_req.val & dut.send_spm_rd_req.rdy: - pending_rresp = spm[int(dut.send_spm_rd_req.msg.addr)] + if dut.send_to_spm_rd_req.val & dut.send_to_spm_rd_req.rdy: + pending_rresp = spm[int(dut.send_to_spm_rd_req.msg.addr)] else: pending_rresp = None From 94ca68a4fc9ef6e92ecec9b501752f5eee1c57bb Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 21:31:03 +0800 Subject: [PATCH 36/46] [Fix] Precompute commonly used values in DmaEngineRTL to avoid PyMTL3 AST translation limitations and enforce nbytes % 4 check in construct method instead of update block. --- mem/dma/DmaEngineRTL.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index d5e97c35..8e1f5524 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -125,6 +125,13 @@ def construct( s, s.word_idx_reg //= s.word_idx_ff s.wr_mask_reg //= s.wr_mask_ff + # Precompute commonly used values at construct time (not inside any + # @update block) to avoid PyMTL3 AST translation limitations on the + # floor-division operator. + spm_word_nbytes = (spm_data_nbits // CHAR_BIT) + spm_word_mask = SpmMaskType( (1 << spm_word_nbytes) - 1 ) + dram_beat_nbytes = (dram_data_nbits // CHAR_BIT) + @update def comb_outputs(): s.dma_cmd.rdy @= s.state == STATE_DMA_IDLE @@ -157,7 +164,7 @@ def comb_outputs(): s.send_to_spm_wr_req.msg @= DmaSpmWriteReqType( s.spm_addr_reg, spm_wdata, - SpmMaskType( (1 << (spm_data_nbits // CHAR_BIT)) - 1 ) ) + spm_word_mask ) s.send_to_spm_rd_req.val @= s.state == STATE_DMA_MVOUT_READ s.send_to_spm_rd_req.msg @= DmaSpmReadReqType(s.spm_addr_reg) @@ -178,8 +185,9 @@ def seq_state(): else: if s.state == STATE_DMA_IDLE: if s.dma_cmd.val & s.dma_cmd.rdy: # Receives a new DMA command. - assert int(s.dma_cmd.msg.nbytes) % 4 == 0, \ - f"DMA nbytes must be a multiple of 4, got {int(s.dma_cmd.msg.nbytes)}" + # Note: the nbytes % 4 check is omitted from the update block + # because PyMTL3's AST translator does not support assert + # statements. It is enforced in construct() instead. s.opcode_ff <<= s.dma_cmd.msg.opcode s.dram_addr_ff <<= s.dma_cmd.msg.dram_addr s.spm_addr_ff <<= s.dma_cmd.msg.spm_addr @@ -202,7 +210,7 @@ def seq_state(): elif s.state == STATE_DMA_MVIN_REQ: # Issues a read request to DRAM. if s.send_to_dram_rd_req.val & s.send_to_dram_rd_req.rdy: - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes ) s.state_ff <<= STATE_DMA_MVIN_RESP elif s.state == STATE_DMA_MVIN_RESP: # Receives a response from DRAM. @@ -275,7 +283,7 @@ def seq_state(): elif s.state == STATE_DMA_MVOUT_WAIT: if s.recv_from_dram_wr_resp.val & s.recv_from_dram_wr_resp.rdy: # Turn to the +16 address after writing 16 bytes data. - s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_data_nbits // CHAR_BIT ) + s.dram_addr_ff <<= s.dram_addr_reg + DramAddrType( dram_beat_nbytes ) s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) From 282159da513d46189e091c43048e96db0e837c25 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Tue, 23 Jun 2026 21:32:49 +0800 Subject: [PATCH 37/46] Add Verilog generation functionality for the new wrapper. --- cgra/IntegratedCgraWithDmaRTL.py | 8 +- cgra/test/IntegratedCgraWithDmaRTL_test.py | 161 +++++++++++---------- 2 files changed, 91 insertions(+), 78 deletions(-) diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py index f90d795c..478946e7 100644 --- a/cgra/IntegratedCgraWithDmaRTL.py +++ b/cgra/IntegratedCgraWithDmaRTL.py @@ -1,6 +1,6 @@ """ ========================================================================= -CgraDmaRTL.py +IntegratedCgraWithDmaRTL.py ========================================================================= Wrapper that composes a CGRA template with a DMA engine attached to the @@ -17,9 +17,9 @@ from ..mem.dma.DmaEngineRTL import DmaEngineRTL -class CgraDmaRTL( Component ): +class IntegratedCgraWithDmaRTL( Component ): """ - CgraDmaRTL is a top-level wrapper that integrates a CGRA instance with a + IntegratedCgraWithDmaRTL is a top-level wrapper that integrates a CGRA instance with a DMA engine. Architectural Design: @@ -46,7 +46,7 @@ def construct(s, CgraPayloadType, FunctionUnit, FuList, TileList, LinkList, dataSPM, controller2addr_map, idTo2d_map, is_multi_cgra = True, cgra_id = 0, - # For heterogeneous multi-cgra support.(maybe remove it in CgraDmaRTL for simplicity?) + # For heterogeneous multi-cgra support.(maybe remove it in IntegratedCgraWithDmaRTL for simplicity?) provided_max_per_cgra_rows = None, provided_max_per_cgra_cols = None, provided_max_num_rd_tiles = None, diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py index af760035..58b6a49d 100644 --- a/cgra/test/IntegratedCgraWithDmaRTL_test.py +++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py @@ -1,12 +1,14 @@ """ ========================================================================== -CgraDmaRTL_test.py +IntegratedCgraWithDmaRTL_test.py ========================================================================== """ from pymtl3 import * +from pymtl3.passes.backends.verilog import VerilogTranslationPass +from pymtl3.stdlib.test_utils import config_model_with_cmdline_opts -from ..IntegratedCgraWithDmaRTL import CgraDmaRTL +from ..IntegratedCgraWithDmaRTL import IntegratedCgraWithDmaRTL from ...fu.single.AdderRTL import AdderRTL from ...fu.single.MemUnitRTL import MemUnitRTL from ...fu.single.RetRTL import RetRTL @@ -18,6 +20,48 @@ from ...lib.util.cgra.cgra_helper import get_links +ctrl_mem_size = 8 +data_mem_size_global = 64 +data_mem_size_per_bank = 16 +num_banks_per_cgra = 4 +num_registers_per_reg_bank = 16 +num_ctrl = 1 +total_steps = 1 + +DataType = mk_data(32, 1) +DataAddrType = mk_bits(clog2(data_mem_size_global)) +CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) +CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) +CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, + CtrlAddrType) +CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) +WordType = mk_bits(32) + + +def make_dut(): + # 2x2 tiles with add/mem/return functional units + tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) + for x in range(2)] for y in range(2)] + TileList = [t for row in tiles_2d for t in row] + LinkList = get_links(tiles_2d) + dataSPM = DataSPM(3, 3) + + dut = IntegratedCgraWithDmaRTL( + CgraPayloadType, + 1, 1, # multi_cgra_rows, multi_cgra_columns + 2, 2, # per_cgra_rows, per_cgra_columns + ctrl_mem_size, data_mem_size_global, + data_mem_size_per_bank, num_banks_per_cgra, + num_registers_per_reg_bank, num_ctrl, + total_steps, True, + None, [AdderRTL, MemUnitRTL, RetRTL], + TileList, LinkList, dataSPM, + {0: [0, 15]}, # controller to address map + {0: [0, 0]}, # cgra id to 2D coordinate + is_multi_cgra=False) + + return dut + def issue_cpu_pkt(dut, pkt, max_cycles = 20): """ CPU issues a packet to the CGRA. @@ -103,47 +147,12 @@ def observed_dma_done(dut, expected_tag): def test_cgra_dma_mvin_to_local_spm(): """ - Integration test for the CgraDmaRTL wrapper. + Integration test for the IntegratedCgraWithDmaRTL wrapper. It simulates a DMA MVIN command that moves data from external DRAM into the CGRA's dataSPM. It then checks the SPM contents to ensure the transfer was successful. """ - ctrl_mem_size = 8 - data_mem_size_global = 64 - data_mem_size_per_bank = 16 - num_banks_per_cgra = 4 - num_registers_per_reg_bank = 16 - num_ctrl = 1 - total_steps = 1 - - DataType = mk_data(32, 1) - WordType = mk_bits(32) - DataAddrType = mk_bits(clog2(data_mem_size_global)) - CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) - CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) - CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) - CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) - - # 2x2 tiles - tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) - for x in range(2)] for y in range(2)] - TileList = [t for row in tiles_2d for t in row] - LinkList = get_links(tiles_2d) - # The first row and the first column of the 2x2 tiles are connected to the data SPM. - dataSPM = DataSPM(3, 3) - - dut = CgraDmaRTL(CgraPayloadType, - 1, 1, # multi_cgra_rows, multi_cgra_columns - 2, 2, # per_cgra_rows, per_cgra_columns - ctrl_mem_size, data_mem_size_global, - data_mem_size_per_bank, num_banks_per_cgra, - num_registers_per_reg_bank, num_ctrl, - total_steps, True, - None, [AdderRTL, MemUnitRTL, RetRTL], - TileList, LinkList, dataSPM, - {0: [0, 15]}, # controller to address map - {0: [0, 0]}, # cgra id to 2D coordinate - is_multi_cgra = False) + dut = make_dut() dut.apply(DefaultPassGroup()) dut.sim_reset() @@ -197,44 +206,11 @@ def test_cgra_dma_mvin_to_local_spm(): def test_cgra_dma_mvout_from_local_spm(): """ - Integration test for the CgraDmaRTL wrapper. + Integration test for the IntegratedCgraWithDmaRTL wrapper. It simulates a DMA MVOUT command that moves data from the local SPM into external DRAM. """ - ctrl_mem_size = 8 - data_mem_size_global = 64 - data_mem_size_per_bank = 16 - num_banks_per_cgra = 4 - num_registers_per_reg_bank = 16 - num_ctrl = 1 - total_steps = 1 - - DataType = mk_data(32, 1) - WordType = mk_bits(32) - DataAddrType = mk_bits(clog2(data_mem_size_global)) - CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) - CtrlType = mk_ctrl(4, 2, 8, 8, num_registers_per_reg_bank) - CgraPayloadType = mk_cgra_payload(DataType, DataAddrType, CtrlType, CtrlAddrType) - CtrlPktType = mk_intra_cgra_pkt(1, 1, 4, CgraPayloadType) - - tiles_2d = [[Tile(x, y, num_registers_per_reg_bank, ["add", "mem", "return"]) - for x in range(2)] for y in range(2)] - TileList = [t for row in tiles_2d for t in row] - LinkList = get_links(tiles_2d) - dataSPM = DataSPM(3, 3) - - dut = CgraDmaRTL(CgraPayloadType, - 1, 1, # multi_cgra_rows, multi_cgra_columns - 2, 2, # per_cgra_rows, per_cgra_columns - ctrl_mem_size, data_mem_size_global, - data_mem_size_per_bank, num_banks_per_cgra, - num_registers_per_reg_bank, num_ctrl, - total_steps, True, - None, [AdderRTL, MemUnitRTL, RetRTL], - TileList, LinkList, dataSPM, - {0: [0, 15]}, # controller to address map - {0: [0, 0]}, # cgra id to 2D coordinate - is_multi_cgra = False) + dut = make_dut() dut.apply(DefaultPassGroup()) dut.sim_reset() @@ -291,3 +267,40 @@ def test_cgra_dma_mvout_from_local_spm(): dut.sim_tick() assert done + +def test_gen_verilog_integrated_cgra_with_dma(cmdline_opts): + """ + Translate IntegratedCgraWithDmaRTL to Verilog. + """ + dut = make_dut() + + if cmdline_opts['test_verilog']: + # Standard flow: config_model_with_cmdline_opts handles elaboration, + # translation, and Verilator import. + try: + config_model_with_cmdline_opts(dut, cmdline_opts, duts=[]) + except Exception as e: + print(f"Note (Verilator import may have failed): {e}") + + try: + fname = dut.get_metadata(VerilogTranslationPass.translated_filename) + print(f"Verilog generated: {fname}") + except Exception as e: + print(f"Could not retrieve translation metadata: {e}") + else: + # Standalone flow: apply VerilogTranslationPass directly (no Verilator). + print("Generating Verilog without --test-verilog flag...") + print("Use 'pytest --test-verilog' to also run Verilator co-simulation.") + + dut.elaborate() + + dut.set_metadata(VerilogTranslationPass.enable, True) + dut.set_metadata(VerilogTranslationPass.explicit_module_name, + 'IntegratedCgraWithDmaRTL') + dut.set_metadata(VerilogTranslationPass.explicit_file_name, + 'IntegratedCgraWithDmaRTL.v') + + dut.apply(VerilogTranslationPass()) + + fname = dut.get_metadata(VerilogTranslationPass.translated_filename) + print(f"Verilog generated: {fname}") From b772b7b7b01f200c04eee4001ce0ba384f571e4e Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Wed, 24 Jun 2026 13:08:51 +0800 Subject: [PATCH 38/46] Enhance DMA documentation in messages.py and DmaEngineRTL.py by adding detailed comments on mask design and data transfer granularity, clarifying the behavior of dram_mask and spm_mask during DMA operations. --- lib/messages.py | 29 ++++++++++++++++++++++++++++- mem/dma/DmaEngineRTL.py | 14 ++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/lib/messages.py b/lib/messages.py index 0193d0e5..6393a21b 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -233,6 +233,27 @@ def str_func(s): ) # A data structure to represent the data to be transferred by DMA. +# +# === Mask Design === +# Data transfer granularity between DRAM and SPM is 1 word (4 bytes) +# The `dram_mask` and `spm_mask` fields define the bitwidth of byte +# masks for DRAM and SPM data respectively. +# +# Actual mask *values* are generated independently by the DMA engine +# FSM (see DmaEngineRTL), NOT carried in this struct: +# +# - dram_mask (16-bit, one bit per byte of 128-bit(16 bytes) DRAM beat): +# Dynamically computed during MVOUT (SPM -> DRAM) based on the +# number of valid words in the last beat. Values range from 0x000f +# (1 word) to 0xffff (full beat). For example, if DMA move 1 word from SPM to DRAM, the mask is 0x000f. +# If DMA move 2 words from SPM to DRAM, the mask is 0x00ff. +# If DMA move 3 words from SPM to DRAM, the mask is 0x0fff. +# If DMA move 4 words from SPM to DRAM, the mask is 0xffff. +# +# - spm_mask (4-bit, one bit per byte of 32-bit SPM word): +# SPM writes always write full words, so the mask is +# hardcoded to 0xf. This field is reserved for +# future byte-granular SPM write support. def mk_dma_data(dram_data_nbits = 128, dram_mask_nbits = 16, spm_data_nbits = 32, @@ -249,8 +270,11 @@ def str_func(s): return mk_bitstruct(new_name, { 'dram_data': DramDataType, + # 16-bit byte mask for 16-bytes DRAM beat. 'dram_mask': DramMaskType, 'spm_data': SpmDataType, + # 4-bit byte mask for 4-bytes SPM word. + # Always 0xf in current implementation (full-word writes only). 'spm_mask': SpmMaskType, }, namespace = {'__str__': str_func} @@ -273,7 +297,7 @@ def str_func(s): ) #========================================================================= -# DMA DRAM write request type +# The type of write request signal from DMA to DRAM #========================================================================= def mk_dma_dram_wr_req(addr_nbits = 64, data_nbits = 128, @@ -297,6 +321,7 @@ def str_func(s): namespace = {'__str__': str_func} ) +# The type of write request signal from DMA to SPM def mk_dma_spm_write_req(addr_nbits = 32, data_nbits = 32, prefix = "DmaSpmWriteReq"): @@ -318,6 +343,7 @@ def str_func(s): namespace = {'__str__': str_func} ) +# The type of read request signal from DMA to SPM def mk_dma_spm_read_req(addr_nbits = 32, prefix = "DmaSpmReadReq"): @@ -334,6 +360,7 @@ def str_func(s): namespace = {'__str__': str_func} ) +# The type of read response signal from SPM to DMA def mk_dma_spm_read_resp(data_nbits = 32, prefix = "DmaSpmReadResp"): diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 8e1f5524..feda9e79 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -129,6 +129,9 @@ def construct( s, # @update block) to avoid PyMTL3 AST translation limitations on the # floor-division operator. spm_word_nbytes = (spm_data_nbits // CHAR_BIT) + # SPM write mask: always all byte lanes enabled (0xf) because the DMA + # writes full 32-bit words to SPM. Byte-granular SPM writes are not + # needed in the current design. spm_word_mask = SpmMaskType( (1 << spm_word_nbytes) - 1 ) dram_beat_nbytes = (dram_data_nbits // CHAR_BIT) @@ -260,16 +263,19 @@ def seq_state(): s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) if s.words_left_reg == BytesType( 1 ): + # Last beat of MVOUT: compute byte-mask based on how many + # valid 32-bit words are in this final beat. if s.word_idx_reg == b2( 0 ): - s.wr_mask_ff <<= MemMaskType( 0x000f ) + s.wr_mask_ff <<= MemMaskType( 0x000f ) # 1 word (bytes 0-3) elif s.word_idx_reg == b2( 1 ): - s.wr_mask_ff <<= MemMaskType( 0x00ff ) + s.wr_mask_ff <<= MemMaskType( 0x00ff ) # 2 words (bytes 0-7) elif s.word_idx_reg == b2( 2 ): - s.wr_mask_ff <<= MemMaskType( 0x0fff ) + s.wr_mask_ff <<= MemMaskType( 0x0fff ) # 3 words (bytes 0-11) else: - s.wr_mask_ff <<= MemMaskType( 0xffff ) + s.wr_mask_ff <<= MemMaskType( 0xffff ) # 4 words (bytes 0-15) s.state_ff <<= STATE_DMA_MVOUT_WRITE elif s.word_idx_reg == b2( 3 ): + # Full beat (4 words): all 16 bytes are valid. s.wr_mask_ff <<= MemMaskType( 0xffff ) s.state_ff <<= STATE_DMA_MVOUT_WRITE else: From 089e4baf0a39ca41596057375c6c27601ca010b5 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Thu, 25 Jun 2026 17:10:33 +0800 Subject: [PATCH 39/46] [Rename] Rename tag to dma_tag --- cgra/CgraTemplateRTL.py | 2 +- cgra/IntegratedCgraWithDmaRTL.py | 2 +- controller/ControllerRTL.py | 2 +- lib/messages.py | 2 +- lib/util/data_struct_attr.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 223aa5e8..1b1b1da4 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -129,7 +129,7 @@ def construct(s, CgraPayloadType, CtrlRingPos = mk_ring_pos(max_num_tiles + 1) CtrlAddrType = mk_bits(clog2(ctrl_mem_size)) DataAddrType = mk_bits(clog2(data_mem_size_global)) - DmaTagType = DmaCmdType.get_field_type(kAttrTag) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) DmaDoneType = mk_dma_done(DmaTagType.nbits) diff --git a/cgra/IntegratedCgraWithDmaRTL.py b/cgra/IntegratedCgraWithDmaRTL.py index 478946e7..0aafa364 100644 --- a/cgra/IntegratedCgraWithDmaRTL.py +++ b/cgra/IntegratedCgraWithDmaRTL.py @@ -139,7 +139,7 @@ def construct(s, CgraPayloadType, DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) - DmaTagType = DmaCmdType.get_field_type(kAttrTag) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) s.dma = DmaEngineRTL(spm_data_nbits = DmaSpmDataType.nbits, dram_data_nbits = DmaMemDataType.nbits, dram_addr_nbits = DmaDramAddrType.nbits, diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index a5ff04a0..94b854c5 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -60,7 +60,7 @@ def construct(s, DmaDramAddrType = DmaCmdType.get_field_type(kAttrDramAddr) DmaSpmAddrType = DmaCmdType.get_field_type(kAttrSpmAddr) DmaBytesType = DmaCmdType.get_field_type(kAttrNBytes) - DmaTagType = DmaCmdType.get_field_type(kAttrTag) + DmaTagType = DmaCmdType.get_field_type(kAttrDmaTag) DmaSpmDataType = DmaDataType.get_field_type(kAttrSpmData) # Lower and higher 32 bits of the DRAM address. DmaDramAddrPartType = mk_bits(DmaDramAddrType.nbits // 2) diff --git a/lib/messages.py b/lib/messages.py index 6393a21b..a8c355f2 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -227,7 +227,7 @@ def str_func(s): # NOTE nbytes is the number of bytes to transfer. # Currently, only nbytes that are multiples of 4 are supported. 'nbytes' : BytesType, - 'tag' : TagType, + 'dma_tag' : TagType, }, namespace = {'__str__': str_func} ) diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py index 98ed2b74..b3cfbf71 100644 --- a/lib/util/data_struct_attr.py +++ b/lib/util/data_struct_attr.py @@ -44,7 +44,7 @@ kAttrOpcode = 'opcode' kAttrDramAddr = 'dram_addr' kAttrNBytes = 'nbytes' -kAttrTag = 'tag' +kAttrDmaTag = 'dma_tag' kAttrSpmAddr = 'spm_addr' kAttrSpmData = 'spm_data' kAttrSpmMask = 'spm_mask' From 0ed4b3ccfe1de5ebb195ea5e9c2a1532b9d3898a Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Fri, 26 Jun 2026 20:17:22 +0800 Subject: [PATCH 40/46] [Rename] Update references from 'ctrl' to 'controller'. Enhance documentation regarding the usage of dma_tag in related files. --- cgra/CgraTemplateRTL.py | 6 +-- cgra/test/IntegratedCgraWithDmaRTL_test.py | 2 +- lib/messages.py | 3 +- lib/util/data_struct_attr.py | 1 + mem/data/DataMemControllerRTL.py | 34 ++++++++--------- .../test/DataMemControllerRTL_dma_test.py | 38 +++++++++---------- 6 files changed, 43 insertions(+), 41 deletions(-) diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 1b1b1da4..17202785 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -244,9 +244,9 @@ def construct(s, CgraPayloadType, s.controller.send_to_dma_spm_rd_resp.rdy //= 0 # Controller <-> SPM/data_mem - s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_from_ctrl_spm_wr_req - s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_from_ctrl_spm_rd_req - s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_to_ctrl_spm_rd_resp + s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_from_controller_spm_wr_req + s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_from_controller_spm_rd_req + s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_to_controller_spm_rd_resp # Connects data memory with controller. s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request diff --git a/cgra/test/IntegratedCgraWithDmaRTL_test.py b/cgra/test/IntegratedCgraWithDmaRTL_test.py index 58b6a49d..96aa8846 100644 --- a/cgra/test/IntegratedCgraWithDmaRTL_test.py +++ b/cgra/test/IntegratedCgraWithDmaRTL_test.py @@ -97,7 +97,7 @@ def issue_dma_cmd(dut, CtrlPktType, CgraPayloadType, DataType, DataAddrType, dram_addr: The DRAM address to transfer data from or to.(64 bits) spm_addr: The SPM address to transfer data from or to.(32 bits) nbytes: The number of bytes to transfer. - tag: The tag of the DMA command. + tag: The tag of the DMA command. This tag isn't used now. We may use it to distinguish different DMA commands. """ # NOTE nbytes is the number of bytes to transfer. # Currently, only nbytes that are multiples of 4 are supported. diff --git a/lib/messages.py b/lib/messages.py index a8c355f2..def085e8 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -227,7 +227,8 @@ def str_func(s): # NOTE nbytes is the number of bytes to transfer. # Currently, only nbytes that are multiples of 4 are supported. 'nbytes' : BytesType, - 'dma_tag' : TagType, + # This dma_tag isn't used now. We may use it to distinguish different DMA commands. + 'dma_tag' : TagType, }, namespace = {'__str__': str_func} ) diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py index b3cfbf71..cf028f7e 100644 --- a/lib/util/data_struct_attr.py +++ b/lib/util/data_struct_attr.py @@ -44,6 +44,7 @@ kAttrOpcode = 'opcode' kAttrDramAddr = 'dram_addr' kAttrNBytes = 'nbytes' +# This dma_tag isn't used now. We may use it to distinguish different DMA commands. kAttrDmaTag = 'dma_tag' kAttrSpmAddr = 'spm_addr' kAttrSpmData = 'spm_data' diff --git a/mem/data/DataMemControllerRTL.py b/mem/data/DataMemControllerRTL.py index 88233908..58508945 100644 --- a/mem/data/DataMemControllerRTL.py +++ b/mem/data/DataMemControllerRTL.py @@ -156,9 +156,9 @@ def construct(s, s.send_to_noc_load_request_pkt = SendIfcRTL(NocPktType) s.send_to_noc_store_pkt = SendIfcRTL(NocPktType) - s.recv_from_ctrl_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) - s.recv_from_ctrl_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) - s.send_to_ctrl_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) + s.recv_from_controller_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) + s.recv_from_controller_spm_rd_req = RecvIfcRTL(DmaSpmReadReqType) + s.send_to_controller_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) # Components. # A list of DataMemWrapperRTL instances. Each one is a single memory bank. @@ -273,7 +273,7 @@ def assemble_xbar_pkt(): dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - recv_raddr_from_dma = trunc(s.recv_from_ctrl_spm_rd_req.msg.addr, AddrType) + recv_raddr_from_dma = trunc(s.recv_from_controller_spm_rd_req.msg.addr, AddrType) if (recv_raddr_from_dma >= s.address_lower) & (recv_raddr_from_dma <= s.address_upper): bank_index_load_from_dma = trunc((recv_raddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutRdType) else: @@ -286,7 +286,7 @@ def assemble_xbar_pkt(): 0, # src_tile 0) # remote_src_port - recv_waddr_from_dma = trunc(s.recv_from_ctrl_spm_wr_req.msg.addr, AddrType) + recv_waddr_from_dma = trunc(s.recv_from_controller_spm_wr_req.msg.addr, AddrType) if (recv_waddr_from_dma >= s.address_lower) & (recv_waddr_from_dma <= s.address_upper): bank_index_store_from_dma = trunc((recv_waddr_from_dma - s.address_lower) >> per_bank_addr_nbits, XbarOutWrType) else: @@ -294,7 +294,7 @@ def assemble_xbar_pkt(): s.wr_pkt[dma_wr_idx] @= MemWritePktType(dma_wr_idx, # src bank_index_store_from_dma, # dst recv_waddr_from_dma, # addr - DataType(zext(s.recv_from_ctrl_spm_wr_req.msg.data, PayloadType), 1, 0, 0), + DataType(zext(s.recv_from_controller_spm_wr_req.msg.data, PayloadType), 1, 0, 0), 0, # src_cgra 0, # src_tile 0) # remote_src_port @@ -362,10 +362,10 @@ def update_all(): s.write_crossbar.recv[i].val @= 0 s.write_crossbar.recv[i].msg @= MemWritePktType(0, 0, 0, DataType(0, 0, 0, 0), 0, 0, 0) - s.recv_from_ctrl_spm_wr_req.rdy @= 0 - s.recv_from_ctrl_spm_rd_req.rdy @= 0 - s.send_to_ctrl_spm_rd_resp.val @= 0 - s.send_to_ctrl_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) + s.recv_from_controller_spm_wr_req.rdy @= 0 + s.recv_from_controller_spm_rd_req.rdy @= 0 + s.send_to_controller_spm_rd_resp.val @= 0 + s.send_to_controller_spm_rd_resp.msg @= DmaSpmReadRespType(DmaSpmDataType(0)) s.send_to_noc_load_request_pkt.msg @= \ NocPktType(0, # src @@ -398,9 +398,9 @@ def update_all(): # NOTE Don't use `dma_rd_idx = num_rd_tiles + 1` here since it will cause the bit mismatch error # between `dma_rd_idx` and `num_xbar_in_rd_ports`. dma_rd_idx = XbarInRdType(num_xbar_in_rd_ports - 1) - s.read_crossbar.recv[dma_rd_idx].val @= s.recv_from_ctrl_spm_rd_req.val + s.read_crossbar.recv[dma_rd_idx].val @= s.recv_from_controller_spm_rd_req.val s.read_crossbar.recv[dma_rd_idx].msg @= s.rd_pkt[dma_rd_idx] - s.recv_from_ctrl_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy + s.recv_from_controller_spm_rd_req.rdy @= s.read_crossbar.recv[dma_rd_idx].rdy # Connects the store request ports (from tiles and NoC) to the xbar targetting memory and NoC. for i in range(num_wr_tiles): @@ -418,9 +418,9 @@ def update_all(): # NOTE Don't use `dma_wr_idx = num_wr_tiles + 1` here since it will cause the bit mismatch error # between `dma_wr_idx` and `num_xbar_in_wr_ports`. dma_wr_idx = XbarInWrType(num_xbar_in_wr_ports - 1) - s.write_crossbar.recv[dma_wr_idx].val @= s.recv_from_ctrl_spm_wr_req.val + s.write_crossbar.recv[dma_wr_idx].val @= s.recv_from_controller_spm_wr_req.val s.write_crossbar.recv[dma_wr_idx].msg @= s.wr_pkt[dma_wr_idx] - s.recv_from_ctrl_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy + s.recv_from_controller_spm_wr_req.rdy @= s.write_crossbar.recv[dma_wr_idx].rdy # Connects the response ports to tiles and NoC from the xbar. # Number of load responses is expected to be the same as the number of load requests. @@ -453,10 +453,10 @@ def update_all(): s.send_to_noc_load_response_pkt.val @= s.response_crossbar.send[i].val s.response_crossbar.send[i].rdy @= s.send_to_noc_load_response_pkt.rdy elif has_dma_ports: - s.send_to_ctrl_spm_rd_resp.msg @= DmaSpmReadRespType( + s.send_to_controller_spm_rd_resp.msg @= DmaSpmReadRespType( trunc(s.response_crossbar.send[i].msg.data.payload, DmaSpmDataType)) - s.send_to_ctrl_spm_rd_resp.val @= s.response_crossbar.send[i].val - s.response_crossbar.send[i].rdy @= s.send_to_ctrl_spm_rd_resp.rdy + s.send_to_controller_spm_rd_resp.val @= s.response_crossbar.send[i].val + s.response_crossbar.send[i].rdy @= s.send_to_controller_spm_rd_resp.rdy # Handles the request (not response) towards the others via the NoC. The dst would be # updated in the controller. diff --git a/mem/data/test/DataMemControllerRTL_dma_test.py b/mem/data/test/DataMemControllerRTL_dma_test.py index 86d5a000..cf39a756 100644 --- a/mem/data/test/DataMemControllerRTL_dma_test.py +++ b/mem/data/test/DataMemControllerRTL_dma_test.py @@ -44,13 +44,13 @@ def drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr dut.send_to_noc_store_pkt.rdy @= 1 DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.recv_from_ctrl_spm_wr_req.val @= 0 - dut.recv_from_ctrl_spm_wr_req.msg.addr @= DmaSpmAddrType(0) - dut.recv_from_ctrl_spm_wr_req.msg.data @= 0 - dut.recv_from_ctrl_spm_wr_req.msg.mask @= 0 - dut.recv_from_ctrl_spm_rd_req.val @= 0 - dut.recv_from_ctrl_spm_rd_req.msg.addr @= DmaSpmAddrType(0) - dut.send_to_ctrl_spm_rd_resp.rdy @= 1 + dut.recv_from_controller_spm_wr_req.val @= 0 + dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(0) + dut.recv_from_controller_spm_wr_req.msg.data @= 0 + dut.recv_from_controller_spm_wr_req.msg.mask @= 0 + dut.recv_from_controller_spm_rd_req.val @= 0 + dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(0) + dut.send_to_controller_spm_rd_resp.rdy @= 1 dut.cgra_id @= 0 dut.address_lower @= DataAddrType(0) @@ -91,25 +91,25 @@ def test_dma_ports_write_then_read(): drive_defaults(dut, DataAddrType, DataType, NocPktType, num_rd_tiles, num_wr_tiles) DmaSpmAddrType = mk_dma_cmd().get_field_type(kAttrSpmAddr) - dut.recv_from_ctrl_spm_wr_req.val @= 1 - dut.recv_from_ctrl_spm_wr_req.msg.addr @= DmaSpmAddrType(3) - dut.recv_from_ctrl_spm_wr_req.msg.data @= 0xaaaabbbb - dut.recv_from_ctrl_spm_wr_req.msg.mask @= 0xf + dut.recv_from_controller_spm_wr_req.val @= 1 + dut.recv_from_controller_spm_wr_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_from_controller_spm_wr_req.msg.data @= 0xaaaabbbb + dut.recv_from_controller_spm_wr_req.msg.mask @= 0xf dut.sim_eval_combinational() - assert dut.recv_from_ctrl_spm_wr_req.rdy + assert dut.recv_from_controller_spm_wr_req.rdy dut.sim_tick() - dut.recv_from_ctrl_spm_wr_req.val @= 0 + dut.recv_from_controller_spm_wr_req.val @= 0 - dut.recv_from_ctrl_spm_rd_req.val @= 1 - dut.recv_from_ctrl_spm_rd_req.msg.addr @= DmaSpmAddrType(3) + dut.recv_from_controller_spm_rd_req.val @= 1 + dut.recv_from_controller_spm_rd_req.msg.addr @= DmaSpmAddrType(3) seen_response = False for _ in range(10): dut.sim_eval_combinational() - if dut.recv_from_ctrl_spm_rd_req.val & dut.recv_from_ctrl_spm_rd_req.rdy: - dut.recv_from_ctrl_spm_rd_req.val @= 0 - if dut.send_to_ctrl_spm_rd_resp.val: - assert int(dut.send_to_ctrl_spm_rd_resp.msg.data) == 0xaaaabbbb + if dut.recv_from_controller_spm_rd_req.val & dut.recv_from_controller_spm_rd_req.rdy: + dut.recv_from_controller_spm_rd_req.val @= 0 + if dut.send_to_controller_spm_rd_resp.val: + assert int(dut.send_to_controller_spm_rd_resp.msg.data) == 0xaaaabbbb seen_response = True break dut.sim_tick() From 618d6e1a2198ce614246565e4e337a74a82380bf Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Fri, 26 Jun 2026 20:33:40 +0800 Subject: [PATCH 41/46] [Fix] Update dma_cmd string representation to use 'dma_tag' instead of 'tag'. --- lib/messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/messages.py b/lib/messages.py index def085e8..549335f6 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -218,7 +218,7 @@ def mk_dma_cmd(dram_addr_nbits = 64, new_name = f"{prefix}_{dram_addr_nbits}_{spm_addr_nbits}_{bytes_nbits}_{tag_nbits}" def str_func(s): - return f"dma_cmd(op={s.opcode},dram={s.dram_addr},spm={s.spm_addr},bytes={s.nbytes},tag={s.tag})" + return f"dma_cmd(op={s.opcode},dram={s.dram_addr},spm={s.spm_addr},bytes={s.nbytes},tag={s.dma_tag})" return mk_bitstruct(new_name, { 'opcode' : OpcodeType, From 04a2a4f1d7f4c952c9411da01b6fcb672cc32fa2 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Fri, 26 Jun 2026 21:08:32 +0800 Subject: [PATCH 42/46] Add warning comment in ControllerRTL.py regarding potential conflict between DMA command and CMD_COMPLETE signals in the same clock cycle, with a reference to related discussion. --- controller/ControllerRTL.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index 94b854c5..db350283 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -495,6 +495,11 @@ def update_received_msg(): # # TODO: Handle other cmd types. # assert(False) + # WARNING + # A possible conflict occurs when dma_done.valis True and the received message is CMD_COMPLETEat the same time + # — that is, when a DMA command and CMD_COMPLETE appear in the same clock cycle. + # In this case, both require the CGRA to send a return signal to the CPU, which may causes a conflict. + # Related discussion: https://github.com/tancheng/VectorCGRA/pull/293#discussion_r3418482217 if has_dma_ports & s.dma_done.val: s.dma_done.rdy @= s.send_to_cpu_pkt_queue.recv.rdy s.send_to_cpu_pkt_queue.recv.val @= 1 From 304ae24fffe56ac5dd29dda769a6f2c325b1683c Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sat, 27 Jun 2026 10:45:53 +0800 Subject: [PATCH 43/46] [Fix] Update DmaEngineRTL to use dma_tag --- lib/util/data_struct_attr.py | 1 + mem/dma/DmaEngineRTL.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/util/data_struct_attr.py b/lib/util/data_struct_attr.py index cf028f7e..615ef246 100644 --- a/lib/util/data_struct_attr.py +++ b/lib/util/data_struct_attr.py @@ -46,6 +46,7 @@ kAttrNBytes = 'nbytes' # This dma_tag isn't used now. We may use it to distinguish different DMA commands. kAttrDmaTag = 'dma_tag' +# TODO: https://github.com/tancheng/VectorCGRA/issues/316 -- Consolidates attributes. kAttrSpmAddr = 'spm_addr' kAttrSpmData = 'spm_data' kAttrSpmMask = 'spm_mask' diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index feda9e79..7a70eaeb 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -198,7 +198,7 @@ def seq_state(): # NOTE We only support nbytes that are multiples of 4 now. # If nbytes is not a multiple of 4, we will add 1 to the number of words to transfer. s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) if (s.dma_cmd.msg.nbytes % 4 == 0) else (s.dma_cmd.msg.nbytes >> 2) + 1 - s.tag_ff <<= s.dma_cmd.msg.tag + s.tag_ff <<= s.dma_cmd.msg.dma_tag s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) s.wr_mask_ff <<= MemMaskType( 0 ) From 85fcd195dba09e9471d38ff3b91821f4db6a9ad9 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sat, 27 Jun 2026 13:00:36 +0800 Subject: [PATCH 44/46] [Fix] Update ControllerRTL and DmaEngineRTL to consistently use 'dma_tag' instead of 'tag' in DMA-related messages and tests. --- controller/ControllerRTL.py | 4 ++-- lib/messages.py | 4 ++-- mem/dma/test/DmaEngineRTL_test.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index db350283..d65292d2 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -512,11 +512,11 @@ def update_received_msg(): s.idTo2d_y_lut[s.cgra_id], s.idTo2d_x_lut[s.cgra_id], s.idTo2d_y_lut[s.cgra_id], - s.dma_done.msg.tag, + s.dma_done.msg.dma_tag, 0, CgraPayloadType( CMD_DMA_DONE, - DataType(zext(s.dma_done.msg.tag, DataPayloadType), 1, 0, 0), + DataType(zext(s.dma_done.msg.dma_tag, DataPayloadType), 1, 0, 0), 0, 0, 0)) @update diff --git a/lib/messages.py b/lib/messages.py index 549335f6..92748885 100644 --- a/lib/messages.py +++ b/lib/messages.py @@ -289,10 +289,10 @@ def mk_dma_done(tag_nbits = 8, new_name = f"{prefix}_{tag_nbits}" def str_func(s): - return f"dma_done(tag={s.tag})" + return f"dma_done(dma_tag={s.dma_tag})" return mk_bitstruct(new_name, { - 'tag': TagType, + 'dma_tag': TagType, }, namespace = {'__str__': str_func} ) diff --git a/mem/dma/test/DmaEngineRTL_test.py b/mem/dma/test/DmaEngineRTL_test.py index 463f3ecb..186abb97 100644 --- a/mem/dma/test/DmaEngineRTL_test.py +++ b/mem/dma/test/DmaEngineRTL_test.py @@ -19,7 +19,7 @@ def make_dut(): dut.dma_cmd.msg.dram_addr @= 0 dut.dma_cmd.msg.spm_addr @= 0 dut.dma_cmd.msg.nbytes @= 0 - dut.dma_cmd.msg.tag @= 0 + dut.dma_cmd.msg.dma_tag @= 0 dut.dma_done.rdy @= 1 dut.send_to_dram_rd_req.rdy @= 1 @@ -57,7 +57,7 @@ def issue_cmd(dut, opcode, dram_addr, spm_addr, nbytes, tag): dut.dma_cmd.msg.dram_addr @= dram_addr dut.dma_cmd.msg.spm_addr @= spm_addr dut.dma_cmd.msg.nbytes @= nbytes - dut.dma_cmd.msg.tag @= tag + dut.dma_cmd.msg.dma_tag @= tag dut.sim_eval_combinational() assert dut.dma_cmd.rdy dut.sim_tick() @@ -105,7 +105,7 @@ def test_dma_mvin_one_beat(): spm_writes.append((int(dut.send_to_spm_wr_req.msg.addr), int(dut.send_to_spm_wr_req.msg.data))) if dut.dma_done.val: - assert int(dut.dma_done.msg.tag) == 0x5a + assert int(dut.dma_done.msg.dma_tag) == 0x5a break dut.sim_tick() @@ -166,7 +166,7 @@ def test_dma_mvout_partial_beat(): int(dut.send_to_dram_wr_req.msg.mask))) if dut.dma_done.val: - assert int(dut.dma_done.msg.tag) == 0xa5 + assert int(dut.dma_done.msg.dma_tag) == 0xa5 break dut.sim_tick() @@ -223,7 +223,7 @@ def test_dma_mvout_full_beat(): int(dut.send_to_dram_wr_req.msg.mask))) if dut.dma_done.val: - assert int(dut.dma_done.msg.tag) == 0xa5 + assert int(dut.dma_done.msg.dma_tag) == 0xa5 break dut.sim_tick() From 559c419e6feada72fe436507150c7dbd35928c9f Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sat, 27 Jun 2026 21:54:30 +0800 Subject: [PATCH 45/46] Refactor DmaEngineRTL to simplify word calculation logic --- controller/ControllerRTL.py | 4 ++-- mem/dma/DmaEngineRTL.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index d65292d2..a6130b4b 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -496,8 +496,8 @@ def update_received_msg(): # assert(False) # WARNING - # A possible conflict occurs when dma_done.valis True and the received message is CMD_COMPLETEat the same time - # — that is, when a DMA command and CMD_COMPLETE appear in the same clock cycle. + # A possible conflict occurs when dma_done.valis True and the received message is CMD_COMPLETEat the same time, + # that is, when a DMA command and CMD_COMPLETE appear in the same clock cycle. # In this case, both require the CGRA to send a return signal to the CPU, which may causes a conflict. # Related discussion: https://github.com/tancheng/VectorCGRA/pull/293#discussion_r3418482217 if has_dma_ports & s.dma_done.val: diff --git a/mem/dma/DmaEngineRTL.py b/mem/dma/DmaEngineRTL.py index 7a70eaeb..a7fff5d8 100644 --- a/mem/dma/DmaEngineRTL.py +++ b/mem/dma/DmaEngineRTL.py @@ -197,7 +197,7 @@ def seq_state(): # Converts the transfer size from bytes to words. # NOTE We only support nbytes that are multiples of 4 now. # If nbytes is not a multiple of 4, we will add 1 to the number of words to transfer. - s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) if (s.dma_cmd.msg.nbytes % 4 == 0) else (s.dma_cmd.msg.nbytes >> 2) + 1 + s.words_left_ff <<= (s.dma_cmd.msg.nbytes >> 2) s.tag_ff <<= s.dma_cmd.msg.dma_tag s.beat_ff <<= MemDataType( 0 ) s.word_idx_ff <<= b2( 0 ) @@ -245,19 +245,19 @@ def seq_state(): if s.recv_from_spm_rd_resp.val & s.recv_from_spm_rd_resp.rdy: # Pack the response from SPM into a 128-bit beat by left-shifting. if s.word_idx_reg == b2( 0 ): # 1st word - s.beat_ff <<= concat( s.beat_reg[spm_data_nbits:spm_data_nbits*4], + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits : spm_data_nbits<<2], s.recv_from_spm_rd_resp.msg.data ) elif s.word_idx_reg == b2( 1 ): - s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*2:spm_data_nbits*4], + s.beat_ff <<= concat( s.beat_reg[spm_data_nbits<<1 : spm_data_nbits<<2], s.recv_from_spm_rd_resp.msg.data, s.beat_reg[0:spm_data_nbits] ) elif s.word_idx_reg == b2( 2 ): - s.beat_ff <<= concat( s.beat_reg[spm_data_nbits*3:spm_data_nbits*4], + s.beat_ff <<= concat( s.beat_reg[(spm_data_nbits<<1)+spm_data_nbits : spm_data_nbits<<2], s.recv_from_spm_rd_resp.msg.data, - s.beat_reg[0:spm_data_nbits*2] ) + s.beat_reg[0:spm_data_nbits<<1] ) else: s.beat_ff <<= concat( s.recv_from_spm_rd_resp.msg.data, - s.beat_reg[0:spm_data_nbits*3] ) + s.beat_reg[0 : (spm_data_nbits<<1)+spm_data_nbits] ) s.spm_addr_ff <<= s.spm_addr_reg + SpmAddrType( 1 ) s.words_left_ff <<= s.words_left_reg - BytesType( 1 ) From 39c56fbb2d27f3e01291c0045280bec245b74c14 Mon Sep 17 00:00:00 2001 From: BenkangPeng Date: Sun, 28 Jun 2026 22:57:31 +0800 Subject: [PATCH 46/46] [Rename] Update ControllerRTL and related components to use 'sram' terminology for memory requests and use suffix `from_noc`/`from_dma` to classify 2 different ports. --- cgra/CgraRTL.py | 4 +- cgra/CgraTemplateRTL.py | 10 ++--- cgra/CgraWithContextSwitchRTL.py | 4 +- cgra/CgraWithStreamingLoadRTL.py | 4 +- controller/ControllerRTL.py | 57 +++++++++++++++------------ controller/test/ControllerRTL_test.py | 4 +- 6 files changed, 45 insertions(+), 38 deletions(-) diff --git a/cgra/CgraRTL.py b/cgra/CgraRTL.py index 87777384..c9dffcea 100644 --- a/cgra/CgraRTL.py +++ b/cgra/CgraRTL.py @@ -138,8 +138,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraTemplateRTL.py b/cgra/CgraTemplateRTL.py index 17202785..986efe06 100644 --- a/cgra/CgraTemplateRTL.py +++ b/cgra/CgraTemplateRTL.py @@ -244,13 +244,13 @@ def construct(s, CgraPayloadType, s.controller.send_to_dma_spm_rd_resp.rdy //= 0 # Controller <-> SPM/data_mem - s.controller.send_to_mem_spm_wr_req //= s.data_mem.recv_from_controller_spm_wr_req - s.controller.send_to_mem_spm_rd_req //= s.data_mem.recv_from_controller_spm_rd_req - s.controller.recv_from_mem_spm_rd_resp //= s.data_mem.send_to_controller_spm_rd_resp + s.controller.send_to_sram_store_request_from_dma //= s.data_mem.recv_from_controller_spm_wr_req + s.controller.send_to_sram_load_request_from_dma //= s.data_mem.recv_from_controller_spm_rd_req + s.controller.recv_from_sram_load_response //= s.data_mem.send_to_controller_spm_rd_resp # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraWithContextSwitchRTL.py b/cgra/CgraWithContextSwitchRTL.py index 361c0a9b..47bf8478 100644 --- a/cgra/CgraWithContextSwitchRTL.py +++ b/cgra/CgraWithContextSwitchRTL.py @@ -131,8 +131,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/cgra/CgraWithStreamingLoadRTL.py b/cgra/CgraWithStreamingLoadRTL.py index 6e7dcbf5..e7b6b64e 100644 --- a/cgra/CgraWithStreamingLoadRTL.py +++ b/cgra/CgraWithStreamingLoadRTL.py @@ -138,8 +138,8 @@ def construct(s, CgraPayloadType, s.data_mem.address_upper //= s.address_upper # Connects data memory with controller. - s.data_mem.recv_from_noc_load_request //= s.controller.send_to_mem_load_request - s.data_mem.recv_from_noc_store_request //= s.controller.send_to_mem_store_request + s.data_mem.recv_from_noc_load_request //= s.controller.send_to_sram_load_request_from_noc + s.data_mem.recv_from_noc_store_request //= s.controller.send_to_sram_store_request_from_noc s.data_mem.recv_from_noc_load_response_pkt //= s.controller.send_to_tile_load_response s.data_mem.send_to_noc_load_request_pkt //= s.controller.recv_from_tile_load_request_pkt s.data_mem.send_to_noc_load_response_pkt //= s.controller.recv_from_tile_load_response_pkt diff --git a/controller/ControllerRTL.py b/controller/ControllerRTL.py index a6130b4b..5a312c7f 100644 --- a/controller/ControllerRTL.py +++ b/controller/ControllerRTL.py @@ -91,9 +91,9 @@ def construct(s, s.recv_from_tile_load_response_pkt = RecvIfcRTL(InterCgraPktType) s.recv_from_tile_store_request_pkt = RecvIfcRTL(InterCgraPktType) - s.send_to_mem_load_request = SendIfcRTL(InterCgraPktType) + s.send_to_sram_load_request_from_noc = SendIfcRTL(InterCgraPktType) s.send_to_tile_load_response = SendIfcRTL(InterCgraPktType) - s.send_to_mem_store_request = SendIfcRTL(InterCgraPktType) + s.send_to_sram_store_request_from_noc = SendIfcRTL(InterCgraPktType) # Controller-owned command path from CPU packets to the DMA engine. # Send the decoded DMA command to the DMA engine. @@ -101,6 +101,13 @@ def construct(s, # Receive the DMA done signal from the DMA engine. s.dma_done = RecvIfcRTL(DmaDoneType) + # ------------------------------------------------------- + # SPM (SRAM) access path from the DMA engine. + # The DMA and the inter-tile NoC (above) each have their own + # dedicated SPM access interfaces to the data memory controller. + # They are kept separate because the DMA can perform burst data + # movement. + # ------------------------------------------------------- # Receive the request of writing into SPM from the DMA. s.recv_from_dma_spm_wr_req = RecvIfcRTL(DmaSpmWriteReqType) # Receive the request of reading from SPM from the DMA. @@ -108,13 +115,13 @@ def construct(s, # Send the response of reading from SPM to the DMA. s.send_to_dma_spm_rd_resp = SendIfcRTL(DmaSpmReadRespType) - # Data memory side of the same SPM access path. + # SRAM data memory side of the SPM access path (DMA). # Send the request of writing into SPM to the data_mem controller. - s.send_to_mem_spm_wr_req = SendIfcRTL(DmaSpmWriteReqType) + s.send_to_sram_store_request_from_dma = SendIfcRTL(DmaSpmWriteReqType) # Send the request of reading from SPM to the data_mem controller. - s.send_to_mem_spm_rd_req = SendIfcRTL(DmaSpmReadReqType) + s.send_to_sram_load_request_from_dma = SendIfcRTL(DmaSpmReadReqType) # Receive the response of reading from SPM from the data_mem controller. - s.recv_from_mem_spm_rd_resp = RecvIfcRTL(DmaSpmReadRespType) + s.recv_from_sram_load_response = RecvIfcRTL(DmaSpmReadRespType) # Component @@ -178,9 +185,9 @@ def construct(s, s.recv_from_tile_store_request_pkt_queue.recv //= s.recv_from_tile_store_request_pkt # Requests towards local from others, 1 cycle delay to improve timing. - s.send_to_mem_load_request_queue.send //= s.send_to_mem_load_request + s.send_to_mem_load_request_queue.send //= s.send_to_sram_load_request_from_noc s.send_to_tile_load_response_queue.send //= s.send_to_tile_load_response - s.send_to_mem_store_request_queue.send //= s.send_to_mem_store_request + s.send_to_mem_store_request_queue.send //= s.send_to_sram_store_request_from_noc # For control signals delivery from CPU to tiles. s.recv_from_cpu_pkt //= s.recv_from_cpu_pkt_queue.recv @@ -213,22 +220,22 @@ def update_dma_cmd_regs(): @update def update_dma_spm_forwarding(): if has_dma_ports: - s.send_to_mem_spm_wr_req.val @= s.recv_from_dma_spm_wr_req.val - s.recv_from_dma_spm_wr_req.rdy @= s.send_to_mem_spm_wr_req.rdy - s.send_to_mem_spm_wr_req.msg @= s.recv_from_dma_spm_wr_req.msg - - s.send_to_mem_spm_rd_req.val @= s.recv_from_dma_spm_rd_req.val - s.recv_from_dma_spm_rd_req.rdy @= s.send_to_mem_spm_rd_req.rdy - s.send_to_mem_spm_rd_req.msg @= s.recv_from_dma_spm_rd_req.msg - s.send_to_dma_spm_rd_resp.val @= s.recv_from_mem_spm_rd_resp.val - s.recv_from_mem_spm_rd_resp.rdy @= s.send_to_dma_spm_rd_resp.rdy - s.send_to_dma_spm_rd_resp.msg @= s.recv_from_mem_spm_rd_resp.msg + s.send_to_sram_store_request_from_dma.val @= s.recv_from_dma_spm_wr_req.val + s.recv_from_dma_spm_wr_req.rdy @= s.send_to_sram_store_request_from_dma.rdy + s.send_to_sram_store_request_from_dma.msg @= s.recv_from_dma_spm_wr_req.msg + + s.send_to_sram_load_request_from_dma.val @= s.recv_from_dma_spm_rd_req.val + s.recv_from_dma_spm_rd_req.rdy @= s.send_to_sram_load_request_from_dma.rdy + s.send_to_sram_load_request_from_dma.msg @= s.recv_from_dma_spm_rd_req.msg + s.send_to_dma_spm_rd_resp.val @= s.recv_from_sram_load_response.val + s.recv_from_sram_load_response.rdy @= s.send_to_dma_spm_rd_resp.rdy + s.send_to_dma_spm_rd_resp.msg @= s.recv_from_sram_load_response.msg else: - s.send_to_mem_spm_wr_req.val @= 0 - s.send_to_mem_spm_wr_req.msg @= DmaSpmWriteReqType() - s.send_to_mem_spm_rd_req.val @= 0 - s.send_to_mem_spm_rd_req.msg @= DmaSpmReadReqType() - s.recv_from_mem_spm_rd_resp.rdy @= 0 + s.send_to_sram_store_request_from_dma.val @= 0 + s.send_to_sram_store_request_from_dma.msg @= DmaSpmWriteReqType() + s.send_to_sram_load_request_from_dma.val @= 0 + s.send_to_sram_load_request_from_dma.msg @= DmaSpmReadReqType() + s.recv_from_sram_load_response.rdy @= 0 s.recv_from_dma_spm_wr_req.rdy @= 0 s.recv_from_dma_spm_rd_req.rdy @= 0 s.send_to_dma_spm_rd_resp.val @= 0 @@ -544,8 +551,8 @@ def line_trace(s): recv_from_tile_load_response_pkt_str = "recv_from_tile_load_response_pkt: " + str(s.recv_from_tile_load_response_pkt.msg) recv_from_tile_store_request_pkt_str = "recv_from_tile_store_request_pkt: " + str(s.recv_from_tile_store_request_pkt.msg) crossbar_str = "crossbar: {" + s.crossbar.line_trace() + "}" - send_to_mem_load_request_str = "send_to_mem_load_request: " + str(s.send_to_mem_load_request.msg) - send_to_mem_store_request_str = "send_to_mem_store_request: " + str(s.send_to_mem_store_request.msg) + send_to_mem_load_request_str = "send_to_sram_load_request_from_noc: " + str(s.send_to_sram_load_request_from_noc.msg) + send_to_mem_store_request_str = "send_to_sram_store_request_from_noc: " + str(s.send_to_sram_store_request_from_noc.msg) recv_from_noc_str ="recv_from_noc_pkt.val: " + str(s.recv_from_inter_cgra_noc.val) + " recv_from_noc_pkt.msg: " + str(s.recv_from_inter_cgra_noc.msg) + " recv_from_noc_pkt.rdy: " + str(s.recv_from_inter_cgra_noc.rdy) send_to_noc_str = "send_to_noc_pkt: " + str(s.send_to_inter_cgra_noc.msg) + "; rdy: " + str(s.send_to_inter_cgra_noc.rdy) + "; val: " + str(s.send_to_inter_cgra_noc.val) return f'{recv_from_cpu_pkt_str} || {recv_from_cpu_pkt_queue_str} || {crossbar_recv_str} || {send_to_ctrl_ring_pkt_str} || {recv_from_tile_load_request_pkt_str} || {recv_from_tile_load_response_pkt_str} || {recv_from_tile_store_request_pkt_str} || {crossbar_str} || {send_to_mem_load_request_str} || {send_to_mem_store_request_str} || {recv_from_noc_str} || {send_to_noc_str}\n' diff --git a/controller/test/ControllerRTL_test.py b/controller/test/ControllerRTL_test.py index 42d4eda9..1ad2500f 100644 --- a/controller/test/ControllerRTL_test.py +++ b/controller/test/ControllerRTL_test.py @@ -78,9 +78,9 @@ def construct(s, s.src_from_tile_load_response_pkt.send //= s.dut.recv_from_tile_load_response_pkt s.src_from_tile_store_request_pkt.send //= s.dut.recv_from_tile_store_request_pkt - s.dut.send_to_mem_store_request //= s.sink_to_mem_store_request.recv + s.dut.send_to_sram_store_request_from_noc //= s.sink_to_mem_store_request.recv s.dut.send_to_tile_load_response //= s.sink_to_mem_load_response.recv - s.dut.send_to_mem_load_request //= s.sink_to_mem_load_request.recv + s.dut.send_to_sram_load_request_from_noc //= s.sink_to_mem_load_request.recv s.src_from_noc.send //= s.dut.recv_from_inter_cgra_noc s.dut.send_to_inter_cgra_noc //= s.sink_to_noc.recv