From fb0d005e5e0cd7f970d2e76401c1c3353cfcdcfa Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Fri, 25 Jul 2025 13:11:48 +0200 Subject: [PATCH 1/5] [multi-prescision] adding multiprecision support. By deafult the assumption is TCDM is 32b with element width=8b. However, low precision workload might have different memory requirements. This PR handles this issue. Additionally, another stride is added to the address generator to enable input tiling useful for output stationary dataflow. d0_stride-> LAYER_C_IN (Next element in W to fill input buffer -> PE_W dimension) d1_stride-> LAYER_C_IN * W_IN (Next element in H to fill input buffer -> PE_H dimension) d2_stride-> TP_IN (number of input channels supported in a tile at a go -> ceil(LAYER_C_IN//TP_IN) dimension) d3_stride-> LAYER_C_IN (Next element in W -> FS dimension) d4_stride-> LAYER_C_IN * W_IN (Next element in H -> FS dimension) --- rtl/common/hci_package.sv | 5 +- rtl/core/hci_core_sink.sv | 94 +++++++++++++++++++++---------------- rtl/core/hci_core_source.sv | 18 ++++--- 3 files changed, 67 insertions(+), 50 deletions(-) diff --git a/rtl/common/hci_package.sv b/rtl/common/hci_package.sv index 1e3d1e1..f4cd701 100644 --- a/rtl/common/hci_package.sv +++ b/rtl/common/hci_package.sv @@ -56,13 +56,14 @@ package hci_package; typedef struct packed { logic req_start; - hwpe_stream_package::ctrl_addressgen_v3_t addressgen_ctrl; + hwpe_stream_package::ctrl_addressgen_v4_t addressgen_ctrl; } hci_streamer_ctrl_t; typedef struct packed { logic ready_start; logic done; - hwpe_stream_package::flags_addressgen_v3_t addressgen_flags; + logic no_valid_transfers; + hwpe_stream_package::flags_addressgen_v4_t addressgen_flags; } hci_streamer_flags_t; typedef enum { diff --git a/rtl/core/hci_core_sink.sv b/rtl/core/hci_core_sink.sv index 049a33d..3bfe4f5 100644 --- a/rtl/core/hci_core_sink.sv +++ b/rtl/core/hci_core_sink.sv @@ -83,11 +83,15 @@ module hci_core_sink import hci_package::*; #( // Stream interface params - parameter int unsigned TCDM_FIFO_DEPTH = 0, - parameter int unsigned TRANS_CNT = 16, - parameter int unsigned MISALIGNED_ACCESSES = 1, - parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0, - parameter bit [2:0] DIM_ENABLE_1H = 3'b011 // Number of dimensions enabled in the address generator + parameter int unsigned TCDM_FIFO_DEPTH = 0, + parameter int unsigned TRANS_CNT = 16, + parameter int unsigned MISALIGNED_ACCESSES = 1, + parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element + parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank + localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, + localparam int unsigned ELEMENT_INDEX_WIDTH = $clog2(ELEMENTS_PER_BANK), + parameter bit [3:0] DIM_ENABLE_1H = 4'b011 // Number of dimensions enabled in the address generator + parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0 ) ( input logic clk_i, @@ -106,6 +110,7 @@ module hci_core_sink localparam int unsigned DATA_WIDTH = `HCI_SIZE_GET_DW(tcdm); localparam int unsigned EHW = `HCI_SIZE_GET_EHW(tcdm); + localparam int unsigned BW = `HCI_SIZE_GET_BW(tcdm); hci_streamer_state_t cs, ns; flags_fifo_t addr_fifo_flags; @@ -116,6 +121,8 @@ module hci_core_sink logic tcdm_inflight; + assign flags_o.no_valid_transfers = (~tcdm_inflight); + hwpe_stream_intf_stream #( .DATA_WIDTH ( 36 ) ) addr_push ( @@ -131,7 +138,7 @@ module hci_core_sink localparam hci_size_parameter_t `HCI_SIZE_PARAM(tcdm_target) = '{ DW: DATA_WIDTH, AW: DEFAULT_AW, - BW: DEFAULT_BW, + BW: BW, UW: DEFAULT_UW, IW: DEFAULT_IW, EW: DEFAULT_EW, @@ -139,7 +146,7 @@ module hci_core_sink }; `HCI_INTF(tcdm_target, clk_i); - hwpe_stream_addressgen_v3 #( + hwpe_stream_addressgen_v4 #( .DIM_ENABLE_1H ( DIM_ENABLE_1H ) ) i_addressgen ( .clk_i ( clk_i ), @@ -168,46 +175,51 @@ module hci_core_sink logic [TRANS_CNT-1:0] address_cnt_d, address_cnt_q; logic [DATA_WIDTH-1:0] stream_data_misaligned; - logic [DATA_WIDTH/8-1:0] stream_strb_misaligned; + logic [DATA_WIDTH/ELEMENT_WIDTH-1:0] stream_strb_misaligned; + logic [ELEMENTS_PER_BANK-1:0][DATA_WIDTH-1:0] stream_data_aligned_array; + logic [ELEMENTS_PER_BANK-1:0][DATA_WIDTH/ELEMENT_WIDTH-1:0] stream_strb_aligned_array; logic [DATA_WIDTH-1:0] stream_data_aligned; - logic [DATA_WIDTH/8-1:0] stream_strb_aligned; + logic [DATA_WIDTH/ELEMENT_WIDTH-1:0] stream_strb_aligned; - assign stream_data_misaligned = stream.data; - assign stream_strb_misaligned = stream.strb; + logic [ELEMENT_INDEX_WIDTH-1:0] bank_offset; - if (MISALIGNED_ACCESSES==1 ) begin : missaligned_access_gen - always_comb - begin - stream_data_aligned = '0; - stream_strb_aligned = '0; - case(addr_pop.data[1:0]) - 2'b00: begin - stream_data_aligned[DATA_WIDTH-32-1:0] = stream_data_misaligned[DATA_WIDTH-32-1:0]; - stream_strb_aligned[(DATA_WIDTH-32)/8-1:0] = stream_strb_misaligned[(DATA_WIDTH-32)/8-1:0]; - end - 2'b01: begin - stream_data_aligned[DATA_WIDTH-24-1:8] = stream_data_misaligned[DATA_WIDTH-32-1:0]; - stream_strb_aligned[(DATA_WIDTH-24)/8-1:1] = stream_strb_misaligned[(DATA_WIDTH-32)/8-1:0]; - end - 2'b10: begin - stream_data_aligned[DATA_WIDTH-16-1:16] = stream_data_misaligned[DATA_WIDTH-32-1:0]; - stream_strb_aligned[(DATA_WIDTH-16)/8-1:2] = stream_strb_misaligned[(DATA_WIDTH-32)/8-1:0]; - end - 2'b11: begin - stream_data_aligned[DATA_WIDTH-8-1:24] = stream_data_misaligned[DATA_WIDTH-32-1:0]; - stream_strb_aligned[(DATA_WIDTH-8)/8-1:3] = stream_strb_misaligned[(DATA_WIDTH-32)/8-1:0]; + assign bank_offset = addr_pop.data[ELEMENT_INDEX_WIDTH-1:0]; + + generate + if (MISALIGNED_ACCESSES == 1) begin: missaligned_access_gen + for (genvar offs = 0; offs < ELEMENTS_PER_BANK; offs++) begin : aligned_stream_gen + if (offs > 0) begin + assign stream_data_aligned_array[offs][offs*ELEMENT_WIDTH-1:0] = '0; + assign stream_strb_aligned_array[offs][offs-1:0] = '0; end - endcase - end - end - else begin - assign stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; - assign stream_strb_aligned[DATA_WIDTH/8-1:0] = stream_strb_misaligned[DATA_WIDTH/8-1:0]; - end + + localparam int unsigned DATA_OFFSET_MSB = DATA_WIDTH - BANK_DATA_WIDTH + offs * ELEMENT_WIDTH; + localparam int unsigned DATA_OFFSET_LSB = DATA_OFFSET_MSB - (DATA_WIDTH - BANK_DATA_WIDTH); + localparam int unsigned STRB_OFFSET_MSB = DATA_OFFSET_MSB / ELEMENT_WIDTH; + localparam int unsigned STRB_OFFSET_LSB = DATA_OFFSET_LSB / ELEMENT_WIDTH; + + assign stream_data_aligned_array[offs][DATA_WIDTH-1:DATA_OFFSET_MSB] = '0; + assign stream_data_aligned_array[offs][DATA_OFFSET_MSB-1:DATA_OFFSET_LSB] = stream_data_misaligned[DATA_WIDTH-BANK_DATA_WIDTH-1:0]; + + assign stream_strb_aligned_array[offs][DATA_WIDTH/ELEMENT_WIDTH-1:STRB_OFFSET_MSB] = '0; + assign stream_strb_aligned_array[offs][STRB_OFFSET_MSB-1:STRB_OFFSET_LSB] = stream_strb_misaligned[(DATA_WIDTH-BANK_DATA_WIDTH)/ELEMENT_WIDTH-1:0]; + end + assign stream_data_aligned = stream_data_aligned_array[bank_offset]; + assign stream_strb_aligned = stream_strb_aligned_array[bank_offset]; + end else begin + assign stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; + assign stream_strb_aligned[DATA_WIDTH/ELEMENT_WIDTH-1:0] = stream_strb_misaligned[DATA_WIDTH/ELEMENT_WIDTH-1:0]; + end + endgenerate + + + + assign stream_data_misaligned = stream.data; + assign stream_strb_misaligned = stream.strb; // hci port binding assign tcdm_target.req = (cs != STREAMER_IDLE) ? stream.valid & addr_pop.valid : '0; - assign tcdm_target.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:2],2'b0} : '0; + assign tcdm_target.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:ELEMENT_INDEX_WIDTH],{ELEMENT_INDEX_WIDTH{1'b0}}} : '0; assign tcdm_target.wen = '0; assign tcdm_target.be = (cs != STREAMER_IDLE) ? stream_strb_aligned : '0; assign tcdm_target.data = (cs != STREAMER_IDLE) ? stream_data_aligned : '0; @@ -349,7 +361,7 @@ module hci_core_sink end else begin initial - dw : assert(stream.DATA_WIDTH+32 == tcdm.DW); + dw : assert(stream.DATA_WIDTH+BANK_DATA_WIDTH == tcdm.DW); end `HCI_SIZE_CHECK_ASSERTS(tcdm); diff --git a/rtl/core/hci_core_source.sv b/rtl/core/hci_core_source.sv index ae16383..5920394 100644 --- a/rtl/core/hci_core_source.sv +++ b/rtl/core/hci_core_source.sv @@ -92,13 +92,17 @@ module hci_core_source import hci_package::*; #( // Stream interface params - parameter int unsigned LATCH_FIFO = 0, - parameter int unsigned TRANS_CNT = 16, - parameter int unsigned ADDR_MIS_DEPTH = 8, // Beware: this must be >= the maximum latency between TCDM gnt and TCDM r_valid!!! - parameter int unsigned MISALIGNED_ACCESSES = 1, - parameter int unsigned PASSTHROUGH_FIFO = 0, - parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0, + parameter int unsigned LATCH_FIFO = 0, + parameter int unsigned TRANS_CNT = 16, + parameter int unsigned ADDR_MIS_DEPTH = 8, // Beware: this must be >= the maximum latency between TCDM gnt and TCDM r_valid!!! + parameter int unsigned MISALIGNED_ACCESSES = 1, + parameter int unsigned PASSTHROUGH_FIFO = 0, + parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element + parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank + localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, + localparam int unsigned ELEMENT_INDEX_WIDTH = $clog2(ELEMENTS_PER_BANK), parameter bit [2:0] DIM_ENABLE_1H = 3'b011 // Number of dimensions enabled in the address generator + parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0 ) ( input logic clk_i, @@ -138,7 +142,7 @@ module hci_core_source ); // generate addresses - hwpe_stream_addressgen_v3 #( + hwpe_stream_addressgen_v4 #( .DIM_ENABLE_1H ( DIM_ENABLE_1H ) ) i_addressgen ( .clk_i ( clk_i ), From e27804bbe9588b0261469dc2ff7dbecdf7067ea0 Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Tue, 12 Aug 2025 10:00:17 +0200 Subject: [PATCH 2/5] [refactor] added parameters for multi precision support --- rtl/core/hci_core_sink.sv | 14 ++++++------- rtl/core/hci_core_source.sv | 23 +++++++++++----------- rtl/interco/hci_router.sv | 39 ++++++++++++++++++++++++------------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/rtl/core/hci_core_sink.sv b/rtl/core/hci_core_sink.sv index 3bfe4f5..0b50d4e 100644 --- a/rtl/core/hci_core_sink.sv +++ b/rtl/core/hci_core_sink.sv @@ -83,14 +83,14 @@ module hci_core_sink import hci_package::*; #( // Stream interface params - parameter int unsigned TCDM_FIFO_DEPTH = 0, - parameter int unsigned TRANS_CNT = 16, - parameter int unsigned MISALIGNED_ACCESSES = 1, - parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element - parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank - localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, + parameter int unsigned TCDM_FIFO_DEPTH = 0, + parameter int unsigned TRANS_CNT = 16, + parameter int unsigned MISALIGNED_ACCESSES = 1, + parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element + parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank + localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, localparam int unsigned ELEMENT_INDEX_WIDTH = $clog2(ELEMENTS_PER_BANK), - parameter bit [3:0] DIM_ENABLE_1H = 4'b011 // Number of dimensions enabled in the address generator + parameter bit [3:0] DIM_ENABLE_1H = 4'b1111, // Number of dimensions enabled in the address generator parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0 ) ( diff --git a/rtl/core/hci_core_source.sv b/rtl/core/hci_core_source.sv index 5920394..f0f2d82 100644 --- a/rtl/core/hci_core_source.sv +++ b/rtl/core/hci_core_source.sv @@ -92,16 +92,16 @@ module hci_core_source import hci_package::*; #( // Stream interface params - parameter int unsigned LATCH_FIFO = 0, - parameter int unsigned TRANS_CNT = 16, - parameter int unsigned ADDR_MIS_DEPTH = 8, // Beware: this must be >= the maximum latency between TCDM gnt and TCDM r_valid!!! - parameter int unsigned MISALIGNED_ACCESSES = 1, - parameter int unsigned PASSTHROUGH_FIFO = 0, - parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element - parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank - localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, + parameter int unsigned LATCH_FIFO = 0, + parameter int unsigned TRANS_CNT = 16, + parameter int unsigned ADDR_MIS_DEPTH = 8, // Beware: this must be >= the maximum latency between TCDM gnt and TCDM r_valid!!! + parameter int unsigned MISALIGNED_ACCESSES = 1, + parameter int unsigned PASSTHROUGH_FIFO = 0, + parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element + parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank + localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, localparam int unsigned ELEMENT_INDEX_WIDTH = $clog2(ELEMENTS_PER_BANK), - parameter bit [2:0] DIM_ENABLE_1H = 3'b011 // Number of dimensions enabled in the address generator + parameter bit [3:0] DIM_ENABLE_1H = 4'b011, // Number of dimensions enabled in the address generator parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0 ) ( @@ -221,9 +221,9 @@ module hci_core_source assign tcdm.r_ready = stream.ready; assign tcdm.req = (cs != STREAMER_IDLE) ? addr_pop.valid & stream.ready : '0; - assign tcdm.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:2],2'b0} : '0; + assign tcdm.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:ELEMENT_INDEX_WIDTH],{ELEMENT_INDEX_WIDTH{1'b0}}} : '0; assign tcdm.wen = 1'b1; - assign tcdm.be = 4'h0; + assign tcdm.be = {ELEMENTS_PER_BANK{1'b0}}; assign tcdm.data = '0; assign tcdm.user = '0; assign tcdm.id = '0; @@ -231,6 +231,7 @@ module hci_core_source assign stream.strb = '1; assign stream.data = stream_data_aligned; assign stream.valid = enable_i & (tcdm.r_valid | stream_valid_q); // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? + // assign stream.valid = enable_i & tcdm.r_valid; // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? assign addr_pop.ready = (cs != STREAMER_IDLE) ? addr_pop.valid & stream.ready & tcdm.gnt : 1'b0; hwpe_stream_intf_stream #( diff --git a/rtl/interco/hci_router.sv b/rtl/interco/hci_router.sv index a7250a4..d5b8f82 100644 --- a/rtl/interco/hci_router.sv +++ b/rtl/interco/hci_router.sv @@ -55,7 +55,10 @@ module hci_router #( parameter int unsigned FIFO_DEPTH = 0, parameter int unsigned NB_OUT_CHAN = 8, - parameter bit USE_ECC = 0, + parameter int unsigned BANK_WORD_WIDTH = 32, + parameter int unsigned BANK_ELEM_WIDTH = 8, + localparam int unsigned NUM_ELEM_WORD = BANK_WORD_WIDTH / BANK_ELEM_WIDTH, + parameter bit USE_ECC = 0, parameter int unsigned FILTER_WRITE_R_VALID = 0, parameter hci_size_parameter_t `HCI_SIZE_PARAM(in) = '0, parameter hci_size_parameter_t `HCI_SIZE_PARAM(out) = '0 @@ -79,12 +82,13 @@ module hci_router //There is only one input port, but with variable data width. //NB_IN_CHAN states, to how many standard (32-bit) ports the input port is equivalent - localparam int unsigned NB_IN_CHAN = DWH / 32; + localparam int unsigned NB_IN_CHAN = DWH / BANK_WORD_WIDTH; //Word-interleaved scheme: // - First bits of requested address are shared // - Lowest 2 bits are byte offset within a DWORD -> ignored // - The bits inbetween designate the selected bank - localparam int unsigned LSB_COMMON_ADDR = $clog2(NB_OUT_CHAN) + 2; + localparam int unsigned ELEM_ADDR_OFFSET = NUM_ELEM_WORD == 1 ? 0 : $clog2(NUM_ELEM_WORD); + localparam int unsigned LSB_COMMON_ADDR = $clog2(NB_OUT_CHAN) + ELEM_ADDR_OFFSET; localparam int unsigned AWC = AWM+$clog2(NB_OUT_CHAN); logic [$clog2(NB_OUT_CHAN)-1:0] bank_offset_s; @@ -105,9 +109,9 @@ module hci_router // Hsiao SEC-DED ECC needs $clog2(DW)+2 check bits // At this level only data are ECC-protected and with DW fixed at 32, EW is 5+2 = 7 localparam hci_size_parameter_t `HCI_SIZE_PARAM(virt_in) = '{ - DW: 32, + DW: BANK_WORD_WIDTH, AW: 32, - BW: 8, + BW: BANK_ELEM_WIDTH, UW: 0, IW: 0, EW: 7*USE_ECC, @@ -173,15 +177,15 @@ module hci_router // unimplemented operation code = 0 assign postfifo.r_opc = '0; - assign bank_offset_s = postfifo.add[LSB_COMMON_ADDR-1:2]; + assign bank_offset_s = postfifo.add[LSB_COMMON_ADDR-1:ELEM_ADDR_OFFSET]; for(genvar ii=0; ii= NB_OUT_CHAN) - virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR] + 1, 2'b0}; //bank level address - else - virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR], 2'b0}; //bank level address + if(bank_offset_s + ii >= NB_OUT_CHAN) begin + if(ELEM_ADDR_OFFSET == 0) begin + virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR] + 1}; //bank level address + end else begin + virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR] + 1, {ELEM_ADDR_OFFSET{1'b0}}}; //bank level address + end + end else begin + if(ELEM_ADDR_OFFSET == 0) begin + virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR]}; //bank level address + end else begin + virt_in[ii].add = {postfifo.add[AWC-1:LSB_COMMON_ADDR], {ELEM_ADDR_OFFSET{1'b0}}}; //bank level address + end + end end : bank_level_address_generation assign virt_in[ii].r_ready = postfifo.r_ready; From 9844a893c34612ac66697e8dcd60214296f2634c Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Tue, 21 Oct 2025 13:52:36 +0200 Subject: [PATCH 3/5] [addr-offset] The banks were hardcoded to 32b each providing 4 bytes of data. But in theory a bank doesn't necessarily need to accommodate 4 bytes or 4 elements. This commit fixes these assumptions by making the bank size configurable via ADDR_OFFSET parameter derived from ELEMENTS_PER_BANK. --- rtl/core/hci_core_source.sv | 45 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/rtl/core/hci_core_source.sv b/rtl/core/hci_core_source.sv index f0f2d82..c393a96 100644 --- a/rtl/core/hci_core_source.sv +++ b/rtl/core/hci_core_source.sv @@ -100,7 +100,7 @@ module hci_core_source parameter int unsigned ELEMENT_WIDTH = 8, // e.g., 8 bits per element parameter int unsigned ELEMENTS_PER_BANK = 4, // number of elements in one memory bank localparam int unsigned BANK_DATA_WIDTH = ELEMENT_WIDTH * ELEMENTS_PER_BANK, - localparam int unsigned ELEMENT_INDEX_WIDTH = $clog2(ELEMENTS_PER_BANK), + localparam int unsigned ADDR_OFFSET = ELEMENTS_PER_BANK == 1 ? 1 : $clog2(ELEMENTS_PER_BANK), parameter bit [3:0] DIM_ENABLE_1H = 4'b011, // Number of dimensions enabled in the address generator parameter hci_size_parameter_t `HCI_SIZE_PARAM(tcdm) = '0 ) @@ -184,8 +184,7 @@ module hci_core_source logic stream_valid_q; logic [DATA_WIDTH-1:0] stream_data_q; - logic [1:0] addr_misaligned_q; - logic addr_misaligned_valid; + logic [ADDR_OFFSET-1:0]addr_misaligned_q; logic [DATA_WIDTH-1:0] stream_data_misaligned; logic [DATA_WIDTH-1:0] stream_data_aligned; @@ -199,20 +198,10 @@ module hci_core_source always_comb begin stream_data_aligned = '0; - case(addr_misaligned_q) - 2'b00: begin - stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; - end - 2'b01: begin - stream_data_aligned[DATA_WIDTH-32-1:0] = stream_data_misaligned[DATA_WIDTH-24-1:8]; - end - 2'b10: begin - stream_data_aligned[DATA_WIDTH-32-1:0] = stream_data_misaligned[DATA_WIDTH-16-1:16]; - end - 2'b11: begin - stream_data_aligned[DATA_WIDTH-32-1:0] = stream_data_misaligned[DATA_WIDTH-8-1:24]; - end - endcase + if(addr_misaligned_q == 0) + stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; + else + stream_data_aligned[0+:BANK_DATA_WIDTH] = stream_data_misaligned[addr_misaligned_q*ELEMENT_WIDTH+:BANK_DATA_WIDTH]; end end else begin @@ -221,7 +210,10 @@ module hci_core_source assign tcdm.r_ready = stream.ready; assign tcdm.req = (cs != STREAMER_IDLE) ? addr_pop.valid & stream.ready : '0; - assign tcdm.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:ELEMENT_INDEX_WIDTH],{ELEMENT_INDEX_WIDTH{1'b0}}} : '0; + if(ADDR_OFFSET == 1) + assign tcdm.add = (cs != STREAMER_IDLE) ? addr_pop.data[31:0] : '0; + else + assign tcdm.add = (cs != STREAMER_IDLE) ? {addr_pop.data[31:ADDR_OFFSET],{ADDR_OFFSET{1'b0}}} : '0; assign tcdm.wen = 1'b1; assign tcdm.be = {ELEMENTS_PER_BANK{1'b0}}; assign tcdm.data = '0; @@ -233,25 +225,32 @@ module hci_core_source assign stream.valid = enable_i & (tcdm.r_valid | stream_valid_q); // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? // assign stream.valid = enable_i & tcdm.r_valid; // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? assign addr_pop.ready = (cs != STREAMER_IDLE) ? addr_pop.valid & stream.ready & tcdm.gnt : 1'b0; + + // hwpe stream is a factor of 8 hardcoded. Until this is fixed have to use this + localparam int unsigned ADDR_OFFSET_BYTE = 8*((ADDR_OFFSET + 8 - 1) / 8); hwpe_stream_intf_stream #( - .DATA_WIDTH ( 8 ) // only 2 significant + .DATA_WIDTH ( ADDR_OFFSET_BYTE ) // only 2 significant ) addr_misaligned_push ( .clk ( clk_i ) ); hwpe_stream_intf_stream #( - .DATA_WIDTH ( 8 ) // only 2 significant + .DATA_WIDTH ( ADDR_OFFSET_BYTE ) // only 2 significant ) addr_misaligned_pop ( .clk ( clk_i ) ); - assign addr_misaligned_push.data = {6'b0, addr_pop.data[1:0]}; + if(ADDR_OFFSET_BYTE == ADDR_OFFSET) + assign addr_misaligned_push.data = addr_pop.data[ADDR_OFFSET-1:0]; + else + assign addr_misaligned_push.data = {{(ADDR_OFFSET_BYTE-ADDR_OFFSET){1'b0}}, addr_pop.data[ADDR_OFFSET-1:0]}; + assign addr_misaligned_push.strb = '1; assign addr_misaligned_push.valid = enable_i & tcdm.req & tcdm.gnt; // BEWARE: considered always ready!!! assign addr_misaligned_pop.ready = (tcdm.r_valid | stream_valid_q) & stream.ready; - assign addr_misaligned_q = addr_misaligned_pop.data[1:0]; + assign addr_misaligned_q = addr_misaligned_pop.data[ADDR_OFFSET-1:0]; hwpe_stream_fifo #( - .DATA_WIDTH ( 8 ), // only [1:0] significant + .DATA_WIDTH ( ADDR_OFFSET_BYTE), // only [1:0] significant .FIFO_DEPTH ( ADDR_MIS_DEPTH ) ) i_addr_misaligned_fifo ( .clk_i ( clk_i ), From d2762784f4fce686d1a4756ebc6aab29b92ab719 Mon Sep 17 00:00:00 2001 From: Lionnus Kesting Date: Thu, 5 Feb 2026 11:58:13 +0100 Subject: [PATCH 4/5] :bug: Extract full data stream instead of bankwidth of data for misaligned access --- rtl/core/hci_core_source.sv | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/rtl/core/hci_core_source.sv b/rtl/core/hci_core_source.sv index c393a96..acae07d 100644 --- a/rtl/core/hci_core_source.sv +++ b/rtl/core/hci_core_source.sv @@ -121,6 +121,7 @@ module hci_core_source localparam int unsigned DATA_WIDTH = `HCI_SIZE_GET_DW(tcdm); localparam int unsigned EHW = `HCI_SIZE_GET_EHW(tcdm); + localparam int unsigned STREAM_MISALIGNED_DW = DATA_WIDTH - BANK_DATA_WIDTH; hci_streamer_state_t cs, ns; flags_fifo_t addr_fifo_flags; @@ -194,14 +195,14 @@ module hci_core_source // this is simply exploiting the fact that we can make a wider data access than strictly necessary! assign stream_data_misaligned = tcdm.r_valid ? tcdm.r_data : stream_data_q; // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? - if (MISALIGNED_ACCESSES==1 ) begin : missaligned_access_gen - always_comb - begin + if (MISALIGNED_ACCESSES==1) begin : misaligned_access_gen + always_comb begin stream_data_aligned = '0; - if(addr_misaligned_q == 0) + if (addr_misaligned_q == 0) stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; - else - stream_data_aligned[0+:BANK_DATA_WIDTH] = stream_data_misaligned[addr_misaligned_q*ELEMENT_WIDTH+:BANK_DATA_WIDTH]; + else begin + stream_data_aligned[STREAM_MISALIGNED_DW-1:0] = stream_data_misaligned[addr_misaligned_q*ELEMENT_WIDTH +: STREAM_MISALIGNED_DW]; + end end end else begin From e7a0ebd219efd819bb5bc91ea1b24fdcfcfde6c6 Mon Sep 17 00:00:00 2001 From: Lionnus Kesting Date: Wed, 11 Feb 2026 15:37:21 +0100 Subject: [PATCH 5/5] :art: Fix style issues --- rtl/core/hci_core_source.sv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rtl/core/hci_core_source.sv b/rtl/core/hci_core_source.sv index acae07d..bd718fe 100644 --- a/rtl/core/hci_core_source.sv +++ b/rtl/core/hci_core_source.sv @@ -196,10 +196,12 @@ module hci_core_source assign stream_data_misaligned = tcdm.r_valid ? tcdm.r_data : stream_data_q; // is this strictly necessary to keep the HWPE-Stream protocol? or can be avoided with a FIFO q? if (MISALIGNED_ACCESSES==1) begin : misaligned_access_gen - always_comb begin + always_comb + begin stream_data_aligned = '0; - if (addr_misaligned_q == 0) + if (addr_misaligned_q == 0) begin stream_data_aligned[DATA_WIDTH-1:0] = stream_data_misaligned[DATA_WIDTH-1:0]; + end else begin stream_data_aligned[STREAM_MISALIGNED_DW-1:0] = stream_data_misaligned[addr_misaligned_q*ELEMENT_WIDTH +: STREAM_MISALIGNED_DW]; end