From de6e3af14b99440d107ab7aa6d8791321ee4c357 Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Thu, 31 Jul 2025 10:33:59 +0200 Subject: [PATCH 1/5] [new-addrgen] needed for the input feature --- Bender.yml | 1 + rtl/basic/hwpe_stream_serialize.sv | 4 +- rtl/hwpe_stream_interfaces.sv | 8 +- rtl/hwpe_stream_package.sv | 19 ++ rtl/streamer/hwpe_stream_addressgen_v4.sv | 346 ++++++++++++++++++++++ 5 files changed, 374 insertions(+), 4 deletions(-) create mode 100644 rtl/streamer/hwpe_stream_addressgen_v4.sv diff --git a/Bender.yml b/Bender.yml index 0c18234..cd11e60 100644 --- a/Bender.yml +++ b/Bender.yml @@ -32,6 +32,7 @@ sources: - rtl/streamer/hwpe_stream_addressgen.sv - rtl/streamer/hwpe_stream_addressgen_v2.sv - rtl/streamer/hwpe_stream_addressgen_v3.sv + - rtl/streamer/hwpe_stream_addressgen_v4.sv - rtl/streamer/hwpe_stream_sink_realign.sv - rtl/streamer/hwpe_stream_source_realign.sv - rtl/streamer/hwpe_stream_strbgen.sv diff --git a/rtl/basic/hwpe_stream_serialize.sv b/rtl/basic/hwpe_stream_serialize.sv index ed8c388..877522b 100644 --- a/rtl/basic/hwpe_stream_serialize.sv +++ b/rtl/basic/hwpe_stream_serialize.sv @@ -61,6 +61,8 @@ module hwpe_stream_serialize #( parameter int unsigned NB_IN_STREAMS = 2, parameter int unsigned CONTIG_LIMIT = 1024, parameter int unsigned DATA_WIDTH = 32, + parameter int unsigned ELEMENT_WIDTH = 8, + localparam int unsigned NUM_ELEMENTS = DATA_WIDTH/ELEMENT_WIDTH, parameter logic SYNC_READY = 1'b0 ) ( @@ -80,7 +82,7 @@ module hwpe_stream_serialize #( // boilerplate for SystemVerilog compliance logic [NB_IN_STREAMS-1:0][DATA_WIDTH-1:0] push_data; logic [NB_IN_STREAMS-1:0] push_valid; - logic [NB_IN_STREAMS-1:0][DATA_WIDTH/8-1:0] push_strb; + logic [NB_IN_STREAMS-1:0][NUM_ELEMENTS-1:0] push_strb; logic [NB_IN_STREAMS-1:0] push_ready; generate diff --git a/rtl/hwpe_stream_interfaces.sv b/rtl/hwpe_stream_interfaces.sv index 2314290..fae0e2d 100644 --- a/rtl/hwpe_stream_interfaces.sv +++ b/rtl/hwpe_stream_interfaces.sv @@ -20,6 +20,8 @@ interface hwpe_stream_intf_tcdm ( input logic clk ); + parameter int unsigned DATA_WIDTH = 32; // used to default to -1 and always overridden --> not well supported by some tools + parameter int unsigned STRB_WIDTH = DATA_WIDTH/8; `ifndef SYNTHESIS // the TRVR assert is disabled by default, as it is only valid for zero-latency // accesses (e.g. using FIFO queues breaks this assumption) @@ -30,9 +32,9 @@ interface hwpe_stream_intf_tcdm ( logic gnt; logic [31:0] add; logic wen; - logic [3:0] be; - logic [31:0] data; - logic [31:0] r_data; + logic [STRB_WIDTH-1:0] be; + logic [DATA_WIDTH-1:0] data; + logic [DATA_WIDTH-1:0] r_data; logic r_valid; modport master ( diff --git a/rtl/hwpe_stream_package.sv b/rtl/hwpe_stream_package.sv index a42fcc2..82b132d 100644 --- a/rtl/hwpe_stream_package.sv +++ b/rtl/hwpe_stream_package.sv @@ -64,6 +64,21 @@ package hwpe_stream_package; logic [2:0] dim_enable_1h; } ctrl_addressgen_v3_t; + typedef struct packed { + logic [31:0] base_addr; + logic [31:0] tot_len; // former word_length + logic [31:0] d0_len; // former line_length + logic signed [31:0] d0_stride; // former word_stride + logic [31:0] d1_len; // former block_length + logic signed [31:0] d1_stride; // former line_stride + logic [31:0] d2_len; + logic signed [31:0] d2_stride; // former block_stride + logic [31:0] d3_len; + logic signed [31:0] d3_stride; + logic signed [31:0] d4_stride; + logic [3:0] dim_enable_1h; + } ctrl_addressgen_v4_t; + typedef struct packed { logic enable; logic strb_valid; @@ -94,6 +109,10 @@ package hwpe_stream_package; logic done; } flags_addressgen_v3_t; + typedef struct packed { + logic done; + } flags_addressgen_v4_t; + typedef struct packed { logic empty; logic full; diff --git a/rtl/streamer/hwpe_stream_addressgen_v4.sv b/rtl/streamer/hwpe_stream_addressgen_v4.sv new file mode 100644 index 0000000..4f66a51 --- /dev/null +++ b/rtl/streamer/hwpe_stream_addressgen_v4.sv @@ -0,0 +1,346 @@ +/* + * hwpe_stream_addressgen_v3.sv + * Francesco Conti + * + * Copyright (C) 2014-2020 ETH Zurich, University of Bologna + * Copyright and related rights are licensed under the Solderpad Hardware + * License, Version 0.51 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law + * or agreed to in writing, software, hardware and materials distributed under + * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +/** + * The **hwpe_stream_addressgen_v3** module is used to generate addresses to + * load or store HWPE-Stream stream. In this version of the address generator, + * the address is itself carried within a HWPE-Stream, making it easily stallable. + * The address generator can be used to generate address from a + * three-dimensional space, which can be visited with configurable strides in all + * three dimensions. + * + * The multiple loop functionality is partially overlapped by the functionality + * provided by the microcode processor `hwce_ctrl_ucode` that can be embedded + * in HWPEs. The latter is much more flexible and smaller, but less fast. + * + * One iteration is performed per each cycle when `enable_i` is 1 and the output + * `addr_o` stream is ready. `presample_i` should be 1 in the first cycle in which + * the address generator can start generating addresses, and no further. + * The following piece of pseudo-C code resumes the basic functionality provided by + * the address generator. + * + * .. code-block:: C + * + * hwpe_stream_addressgen_v3( + * int base_addr, // base address (byte-aligned) + * int d0_len, int d1_len, int tot_len // d0,d1,total length (in number of transactions) + * int d0_stride, int d1_stride, int d2_stride, // d0,d1,d2 strides (in bytes) + * int *d0_addr, int *d1_addr, int *d2_addr, // d0,d1,d2 addresses (by reference) + * int *d0_cnt, int *d1_cnt, int *ov_cnt // d0,d1,overall counters (by reference) + * ) { + * // compute current address + * int current_addr = 0; + * int done = 0; + * if (dim_enable & 0x1 == 0) { // 1-dimensional streaming + * current_addr = base_addr + *d0_addr; + * } + * else if(dim_enable & 0x2 == 0) { // 2-dimensional streaming + * current_addr = base_addr + *d1_addr + *d0_addr; + * } + * else { // 3-dimensional streaming + * current_addr = base_addr + *d2_addr + *d1_addr + *d0_addr; + * } + * // update counters and dimensional addresses + * if(*ov_cnt == tot_len) { + * done = 1; + * } + * if((*d0_cnt < d0_len) || (dim_enable & 0x1 == 0)) { + * *d0_addr = *d0_addr + d0_stride; + * *d0_cnt = *d0_cnt + 1; + * } + * else if ((*d1_cnt < d1_len) || (dim_enable & 0x2 == 0)) { + * *d0_addr = 0; + * *d1_addr = *d1_addr + d1_stride; + * *d0_cnt = 1; + * *d1_cnt = *d1_cnt + 1; + * } + * else if ((*d2_cnt < d2_len) || (dim_enable & 0x4 == 0)) { + * *d0_addr = 0; + * *d1_addr = 0; + * *d2_addr = *d2_addr + d2_stride; + * *d0_cnt = 1; + * *d1_cnt = 1; + * *d2_cnt = *d2_cnt + 1; + * } + * else { + * *d0_addr = 0; + * *d1_addr = 0; + * *d2_addr = 0; + * *d3_addr = *d3_addr + d3_stride; + * *d0_cnt = 1; + * *d1_cnt = 1; + * *d2_cnt = 1; + * } + * *ov_cnt = *ov_cnt + 1; + * return current_addr, done; + * } + * + * .. tabularcolumns:: |l|l|J| + * .. _hwpe_stream_addressgen_v3_params: + * .. table:: **hwpe_stream_addressgen_v3** design-time parameters. + * + * +-------------------------+------------------------------------+---------------------------------------------------------------------------------------------+ + * | **Name** | **Default** | **Description** | + * +-------------------------+------------------------------------+---------------------------------------------------------------------------------------------+ + * | *TRANS_CNT* | 32 | Number of bits supported in the transaction counter, which will overflow at 2^ `TRANS_CNT`. | + * +-------------------------+------------------------------------+---------------------------------------------------------------------------------------------+ + * | *CNT* | 32 | Number of bits supported in non-transaction counters, which will overflow at 2^ `CNT`. | + * +-------------------------+------------------------------------+---------------------------------------------------------------------------------------------+ + * + * .. tabularcolumns:: |l|l|J| + * .. _hwpe_stream_addressgen_v3_ctrl: + * .. table:: **hwpe_stream_addressgen_v3** input control signals. + * + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | **Name** | **Type** | **Description** | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *base_addr* | `logic[31:0]` | Byte-aligned base address of the stream in the HWPE-accessible memory. | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *tot_len* | `logic[31:0]` | Total number of transactions in stream; only the `TRANS_CNT` LSB are actually used. | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d0_len* | `logic[31:0]` | d0 length in number of transactions | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d0_stride* | `logic[31:0]` | d0 stride in bytes | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d0_len* | `logic[31:0]` | d0 length in number of transactions | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d1_stride* | `logic[31:0]` | d1 stride in bytes | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d1_len* | `logic[31:0]` | d1 length in number of transactions | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d2_stride* | `logic[31:0]` | d2 stride in bytes | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d2_len* | `logic[31:0]` | d2 length in number of transactions | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *d3_stride* | `logic[31:0]` | d3 stride in bytes | + * * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * | *dim_enable_1h* | `logic[2:0]` | One-hot switch to enable 4-d counting (111), 3-d (011), 2-d (001), or 1-d (000). | + * +----------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------+ + * + * .. tabularcolumns:: |l|l|J| + * .. _hwpe_stream_addressgen_v3_flags: + * .. table:: **hwpe_stream_addressgen_v3** output flags. + * + * +-----------------+------------------+-----------------------------------------------+ + * | **Name** | **Type** | **Description** | + * +-----------------+------------------+-----------------------------------------------+ + * | *done* | `logic` | 1 when the address generation has finished. | + * +-----------------+------------------+-----------------------------------------------+ + * + */ + + +import hwpe_stream_package::*; + +module hwpe_stream_addressgen_v4 +#( + parameter int unsigned TRANS_CNT = 32, + parameter int unsigned CNT = 32, // number of bits used within the internal counter + parameter bit [3:0] DIM_ENABLE_1H = 4'b1111 // Number of dimensions enabled on HW side +) +( + // global signals + input logic clk_i, + input logic rst_ni, + // local enable and clear + input logic enable_i, + input logic clear_i, + input logic presample_i, + // generated output address + hwpe_stream_intf_stream.source addr_o, + // control channel + input ctrl_addressgen_v4_t ctrl_i, + output flags_addressgen_v4_t flags_o +); + + logic signed [31:0] d0_stride; + logic signed [31:0] d1_stride; + logic signed [31:0] d2_stride; + logic signed [31:0] d3_stride; + logic signed [31:0] d4_stride; + + + + logic [31:0] gen_addr_int; + logic done; + + logic [TRANS_CNT-1:0] overall_counter_d; + logic [CNT-1:0] d0_counter_d; + logic [CNT-1:0] d1_counter_d; + logic [CNT-1:0] d2_counter_d; + logic [CNT-1:0] d3_counter_d; + logic [CNT-1:0] d4_counter_d; + logic [31:0] d0_addr_d; + logic [31:0] d1_addr_d; + logic [31:0] d2_addr_d; + logic [31:0] d3_addr_d; + logic [31:0] d4_addr_d; + logic [TRANS_CNT-1:0] overall_counter_q; + logic [CNT-1:0] d0_counter_q; + logic [CNT-1:0] d1_counter_q; + logic [CNT-1:0] d2_counter_q; + logic [CNT-1:0] d3_counter_q; + logic [CNT-1:0] d4_counter_q; + logic [31:0] d0_addr_q; + logic [31:0] d1_addr_q; + logic [31:0] d2_addr_q; + logic [31:0] d3_addr_q; + logic [31:0] d4_addr_q; + + logic addr_valid_d, addr_valid_q; + + assign d0_stride = $signed(ctrl_i.d0_stride); + assign d1_stride = $signed(ctrl_i.d1_stride); + assign d2_stride = $signed(ctrl_i.d2_stride); + assign d3_stride = $signed(ctrl_i.d3_stride); + assign d4_stride = $signed(ctrl_i.d4_stride); + + // address generation + always_comb + begin : address_gen_counters_comb + d0_addr_d = d0_addr_q; + d1_addr_d = d1_addr_q; + d2_addr_d = d2_addr_q; + d3_addr_d = d3_addr_q; + d4_addr_d = d4_addr_q; + d0_counter_d = d0_counter_q; + d1_counter_d = d1_counter_q; + d2_counter_d = d2_counter_q; + d3_counter_d = d3_counter_q; + d4_counter_d = d4_counter_q; + overall_counter_d = overall_counter_q; + addr_valid_d = addr_valid_q; + done = '0; + if(addr_o.ready) begin + if(overall_counter_q < ctrl_i.tot_len) begin + addr_valid_d = 1'b1; + if((d0_counter_q < ctrl_i.d0_len) || (ctrl_i.dim_enable_1h[0] == 1'b0) || (DIM_ENABLE_1H[0] == 1'b0)) begin + d0_addr_d = d0_addr_q + d0_stride; + d0_counter_d = d0_counter_q + 1; + end + else if ((d1_counter_q < ctrl_i.d1_len) || (ctrl_i.dim_enable_1h[1] == 1'b0) || (DIM_ENABLE_1H[1] == 1'b0)) begin + d0_addr_d = '0; + d1_addr_d = d1_addr_q + d1_stride; + d0_counter_d = 1; + d1_counter_d = d1_counter_q + 1; + end + else if ((d2_counter_q < ctrl_i.d2_len) || (ctrl_i.dim_enable_1h[2] == 1'b0) || (DIM_ENABLE_1H[2] == 1'b0)) begin + d0_addr_d = '0; + d1_addr_d = '0; + d2_addr_d = d2_addr_q + d2_stride; + d0_counter_d = 1; + d1_counter_d = 1; + d2_counter_d = d2_counter_q + 1; + end + else if ((d3_counter_q < ctrl_i.d3_len) || (ctrl_i.dim_enable_1h[3] == 1'b0) || (DIM_ENABLE_1H[3] == 1'b0)) begin + d0_addr_d = '0; + d1_addr_d = '0; + d2_addr_d = '0; + d3_addr_d = d3_addr_q + d3_stride; + d0_counter_d = 1; + d1_counter_d = 1; + d2_counter_d = 1; + d3_counter_d = d3_counter_q + 1; + end + else begin + d0_addr_d = '0; + d1_addr_d = '0; + d2_addr_d = '0; + d3_addr_d = '0; + d4_addr_d = d4_addr_q + d4_stride; + d0_counter_d = 1; + d1_counter_d = 1; + d2_counter_d = 1; + d3_counter_d = 1; + d4_counter_d = d4_counter_q + 1; + end + overall_counter_d = overall_counter_q + 1; + end + else begin + addr_valid_d = 1'b0; + done = 1'b1; + end + end + end + + // address generation + always_ff @(posedge clk_i or negedge rst_ni) + begin : address_gen_counters_d0_ff + if (~rst_ni) begin + d0_addr_q <= '0; + end + else if (clear_i) begin + d0_addr_q <= '0; + end + else if (presample_i) begin + d0_addr_q <= '0; + end + else if (enable_i) begin + d0_addr_q <= d0_addr_d; + end + end + + always_ff @(posedge clk_i or negedge rst_ni) + begin : address_gen_counters_ff + if (~rst_ni) begin + d1_addr_q <= '0; + d2_addr_q <= '0; + d3_addr_q <= '0; + d4_addr_q <= '0; + d0_counter_q <= '0; + d1_counter_q <= 1; + d2_counter_q <= 1; + d3_counter_q <= 1; + d4_counter_q <= 1; + overall_counter_q <= '0; + addr_valid_q <= '0; + end + else if (clear_i) begin + d1_addr_q <= '0; + d2_addr_q <= '0; + d3_addr_q <= '0; + d4_addr_q <= '0; + d0_counter_q <= '0; + d1_counter_q <= 1; + d2_counter_q <= 1; + d3_counter_q <= 1; + d4_counter_q <= 1; + overall_counter_q <= '0; + addr_valid_q <= '0; + end + else if(enable_i) begin + d1_addr_q <= d1_addr_d; + d2_addr_q <= d2_addr_d; + d3_addr_q <= d3_addr_d; + d4_addr_q <= d4_addr_d; + d0_counter_q <= d0_counter_d; + d1_counter_q <= d1_counter_d; + d2_counter_q <= d2_counter_d; + d3_counter_q <= d3_counter_d; + d4_counter_q <= d4_counter_d; + overall_counter_q <= overall_counter_d; + addr_valid_q <= addr_valid_d; + end + end + + assign gen_addr_int = ctrl_i.base_addr + d4_addr_q + d3_addr_q + d2_addr_q + d1_addr_q + d0_addr_q; + + assign addr_o.data = gen_addr_int; + assign addr_o.strb = '1; + assign addr_o.valid = addr_valid_q; + + assign flags_o.done = done; + +endmodule // hwpe_stream_addressgen_v4 From 14c0c0c2bb1cdf57a49dcecadcc4e63e1dd96a2d Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Sat, 30 Aug 2025 21:24:29 +0200 Subject: [PATCH 2/5] [asymmetric-fence] This is different from the generic fence. First the widths are different Second, this fence could be bypassed. this is used in conjuction with weight and normalization streams. In the last tile the streams are fenced otherwise it is bypassed allowing only weight to propagate --- Bender.yml | 1 + rtl/basic/hwpe_stream_fence_asymmetric.sv | 141 ++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 rtl/basic/hwpe_stream_fence_asymmetric.sv diff --git a/Bender.yml b/Bender.yml index cd11e60..f5c1501 100644 --- a/Bender.yml +++ b/Bender.yml @@ -23,6 +23,7 @@ sources: - rtl/basic/hwpe_stream_demux_static.sv - rtl/basic/hwpe_stream_deserialize.sv - rtl/basic/hwpe_stream_fence.sv + - rtl/basic/hwpe_stream_fence_asymmetric.sv - rtl/basic/hwpe_stream_merge.sv - rtl/basic/hwpe_stream_mux_static.sv - rtl/basic/hwpe_stream_serialize.sv diff --git a/rtl/basic/hwpe_stream_fence_asymmetric.sv b/rtl/basic/hwpe_stream_fence_asymmetric.sv new file mode 100644 index 0000000..d13142b --- /dev/null +++ b/rtl/basic/hwpe_stream_fence_asymmetric.sv @@ -0,0 +1,141 @@ +/* + * hwpe_stream_fence_aymmetric.sv + * Arpan Suravi Prasad + * + * Copyright (C) 2014-2018 ETH Zurich, University of Bologna + * Copyright and related rights are licensed under the Solderpad Hardware + * License, Version 0.51 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law + * or agreed to in writing, software, hardware and materials distributed under + * this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +/** + * The **hwpe_stream_fence_asymmetric** module is used to synchronize the handshake between + * `NB_STREAMS` streams. + * This is necessary, for example, when 2 asymmetric(different datawidth) streams are produced + * from separate TCDM accesses and have to be joined into a single, wider + * stream. + * + * .. _wavedrom_hwpe_stream_fence: + * .. wavedrom:: wavedrom/hwpe_stream_fence.json + * :width: 85 % + * :caption: Example of **hwpe_stream_fence** operation. + * + * .. tabularcolumns:: |l|l|J| + * .. _hwpe_stream_fence_params: + * .. table:: **hwpe_stream_fence** design-time parameters. + * + * +------------------+-------------+---------------------------------------------+ + * | **Name** | **Default** | **Description** | + * +------------------+-------------+---------------------------------------------+ + * | *NB_STREAMS* | 2 | Number of input/output HWPE-Stream streams. | + * +------------------+-------------+---------------------------------------------+ + * | *DATA_WIDTH* | 32 | Width of the HWPE-Stream streams. | + * +------------------+-------------+---------------------------------------------+ + */ + +import hwpe_stream_package::*; + +module hwpe_stream_fence_asymmetric #( + localparam int unsigned NB_STREAMS = 2, + parameter int unsigned ELEM_WIDTH_0 = 32, + parameter int unsigned ELEM_WIDTH_1 = 32, + parameter int unsigned DATA_WIDTH_0 = 32, + parameter int unsigned DATA_WIDTH_1 = 32 +) +( + input logic clk_i, + input logic rst_ni, + input logic clear_i, + input logic bypass_i, + + hwpe_stream_intf_stream.sink push_0_i, + hwpe_stream_intf_stream.sink push_1_i, + hwpe_stream_intf_stream.source pop_0_o, + hwpe_stream_intf_stream.source pop_1_o +); + + logic [NB_STREAMS-1:0] in_valid; + logic out_valid; + logic [NB_STREAMS-1:0] fence_state_q, fence_state_d; + logic [DATA_WIDTH_0-1:0] data_0_q; + logic [DATA_WIDTH_1-1:0] data_1_q; + logic [DATA_WIDTH_0/ELEM_WIDTH_0-1:0] strb_0_q; + logic [DATA_WIDTH_1/ELEM_WIDTH_1-1:0] strb_1_q; + + logic [NB_STREAMS-1:0] in_strm_hs, out_strm_hs; + + assign in_strm_hs[0] = push_0_i.ready & push_0_i.valid; + assign in_strm_hs[1] = push_1_i.ready & push_1_i.valid; + + assign out_strm_hs[0] = pop_0_o.ready & pop_0_o.valid; + assign out_strm_hs[1] = pop_1_o.ready & pop_1_o.valid; + + assign in_valid[0] = push_0_i.valid; + assign in_valid[1] = push_1_i.valid; + + assign push_0_i.ready = bypass_i ? fence_state_q[0] ? 1'b0 : pop_0_o.ready : pop_0_o.ready & (~fence_state_q[0] | out_valid); + assign push_1_i.ready = bypass_i ? fence_state_q[1] ? 1'b0 : pop_1_o.ready : pop_1_o.ready & (~fence_state_q[1] | out_valid); + + assign pop_0_o.valid = bypass_i ? fence_state_q[0] ? 1'b1 : push_0_i.valid : out_valid; + assign pop_1_o.valid = bypass_i ? fence_state_q[1] ? 1'b1 : push_1_i.valid : out_valid; + + assign pop_0_o.data = bypass_i ? (fence_state_q[0] ? data_0_q : push_0_i.data) : fence_state_q[0] ? data_0_q : push_0_i.data; + assign pop_1_o.data = bypass_i ? (fence_state_q[1] ? data_1_q : push_1_i.data) : fence_state_q[1] ? data_1_q : push_1_i.data; + + assign pop_0_o.strb = bypass_i ? fence_state_q[0] ? strb_0_q : push_0_i.strb : fence_state_q[0] ? strb_0_q : push_0_i.strb; + assign pop_1_o.strb = bypass_i ? fence_state_q[1] ? strb_1_q : push_1_i.strb : fence_state_q[1] ? strb_1_q : push_1_i.strb; + + always_ff @(posedge clk_i or negedge rst_ni) + begin + if(~rst_ni) begin + data_0_q <= '0; + data_1_q <= '0; + strb_0_q <= '0; + strb_1_q <= '0; + end else if(clear_i) begin + data_0_q <= '0; + data_1_q <= '0; + strb_0_q <= '0; + strb_1_q <= '0; + end else begin + data_0_q <= bypass_i ? fence_state_q[0] ? data_0_q: '0 : (in_valid[0] && ~fence_state_q[0]) | (fence_state_q[0] & in_strm_hs[0]) ? push_0_i.data : data_0_q; + data_1_q <= bypass_i ? fence_state_q[1] ? data_1_q: '0 : (in_valid[1] && ~fence_state_q[1]) | (fence_state_q[1] & in_strm_hs[1]) ? push_1_i.data : data_1_q; + strb_0_q <= bypass_i ? fence_state_q[0] ? strb_0_q: '0 : (in_valid[0] && ~fence_state_q[0]) | (fence_state_q[0] & in_strm_hs[0]) ? push_0_i.strb : strb_0_q; + strb_1_q <= bypass_i ? fence_state_q[1] ? strb_1_q: '0 : (in_valid[1] && ~fence_state_q[1]) | (fence_state_q[1] & in_strm_hs[1]) ? push_1_i.strb : strb_1_q; + end + end + + + always_comb + begin + out_valid = 1'b0; + fence_state_d[0] = bypass_i ? (fence_state_q[0] & out_strm_hs[0] ? 1'b0 : fence_state_q[0]): in_strm_hs[0] ? fence_state_q[0] : out_strm_hs[0] ?'0 : fence_state_q[0]; + fence_state_d[1] = bypass_i ? (fence_state_q[1] & out_strm_hs[1] ? 1'b0 : fence_state_q[1]): in_strm_hs[1] ? fence_state_q[1] : out_strm_hs[1] ?'0 : fence_state_q[1]; + if(&(in_valid | fence_state_q)) + out_valid = bypass_i ? 1'b0 : 1'b1; + else begin + if(~bypass_i) begin + fence_state_d = fence_state_q | in_valid; + end else begin + fence_state_d[0] = fence_state_q[0] ? out_strm_hs[0] ? 1'b0 : fence_state_q[0] : 1'b0; + fence_state_d[1] = fence_state_q[1] ? out_strm_hs[1] ? 1'b0 : fence_state_q[1] : 1'b0; + end + end + end + + always_ff @(posedge clk_i or negedge rst_ni) + begin + if(~rst_ni) + fence_state_q <= '0; + else if(clear_i) + fence_state_q <= '0; + else + fence_state_q <= fence_state_d; + end + +endmodule // hwpe_stream_fence From 460aad6994f527c1de0dd8353df38d31fff10f55 Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Sun, 31 Aug 2025 00:59:24 +0200 Subject: [PATCH 3/5] [fix] fix to the fence module. Cleaner implementation --- rtl/basic/hwpe_stream_fence_asymmetric.sv | 102 +++++++++++++--------- 1 file changed, 63 insertions(+), 39 deletions(-) diff --git a/rtl/basic/hwpe_stream_fence_asymmetric.sv b/rtl/basic/hwpe_stream_fence_asymmetric.sv index d13142b..8a816ce 100644 --- a/rtl/basic/hwpe_stream_fence_asymmetric.sv +++ b/rtl/basic/hwpe_stream_fence_asymmetric.sv @@ -59,13 +59,13 @@ module hwpe_stream_fence_asymmetric #( hwpe_stream_intf_stream.source pop_1_o ); - logic [NB_STREAMS-1:0] in_valid; - logic out_valid; - logic [NB_STREAMS-1:0] fence_state_q, fence_state_d; - logic [DATA_WIDTH_0-1:0] data_0_q; - logic [DATA_WIDTH_1-1:0] data_1_q; - logic [DATA_WIDTH_0/ELEM_WIDTH_0-1:0] strb_0_q; - logic [DATA_WIDTH_1/ELEM_WIDTH_1-1:0] strb_1_q; + logic [NB_STREAMS-1:0] in_valid; + logic out_valid; + logic [NB_STREAMS-1:0] fence_state_q, fence_state_d; + logic [DATA_WIDTH_0-1:0] data_0_d, data_0_q; + logic [DATA_WIDTH_1-1:0] data_1_d, data_1_q; + logic [DATA_WIDTH_0/ELEM_WIDTH_0-1:0] strb_0_d, strb_0_q; + logic [DATA_WIDTH_1/ELEM_WIDTH_1-1:0] strb_1_d, strb_1_q; logic [NB_STREAMS-1:0] in_strm_hs, out_strm_hs; @@ -78,17 +78,58 @@ module hwpe_stream_fence_asymmetric #( assign in_valid[0] = push_0_i.valid; assign in_valid[1] = push_1_i.valid; - assign push_0_i.ready = bypass_i ? fence_state_q[0] ? 1'b0 : pop_0_o.ready : pop_0_o.ready & (~fence_state_q[0] | out_valid); - assign push_1_i.ready = bypass_i ? fence_state_q[1] ? 1'b0 : pop_1_o.ready : pop_1_o.ready & (~fence_state_q[1] | out_valid); - - assign pop_0_o.valid = bypass_i ? fence_state_q[0] ? 1'b1 : push_0_i.valid : out_valid; - assign pop_1_o.valid = bypass_i ? fence_state_q[1] ? 1'b1 : push_1_i.valid : out_valid; - - assign pop_0_o.data = bypass_i ? (fence_state_q[0] ? data_0_q : push_0_i.data) : fence_state_q[0] ? data_0_q : push_0_i.data; - assign pop_1_o.data = bypass_i ? (fence_state_q[1] ? data_1_q : push_1_i.data) : fence_state_q[1] ? data_1_q : push_1_i.data; - - assign pop_0_o.strb = bypass_i ? fence_state_q[0] ? strb_0_q : push_0_i.strb : fence_state_q[0] ? strb_0_q : push_0_i.strb; - assign pop_1_o.strb = bypass_i ? fence_state_q[1] ? strb_1_q : push_1_i.strb : fence_state_q[1] ? strb_1_q : push_1_i.strb; + // Can take element if there is nothing registered or if registered there is a handshake + assign push_0_i.ready = ~fence_state_q[0] || fence_state_q[0] & out_strm_hs[0]; + assign push_1_i.ready = ~fence_state_q[1] || fence_state_q[1] & out_strm_hs[1]; + + assign out_valid = &( fence_state_q | in_valid); + + assign pop_0_o.valid = bypass_i ? push_0_i.valid | fence_state_q[0] : out_valid; + assign pop_1_o.valid = bypass_i ? push_1_i.valid | fence_state_q[1] : out_valid; + + assign pop_0_o.data = fence_state_q[0] ? data_0_q: push_0_i.data; + assign pop_1_o.data = fence_state_q[1] ? data_1_q: push_1_i.data; + + assign pop_0_o.strb = fence_state_q[0] ? strb_0_q: push_0_i.data; + assign pop_1_o.strb = fence_state_q[1] ? strb_1_q: push_1_i.strb; + + always_comb begin + fence_state_d[0] = fence_state_q[0]; + data_0_d = data_0_q; + strb_0_d = strb_0_q; + if(in_strm_hs[0] & out_strm_hs[0]) begin + fence_state_d[0] = fence_state_q[0]; + data_0_d = fence_state_q[0] ? push_0_i.data : data_0_q; + strb_0_d = fence_state_q[0] ? push_0_i.strb : strb_0_q; + end else if (in_strm_hs[0] & ~out_strm_hs[0]) begin + fence_state_d[0] = 1'b1; + data_0_d = push_0_i.data; + strb_0_d = push_0_i.strb; + end else if (~in_strm_hs[0] & out_strm_hs[0]) begin + fence_state_d[0] = 1'b0; + data_0_d = '0; + strb_0_d = '0; + end + end + + always_comb begin + fence_state_d[1] = fence_state_q[1]; + data_1_d = data_1_q; + strb_1_d = strb_1_q; + if(in_strm_hs[1] & out_strm_hs[1]) begin + fence_state_d[1] = fence_state_q[1]; + data_1_d = fence_state_q[1] ? push_1_i.data : data_1_q; + strb_1_d = fence_state_q[1] ? push_1_i.strb : strb_1_q; + end else if (in_strm_hs[1] & ~out_strm_hs[1]) begin + fence_state_d[1] = 1'b1; + data_1_d = push_1_i.data; + strb_1_d = push_1_i.strb; + end else if (~in_strm_hs[1] & out_strm_hs[1]) begin + fence_state_d[1] = 1'b0; + data_1_d = '0; + strb_1_d = '0; + end + end always_ff @(posedge clk_i or negedge rst_ni) begin @@ -103,31 +144,14 @@ module hwpe_stream_fence_asymmetric #( strb_0_q <= '0; strb_1_q <= '0; end else begin - data_0_q <= bypass_i ? fence_state_q[0] ? data_0_q: '0 : (in_valid[0] && ~fence_state_q[0]) | (fence_state_q[0] & in_strm_hs[0]) ? push_0_i.data : data_0_q; - data_1_q <= bypass_i ? fence_state_q[1] ? data_1_q: '0 : (in_valid[1] && ~fence_state_q[1]) | (fence_state_q[1] & in_strm_hs[1]) ? push_1_i.data : data_1_q; - strb_0_q <= bypass_i ? fence_state_q[0] ? strb_0_q: '0 : (in_valid[0] && ~fence_state_q[0]) | (fence_state_q[0] & in_strm_hs[0]) ? push_0_i.strb : strb_0_q; - strb_1_q <= bypass_i ? fence_state_q[1] ? strb_1_q: '0 : (in_valid[1] && ~fence_state_q[1]) | (fence_state_q[1] & in_strm_hs[1]) ? push_1_i.strb : strb_1_q; + data_0_q <= data_0_d; + data_1_q <= data_1_d; + strb_0_q <= strb_0_d; + strb_1_q <= strb_1_d; end end - always_comb - begin - out_valid = 1'b0; - fence_state_d[0] = bypass_i ? (fence_state_q[0] & out_strm_hs[0] ? 1'b0 : fence_state_q[0]): in_strm_hs[0] ? fence_state_q[0] : out_strm_hs[0] ?'0 : fence_state_q[0]; - fence_state_d[1] = bypass_i ? (fence_state_q[1] & out_strm_hs[1] ? 1'b0 : fence_state_q[1]): in_strm_hs[1] ? fence_state_q[1] : out_strm_hs[1] ?'0 : fence_state_q[1]; - if(&(in_valid | fence_state_q)) - out_valid = bypass_i ? 1'b0 : 1'b1; - else begin - if(~bypass_i) begin - fence_state_d = fence_state_q | in_valid; - end else begin - fence_state_d[0] = fence_state_q[0] ? out_strm_hs[0] ? 1'b0 : fence_state_q[0] : 1'b0; - fence_state_d[1] = fence_state_q[1] ? out_strm_hs[1] ? 1'b0 : fence_state_q[1] : 1'b0; - end - end - end - always_ff @(posedge clk_i or negedge rst_ni) begin if(~rst_ni) From 40ab0fe1433dc080a49aa1926ce09df9ab3f5fb5 Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Tue, 21 Oct 2025 15:21:22 +0200 Subject: [PATCH 4/5] [parametric] Element width is added as a parameter for multi precision support --- rtl/fifo/hwpe_stream_fifo.sv | 17 ++++++++++------- rtl/hwpe_stream_interfaces.sv | 8 +++++--- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/rtl/fifo/hwpe_stream_fifo.sv b/rtl/fifo/hwpe_stream_fifo.sv index 5e5a3e4..339b2bc 100644 --- a/rtl/fifo/hwpe_stream_fifo.sv +++ b/rtl/fifo/hwpe_stream_fifo.sv @@ -67,6 +67,7 @@ import hwpe_stream_package::*; module hwpe_stream_fifo #( parameter int unsigned DATA_WIDTH = 32, + parameter int unsigned ELEMENT_WIDTH = 8, parameter int unsigned FIFO_DEPTH = 8, parameter int unsigned LATCH_FIFO = 0, parameter int unsigned LATCH_FIFO_TEST_WRAP = 0 @@ -81,6 +82,8 @@ module hwpe_stream_fifo #( hwpe_stream_intf_stream.sink push_i, hwpe_stream_intf_stream.source pop_o ); + localparam int unsigned STRB_WIDTH = DATA_WIDTH / ELEMENT_WIDTH; + localparam int unsigned DATA_STRB_WIDTH = DATA_WIDTH + STRB_WIDTH; // Local Parameter localparam ADDR_DEPTH = (FIFO_DEPTH==1) ? 1 : $clog2(FIFO_DEPTH); @@ -90,7 +93,7 @@ module hwpe_stream_fifo #( logic [ADDR_DEPTH-1:0] pop_pointer_q, pop_pointer_d; logic [ADDR_DEPTH-1:0] push_pointer_q, push_pointer_d; - logic [DATA_WIDTH+DATA_WIDTH/8-1:0] fifo_registers[FIFO_DEPTH-1:0]; + logic [DATA_STRB_WIDTH-1:0] fifo_registers[FIFO_DEPTH-1:0]; integer i; assign flags_o.empty = (cs == EMPTY) ? 1'b1 : 1'b0; @@ -232,8 +235,8 @@ module hwpe_stream_fifo #( endcase end - logic [DATA_WIDTH+DATA_WIDTH/8-1:0] data_out_int; - logic [DATA_WIDTH+DATA_WIDTH/8-1:0] data_in_int; + logic [DATA_STRB_WIDTH-1:0] data_out_int; + logic [DATA_STRB_WIDTH-1:0] data_in_int; generate if(LATCH_FIFO == 0) begin : fifo_ff_gen @@ -263,7 +266,7 @@ module hwpe_stream_fifo #( hwpe_stream_fifo_scm #( .ADDR_WIDTH ( ADDR_DEPTH ), - .DATA_WIDTH ( DATA_WIDTH + DATA_WIDTH/8 ) + .DATA_WIDTH ( DATA_STRB_WIDTH ) ) i_fifo_latch ( .clk ( clk_i ), .rst_n ( rst_ni ), @@ -282,7 +285,7 @@ module hwpe_stream_fifo #( hwpe_stream_fifo_scm_test_wrap #( .ADDR_WIDTH ( ADDR_DEPTH ), - .DATA_WIDTH ( DATA_WIDTH + DATA_WIDTH/8 ) + .DATA_WIDTH ( DATA_STRB_WIDTH ) ) i_fifo_latch ( .clk ( clk_i ), .rst_n ( rst_ni ), @@ -303,7 +306,7 @@ module hwpe_stream_fifo #( end endgenerate - assign pop_o.data = (pop_o.valid == 1'b1) ? data_out_int[DATA_WIDTH+DATA_WIDTH/8-1:DATA_WIDTH/8] : '0; - assign pop_o.strb = (pop_o.valid == 1'b1) ? data_out_int[DATA_WIDTH/8-1:0] : '0; + assign pop_o.data = (pop_o.valid == 1'b1) ? data_out_int[DATA_STRB_WIDTH-1:STRB_WIDTH] : '0; + assign pop_o.strb = (pop_o.valid == 1'b1) ? data_out_int[STRB_WIDTH-1:0] : '0; endmodule // hwpe_stream_fifo diff --git a/rtl/hwpe_stream_interfaces.sv b/rtl/hwpe_stream_interfaces.sv index fae0e2d..d4c550d 100644 --- a/rtl/hwpe_stream_interfaces.sv +++ b/rtl/hwpe_stream_interfaces.sv @@ -20,8 +20,9 @@ interface hwpe_stream_intf_tcdm ( input logic clk ); - parameter int unsigned DATA_WIDTH = 32; // used to default to -1 and always overridden --> not well supported by some tools - parameter int unsigned STRB_WIDTH = DATA_WIDTH/8; + parameter int unsigned DATA_WIDTH = 32; // used to default to -1 and always overridden --> not well supported by some tools + parameter int unsigned ELEMENT_WIDTH = 8; // by default a byte as the element width + parameter int unsigned STRB_WIDTH = DATA_WIDTH/ELEMENT_WIDTH; `ifndef SYNTHESIS // the TRVR assert is disabled by default, as it is only valid for zero-latency // accesses (e.g. using FIFO queues breaks this assumption) @@ -73,7 +74,8 @@ interface hwpe_stream_intf_stream input logic clk ); parameter int unsigned DATA_WIDTH = 32; // used to default to -1 and always overridden --> not well supported by some tools - parameter int unsigned STRB_WIDTH = DATA_WIDTH/8; + parameter int unsigned ELEMENT_WIDTH = 8; // by default a byte as the element width + parameter int unsigned STRB_WIDTH = DATA_WIDTH/ELEMENT_WIDTH; `ifndef SYNTHESIS parameter bit BYPASS_VCR_ASSERT = 1'b0; parameter bit BYPASS_VDR_ASSERT = 1'b0; From bf40ede093a9714ad618c63ca748bde41d9cdf4d Mon Sep 17 00:00:00 2001 From: Arpan Suravi Prasad Date: Fri, 5 Dec 2025 09:49:47 +0100 Subject: [PATCH 5/5] [new mode] when the stream size is not a factor of number of streams the mode !=0 is used. Otherwise mode=0 is used for the default previous existing mode. --- rtl/basic/hwpe_stream_serialize.sv | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/rtl/basic/hwpe_stream_serialize.sv b/rtl/basic/hwpe_stream_serialize.sv index 877522b..89b8277 100644 --- a/rtl/basic/hwpe_stream_serialize.sv +++ b/rtl/basic/hwpe_stream_serialize.sv @@ -62,6 +62,7 @@ module hwpe_stream_serialize #( parameter int unsigned CONTIG_LIMIT = 1024, parameter int unsigned DATA_WIDTH = 32, parameter int unsigned ELEMENT_WIDTH = 8, + parameter int unsigned MODE = 0, localparam int unsigned NUM_ELEMENTS = DATA_WIDTH/ELEMENT_WIDTH, parameter logic SYNC_READY = 1'b0 ) @@ -90,7 +91,7 @@ module hwpe_stream_serialize #( assign push_data [ii] = push_i[ii].data; assign push_strb [ii] = push_i[ii].strb; - assign push_valid[ii] = push_i[ii].valid; + assign push_valid[ii] = push_i[ii].valid & stream_cnt_en; assign push_i[ii].ready = push_ready[ii]; end @@ -101,9 +102,16 @@ module hwpe_stream_serialize #( assign pop_o.valid = push_valid[stream_cnt_q]; assign pop_o.strb = push_strb [stream_cnt_q]; + logic last_stream; + + if(MODE == 0) + assign last_stream = 1'b0; + else + assign last_stream = (contig_cnt_q == ctrl_i.nb_contig_m1 - 1) ? stream_cnt_en & pop_o.ready : 1'b0; + if(SYNC_READY) begin : sync_ready_gen for(genvar ii=0; ii