From 6e6f61592f0c92100452a0be8a9bb90c2df60e96 Mon Sep 17 00:00:00 2001 From: Francesco Conti Date: Thu, 5 Mar 2026 12:33:56 +0100 Subject: [PATCH 01/25] Add some README for verification env + change for non-IIS machines --- README.md | 33 +++++++++++++++++++++++++++++++++ target/verif/verif.mk | 6 +++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 690bc53..6a67cfc 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,39 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc - https://github.com/pulp-platform/neureka - https://github.com/pulp-platform/redmule +# Usage flow + +The typical full flow is: + +``` +make checkout # Fetch and check out dependencies via Bender +make config-verif # Generate Makefiles from JSON verification configs +make stim-verif # Generate simulation stimulus vectors (requires Python 3) +make compile-verif # Compile RTL and testbench with QuestaSim +make opt-verif # Optimize the compiled design with vopt +make run-verif # Run the simulation (batch mode by default) +``` + +To open the simulation in the QuestaSim GUI with waveforms, pass `GUI=1`: + +``` +make run-verif GUI=1 +``` + +Cleanup targets: + +| Target | Effect | +|----------------------|----------------------------------------------------| +| `clean-config-verif` | Remove generated configuration Makefiles | +| `clean-stim-verif` | Remove generated stimulus vectors | +| `clean-sim-verif` | Remove QuestaSim build artifacts (work lib, logs) | +| `clean-verif` | Run all three clean targets above | + +**Notes:** +- On IIS machines, defaults to QuestaSim (`questa-2022.3`) (can be overriden with `SIM_QUESTA=`). On non-IIS machines, defaults to QuestaSim available in `PATH`. +- Verification configuration is driven by JSON files under `target/verif/config/`. Edit those before running `config-verif` and `stim-verif`. +- `run-verif` depends on `opt-verif` and `stim-verif`, so after `checkout` and `config-verif` you can jump straight to it. + # Style guide These IPs use a slightly different style than other PULP IPs. Refer to `STYLE.md` for some indications. diff --git a/target/verif/verif.mk b/target/verif/verif.mk index dd656b0..bf324d3 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -17,7 +17,11 @@ include $(HCI_VERIF_DIR)/bender.mk # Tooling #NOTE: Only QuestaSim is currently supported by verification framework -SIM_QUESTA ?= questa-2022.3 +ifneq (,$(wildcard /etc/iis.version)) + SIM_QUESTA ?= questa-2022.3 +else + SIM_QUESTA ?= +endif SIM_VLIB ?= $(SIM_QUESTA) vlib SIM_VSIM ?= $(SIM_QUESTA) vsim SIM_VOPT ?= $(SIM_QUESTA) vopt From b00d0b151412741a707d03fd54b54768be221ef0 Mon Sep 17 00:00:00 2001 From: Francesco Conti Date: Thu, 5 Mar 2026 16:26:00 +0100 Subject: [PATCH 02/25] Add some (reasonable) waivers + minor cleanup --- README.md | 3 +- rtl/hci_interconnect.sv | 20 +++++++++- rtl/interco/hci_arbiter.sv | 68 +++++++++++++++++++-------------- rtl/interco/hci_arbiter_tree.sv | 21 +++++++++- rtl/interco/hci_router.sv | 35 +++++++++++------ target/verif/verif.mk | 6 ++- target/verif/vsim/tb_hci.tcl | 4 +- 7 files changed, 108 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 6a67cfc..fc10244 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,7 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc - https://github.com/pulp-platform/neureka - https://github.com/pulp-platform/redmule -# Usage flow - +# Verification flow The typical full flow is: ``` diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv index 9825891..1d6a771 100644 --- a/rtl/hci_interconnect.sv +++ b/rtl/hci_interconnect.sv @@ -142,8 +142,22 @@ module hci_interconnect EW: DEFAULT_EW, EHW: DEFAULT_EHW }; - `HCI_INTF_ARRAY(hwpe_mem_muxed, clk_i, 0:N_MEM-1); - + hci_core_intf #( + .DW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).DW ), + .AW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).AW ), + .BW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).BW ), + .UW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).UW ), + .IW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).IW ), + .EW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EW ), + .EHW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EHW ) +`ifndef SYNTHESIS + , + .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // hwpe_mem_muxed is an internal muxed signal, not a protocol-compliant port + .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT ) +`endif + ) hwpe_mem_muxed [0:N_MEM-1] ( + .clk ( clk_i ) + ); localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe_mem) = '{ DW: DEFAULT_DW, @@ -257,6 +271,8 @@ module hci_interconnect hci_arbiter_tree #( .NB_REQUESTS(N_HWPE), .NB_CHAN ( N_MEM ), + .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), + .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT ), .`HCI_SIZE_PARAM(out)(`HCI_SIZE_PARAM(hwpe_mem_muxed)) ) i_wide_port_arbiter_tree ( .clk_i ( clk_i ), diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv index a291532..2447608 100644 --- a/rtl/interco/hci_arbiter.sv +++ b/rtl/interco/hci_arbiter.sv @@ -142,38 +142,48 @@ module hci_arbiter in_low [ii].gnt = '0; if(hs_pass_d[ii]) begin - out[ii].req = in_high[ii].req; - out[ii].add = in_high[ii].add; - out[ii].wen = in_high[ii].wen; - out[ii].be = in_high[ii].be; - out[ii].data = in_high[ii].data; - out[ii].id = in_high[ii].id; - out[ii].user = in_high[ii].user; - out[ii].ecc = in_high[ii].ecc; - in_high[ii].gnt = out[ii].gnt; - end + out[ii].req = in_high[ii].req; + out[ii].add = in_high[ii].add; + out[ii].wen = in_high[ii].wen; + out[ii].be = in_high[ii].be; + out[ii].data = in_high[ii].data; + out[ii].id = in_high[ii].id; + out[ii].user = in_high[ii].user; + out[ii].r_ready = in_high[ii].r_ready; + out[ii].ecc = in_high[ii].ecc; + out[ii].ereq = in_high[ii].ereq; + out[ii].r_eready = in_high[ii].r_eready; + in_high[ii].gnt = out[ii].gnt; + end else begin - out[ii].req = in_low[ii].req; - out[ii].add = in_low[ii].add; - out[ii].wen = in_low[ii].wen; - out[ii].be = in_low[ii].be; - out[ii].data = in_low[ii].data; - out[ii].id = in_low[ii].id; - out[ii].user = in_low[ii].user; - out[ii].ecc = in_low[ii].ecc; - in_low[ii].gnt = out[ii].gnt; + out[ii].req = in_low[ii].req; + out[ii].add = in_low[ii].add; + out[ii].wen = in_low[ii].wen; + out[ii].be = in_low[ii].be; + out[ii].data = in_low[ii].data; + out[ii].id = in_low[ii].id; + out[ii].user = in_low[ii].user; + out[ii].r_ready = in_low[ii].r_ready; + out[ii].ecc = in_low[ii].ecc; + out[ii].ereq = in_low[ii].ereq; + out[ii].r_eready = in_low[ii].r_eready; + in_low[ii].gnt = out[ii].gnt; end - in_high[ii].r_data = out[ii].r_data; - in_low [ii].r_data = out[ii].r_data; - in_high[ii].r_id = out[ii].r_id; - in_low [ii].r_id = out[ii].r_id; - in_high[ii].r_opc = out[ii].r_opc; - in_low [ii].r_opc = out[ii].r_opc; - in_high[ii].r_user = out[ii].r_user; - in_low [ii].r_user = out[ii].r_user; - in_high[ii].r_ecc = out[ii].r_ecc; - in_low [ii].r_ecc = out[ii].r_ecc; + in_high[ii].r_data = out[ii].r_data; + in_low [ii].r_data = out[ii].r_data; + in_high[ii].r_id = out[ii].r_id; + in_low [ii].r_id = out[ii].r_id; + in_high[ii].r_opc = out[ii].r_opc; + in_low [ii].r_opc = out[ii].r_opc; + in_high[ii].r_user = out[ii].r_user; + in_low [ii].r_user = out[ii].r_user; + in_high[ii].r_ecc = out[ii].r_ecc; + in_low [ii].r_ecc = out[ii].r_ecc; + in_high[ii].egnt = out[ii].egnt; + in_low [ii].egnt = out[ii].egnt; + in_high[ii].r_evalid = out[ii].r_evalid; + in_low [ii].r_evalid = out[ii].r_evalid; // r_valid signals are NOT propagated by the arbiter, they are generated at // routing stage. In previous HCI versions, we used a r_valid-less version // of the protocol here. diff --git a/rtl/interco/hci_arbiter_tree.sv b/rtl/interco/hci_arbiter_tree.sv index 2a10524..f304432 100644 --- a/rtl/interco/hci_arbiter_tree.sv +++ b/rtl/interco/hci_arbiter_tree.sv @@ -39,7 +39,9 @@ module hci_arbiter_tree #( parameter int unsigned NB_REQUESTS = 1, parameter int unsigned NB_CHAN = 16, - parameter hci_size_parameter_t `HCI_SIZE_PARAM(out) = '0 + parameter hci_size_parameter_t `HCI_SIZE_PARAM(out) = '0, + parameter bit WAIVE_RQ3_ASSERT = 1'b0, + parameter bit WAIVE_RQ4_ASSERT = 1'b0 ) ( input logic clk_i, @@ -74,7 +76,22 @@ module hci_arbiter_tree EHW: `HCI_SIZE_GET_EHW(out) }; - `HCI_INTF_ARRAY(arb_out, clk_i, 0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1); + hci_core_intf #( + .DW ( `HCI_SIZE_PARAM(arb_out).DW ), + .AW ( `HCI_SIZE_PARAM(arb_out).AW ), + .BW ( `HCI_SIZE_PARAM(arb_out).BW ), + .UW ( `HCI_SIZE_PARAM(arb_out).UW ), + .IW ( `HCI_SIZE_PARAM(arb_out).IW ), + .EW ( `HCI_SIZE_PARAM(arb_out).EW ), + .EHW ( `HCI_SIZE_PARAM(arb_out).EHW ) +`ifndef SYNTHESIS + , + .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // arb_out is an internal arbitration signal, not a protocol-compliant port + .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT ) +`endif + ) arb_out [0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1] ( + .clk ( clk_i ) + ); generate for(genvar lvl=0; lvl Date: Fri, 6 Mar 2026 13:08:19 +0100 Subject: [PATCH 03/25] tb: Fix erroneous latency reporting --- target/verif/src/latency_monitor.sv | 8 ++++-- target/verif/src/simulation_report.sv | 38 +++++++++++++-------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/latency_monitor.sv index d89cdf5..b645ce1 100644 --- a/target/verif/src/latency_monitor.sv +++ b/target/verif/src/latency_monitor.sv @@ -74,8 +74,10 @@ module latency_monitor #( req_start_cycle_log <= '0; req_prev_log <= 1'b0; end else begin - if (hci_log_if[gi].req && !req_prev_log) begin + if (hci_log_if[gi].req && !req_prev_log && !hci_log_if[gi].gnt) begin req_start_cycle_log <= cycle_q; + end else if (hci_log_if[gi].gnt) begin + req_start_cycle_log <= cycle_q + 1; end if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin @@ -130,8 +132,10 @@ module latency_monitor #( req_start_cycle_hwpe <= '0; req_prev_hwpe <= 1'b0; end else begin - if (hci_hwpe_if[gi].req && !req_prev_hwpe) begin + if (hci_hwpe_if[gi].req && !req_prev_hwpe && !hci_hwpe_if[gi].gnt) begin req_start_cycle_hwpe <= cycle_q; + end else if (hci_hwpe_if[gi].gnt) begin + req_start_cycle_hwpe <= cycle_q + 1; end if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index 4d4e2e4..c9e8423 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -205,11 +205,11 @@ module simulation_report $display("\n\\\\BANDWIDTH\\\\"); $display( - "Completion bandwidth (writes granted + reads completed): %0.1f bit/cycle", + "Completion bandwidth (writes granted + reads completed): %0.2f bit/cycle", throughput_complete_i ); - $display("Stimulus phase duration: %0.1f cycles", stim_latency_i); - $display("Completion phase duration: %0.1f cycles", tot_latency_i); + $display("Stimulus phase duration: %0.2f cycles", stim_latency_i); + $display("Completion phase duration: %0.2f cycles", tot_latency_i); $display( "Granted transactions: reads=%0d writes=%0d total=%0d", total_read_granted_transactions, @@ -222,28 +222,28 @@ module simulation_report ); $display("\n\\\\SIMULATION TIME\\\\"); - $display("Total simulation time: %0.1f cycles", tot_latency_i); + $display("Total simulation time: %0.2f cycles", tot_latency_i); for (int i = 0; i < N_CORE_REAL; i++) begin $display( - "Core%0d (master_log_%0d): %0.1f cycles", + "Core%0d (master_log_%0d): %0.2f cycles", i, i, latency_per_master_i[i] ); end for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin $display( - "DMA%0d (master_log_%0d): %0.1f cycles", + "DMA%0d (master_log_%0d): %0.2f cycles", i - N_CORE, i, latency_per_master_i[i] ); end for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin $display( - "EXT%0d (master_log_%0d): %0.1f cycles", + "EXT%0d (master_log_%0d): %0.2f cycles", i - (N_CORE + N_DMA), i, latency_per_master_i[i] ); end for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin $display( - "HWPE%0d (master_hwpe_%0d): %0.1f cycles", + "HWPE%0d (master_hwpe_%0d): %0.2f cycles", i - (N_MASTER - N_HWPE), i - (N_MASTER - N_HWPE), latency_per_master_i[i] @@ -340,7 +340,7 @@ module simulation_report $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\"); for (int i = 0; i < N_CORE_REAL; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -350,7 +350,7 @@ module simulation_report end for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -360,7 +360,7 @@ module simulation_report end for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -370,7 +370,7 @@ module simulation_report end for (int i = 0; i < N_HWPE_REAL; i++) begin $display( - "master_hwpe_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_hwpe_i[i] != 0) ? (sum_req_to_gnt_latency_hwpe_i[i] / real'(n_gnt_transactions_hwpe_i[i])) : @@ -380,32 +380,32 @@ module simulation_report end $display(""); $display( - "Total accumulated req->gnt latency: %0.1f cycles over %0d grants", + "Total accumulated req->gnt latency: %0d cycles over %0d grants", sum_req_to_gnt_latency_all, total_gnt_transactions_all ); $display( - "LOG avg req->gnt latency (weighted by grant count): %0.1f cycles", + "LOG avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_log_weighted ); $display( - "LOG avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "LOG avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_log_unweighted ); $display( - "HWPE avg req->gnt latency (weighted by grant count): %0.1f cycles", + "HWPE avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_hwpe_weighted ); $display( - "HWPE avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "HWPE avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_hwpe_unweighted ); $display( - "Global avg req->gnt latency (weighted by grant count): %0.1f cycles", + "Global avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_weighted ); $display( - "Global avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "Global avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_unweighted ); $display(""); From 6ea8a8817d4d7b0670594d8825d84e4470482236 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 6 Mar 2026 13:09:36 +0100 Subject: [PATCH 04/25] rtl: :bug: Set independent QoS config for HWPE branch (was same as LIC) --- rtl/hci_interconnect.sv | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv index 1d6a771..53e10ad 100644 --- a/rtl/hci_interconnect.sv +++ b/rtl/hci_interconnect.sv @@ -268,6 +268,19 @@ module hci_interconnect end : hwpe_req2mem + // Set arbitration tree to be perfectly fair. It must not + // follow the max stall policy of the HWPE vs LIC arbiter. + // FIXME: it would be interesting to explore what happens + // with an unfair but configurable setting. Probably we need + // a generator to do that, I do not see a way to code it in + // pure SystemVerilog. + hci_interconnect_ctrl_t ctrl_arbiter_tree; + always_comb + begin + ctrl_arbiter_tree = ctrl_i; + ctrl_arbiter_tree.low_prio_max_stall = 1; + end + hci_arbiter_tree #( .NB_REQUESTS(N_HWPE), .NB_CHAN ( N_MEM ), @@ -278,7 +291,7 @@ module hci_interconnect .clk_i ( clk_i ), .rst_ni ( rst_ni ), .clear_i ( clear_i ), - .ctrl_i ( ctrl_i ), + .ctrl_i ( ctrl_arbiter_tree ), .in ( hwpe_mem ), .out ( hwpe_mem_muxed ) ); From 7c61e6c0211556f96cbfa53fe9a7736788fa4a0e Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 27 Feb 2026 16:05:40 +0100 Subject: [PATCH 05/25] [WIP] Implement LOG and MUX tb --- target/verif/bender.mk | 6 +- target/verif/config/generated/hardware.mk.tpl | 3 +- target/verif/config/generated/json_to_mk.py | 46 +++- .../verif/config/generated/testbench.mk.tpl | 1 - target/verif/config/hardware.json | 3 +- target/verif/config/testbench.json | 1 - .../verif/simvectors/hci_stimuli/processor.py | 10 +- target/verif/simvectors/main.py | 10 +- target/verif/src/application_driver.sv | 8 +- target/verif/src/simulation_report.sv | 68 +++--- target/verif/src/tb_hci.sv | 216 ++++++++++++++---- target/verif/src/tb_hci_pkg.sv | 78 ++++--- target/verif/src/throughput_monitor.sv | 4 +- target/verif/verif.mk | 5 +- target/verif/vsim/tb_hci.tcl | 95 +++++++- 15 files changed, 411 insertions(+), 143 deletions(-) diff --git a/target/verif/bender.mk b/target/verif/bender.mk index 2e1362e..8ccc663 100644 --- a/target/verif/bender.mk +++ b/target/verif/bender.mk @@ -8,7 +8,7 @@ VERIF_DEFS ?= VERIF_DEFS += \ -D N_HWPE=$(N_HWPE) \ - -D HWPE_WIDTH=$(HWPE_WIDTH) \ + -D HWPE_WIDTH_FACT=$(HWPE_WIDTH_FACT) \ -D N_CORE=$(N_CORE) \ -D N_DMA=$(N_DMA) \ -D N_EXT=$(N_EXT) \ @@ -22,11 +22,11 @@ VERIF_DEFS += \ -D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \ -D CLK_PERIOD=$(CLK_PERIOD) \ -D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \ - -D MAX_CYCLES_BETWEEN_GNT_RVALID=$(MAX_CYCLES_BETWEEN_GNT_RVALID) \ -D RANDOM_GNT=$(RANDOM_GNT) \ + -D INTERCO_TYPE=$(INTERCO_TYPE) \ -D INVERT_PRIO=$(INVERT_PRIO) \ -D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) # Common targets for bender VERIF_TARGS ?= -VERIF_TARGS += -t hci_verif \ No newline at end of file +VERIF_TARGS += -t hci_verif diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl index f06a13d..f6bdc14 100644 --- a/target/verif/config/generated/hardware.mk.tpl +++ b/target/verif/config/generated/hardware.mk.tpl @@ -6,10 +6,11 @@ # Hardware configuration parameters (from hardware.json) N_HWPE?=${N_HWPE} -HWPE_WIDTH?=${HWPE_WIDTH} +HWPE_WIDTH_FACT?=${HWPE_WIDTH_FACT} N_CORE?=${N_CORE} N_DMA?=${N_DMA} N_EXT?=${N_EXT} +INTERCO_TYPE?=${INTERCO_TYPE} TS_BIT?=${TS_BIT} EXPFIFO?=${EXPFIFO} SEL_LIC?=${SEL_LIC} diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py index fcec54f..4a1c9ba 100755 --- a/target/verif/config/generated/json_to_mk.py +++ b/target/verif/config/generated/json_to_mk.py @@ -45,6 +45,34 @@ def flatten_dict(d, prefix=''): items.append((new_key, v)) return dict(items) + +def get_parameters(config): + """Return flattened parameters from config JSON.""" + params = config.get("parameters") + if not isinstance(params, dict): + return {} + return flatten_dict(params) + + +def load_all_parameters(config_dir): + """Load flattened parameters from all JSON files in config_dir.""" + merged = {} + for json_path in sorted(config_dir.glob("*.json")): + cfg = load_json_config(json_path) + merged.update(get_parameters(cfg)) + return merged + + +def template_variables(template_content): + """Extract Template variable names used by template content.""" + pattern = Template.pattern + vars_found = set() + for match in pattern.finditer(template_content): + name = match.group("named") or match.group("braced") + if name is not None: + vars_found.add(name) + return vars_found + def parse_args(argv=None): parser = argparse.ArgumentParser( description="Convert JSON configuration to Makefile fragment using templates." @@ -83,16 +111,22 @@ def main(): # Load template template_content = load_template(template_file) - # Flatten the parameters dict for template substitution - template_data = flatten_dict(config['parameters']) + # Build substitution dictionary: + # 1. all parameters from all configs (fallback) + # 2. parameters from selected config (override) + template_data = load_all_parameters(config_dir) + template_data.update(get_parameters(config)) # Apply template substitution template = Template(template_content) - try: - result = template.substitute(template_data) - except KeyError as e: - print(f"ERROR: Missing template variable: {e}", file=sys.stderr) + missing = sorted(v for v in template_variables(template_content) if v not in template_data) + if missing: + print( + f"ERROR: Missing template variable(s): {', '.join(missing)}", + file=sys.stderr, + ) sys.exit(1) + result = template.substitute(template_data) # Output to stdout print(result) diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl index a1cf812..469736b 100644 --- a/target/verif/config/generated/testbench.mk.tpl +++ b/target/verif/config/generated/testbench.mk.tpl @@ -9,7 +9,6 @@ N_TRANSACTION_LOG?=${N_TRANSACTION_LOG} TRANSACTION_RATIO?=${TRANSACTION_RATIO} CLK_PERIOD?=${CLK_PERIOD} RST_CLK_CYCLES?=${RST_CLK_CYCLES} -MAX_CYCLES_BETWEEN_GNT_RVALID?=${MAX_CYCLES_BETWEEN_GNT_RVALID} RANDOM_GNT?=${RANDOM_GNT} INVERT_PRIO?=${INVERT_PRIO} LOW_PRIO_MAX_STALL?=${LOW_PRIO_MAX_STALL} diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index 678217b..028c714 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -2,10 +2,11 @@ "description": "Hardware configuration parameters for HCI interconnect", "parameters": { "N_HWPE": 2, - "HWPE_WIDTH": 8, + "HWPE_WIDTH_FACT": 8, "N_CORE": 8, "N_DMA": 1, "N_EXT": 1, + "INTERCO_TYPE": "LOG", "TS_BIT": 21, "EXPFIFO": 0, "SEL_LIC": 0, diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json index a4daaf4..46005ab 100644 --- a/target/verif/config/testbench.json +++ b/target/verif/config/testbench.json @@ -5,7 +5,6 @@ "TRANSACTION_RATIO": 1, "CLK_PERIOD": 50, "RST_CLK_CYCLES": 10, - "MAX_CYCLES_BETWEEN_GNT_RVALID": 1, "RANDOM_GNT": 0, "INVERT_PRIO": 0, "LOW_PRIO_MAX_STALL": 10 diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py index 91b4892..2f580be 100644 --- a/target/verif/simvectors/hci_stimuli/processor.py +++ b/target/verif/simvectors/hci_stimuli/processor.py @@ -6,7 +6,7 @@ # 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list. # -folder_path_raw --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated) # -folder_path_processed --> String that specifies the path of the folder containing the new txt files created by this function -def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH): +def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT): file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")] for file in file_names: filepath_read = os.path.join(folder_path_raw,file) @@ -27,18 +27,18 @@ def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") else: for _ in range(int(cycle_offset)-1): - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") + file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n") else: if "log" in file: file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") else: - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") + file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") # 2) ++PAD++ txt files to have the same number of lines # -Folder_path --> path of the folder containing the txt files to be padded -def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH): +def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT): file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder max_lines = 0 line_count = {} # Dictionary to store the number of lines in each txt file @@ -59,4 +59,4 @@ def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH): f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") else: for _ in range(padding_needed): - f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") + f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 21c2e28..b164503 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -73,7 +73,7 @@ def main(argv=None): N_DMA = hw_params['N_DMA'] N_EXT = hw_params['N_EXT'] N_HWPE = hw_params['N_HWPE'] - HWPE_WIDTH = hw_params['HWPE_WIDTH'] + HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT'] # Extract testbench parameters tb_params = testbench_config['parameters'] @@ -167,7 +167,7 @@ def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, mast - master_global_idx: global master id used by the generator """ nonlocal next_start_id - data_width = HWPE_WIDTH * DATA_WIDTH if is_hwpe else DATA_WIDTH + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG @@ -196,7 +196,7 @@ def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, mast def _gen_hwpe_master(master_idx, master_config, global_idx): nonlocal next_start_id filepath = raw_dir / f"master_hwpe_{master_idx}.txt" - master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH * DATA_WIDTH, ADD_WIDTH, + master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH_FACT * DATA_WIDTH, ADD_WIDTH, str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx) config = str(master_config.get('mem_access_type', '0')) start_address = str(master_config.get('start_address', '0')) @@ -250,10 +250,10 @@ def _gen_hwpe_master(master_idx, master_config, global_idx): # Process raw files simvector_raw_path = str(raw_dir) simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve()) - unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH) + unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) print("STEP 1 COMPLETED: unfold txt files") - pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH) + pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) print("STEP 2 COMPLETED: pad txt files") if args.golden: diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index e1651a5..c3b7ffa 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -21,14 +21,14 @@ module application_driver #( parameter int unsigned MASTER_NUMBER = 1, parameter int unsigned IS_HWPE = 1, parameter int unsigned DATA_WIDTH = 1, - parameter int unsigned ADD_WIDTH = 1, + parameter int unsigned ADDR_WIDTH = 1, parameter int unsigned APPL_DELAY = 2, // Delay on the input signals parameter int unsigned IW = 1, parameter string STIM_FILE = "" ) ( - hci_core_intf.initiator hci_if, - input logic rst_ni, input logic clk_i, + input logic rst_ni, + hci_core_intf.initiator hci_if, output logic end_stimuli_o, output logic end_latency_o, output int unsigned n_issued_transactions_o, @@ -42,7 +42,7 @@ module application_driver #( logic wen; logic req; logic [DATA_WIDTH-1:0] data; - logic [ADD_WIDTH-1:0] add; + logic [ADDR_WIDTH-1:0] add; int unsigned n_completed_read_transactions; logic pending_rsp_is_read[$]; diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index c9e8423..28f8b52 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -20,21 +20,21 @@ module simulation_report import tb_hci_pkg::*; ( - input logic [0:N_MASTER-1] end_stimuli_i, - input logic [0:N_MASTER-1] end_latency_i, + input logic [0:N_DRIVERS-1] end_stimuli_i, + input logic [0:N_DRIVERS-1] end_latency_i, input real throughput_complete_i, input real stim_latency_i, input real tot_latency_i, - input real latency_per_master_i[N_MASTER], - input real sum_req_to_gnt_latency_log_i[N_MASTER-N_HWPE], + input real latency_per_master_i[N_DRIVERS], + input real sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE], input real sum_req_to_gnt_latency_hwpe_i[N_HWPE], - input int unsigned n_gnt_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_gnt_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_gnt_transactions_hwpe_i[N_HWPE], - input int unsigned n_read_granted_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_read_granted_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_read_granted_transactions_hwpe_i[N_HWPE], - input int unsigned n_write_granted_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_write_granted_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_write_granted_transactions_hwpe_i[N_HWPE], - input int unsigned n_read_complete_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_read_complete_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_read_complete_transactions_hwpe_i[N_HWPE] ); @@ -86,7 +86,7 @@ module simulation_report wait (&end_latency_i); wait (throughput_complete_i >= 0); wait (tot_latency_i >= 0); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -104,7 +104,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -122,7 +122,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -140,7 +140,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin total_read_granted_transactions += n_read_granted_transactions_hwpe_i[i]; total_write_granted_transactions += n_write_granted_transactions_hwpe_i[i]; total_read_complete_transactions += n_read_complete_transactions_hwpe_i[i]; @@ -188,19 +188,19 @@ module simulation_report $display("\\\\HW CONFIG\\\\"); $display( "Masters: CORE=%0d DMA=%0d EXT=%0d HWPE=%0d (total=%0d)", - N_CORE_REAL, N_DMA_REAL, N_EXT_REAL, N_HWPE_REAL, N_MASTER_REAL + N_CORE, N_DMA, N_EXT, N_HWPE, N_DRIVERS ); $display( "Memory: banks=%0d total_size=%0d kB data_width=%0d bits hwpe_width=%0d lanes", - N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH + N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH_FACT ); $display( "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d", SEL_LIC, TS_BIT, EXPFIFO ); $display( - "ID/address: IW=%0d ADD_WIDTH=%0d AddrMemWidth=%0d", - IW, ADD_WIDTH, AddrMemWidth + "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d", + IW, ADDR_WIDTH, ADDR_WIDTH_BANK ); $display("\n\\\\BANDWIDTH\\\\"); @@ -223,35 +223,35 @@ module simulation_report $display("\n\\\\SIMULATION TIME\\\\"); $display("Total simulation time: %0.2f cycles", tot_latency_i); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin $display( "Core%0d (master_log_%0d): %0.2f cycles", i, i, latency_per_master_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( "DMA%0d (master_log_%0d): %0.2f cycles", i - N_CORE, i, latency_per_master_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( "EXT%0d (master_log_%0d): %0.2f cycles", i - (N_CORE + N_DMA), i, latency_per_master_i[i] ); end - for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin + for (int i = N_DRIVERS - N_HWPE; i < N_DRIVERS - N_HWPE + N_HWPE; i++) begin $display( "HWPE%0d (master_hwpe_%0d): %0.2f cycles", - i - (N_MASTER - N_HWPE), - i - (N_MASTER - N_HWPE), + i - (N_DRIVERS - N_HWPE), + i - (N_DRIVERS - N_HWPE), latency_per_master_i[i] ); end $display("\n\\\\READ RESPONSE COVERAGE\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -262,7 +262,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -273,7 +273,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -284,7 +284,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin expected_reads = n_read_granted_transactions_hwpe_i[i]; observed_reads = n_read_complete_transactions_hwpe_i[i]; $display( @@ -300,7 +300,7 @@ module simulation_report end $display("\n\\\\TRANSACTION COUNTS\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -309,7 +309,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -318,7 +318,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -327,7 +327,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin $display( "master_hwpe_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -338,7 +338,7 @@ module simulation_report end $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin $display( "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, @@ -348,7 +348,7 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, @@ -358,7 +358,7 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, @@ -368,7 +368,7 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin $display( "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 5fb3411..7ce7ea2 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -30,11 +30,6 @@ module tb_hci logic s_clear; hci_interconnect_ctrl_t s_hci_ctrl; - logic [0:N_MASTER-1] s_end_stimuli; - logic [0:N_MASTER-1] s_end_latency; - int unsigned s_issued_transactions[0:N_MASTER-1]; - int unsigned s_issued_read_transactions[0:N_MASTER-1]; - clk_rst_gen #( .ClkPeriod(CLK_PERIOD), .RstClkCycles(RST_CLK_CYCLES) @@ -59,7 +54,7 @@ module tb_hci }; localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{ // Bank parameters DW: DEFAULT_DW, - AW: AddrMemWidth, + AW: ADDR_WIDTH_BANK, BW: DEFAULT_BW, UW: DEFAULT_UW, IW: IW, @@ -67,7 +62,7 @@ module tb_hci EHW: DEFAULT_EHW }; localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{ // HWPE parameters - DW: HWPE_WIDTH*DATA_WIDTH, + DW: HWPE_WIDTH_FACT*DATA_WIDTH, AW: DEFAULT_AW, BW: DEFAULT_BW, UW: DEFAULT_UW, @@ -76,6 +71,8 @@ module tb_hci EHW: DEFAULT_EHW }; + /* Application-driver-side interfaces */ + hci_core_intf #( .DW(HCI_SIZE_hwpe.DW), .AW(HCI_SIZE_hwpe.AW), @@ -96,10 +93,75 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_log_if [0:N_MASTER-N_HWPE-1] ( + ) hci_log_if [0:N_LOG_MASTERS-1] ( + .clk(clk) + ); + + /* Interconnect-side master interfaces */ + + hci_core_intf #( + .DW(HCI_SIZE_hwpe.DW), + .AW(HCI_SIZE_hwpe.AW), + .BW(HCI_SIZE_hwpe.BW), + .UW(HCI_SIZE_hwpe.UW), + .IW(HCI_SIZE_hwpe.IW), + .EW(HCI_SIZE_hwpe.EW), + .EHW(HCI_SIZE_hwpe.EHW) + ) hci_hwpe_wide_if [0:N_HWPE_MASTERS-1] ( + .clk(clk) + ); + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_core_if [0:N_CORE+N_HWPE_LOG_MASTERS-1] ( + .clk(clk) + ); + + // LOG-only intermediate interfaces for HWPE split lanes. + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_hwpe_log_if [0:N_HWPE_LOG_MASTERS-1] ( + .clk(clk) + ); + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_dma_if [0:N_DMA-1] ( + .clk(clk) + ); + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_ext_if [0:N_EXT-1] ( .clk(clk) ); + /* Memory interface */ + hci_core_intf #( .DW(HCI_SIZE_mems.DW), .AW(HCI_SIZE_mems.AW), @@ -118,20 +180,87 @@ module tb_hci /* HCI instance */ + generate + for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_log + hci_core_assign i_core_to_hci_assign ( + .tcdm_target (hci_log_if[ii]), + .tcdm_initiator (hci_core_if[ii]) + ); + end + for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_log + hci_core_assign i_dma_to_hci_assign ( + .tcdm_target (hci_log_if[N_CORE + ii]), + .tcdm_initiator (hci_dma_if[ii]) + ); + end + for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_log + hci_core_assign i_ext_to_hci_assign ( + .tcdm_target (hci_log_if[N_CORE + N_DMA + ii]), + .tcdm_initiator (hci_ext_if[ii]) + ); + end + + if (INTERCO_TYPE == HCI) begin : gen_hwpe_to_hci + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_hci + hci_core_assign i_hwpe_to_hci_assign ( + .tcdm_target (hci_hwpe_if[ii]), + .tcdm_initiator (hci_hwpe_wide_if[ii]) + ); + end + end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_to_log + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_log + hci_core_split #( + .DW(HWPE_WIDTH_FACT * DATA_WIDTH), + .BW(DATA_WIDTH / 8), + .UW(1), + .NB_OUT_CHAN(HWPE_WIDTH_FACT), + .FIFO_DEPTH(2), + .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe) + ) i_hwpe_to_log_split ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .tcdm_target(hci_hwpe_if[ii]), + .tcdm_initiator(hci_hwpe_log_if[ii * HWPE_WIDTH_FACT : (ii + 1) * HWPE_WIDTH_FACT - 1]) + ); + + for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_log_rvalid_filter + localparam int unsigned IDX_LOG = ii * HWPE_WIDTH_FACT + f; + localparam int unsigned IDX_CORE = N_CORE + IDX_LOG; + hci_core_r_valid_filter #( + .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_cores) + ) i_hwpe_log_rvalid_filter ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .enable_i(1'b1), + .tcdm_target(hci_hwpe_log_if[IDX_LOG]), + .tcdm_initiator(hci_core_if[IDX_CORE]) + ); + end + end + end else begin + // Error: unsupported for now + $error("Unsupported INTERCO_TYPE"); + end + endgenerate + assign s_clear = 0; + assign s_hci_ctrl.arb_policy = ARBITER_MODE; assign s_hci_ctrl.invert_prio = INVERT_PRIO; assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL; hci_interconnect #( - .N_HWPE(N_HWPE), // Number of HWPEs attached to the port - .N_CORE(N_CORE), // Number of Core ports - .N_DMA(N_DMA), // Number of DMA ports - .N_EXT(N_EXT), // Number of External ports - .N_MEM(N_BANKS), // Number of Memory banks - .TS_BIT(TS_BIT), // TEST_SET_BIT (for Log Interconnect) - .IW(IW), // ID Width - .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect - .SEL_LIC(SEL_LIC), // Log interconnect type selector + .N_HWPE(N_HWPE_MASTERS), // Number of HWPE ports + .N_CORE(N_CORE + N_HWPE_LOG_MASTERS), // Number of CORE ports + .N_DMA(N_DMA), // Number of DMA ports + .N_EXT(N_EXT), // Number of External ports + .N_MEM(N_BANKS), // Number of Memory banks + .TS_BIT(TS_BIT), // TEST_SET_BIT (for Log Interconnect) + .IW(IW), // ID Width + .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect + .SEL_LIC(SEL_LIC), // Log interconnect type selector + .FILTER_WRITE_R_VALID ( FILTER_WRITE_R_VALID ), .HCI_SIZE_cores(HCI_SIZE_cores), .HCI_SIZE_mems(HCI_SIZE_mems), .HCI_SIZE_hwpe(HCI_SIZE_hwpe), @@ -144,11 +273,11 @@ module tb_hci .rst_ni(rst_n), .clear_i(s_clear), .ctrl_i(s_hci_ctrl), - .cores(hci_log_if[0:N_CORE-1]), - .dma(hci_log_if[N_CORE:N_CORE+N_DMA-1]), - .ext(hci_log_if[N_CORE+N_DMA:N_CORE+N_DMA+N_EXT-1]), + .cores(hci_core_if), + .dma(hci_dma_if), + .ext(hci_ext_if), .mems(hci_mem_if), - .hwpe(hci_hwpe_if) + .hwpe(hci_hwpe_wide_if) ); ////////// @@ -159,7 +288,7 @@ module tb_hci .BankSize(N_WORDS), .NbBanks(N_BANKS), .DataWidth(DATA_WIDTH), - .AddrWidth(ADD_WIDTH), + .AddrWidth(ADDR_WIDTH), .BeWidth(DATA_WIDTH/8), .IdWidth(IW) ) i_tb_mem ( @@ -173,16 +302,21 @@ module tb_hci // Application drivers // ///////////////////////// + logic [0:N_DRIVERS-1] s_end_stimuli; + logic [0:N_DRIVERS-1] s_end_latency; + int unsigned s_issued_transactions[0:N_DRIVERS-1]; + int unsigned s_issued_read_transactions[0:N_DRIVERS-1]; + /* CORE + DMA + EXT */ generate - for (genvar ii = 0; ii < N_MASTER - N_HWPE; ii++) begin : gen_app_driver_log + for (genvar ii = 0; ii < N_CORE + N_DMA + N_EXT; ii++) begin : gen_app_driver_log localparam string STIM_FILE_LOG = $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), .IS_HWPE(0), .DATA_WIDTH(DATA_WIDTH), - .ADD_WIDTH(ADD_WIDTH), + .ADDR_WIDTH(ADDR_WIDTH), .APPL_DELAY(APPL_DELAY), // Delay on the input signals .IW(IW), .STIM_FILE(STIM_FILE_LOG) @@ -206,19 +340,19 @@ module tb_hci application_driver #( .MASTER_NUMBER(ii), .IS_HWPE(1), - .DATA_WIDTH(HWPE_WIDTH * DATA_WIDTH), - .ADD_WIDTH(ADD_WIDTH), + .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH), + .ADDR_WIDTH(ADDR_WIDTH), .APPL_DELAY(APPL_DELAY), // Delay on the input signals .IW(IW), .STIM_FILE(STIM_FILE_HWPE) ) i_app_driver_hwpe ( - .hci_if(hci_hwpe_if[ii]), - .rst_ni(rst_n), .clk_i(clk), - .end_stimuli_o(s_end_stimuli[N_MASTER-N_HWPE+ii]), - .end_latency_o(s_end_latency[N_MASTER-N_HWPE+ii]), - .n_issued_transactions_o(s_issued_transactions[N_MASTER-N_HWPE+ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[N_MASTER-N_HWPE+ii]) + .rst_ni(rst_n), + .hci_if(hci_hwpe_if[ii]), + .end_stimuli_o(s_end_stimuli[N_CORE+N_DMA+N_EXT + ii]), + .end_latency_o(s_end_latency[N_CORE+N_DMA+N_EXT + ii]), + .n_issued_transactions_o(s_issued_transactions[N_CORE+N_DMA+N_EXT + ii]), + .n_issued_read_transactions_o(s_issued_read_transactions[N_CORE+N_DMA+N_EXT + ii]) ); end endgenerate @@ -227,30 +361,30 @@ module tb_hci // QoS // ///////// - real SUM_REQ_TO_GNT_LATENCY_LOG[N_MASTER-N_HWPE]; + real SUM_REQ_TO_GNT_LATENCY_LOG[N_CORE+N_DMA+N_EXT]; real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE]; - int unsigned N_GNT_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_GNT_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE]; /* REAL THROUGHPUT AND SIMULATION TIME */ - real latency_per_master[N_MASTER]; + real latency_per_master[N_DRIVERS]; real throughput_completed; real stim_latency; real tot_latency; throughput_monitor #( - .N_MASTER(N_MASTER), + .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE), .CLK_PERIOD(CLK_PERIOD), .DATA_WIDTH(DATA_WIDTH), - .HWPE_WIDTH(HWPE_WIDTH) + .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT) ) i_throughput_monitor ( .clk_i(clk), .rst_ni(rst_n), @@ -268,7 +402,7 @@ module tb_hci /* LATENCY MONITOR */ latency_monitor #( - .N_MASTER(N_MASTER), + .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE) ) i_latency_monitor ( .clk_i(clk), diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index cdaf055..d35ca6e 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -18,6 +18,12 @@ package tb_hci_pkg; + typedef enum logic [1:0] { + LOG = 2'd0, + MUX = 2'd1, + HCI = 2'd2 + } interco_e; + ////////////////////////// // Testbench parameters // ////////////////////////// @@ -37,7 +43,6 @@ package tb_hci_pkg; localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO); // TCDM interface parameters - localparam int unsigned MAX_CYCLES_BETWEEN_GNT_RVALID = `ifdef MAX_CYCLES_BETWEEN_GNT_RVALID `MAX_CYCLES_BETWEEN_GNT_RVALID `else 1 `endif; localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; // Arbiter configuration @@ -52,19 +57,13 @@ package tb_hci_pkg; /* Config */ + localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif; + // Master port counts - localparam int unsigned N_HWPE_REAL = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port - localparam int unsigned N_CORE_REAL = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports - localparam int unsigned N_DMA_REAL = `ifdef N_DMA `N_DMA `else 1 `endif; // Number of DMA ports - localparam int unsigned N_EXT_REAL = `ifdef N_EXT `N_EXT `else 1 `endif; // Number of External ports - - // Normalized master counts (minimum 1 for array sizing) - localparam int unsigned N_HWPE = (N_HWPE_REAL == 0) ? 1 : N_HWPE_REAL; - localparam int unsigned N_CORE = (N_CORE_REAL == 0) ? 1 : N_CORE_REAL; - localparam int unsigned N_DMA = (N_DMA_REAL == 0) ? 1 : N_DMA_REAL; - localparam int unsigned N_EXT = (N_EXT_REAL == 0) ? 1 : N_EXT_REAL; - localparam int unsigned N_MASTER = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters - localparam int unsigned N_MASTER_REAL = N_HWPE_REAL + N_CORE_REAL + N_DMA_REAL + N_EXT_REAL; // Total number of masters (real) + localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port + localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports + localparam int unsigned N_DMA = `ifdef N_DMA `N_DMA `else 1 `endif; // Number of DMA ports + localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif; // Number of External ports // Interconnect configuration localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif; // TEST_SET_BIT (for Log Interconnect) @@ -72,23 +71,30 @@ package tb_hci_pkg; localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector // Data and memory parameters - localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits - localparam int unsigned HWPE_WIDTH = `ifdef HWPE_WIDTH `HWPE_WIDTH `else 4 `endif; // Width of an HWPE wide-word + localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits + localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; // Width factor of an HWPE wide-word localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB) - localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks /* Derived parameters */ - localparam int unsigned ADD_WIDTH = $clog2(TOT_MEM_SIZE * 1024); // Width of ADDRESS in bits + localparam int unsigned N_DRIVERS = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters + localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks + + localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; + localparam int unsigned N_HWPE_LOG_MASTERS = (INTERCO_TYPE == LOG) ? (N_HWPE * HWPE_WIDTH_FACT) : 0; + localparam int unsigned N_HWPE_MASTERS = (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0); + + localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024); // Width of ADDRESS in bits localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH; // Width of a memory bank (bits) localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes) localparam int unsigned BIT_BANK_INDEX = $clog2(N_BANKS); // Bits of the Bank index - localparam int unsigned AddrMemWidth = ADD_WIDTH - BIT_BANK_INDEX; // Number of address bits per TCDM bank + localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX; // Number of address bits per TCDM bank localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank - localparam int unsigned FILTER_WRITE_R_VALID = '0; + localparam int unsigned FILTER_WRITE_R_VALID[0:N_HWPE_MASTERS-1] = '{default: 0}; // Enable filtering of only r_valid respons - localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * (N_MASTER_REAL - N_HWPE_REAL) + N_TRANSACTION_HWPE * N_HWPE_REAL); // ID Width - localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE_REAL + N_DMA_REAL + N_EXT_REAL) + N_HWPE_REAL * N_TRANSACTION_HWPE * HWPE_WIDTH; + // Keep ID width consistent with stimuli generator. + localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * N_LOG_MASTERS + N_TRANSACTION_HWPE * N_HWPE); + localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE + N_DMA + N_EXT) + N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT; /////////// // Types // @@ -97,17 +103,17 @@ package tb_hci_pkg; typedef struct packed { logic wen; logic [DATA_WIDTH-1:0] data; - logic [ADD_WIDTH-1:0] add; + logic [ADDR_WIDTH-1:0] add; } stimuli_t; typedef struct packed { - logic [DATA_WIDTH - 1 : 0] data; - logic [AddrMemWidth - 1 : 0] add; + logic [DATA_WIDTH-1:0] data; + logic [ADDR_WIDTH_BANK-1:0] add; } out_intc_to_mem_t; // Helper return type for HWPE address/data creation typedef struct { - logic [ADD_WIDTH-1:0] address; + logic [ADDR_WIDTH-1:0] address; logic [DATA_WIDTH-1:0] data; logic rolls_over; } hwpe_addr_data_t; @@ -118,8 +124,8 @@ package tb_hci_pkg; // Zero-time pure function returning address/data for an HWPE lane function automatic hwpe_addr_data_t create_address_and_data_hwpe( - input logic [ADD_WIDTH-1:0] address_before, - input logic [HWPE_WIDTH * DATA_WIDTH-1:0] data_before, + input logic [ADDR_WIDTH-1:0] address_before, + input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before, input int index, input logic rolls_over_check_before ); @@ -135,7 +141,7 @@ package tb_hci_pkg; end ret.address = { - address_before[ADD_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over, + address_before[ADDR_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over, bank_index_after, address_before[1:0] }; @@ -145,7 +151,7 @@ package tb_hci_pkg; endfunction task calculate_bank_index( - input logic [ADD_WIDTH-1:0] address, + input logic [ADDR_WIDTH-1:0] address, output logic [BIT_BANK_INDEX-1:0] index ); index = address[BIT_BANK_INDEX-1 + 2:2]; @@ -160,8 +166,8 @@ package tb_hci_pkg; end else begin tot_time = N_TRANSACTION_LOG; end - tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_MASTER_REAL - N_HWPE_REAL) + - (N_TRANSACTION_HWPE * HWPE_WIDTH * DATA_WIDTH) * N_HWPE_REAL; // bit + tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_DRIVERS - N_HWPE) + + (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE; // bit throughput_theo = tot_data / tot_time; // bit per cycle band_memory_limit = real'(N_BANKS * DATA_WIDTH); if (throughput_theo >= band_memory_limit) begin @@ -174,14 +180,14 @@ package tb_hci_pkg; /////////////// // Convert a full system address to per-bank local word address. - function int unsigned get_bank_local_address(input logic [ADD_WIDTH-1:0] addr_i); - logic [ADD_WIDTH-1:0] mapped_addr; - logic [ADD_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; + function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i); + logic [ADDR_WIDTH-1:0] mapped_addr; + logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; tb_hci_pkg::hwpe_addr_data_t hwpe_lane_addr_data; - hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH, '0); + hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH_FACT, '0); mapped_addr = hwpe_lane_addr_data.address; bank_local_addr = { - mapped_addr[ADD_WIDTH-1:BIT_BANK_INDEX + 2], + mapped_addr[ADDR_WIDTH-1:BIT_BANK_INDEX + 2], mapped_addr[1:0] }; return int'(bank_local_addr); diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv index b25d16e..bc7294f 100644 --- a/target/verif/src/throughput_monitor.sv +++ b/target/verif/src/throughput_monitor.sv @@ -22,7 +22,7 @@ module throughput_monitor #( parameter int unsigned N_HWPE, parameter int unsigned CLK_PERIOD, parameter int unsigned DATA_WIDTH, - parameter int unsigned HWPE_WIDTH + parameter int unsigned HWPE_WIDTH_FACT ) ( input logic clk_i, input logic rst_ni, @@ -82,7 +82,7 @@ module throughput_monitor #( for (int i = 0; i < N_HWPE; i++) begin tot_data += real'( n_write_granted_hwpe_i[i] + n_read_complete_hwpe_i[i] - ) * real'(HWPE_WIDTH * DATA_WIDTH); + ) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); end if (completion_time_cycles > 0.0) begin throughput_complete_o = tot_data / completion_time_cycles; // bits per cycle diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 1dac0de..e9829ea 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -127,11 +127,12 @@ $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled: $(sim_vsim_lib)/.hw date > $@ .PHONY: run-verif -run-verif: $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp +run-verif: $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp cd $(HCI_VERIF_DIR)/vsim && \ $(SIM_VSIM) $(SIM_HCI_VSIM_ARGS) \ $(sim_top_level)_optimized \ - -do 'set GUI $(GUI); source $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl' + -do 'set GUI $(GUI); source $<' + .PHONY: clean-verif clean-sim-verif: diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl index 8935f49..feeefa0 100644 --- a/target/verif/vsim/tb_hci.tcl +++ b/target/verif/vsim/tb_hci.tcl @@ -1,7 +1,100 @@ # If GUI is 1, spawn waveforms if {$GUI == 1} { echo "GUI mode enabled" - add log -r /* + log -r /* + + set N_CORE [examine -radix dec /tb_hci_pkg/N_CORE] + set N_DMA [examine -radix dec /tb_hci_pkg/N_DMA] + set N_EXT [examine -radix dec /tb_hci_pkg/N_EXT] + set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE] + set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS] + + set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE] + set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS] + set N_HWPE_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_LOG_MASTERS] + set N_HWPE_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_MASTERS] + + set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT] + + add wave -noupdate /tb_hci/clk + add wave -noupdate /tb_hci/rst_n + + # Application driver interfaces + add wave -noupdate -group application_drivers -divider narrow_Cores + for {set i 0} {$i < $N_CORE} {incr i} { + add wave -noupdate -group application_drivers -group core_$i /tb_hci/hci_log_if[$i]/* + } + add wave -noupdate -group application_drivers -divider wide_HWPEs + for {set i 0} {$i < $N_HWPE} {incr i} { + add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_hwpe_if[$i]/* + } + add wave -noupdate -group application_drivers -divider narrow_DMA + for {set i 0} {$i < $N_DMA} {incr i} { + set idx [expr {$i + $N_CORE}] + add wave -noupdate -group application_drivers -group dma_$i /tb_hci/hci_log_if[$idx]/* + } + add wave -noupdate -group application_drivers -divider narrow_External + for {set i 0} {$i < $N_EXT} {incr i} { + set idx [expr {$i + $N_CORE + $N_DMA}] + add wave -noupdate -group application_drivers -group ext_$i /tb_hci/hci_log_if[$idx]/* + } + # HCI-side interfaces + add wave -noupdate -group hci_interfaces -divider narrow_Cores + for {set i 0} {$i < $N_CORE} {incr i} { + add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_core_if[$i]/* + } + if {$INTERCO_TYPE == {LOG}} { + add wave -noupdate -group hci_interfaces -divider narrow_HWPEs_split + for {set i 0} {$i < $N_HWPE} {incr i} { + for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} { + set idx [expr {$N_CORE + $i*$HWPE_WIDTH_FACT + $f}] + add wave -noupdate -group hci_interfaces -group hwpe_$i -group word_$f /tb_hci/hci_core_if[$idx]/* + } + } + } elseif {$INTERCO_TYPE == {HCI} || $INTERCO_TYPE == {MUX}} { + add wave -noupdate -group hci_interfaces -divider wide_HWPEs + for {set i 0} {$i < $N_HWPE_MASTERS} {incr i} { + add wave -noupdate -group hci_interfaces -group hwpe_$i /tb_hci/hci_hwpe_wide_if[$i]/* + } + } else { + echo "WARNING: unsupported INTERCO_TYPE value: $INTERCO_TYPE_SYM ($INTERCO_TYPE)" + } + add wave -noupdate -group hci_interfaces -divider narrow_DMA + for {set i 0} {$i < $N_DMA} {incr i} { + add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_dma_if[$i]/* + } + add wave -noupdate -group hci_interfaces -divider narrow_External + for {set i 0} {$i < $N_EXT} {incr i} { + add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_ext_if[$i]/* + } + # Memory slaves + add wave -noupdate -group memory_slaves + for {set i 0} {$i < $N_BANKS} {incr i} { + add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_mem_if[$i]/* + } + + add wave -noupdate -group metrics /tb_hci/s_end_latency + add wave -noupdate -group metrics /tb_hci/s_end_stimuli + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_read_transactions[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/s_issued_read_transactions[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/s_issued_read_transactions[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/s_issued_read_transactions[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/s_issued_read_transactions[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/s_issued_read_transactions[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/s_issued_read_transactions[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/s_issued_read_transactions[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[9]} {-format Analog-Step -height 84 -max 503.0} {/tb_hci/s_issued_read_transactions[10]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/s_issued_read_transactions[11]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/s_issued_read_transactions + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_transactions[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[9]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[10]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[11]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/s_issued_transactions + add wave -noupdate -group metrics /tb_hci/stim_latency + add wave -noupdate -group metrics /tb_hci/tot_latency + add wave -noupdate -group metrics /tb_hci/latency_per_master + add wave -noupdate -group metrics /tb_hci/throughput_completed + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_HWPE + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_LOG + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 511.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 507.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 528.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 507.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 494.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 519.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 475.99999999999994} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 488.00000000000006} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 482.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 497.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[0]} {-format Analog-Step -height 84 -max 1327824.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[1]} {-format Analog-Step -height 84 -max 1336050.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE + add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[0]} {-format Analog-Step -height 84 -max 580398.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[1]} {-format Analog-Step -height 84 -max 580783.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[2]} {-format Analog-Step -height 84 -max 596101.00000000012} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[3]} {-format Analog-Step -height 84 -max 590598.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[4]} {-format Analog-Step -height 84 -max 603272.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[5]} {-format Analog-Step -height 84 -max 601733.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[6]} {-format Analog-Step -height 84 -max 596110.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[7]} {-format Analog-Step -height 84 -max 593715.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[8]} {-format Analog-Step -height 84 -max 602346.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[9]} {-format Analog-Step -height 84 -max 597075.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG + add wave -noupdate /tb_hci/s_clear + add wave -noupdate /tb_hci/s_hci_ctrl + + configure wave -signalnamewidth 1 } else { run -a } From f79d065b85799b12ea0cdf0820c6918a35a87869 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 27 Feb 2026 16:06:11 +0100 Subject: [PATCH 06/25] [WIP] Change structure of tb --- .../verif/simvectors/hci_stimuli/generator.py | 33 +- target/verif/simvectors/main.py | 8 +- target/verif/src/application_driver.sv | 96 ++++-- target/verif/src/latency_monitor.sv | 36 +-- target/verif/src/tb_hci.sv | 303 +++++++++--------- target/verif/src/tb_hci_pkg.sv | 172 ++++++---- target/verif/src/tcdm_banks_wrap.sv | 2 +- target/verif/vsim/tb_hci.tcl | 99 +++--- 8 files changed, 411 insertions(+), 338 deletions(-) diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py index ca5e2d0..af121d6 100644 --- a/target/verif/simvectors/hci_stimuli/generator.py +++ b/target/verif/simvectors/hci_stimuli/generator.py @@ -2,7 +2,7 @@ import random import os -class StimuliGenerator: +class StimuliGenerator: def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION): self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8) @@ -15,8 +15,11 @@ def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,f self.N_TEST = N_TEST self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET self.CYCLE_OFFSET = CYCLE_OFFSET - self.IW = IW - self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION + self.IW = IW + self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION + + def _format_id(self, id_value): + return bin(id_value % (1 << self.IW))[2:].zfill(self.IW) def random_data(self): data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data @@ -58,8 +61,8 @@ def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) break - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) return id @@ -88,8 +91,8 @@ def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_R LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) break - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) @@ -116,8 +119,8 @@ def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ if wen: if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 if id - id_start >= self.N_TEST : STOP = 1 break @@ -125,8 +128,8 @@ def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 if id - id_start >= self.N_TEST : STOP = 1 break @@ -159,8 +162,8 @@ def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIS if wen: if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 if id - id_start >= self.N_TEST : STOP = 1 break @@ -168,8 +171,8 @@ def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIS if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 + file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") + id = id + 1 if id - id_start >= self.N_TEST : STOP = 1 break diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index b164503..bbdd853 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -98,7 +98,13 @@ def main(argv=None): N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO) N_LOG = N_CORE + N_DMA + N_EXT N_MASTER = N_LOG + N_HWPE - IW = int(np.ceil(np.log2(N_TEST_LOG * N_LOG + N_TEST_HWPE * N_HWPE))) + # Match tb_hci_pkg interface ID widths (hci_system-style model). + IW_CORES = 8 + IW_HWPE = IW_CORES + if IW_CORES != IW_HWPE: + print("ERROR: stimuli generator currently requires equal log/HWPE interface ID widths") + sys.exit(1) + IW = IW_CORES CORE_ZERO_FLAG = False EXT_ZERO_FLAG = False DMA_ZERO_FLAG = False diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index c3b7ffa..fd69399 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -24,6 +24,8 @@ module application_driver #( parameter int unsigned ADDR_WIDTH = 1, parameter int unsigned APPL_DELAY = 2, // Delay on the input signals parameter int unsigned IW = 1, + parameter bit USE_STREAMER_FIFO = 1'b1, + parameter int unsigned STREAMER_FIFO_DEPTH = 2, parameter string STIM_FILE = "" ) ( input logic clk_i, @@ -45,6 +47,48 @@ module application_driver #( logic [ADDR_WIDTH-1:0] add; int unsigned n_completed_read_transactions; logic pending_rsp_is_read[$]; + localparam hci_package::hci_size_parameter_t HCI_SIZE_driver = '{ + DW: DATA_WIDTH, + AW: ADDR_WIDTH, + BW: hci_package::DEFAULT_BW, + UW: hci_package::DEFAULT_UW, + IW: IW, + EW: hci_package::DEFAULT_EW, + EHW: hci_package::DEFAULT_EHW + }; + + hci_core_intf #( + .DW(HCI_SIZE_driver.DW), + .AW(HCI_SIZE_driver.AW), + .BW(HCI_SIZE_driver.BW), + .UW(HCI_SIZE_driver.UW), + .IW(HCI_SIZE_driver.IW), + .EW(HCI_SIZE_driver.EW), + .EHW(HCI_SIZE_driver.EHW) + ) hci_drv_if ( + .clk(clk_i) + ); + + generate + if (USE_STREAMER_FIFO) begin : gen_driver_streamer_fifo + hci_core_fifo #( + .FIFO_DEPTH(STREAMER_FIFO_DEPTH), + .HCI_SIZE_tcdm_initiator(HCI_SIZE_driver) + ) i_driver_streamer_fifo ( + .clk_i(clk_i), + .rst_ni(rst_ni), + .clear_i(1'b0), + .flags_o(), + .tcdm_target(hci_drv_if), + .tcdm_initiator(hci_if) + ); + end else begin : gen_driver_direct + hci_core_assign i_driver_direct_assign ( + .tcdm_target(hci_drv_if), + .tcdm_initiator(hci_if) + ); + end + endgenerate always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter logic retired_is_read; @@ -52,10 +96,10 @@ module application_driver #( n_completed_read_transactions <= '0; pending_rsp_is_read.delete(); end else begin - if (hci_if.req && hci_if.gnt) begin - pending_rsp_is_read.push_back(hci_if.wen); + if (hci_drv_if.req && hci_drv_if.gnt) begin + pending_rsp_is_read.push_back(hci_drv_if.wen); end - if (hci_if.r_valid && hci_if.r_ready) begin + if (hci_drv_if.r_valid && hci_drv_if.r_ready) begin if (pending_rsp_is_read.size() != 0) begin retired_is_read = pending_rsp_is_read.pop_front(); if (retired_is_read) begin @@ -67,17 +111,17 @@ module application_driver #( end initial begin : proc_application_driver - hci_if.id = '0; - hci_if.add = '0; - hci_if.data = '0; - hci_if.req = 1'b0; - hci_if.wen = 1'b0; - hci_if.ecc = '0; - hci_if.ereq = '0; - hci_if.r_eready = '0; - hci_if.be = '1; - hci_if.r_ready = 1'b1; - hci_if.user = '0; + hci_drv_if.id = '0; + hci_drv_if.add = '0; + hci_drv_if.data = '0; + hci_drv_if.req = 1'b0; + hci_drv_if.wen = 1'b0; + hci_drv_if.ecc = '0; + hci_drv_if.ereq = '0; + hci_drv_if.r_eready = '0; + hci_drv_if.be = '1; + hci_drv_if.r_ready = 1'b1; + hci_drv_if.user = '0; end_stimuli_o = 1'b0; end_latency_o = 1'b0; n_issued_transactions_o = '0; @@ -113,25 +157,25 @@ module application_driver #( break; end #(APPL_DELAY); - hci_if.id = id; - hci_if.data = data; - hci_if.add = add; - hci_if.wen = wen; - hci_if.req = req; + hci_drv_if.id = id; + hci_drv_if.data = data; + hci_drv_if.add = add; + hci_drv_if.wen = wen; + hci_drv_if.req = req; if (req) begin - @(posedge clk_i iff hci_if.gnt); + @(posedge clk_i iff hci_drv_if.gnt); n_issued_transactions_o++; if (wen) begin n_issued_read_transactions_o++; end // Deassert in NBA region so monitors sampling this edge see the handshake. - hci_if.id <= '0; - hci_if.data <= '0; - hci_if.add <= '0; - hci_if.wen <= 1'b0; - hci_if.req <= 1'b0; - wait (hci_if.req == 1'b0); + hci_drv_if.id <= '0; + hci_drv_if.data <= '0; + hci_drv_if.add <= '0; + hci_drv_if.wen <= 1'b0; + hci_drv_if.req <= 1'b0; + wait (hci_drv_if.req == 1'b0); end else begin @(posedge clk_i); end diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/latency_monitor.sv index b645ce1..3aff305 100644 --- a/target/verif/src/latency_monitor.sv +++ b/target/verif/src/latency_monitor.sv @@ -24,8 +24,8 @@ module latency_monitor #( input logic clk_i, input logic rst_ni, // Monitored interfaces - hci_core_intf.monitor hci_log_if [0:N_MASTER-N_HWPE-1], - hci_core_intf.monitor hci_hwpe_if [0:N_HWPE-1], + hci_core_intf.monitor hci_driver_log_if [0:N_MASTER-N_HWPE-1], + hci_core_intf.monitor hci_driver_hwpe_if [0:N_HWPE-1], // Accumulated request-to-grant latency. output real sum_req_to_gnt_latency_log_o[N_MASTER-N_HWPE], output real sum_req_to_gnt_latency_hwpe_o[N_HWPE], @@ -74,32 +74,32 @@ module latency_monitor #( req_start_cycle_log <= '0; req_prev_log <= 1'b0; end else begin - if (hci_log_if[gi].req && !req_prev_log && !hci_log_if[gi].gnt) begin + if (hci_driver_log_if[gi].req && !req_prev_log && !hci_driver_log_if[gi].gnt) begin req_start_cycle_log <= cycle_q; - end else if (hci_log_if[gi].gnt) begin + end else if (hci_driver_log_if[gi].gnt) begin req_start_cycle_log <= cycle_q + 1; end - if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt) begin if (req_prev_log) begin sum_req_to_gnt_latency_log_o[gi] <= sum_req_to_gnt_latency_log_o[gi] + real'(cycle_q - req_start_cycle_log); end n_gnt_transactions_log_o[gi] <= n_gnt_transactions_log_o[gi] + 1; - pending_rsp_is_read_log.push_back(hci_log_if[gi].wen); + pending_rsp_is_read_log.push_back(hci_driver_log_if[gi].wen); end - req_prev_log <= hci_log_if[gi].req; + req_prev_log <= hci_driver_log_if[gi].req; - if (hci_log_if[gi].req && hci_log_if[gi].gnt && hci_log_if[gi].wen) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && hci_driver_log_if[gi].wen) begin n_read_granted_log_o[gi] <= n_read_granted_log_o[gi] + 1; end - if (hci_log_if[gi].req && hci_log_if[gi].gnt && !hci_log_if[gi].wen) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && !hci_driver_log_if[gi].wen) begin n_write_granted_log_o[gi] <= n_write_granted_log_o[gi] + 1; end - if (hci_log_if[gi].r_valid && hci_log_if[gi].r_ready) begin + if (hci_driver_log_if[gi].r_valid && hci_driver_log_if[gi].r_ready) begin if (pending_rsp_is_read_log.size() != 0) begin retired_is_read_log = pending_rsp_is_read_log.pop_front(); if (retired_is_read_log) begin @@ -132,32 +132,32 @@ module latency_monitor #( req_start_cycle_hwpe <= '0; req_prev_hwpe <= 1'b0; end else begin - if (hci_hwpe_if[gi].req && !req_prev_hwpe && !hci_hwpe_if[gi].gnt) begin + if (hci_driver_hwpe_if[gi].req && !req_prev_hwpe && !hci_driver_hwpe_if[gi].gnt) begin req_start_cycle_hwpe <= cycle_q; - end else if (hci_hwpe_if[gi].gnt) begin + end else if (hci_driver_hwpe_if[gi].gnt) begin req_start_cycle_hwpe <= cycle_q + 1; end - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt) begin if (req_prev_hwpe) begin sum_req_to_gnt_latency_hwpe_o[gi] <= sum_req_to_gnt_latency_hwpe_o[gi] + real'(cycle_q - req_start_cycle_hwpe); end n_gnt_transactions_hwpe_o[gi] <= n_gnt_transactions_hwpe_o[gi] + 1; - pending_rsp_is_read_hwpe.push_back(hci_hwpe_if[gi].wen); + pending_rsp_is_read_hwpe.push_back(hci_driver_hwpe_if[gi].wen); end - req_prev_hwpe <= hci_hwpe_if[gi].req; + req_prev_hwpe <= hci_driver_hwpe_if[gi].req; - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && hci_hwpe_if[gi].wen) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && hci_driver_hwpe_if[gi].wen) begin n_read_granted_hwpe_o[gi] <= n_read_granted_hwpe_o[gi] + 1; end - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && !hci_hwpe_if[gi].wen) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && !hci_driver_hwpe_if[gi].wen) begin n_write_granted_hwpe_o[gi] <= n_write_granted_hwpe_o[gi] + 1; end - if (hci_hwpe_if[gi].r_valid && hci_hwpe_if[gi].r_ready) begin + if (hci_driver_hwpe_if[gi].r_valid && hci_driver_hwpe_if[gi].r_ready) begin if (pending_rsp_is_read_hwpe.size() != 0) begin retired_is_read_hwpe = pending_rsp_is_read_hwpe.pop_front(); if (retired_is_read_hwpe) begin diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 7ce7ea2..13cf7ec 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -42,49 +42,38 @@ module tb_hci // HCI // ///////// - /* HCI interfaces */ - localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{ // CORE + DMA + EXT parameters - DW: DEFAULT_DW, - AW: DEFAULT_AW, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{ + DW: DW_cores, + AW: AW_cores, + BW: BW_cores, + UW: UW_cores, + IW: IW_cores, + EW: EW_cores, + EHW: EHW_cores }; - localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{ // Bank parameters - DW: DEFAULT_DW, - AW: ADDR_WIDTH_BANK, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + + localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{ + DW: DW_hwpe, + AW: AW_hwpe, + BW: BW_hwpe, + UW: UW_hwpe, + IW: IW_hwpe, + EW: EW_hwpe, + EHW: EHW_hwpe }; - localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{ // HWPE parameters - DW: HWPE_WIDTH_FACT*DATA_WIDTH, - AW: DEFAULT_AW, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + + localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{ + DW: DW_mems, + AW: AW_mems, + BW: BW_mems, + UW: UW_mems, + IW: IW_mems, + EW: EW_mems, + EHW: EHW_mems }; /* Application-driver-side interfaces */ - hci_core_intf #( - .DW(HCI_SIZE_hwpe.DW), - .AW(HCI_SIZE_hwpe.AW), - .BW(HCI_SIZE_hwpe.BW), - .UW(HCI_SIZE_hwpe.UW), - .IW(HCI_SIZE_hwpe.IW), - .EW(HCI_SIZE_hwpe.EW), - .EHW(HCI_SIZE_hwpe.EHW) - ) hci_hwpe_if [0:N_HWPE-1] ( - .clk(clk) - ); - hci_core_intf #( .DW(HCI_SIZE_cores.DW), .AW(HCI_SIZE_cores.AW), @@ -93,12 +82,10 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_log_if [0:N_LOG_MASTERS-1] ( + ) hci_driver_log_if [0:N_LOG_MASTERS-1] ( .clk(clk) ); - /* Interconnect-side master interfaces */ - hci_core_intf #( .DW(HCI_SIZE_hwpe.DW), .AW(HCI_SIZE_hwpe.AW), @@ -107,10 +94,12 @@ module tb_hci .IW(HCI_SIZE_hwpe.IW), .EW(HCI_SIZE_hwpe.EW), .EHW(HCI_SIZE_hwpe.EHW) - ) hci_hwpe_wide_if [0:N_HWPE_MASTERS-1] ( + ) hci_driver_hwpe_if [0:N_HWPE-1] ( .clk(clk) ); + /* Interconnect-side interfaces (hci_system-style organization) */ + hci_core_intf #( .DW(HCI_SIZE_cores.DW), .AW(HCI_SIZE_cores.AW), @@ -119,20 +108,19 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_core_if [0:N_CORE+N_HWPE_LOG_MASTERS-1] ( + ) hci_initiator_narrow [0:N_NARROW_HCI-1] ( .clk(clk) ); - // LOG-only intermediate interfaces for HWPE split lanes. hci_core_intf #( - .DW(HCI_SIZE_cores.DW), - .AW(HCI_SIZE_cores.AW), - .BW(HCI_SIZE_cores.BW), - .UW(HCI_SIZE_cores.UW), - .IW(HCI_SIZE_cores.IW), - .EW(HCI_SIZE_cores.EW), - .EHW(HCI_SIZE_cores.EHW) - ) hci_hwpe_log_if [0:N_HWPE_LOG_MASTERS-1] ( + .DW(HCI_SIZE_hwpe.DW), + .AW(HCI_SIZE_hwpe.AW), + .BW(HCI_SIZE_hwpe.BW), + .UW(HCI_SIZE_hwpe.UW), + .IW(HCI_SIZE_hwpe.IW), + .EW(HCI_SIZE_hwpe.EW), + .EHW(HCI_SIZE_hwpe.EHW) + ) hci_initiator_wide [0:N_WIDE_HCI-1] ( .clk(clk) ); @@ -144,7 +132,7 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_dma_if [0:N_DMA-1] ( + ) hci_initiator_dma [0:N_DMA-1] ( .clk(clk) ); @@ -156,12 +144,10 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_ext_if [0:N_EXT-1] ( + ) hci_initiator_ext [0:N_EXT-1] ( .clk(clk) ); - /* Memory interface */ - hci_core_intf #( .DW(HCI_SIZE_mems.DW), .AW(HCI_SIZE_mems.AW), @@ -174,93 +160,109 @@ module tb_hci .WAIVE_RQ4_ASSERT(1'b1), .WAIVE_RSP3_ASSERT(1'b1), .WAIVE_RSP5_ASSERT(1'b1) - ) hci_mem_if [0:N_BANKS-1] ( + ) hci_target_mems [0:N_BANKS-1] ( .clk(clk) ); - /* HCI instance */ - generate - for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_log - hci_core_assign i_core_to_hci_assign ( - .tcdm_target (hci_log_if[ii]), - .tcdm_initiator (hci_core_if[ii]) + for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow + hci_core_assign i_core_to_narrow_assign ( + .tcdm_target(hci_driver_log_if[ii]), + .tcdm_initiator(hci_initiator_narrow[ii]) ); end - for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_log + + for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_hci hci_core_assign i_dma_to_hci_assign ( - .tcdm_target (hci_log_if[N_CORE + ii]), - .tcdm_initiator (hci_dma_if[ii]) + .tcdm_target(hci_driver_log_if[N_CORE + ii]), + .tcdm_initiator(hci_initiator_dma[ii]) ); end - for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_log + + for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_hci hci_core_assign i_ext_to_hci_assign ( - .tcdm_target (hci_log_if[N_CORE + N_DMA + ii]), - .tcdm_initiator (hci_ext_if[ii]) + .tcdm_target(hci_driver_log_if[N_CORE + N_DMA + ii]), + .tcdm_initiator(hci_initiator_ext[ii]) ); end - if (INTERCO_TYPE == HCI) begin : gen_hwpe_to_hci - for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_hci - hci_core_assign i_hwpe_to_hci_assign ( - .tcdm_target (hci_hwpe_if[ii]), - .tcdm_initiator (hci_hwpe_wide_if[ii]) + if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign + hci_core_assign i_hwpe_hci_assign ( + .tcdm_target(hci_driver_hwpe_if[ii]), + .tcdm_initiator(hci_initiator_wide[ii]) ); end - end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_to_log - for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_log - hci_core_split #( - .DW(HWPE_WIDTH_FACT * DATA_WIDTH), - .BW(DATA_WIDTH / 8), - .UW(1), - .NB_OUT_CHAN(HWPE_WIDTH_FACT), - .FIFO_DEPTH(2), - .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe) - ) i_hwpe_to_log_split ( - .clk_i(clk), - .rst_ni(rst_n), - .clear_i(s_clear), - .tcdm_target(hci_hwpe_if[ii]), - .tcdm_initiator(hci_hwpe_log_if[ii * HWPE_WIDTH_FACT : (ii + 1) * HWPE_WIDTH_FACT - 1]) - ); + end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux + hci_core_mux_static #( + .NB_CHAN(N_HWPE), + .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe) + ) i_hwpe_mux ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .sel_i('0), + .in(hci_driver_hwpe_if), + .out(hci_initiator_wide[0]) + ); + end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master + for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_lane + localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f; + + assign hci_initiator_narrow[IDX].req = hci_driver_hwpe_if[ii].req; + assign hci_initiator_narrow[IDX].add = hci_driver_hwpe_if[ii].add + f * WORD_SIZE; + assign hci_initiator_narrow[IDX].wen = hci_driver_hwpe_if[ii].wen; + assign hci_initiator_narrow[IDX].data = hci_driver_hwpe_if[ii].data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8]; + assign hci_initiator_narrow[IDX].be = hci_driver_hwpe_if[ii].be[(f + 1) * WORD_SIZE - 1 : f * WORD_SIZE]; + assign hci_initiator_narrow[IDX].r_ready = hci_driver_hwpe_if[ii].r_ready; + assign hci_initiator_narrow[IDX].user = hci_driver_hwpe_if[ii].user; + assign hci_initiator_narrow[IDX].id = hci_driver_hwpe_if[ii].id; + assign hci_driver_hwpe_if[ii].r_data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8] = hci_initiator_narrow[IDX].r_data; + assign hci_initiator_narrow[IDX].ecc = hci_driver_hwpe_if[ii].ecc; + assign hci_initiator_narrow[IDX].ereq = hci_driver_hwpe_if[ii].ereq; + assign hci_initiator_narrow[IDX].r_eready = hci_driver_hwpe_if[ii].r_eready; + end - for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_log_rvalid_filter - localparam int unsigned IDX_LOG = ii * HWPE_WIDTH_FACT + f; - localparam int unsigned IDX_CORE = N_CORE + IDX_LOG; - hci_core_r_valid_filter #( - .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_cores) - ) i_hwpe_log_rvalid_filter ( - .clk_i(clk), - .rst_ni(rst_n), - .clear_i(s_clear), - .enable_i(1'b1), - .tcdm_target(hci_hwpe_log_if[IDX_LOG]), - .tcdm_initiator(hci_core_if[IDX_CORE]) - ); + assign hci_driver_hwpe_if[ii].r_user = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_user; + assign hci_driver_hwpe_if[ii].r_id = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_id; + assign hci_driver_hwpe_if[ii].r_opc = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_opc; + assign hci_driver_hwpe_if[ii].r_ecc = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_ecc; + + logic [HWPE_WIDTH_FACT-1:0] gnt_vec, rvalid_vec, egnt_vec, revalid_vec; + for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_red + localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f; + assign gnt_vec[f] = hci_initiator_narrow[IDX].gnt; + assign rvalid_vec[f] = hci_initiator_narrow[IDX].r_valid; + assign egnt_vec[f] = hci_initiator_narrow[IDX].egnt; + assign revalid_vec[f] = hci_initiator_narrow[IDX].r_evalid; end + assign hci_driver_hwpe_if[ii].gnt = &gnt_vec; + assign hci_driver_hwpe_if[ii].r_valid = &rvalid_vec; + assign hci_driver_hwpe_if[ii].egnt = &egnt_vec; + assign hci_driver_hwpe_if[ii].r_evalid = &revalid_vec; end - end else begin - // Error: unsupported for now - $error("Unsupported INTERCO_TYPE"); + end else begin : gen_unsupported_mode + initial $error("Unsupported INTERCO_TYPE"); end endgenerate - assign s_clear = 0; + assign s_clear = 1'b0; assign s_hci_ctrl.arb_policy = ARBITER_MODE; assign s_hci_ctrl.invert_prio = INVERT_PRIO; assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL; hci_interconnect #( - .N_HWPE(N_HWPE_MASTERS), // Number of HWPE ports - .N_CORE(N_CORE + N_HWPE_LOG_MASTERS), // Number of CORE ports - .N_DMA(N_DMA), // Number of DMA ports - .N_EXT(N_EXT), // Number of External ports - .N_MEM(N_BANKS), // Number of Memory banks - .TS_BIT(TS_BIT), // TEST_SET_BIT (for Log Interconnect) - .IW(IW), // ID Width - .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect - .SEL_LIC(SEL_LIC), // Log interconnect type selector - .FILTER_WRITE_R_VALID ( FILTER_WRITE_R_VALID ), + .N_HWPE(N_WIDE_HCI), + .N_CORE(N_NARROW_HCI), + .N_DMA(N_DMA), + .N_EXT(N_EXT), + .N_MEM(N_BANKS), + .TS_BIT(TS_BIT), + .IW(IW), + .EXPFIFO(EXPFIFO), + .SEL_LIC(SEL_LIC), + .FILTER_WRITE_R_VALID(FILTER_WRITE_R_VALID), .HCI_SIZE_cores(HCI_SIZE_cores), .HCI_SIZE_mems(HCI_SIZE_mems), .HCI_SIZE_hwpe(HCI_SIZE_hwpe), @@ -273,11 +275,11 @@ module tb_hci .rst_ni(rst_n), .clear_i(s_clear), .ctrl_i(s_hci_ctrl), - .cores(hci_core_if), - .dma(hci_dma_if), - .ext(hci_ext_if), - .mems(hci_mem_if), - .hwpe(hci_hwpe_wide_if) + .cores(hci_initiator_narrow), + .dma(hci_initiator_dma), + .ext(hci_initiator_ext), + .mems(hci_target_mems), + .hwpe(hci_initiator_wide) ); ////////// @@ -289,13 +291,13 @@ module tb_hci .NbBanks(N_BANKS), .DataWidth(DATA_WIDTH), .AddrWidth(ADDR_WIDTH), - .BeWidth(DATA_WIDTH/8), + .BeWidth(DATA_WIDTH / 8), .IdWidth(IW) ) i_tb_mem ( .clk_i(clk), .rst_ni(rst_n), .test_mode_i(1'b0), - .tcdm_slave(hci_mem_if) + .tcdm_slave(hci_target_mems) ); ///////////////////////// @@ -307,9 +309,8 @@ module tb_hci int unsigned s_issued_transactions[0:N_DRIVERS-1]; int unsigned s_issued_read_transactions[0:N_DRIVERS-1]; - /* CORE + DMA + EXT */ generate - for (genvar ii = 0; ii < N_CORE + N_DMA + N_EXT; ii++) begin : gen_app_driver_log + for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log localparam string STIM_FILE_LOG = $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii); application_driver #( @@ -317,13 +318,13 @@ module tb_hci .IS_HWPE(0), .DATA_WIDTH(DATA_WIDTH), .ADDR_WIDTH(ADDR_WIDTH), - .APPL_DELAY(APPL_DELAY), // Delay on the input signals - .IW(IW), + .APPL_DELAY(APPL_DELAY), + .IW(IW_cores), .STIM_FILE(STIM_FILE_LOG) ) i_app_driver_log ( - .hci_if(hci_log_if[ii]), - .rst_ni(rst_n), .clk_i(clk), + .rst_ni(rst_n), + .hci_if(hci_driver_log_if[ii]), .end_stimuli_o(s_end_stimuli[ii]), .end_latency_o(s_end_latency[ii]), .n_issued_transactions_o(s_issued_transactions[ii]), @@ -332,7 +333,6 @@ module tb_hci end endgenerate - /* HWPE */ generate for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe localparam string STIM_FILE_HWPE = @@ -342,17 +342,17 @@ module tb_hci .IS_HWPE(1), .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH), .ADDR_WIDTH(ADDR_WIDTH), - .APPL_DELAY(APPL_DELAY), // Delay on the input signals - .IW(IW), + .APPL_DELAY(APPL_DELAY), + .IW(IW_hwpe), .STIM_FILE(STIM_FILE_HWPE) ) i_app_driver_hwpe ( .clk_i(clk), .rst_ni(rst_n), - .hci_if(hci_hwpe_if[ii]), - .end_stimuli_o(s_end_stimuli[N_CORE+N_DMA+N_EXT + ii]), - .end_latency_o(s_end_latency[N_CORE+N_DMA+N_EXT + ii]), - .n_issued_transactions_o(s_issued_transactions[N_CORE+N_DMA+N_EXT + ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[N_CORE+N_DMA+N_EXT + ii]) + .hci_if(hci_driver_hwpe_if[ii]), + .end_stimuli_o(s_end_stimuli[N_LOG_MASTERS + ii]), + .end_latency_o(s_end_latency[N_LOG_MASTERS + ii]), + .n_issued_transactions_o(s_issued_transactions[N_LOG_MASTERS + ii]), + .n_issued_read_transactions_o(s_issued_read_transactions[N_LOG_MASTERS + ii]) ); end endgenerate @@ -361,19 +361,17 @@ module tb_hci // QoS // ///////// - real SUM_REQ_TO_GNT_LATENCY_LOG[N_CORE+N_DMA+N_EXT]; + real SUM_REQ_TO_GNT_LATENCY_LOG[N_LOG_MASTERS]; real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE]; - int unsigned N_GNT_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; + int unsigned N_GNT_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; + int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; + int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT]; + int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE]; - /* REAL THROUGHPUT AND SIMULATION TIME */ - real latency_per_master[N_DRIVERS]; real throughput_completed; real stim_latency; @@ -400,15 +398,14 @@ module tb_hci .latency_per_master_o(latency_per_master) ); - /* LATENCY MONITOR */ latency_monitor #( .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE) ) i_latency_monitor ( .clk_i(clk), .rst_ni(rst_n), - .hci_log_if(hci_log_if), - .hci_hwpe_if(hci_hwpe_if), + .hci_driver_log_if(hci_driver_log_if), + .hci_driver_hwpe_if(hci_driver_hwpe_if), .sum_req_to_gnt_latency_log_o(SUM_REQ_TO_GNT_LATENCY_LOG), .sum_req_to_gnt_latency_hwpe_o(SUM_REQ_TO_GNT_LATENCY_HWPE), .n_gnt_transactions_log_o(N_GNT_TRANSACTIONS_LOG), @@ -421,11 +418,6 @@ module tb_hci .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); - /////////// - // Other // - /////////// - - /* SIMULATION REPORT */ simulation_report i_simulation_report ( .end_stimuli_i(s_end_stimuli), .end_latency_i(s_end_latency), @@ -445,7 +437,6 @@ module tb_hci .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); - /* ASSERTIONS */ localparam int unsigned MAX_BANK_LOCAL_ADDR = TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE; @@ -453,15 +444,15 @@ module tb_hci for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_assert_hwpe_address a_hwpe_addr_in_bounds: assert property ( @(posedge clk) - get_bank_local_address(hci_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR + get_bank_local_address(hci_driver_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR ) else begin $display("--------------------------------------------"); $display("Time %0t: Test stopped", $time); $error( "HWPE%0d generated an out-of-bounds address (raw=0x%0h, bank_local=0x%0h, max=0x%0h).", ii, - hci_hwpe_if[ii].add, - get_bank_local_address(hci_hwpe_if[ii].add), + hci_driver_hwpe_if[ii].add, + get_bank_local_address(hci_driver_hwpe_if[ii].add), MAX_BANK_LOCAL_ADDR ); $display("This workload is invalid; rerun with a different workload configuration."); diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index d35ca6e..cfcb3c6 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -19,9 +19,9 @@ package tb_hci_pkg; typedef enum logic [1:0] { - LOG = 2'd0, + LOG = 2'd0, MUX = 2'd1, - HCI = 2'd2 + HCI = 2'd2 } interco_e; ////////////////////////// @@ -29,72 +29,112 @@ package tb_hci_pkg; ////////////////////////// // from verif/config/testbench.mk - /* Timing parameters */ - - localparam time CLK_PERIOD = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif; - localparam time APPL_DELAY = 0; - localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif; - - /* Simulation parameters */ + localparam time CLK_PERIOD = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif; + localparam time APPL_DELAY = 0; + localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif; // Transaction counts - localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif; - localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif; + localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif; + localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif; localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO); - // TCDM interface parameters - localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; - - // Arbiter configuration - localparam int unsigned ARBITER_MODE = 0; - localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif; - localparam int unsigned LOW_PRIO_MAX_STALL = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif; + // TCDM and arbitration parameters + localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; + localparam int unsigned ARBITER_MODE = 0; + localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif; + localparam int unsigned LOW_PRIO_MAX_STALL = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif; - ///////////////////////// - // Hardware parameters // - ///////////////////////// + ///////////////////////////// + // Configurable parameters // + ///////////////////////////// // from verif/config/hardware.mk - /* Config */ - + // Interconnect mode localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif; - // Master port counts - localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port - localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports - localparam int unsigned N_DMA = `ifdef N_DMA `N_DMA `else 1 `endif; // Number of DMA ports - localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif; // Number of External ports + // Number of initiators + localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; + localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; + localparam int unsigned N_DMA = `ifdef N_DMA `N_DMA `else 1 `endif; + localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif; // Interconnect configuration - localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif; // TEST_SET_BIT (for Log Interconnect) - localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; // FIFO Depth for HWPE Interconnect - localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector + localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif; + localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; + localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; - // Data and memory parameters - localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits - localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; // Width factor of an HWPE wide-word - localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB) + // Memory system configuration + localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; + localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; + localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; + localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; - /* Derived parameters */ + ////////////////////////// + // Hardcoded parameters // + ////////////////////////// - localparam int unsigned N_DRIVERS = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters - localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks + localparam int unsigned WORD_SIZE = DATA_WIDTH / 8; + + ////////////////////////// + // Dependent parameters // + ////////////////////////// localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; - localparam int unsigned N_HWPE_LOG_MASTERS = (INTERCO_TYPE == LOG) ? (N_HWPE * HWPE_WIDTH_FACT) : 0; - localparam int unsigned N_HWPE_MASTERS = (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0); + localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; + + // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE. + localparam int unsigned N_NARROW_HCI = + N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG); + + // If full HCI is used, instantiate one wide port per HWPE. + // If static MUX is used, instantiate a single shared wide port. + localparam int unsigned N_WIDE_HCI = + (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0); - localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024); // Width of ADDRESS in bits - localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH; // Width of a memory bank (bits) - localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes) - localparam int unsigned BIT_BANK_INDEX = $clog2(N_BANKS); // Bits of the Bank index - localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX; // Number of address bits per TCDM bank - localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank - localparam int unsigned FILTER_WRITE_R_VALID[0:N_HWPE_MASTERS-1] = '{default: 0}; // Enable filtering of only r_valid respons + // One-hot response ID width used by the interconnect. + localparam int unsigned IW = N_NARROW_HCI + N_WIDE_HCI + N_DMA + N_EXT; - // Keep ID width consistent with stimuli generator. - localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * N_LOG_MASTERS + N_TRANSACTION_HWPE * N_HWPE); - localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE + N_DMA + N_EXT) + N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT; + localparam int unsigned FILTER_WRITE_R_VALID[0:N_WIDE_HCI-1] = '{default: 0}; + + localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024); + localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH; + localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; + localparam int unsigned BIT_BANK_INDEX = $clog2(N_BANKS); + localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX; + localparam int unsigned N_WORDS = + (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; + + localparam int unsigned TOT_CHECK = + N_TRANSACTION_LOG * N_LOG_MASTERS + + N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT; + + /////////////// + // Bitwidths // + /////////////// + + localparam int unsigned DW_cores = DATA_WIDTH; + localparam int unsigned AW_cores = 32; + localparam int unsigned BW_cores = 8; + localparam int unsigned UW_cores = 1; + localparam int unsigned IW_cores = 8; + localparam int unsigned EW_cores = 1; + localparam int unsigned EHW_cores = 1; + + localparam int unsigned DW_hwpe = DW_cores * HWPE_WIDTH_FACT; + localparam int unsigned AW_hwpe = AW_cores; + localparam int unsigned BW_hwpe = BW_cores; + localparam int unsigned UW_hwpe = UW_cores; + localparam int unsigned IW_hwpe = IW_cores; + localparam int unsigned EW_hwpe = EW_cores; + localparam int unsigned EHW_hwpe = EHW_cores; + + localparam int unsigned DW_mems = DW_cores; + localparam int unsigned AW_mems = ADDR_WIDTH_BANK; + localparam int unsigned BW_mems = BW_cores; + localparam int unsigned UW_mems = UW_cores; + localparam int unsigned IW_mems = IW; + localparam int unsigned EW_mems = EW_cores; + localparam int unsigned EHW_mems = EHW_cores; /////////// // Types // @@ -107,35 +147,32 @@ package tb_hci_pkg; } stimuli_t; typedef struct packed { - logic [DATA_WIDTH-1:0] data; + logic [DATA_WIDTH-1:0] data; logic [ADDR_WIDTH_BANK-1:0] add; } out_intc_to_mem_t; - // Helper return type for HWPE address/data creation typedef struct { logic [ADDR_WIDTH-1:0] address; logic [DATA_WIDTH-1:0] data; - logic rolls_over; + logic rolls_over; } hwpe_addr_data_t; ///////////// // Helpers // ///////////// - // Zero-time pure function returning address/data for an HWPE lane function automatic hwpe_addr_data_t create_address_and_data_hwpe( - input logic [ADDR_WIDTH-1:0] address_before, + input logic [ADDR_WIDTH-1:0] address_before, input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before, - input int index, - input logic rolls_over_check_before + input int index, + input logic rolls_over_check_before ); hwpe_addr_data_t ret; logic [BIT_BANK_INDEX-1:0] bank_index_before, bank_index_after; begin - bank_index_before = address_before[BIT_BANK_INDEX-1 + 2 : 2]; - // Legacy behavior: add full-width index and let truncation wrap - bank_index_after = index + bank_index_before; - ret.rolls_over = rolls_over_check_before; + bank_index_before = address_before[BIT_BANK_INDEX-1 + 2:2]; + bank_index_after = index + bank_index_before; + ret.rolls_over = rolls_over_check_before; if (bank_index_before > bank_index_after) begin ret.rolls_over = 1'b1; end @@ -157,8 +194,6 @@ package tb_hci_pkg; index = address[BIT_BANK_INDEX-1 + 2:2]; endtask - /* Metrics helpers */ - task calculate_theoretical_throughput(output real throughput_theo); real tot_data, band_memory_limit, tot_time; if (TRANSACTION_RATIO >= 1) begin @@ -166,20 +201,15 @@ package tb_hci_pkg; end else begin tot_time = N_TRANSACTION_LOG; end - tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_DRIVERS - N_HWPE) + - (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE; // bit - throughput_theo = tot_data / tot_time; // bit per cycle + tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * N_LOG_MASTERS + + (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE; + throughput_theo = tot_data / tot_time; band_memory_limit = real'(N_BANKS * DATA_WIDTH); if (throughput_theo >= band_memory_limit) begin throughput_theo = band_memory_limit; end endtask - /////////////// - // Functions // - /////////////// - - // Convert a full system address to per-bank local word address. function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i); logic [ADDR_WIDTH-1:0] mapped_addr; logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv index 46c251c..fc22a0b 100644 --- a/target/verif/src/tcdm_banks_wrap.sv +++ b/target/verif/src/tcdm_banks_wrap.sv @@ -92,9 +92,9 @@ module tcdm_banks_wrap #( tcdm_slave[i].r_valid <= 1'b1; end else begin tcdm_slave[i].r_valid <= 1'b0; + end end end - end end endmodule diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl index feeefa0..33b1e72 100644 --- a/target/verif/vsim/tb_hci.tcl +++ b/target/verif/vsim/tb_hci.tcl @@ -9,88 +9,87 @@ if {$GUI == 1} { set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE] set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS] - set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE] set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS] - set N_HWPE_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_LOG_MASTERS] - set N_HWPE_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_MASTERS] - + set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI] + set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI] set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT] + set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE] add wave -noupdate /tb_hci/clk add wave -noupdate /tb_hci/rst_n - # Application driver interfaces - add wave -noupdate -group application_drivers -divider narrow_Cores - for {set i 0} {$i < $N_CORE} {incr i} { - add wave -noupdate -group application_drivers -group core_$i /tb_hci/hci_log_if[$i]/* + # Application-driver interfaces + add wave -noupdate -group application_drivers -divider log_masters + for {set i 0} {$i < $N_LOG_MASTERS} {incr i} { + add wave -noupdate -group application_drivers -group log_$i /tb_hci/hci_driver_log_if[$i]/* } - add wave -noupdate -group application_drivers -divider wide_HWPEs + + add wave -noupdate -group application_drivers -divider hwpe_masters for {set i 0} {$i < $N_HWPE} {incr i} { - add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_hwpe_if[$i]/* - } - add wave -noupdate -group application_drivers -divider narrow_DMA - for {set i 0} {$i < $N_DMA} {incr i} { - set idx [expr {$i + $N_CORE}] - add wave -noupdate -group application_drivers -group dma_$i /tb_hci/hci_log_if[$idx]/* - } - add wave -noupdate -group application_drivers -divider narrow_External - for {set i 0} {$i < $N_EXT} {incr i} { - set idx [expr {$i + $N_CORE + $N_DMA}] - add wave -noupdate -group application_drivers -group ext_$i /tb_hci/hci_log_if[$idx]/* + add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/* } - # HCI-side interfaces - add wave -noupdate -group hci_interfaces -divider narrow_Cores + + # Interconnect-side interfaces + add wave -noupdate -group hci_interfaces -divider narrow_cores for {set i 0} {$i < $N_CORE} {incr i} { - add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_core_if[$i]/* + add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_initiator_narrow[$i]/* } - if {$INTERCO_TYPE == {LOG}} { - add wave -noupdate -group hci_interfaces -divider narrow_HWPEs_split + + if {[string first "LOG" $INTERCO_TYPE] != -1} { + add wave -noupdate -group hci_interfaces -divider narrow_hwpe_split for {set i 0} {$i < $N_HWPE} {incr i} { for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} { - set idx [expr {$N_CORE + $i*$HWPE_WIDTH_FACT + $f}] - add wave -noupdate -group hci_interfaces -group hwpe_$i -group word_$f /tb_hci/hci_core_if[$idx]/* + set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}] + if {$idx < $N_NARROW_HCI} { + add wave -noupdate -group hci_interfaces -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/* + } } } - } elseif {$INTERCO_TYPE == {HCI} || $INTERCO_TYPE == {MUX}} { - add wave -noupdate -group hci_interfaces -divider wide_HWPEs - for {set i 0} {$i < $N_HWPE_MASTERS} {incr i} { - add wave -noupdate -group hci_interfaces -group hwpe_$i /tb_hci/hci_hwpe_wide_if[$i]/* + } + + if {$N_WIDE_HCI > 0} { + add wave -noupdate -group hci_interfaces -divider wide_hwpe + for {set i 0} {$i < $N_WIDE_HCI} {incr i} { + add wave -noupdate -group hci_interfaces -group wide_$i /tb_hci/hci_initiator_wide[$i]/* } - } else { - echo "WARNING: unsupported INTERCO_TYPE value: $INTERCO_TYPE_SYM ($INTERCO_TYPE)" } - add wave -noupdate -group hci_interfaces -divider narrow_DMA + + add wave -noupdate -group hci_interfaces -divider dma for {set i 0} {$i < $N_DMA} {incr i} { - add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_dma_if[$i]/* + add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_initiator_dma[$i]/* } - add wave -noupdate -group hci_interfaces -divider narrow_External + + add wave -noupdate -group hci_interfaces -divider ext for {set i 0} {$i < $N_EXT} {incr i} { - add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_ext_if[$i]/* + add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_initiator_ext[$i]/* } + # Memory slaves add wave -noupdate -group memory_slaves for {set i 0} {$i < $N_BANKS} {incr i} { - add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_mem_if[$i]/* + add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_target_mems[$i]/* } + # Metrics add wave -noupdate -group metrics /tb_hci/s_end_latency add wave -noupdate -group metrics /tb_hci/s_end_stimuli - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_read_transactions[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/s_issued_read_transactions[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/s_issued_read_transactions[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/s_issued_read_transactions[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/s_issued_read_transactions[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/s_issued_read_transactions[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/s_issued_read_transactions[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/s_issued_read_transactions[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[9]} {-format Analog-Step -height 84 -max 503.0} {/tb_hci/s_issued_read_transactions[10]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/s_issued_read_transactions[11]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/s_issued_read_transactions - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_transactions[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[9]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[10]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[11]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/s_issued_transactions + add wave -noupdate -group metrics /tb_hci/s_issued_transactions + add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions add wave -noupdate -group metrics /tb_hci/stim_latency add wave -noupdate -group metrics /tb_hci/tot_latency add wave -noupdate -group metrics /tb_hci/latency_per_master add wave -noupdate -group metrics /tb_hci/throughput_completed - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_HWPE - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_LOG - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 511.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 507.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 528.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 507.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 494.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 519.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 475.99999999999994} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 488.00000000000006} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 482.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 497.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[0]} {-format Analog-Step -height 84 -max 1327824.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[1]} {-format Analog-Step -height 84 -max 1336050.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE - add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[0]} {-format Analog-Step -height 84 -max 580398.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[1]} {-format Analog-Step -height 84 -max 580783.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[2]} {-format Analog-Step -height 84 -max 596101.00000000012} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[3]} {-format Analog-Step -height 84 -max 590598.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[4]} {-format Analog-Step -height 84 -max 603272.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[5]} {-format Analog-Step -height 84 -max 601733.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[6]} {-format Analog-Step -height 84 -max 596110.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[7]} {-format Analog-Step -height 84 -max 593715.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[8]} {-format Analog-Step -height 84 -max 602346.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[9]} {-format Analog-Step -height 84 -max 597075.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG + add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG + add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE + add wave -noupdate /tb_hci/s_clear add wave -noupdate /tb_hci/s_hci_ctrl From e5357a0ca9fc697076d2dc619cc38ac3062dae9f Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Thu, 5 Mar 2026 17:54:56 +0100 Subject: [PATCH 07/25] tb: Remove fifo from app_driver, remove r_valid gen from tcdm, reintroduce hci_core_split - Revert application_driver to initial state, which does not employ FIFOs at interface - Remove r_valid generation from TCDM wrapper (r_valid is generated in the LIC, generating it in TCDM wrapper creates HCI violation asserts) - Use hci_core_split instead of manual splitting of wide HWPE interfaces (higher perf thanks to FIFOs --- target/verif/src/application_driver.sv | 96 +++++++------------------- target/verif/src/tb_hci.sv | 48 ++++--------- target/verif/src/tcdm_banks_wrap.sv | 22 +++--- 3 files changed, 51 insertions(+), 115 deletions(-) diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index fd69399..c3b7ffa 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -24,8 +24,6 @@ module application_driver #( parameter int unsigned ADDR_WIDTH = 1, parameter int unsigned APPL_DELAY = 2, // Delay on the input signals parameter int unsigned IW = 1, - parameter bit USE_STREAMER_FIFO = 1'b1, - parameter int unsigned STREAMER_FIFO_DEPTH = 2, parameter string STIM_FILE = "" ) ( input logic clk_i, @@ -47,48 +45,6 @@ module application_driver #( logic [ADDR_WIDTH-1:0] add; int unsigned n_completed_read_transactions; logic pending_rsp_is_read[$]; - localparam hci_package::hci_size_parameter_t HCI_SIZE_driver = '{ - DW: DATA_WIDTH, - AW: ADDR_WIDTH, - BW: hci_package::DEFAULT_BW, - UW: hci_package::DEFAULT_UW, - IW: IW, - EW: hci_package::DEFAULT_EW, - EHW: hci_package::DEFAULT_EHW - }; - - hci_core_intf #( - .DW(HCI_SIZE_driver.DW), - .AW(HCI_SIZE_driver.AW), - .BW(HCI_SIZE_driver.BW), - .UW(HCI_SIZE_driver.UW), - .IW(HCI_SIZE_driver.IW), - .EW(HCI_SIZE_driver.EW), - .EHW(HCI_SIZE_driver.EHW) - ) hci_drv_if ( - .clk(clk_i) - ); - - generate - if (USE_STREAMER_FIFO) begin : gen_driver_streamer_fifo - hci_core_fifo #( - .FIFO_DEPTH(STREAMER_FIFO_DEPTH), - .HCI_SIZE_tcdm_initiator(HCI_SIZE_driver) - ) i_driver_streamer_fifo ( - .clk_i(clk_i), - .rst_ni(rst_ni), - .clear_i(1'b0), - .flags_o(), - .tcdm_target(hci_drv_if), - .tcdm_initiator(hci_if) - ); - end else begin : gen_driver_direct - hci_core_assign i_driver_direct_assign ( - .tcdm_target(hci_drv_if), - .tcdm_initiator(hci_if) - ); - end - endgenerate always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter logic retired_is_read; @@ -96,10 +52,10 @@ module application_driver #( n_completed_read_transactions <= '0; pending_rsp_is_read.delete(); end else begin - if (hci_drv_if.req && hci_drv_if.gnt) begin - pending_rsp_is_read.push_back(hci_drv_if.wen); + if (hci_if.req && hci_if.gnt) begin + pending_rsp_is_read.push_back(hci_if.wen); end - if (hci_drv_if.r_valid && hci_drv_if.r_ready) begin + if (hci_if.r_valid && hci_if.r_ready) begin if (pending_rsp_is_read.size() != 0) begin retired_is_read = pending_rsp_is_read.pop_front(); if (retired_is_read) begin @@ -111,17 +67,17 @@ module application_driver #( end initial begin : proc_application_driver - hci_drv_if.id = '0; - hci_drv_if.add = '0; - hci_drv_if.data = '0; - hci_drv_if.req = 1'b0; - hci_drv_if.wen = 1'b0; - hci_drv_if.ecc = '0; - hci_drv_if.ereq = '0; - hci_drv_if.r_eready = '0; - hci_drv_if.be = '1; - hci_drv_if.r_ready = 1'b1; - hci_drv_if.user = '0; + hci_if.id = '0; + hci_if.add = '0; + hci_if.data = '0; + hci_if.req = 1'b0; + hci_if.wen = 1'b0; + hci_if.ecc = '0; + hci_if.ereq = '0; + hci_if.r_eready = '0; + hci_if.be = '1; + hci_if.r_ready = 1'b1; + hci_if.user = '0; end_stimuli_o = 1'b0; end_latency_o = 1'b0; n_issued_transactions_o = '0; @@ -157,25 +113,25 @@ module application_driver #( break; end #(APPL_DELAY); - hci_drv_if.id = id; - hci_drv_if.data = data; - hci_drv_if.add = add; - hci_drv_if.wen = wen; - hci_drv_if.req = req; + hci_if.id = id; + hci_if.data = data; + hci_if.add = add; + hci_if.wen = wen; + hci_if.req = req; if (req) begin - @(posedge clk_i iff hci_drv_if.gnt); + @(posedge clk_i iff hci_if.gnt); n_issued_transactions_o++; if (wen) begin n_issued_read_transactions_o++; end // Deassert in NBA region so monitors sampling this edge see the handshake. - hci_drv_if.id <= '0; - hci_drv_if.data <= '0; - hci_drv_if.add <= '0; - hci_drv_if.wen <= 1'b0; - hci_drv_if.req <= 1'b0; - wait (hci_drv_if.req == 1'b0); + hci_if.id <= '0; + hci_if.data <= '0; + hci_if.add <= '0; + hci_if.wen <= 1'b0; + hci_if.req <= 1'b0; + wait (hci_if.req == 1'b0); end else begin @(posedge clk_i); end diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 13cf7ec..0e51a48 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -207,40 +207,20 @@ module tb_hci ); end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master - for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_lane - localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f; - - assign hci_initiator_narrow[IDX].req = hci_driver_hwpe_if[ii].req; - assign hci_initiator_narrow[IDX].add = hci_driver_hwpe_if[ii].add + f * WORD_SIZE; - assign hci_initiator_narrow[IDX].wen = hci_driver_hwpe_if[ii].wen; - assign hci_initiator_narrow[IDX].data = hci_driver_hwpe_if[ii].data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8]; - assign hci_initiator_narrow[IDX].be = hci_driver_hwpe_if[ii].be[(f + 1) * WORD_SIZE - 1 : f * WORD_SIZE]; - assign hci_initiator_narrow[IDX].r_ready = hci_driver_hwpe_if[ii].r_ready; - assign hci_initiator_narrow[IDX].user = hci_driver_hwpe_if[ii].user; - assign hci_initiator_narrow[IDX].id = hci_driver_hwpe_if[ii].id; - assign hci_driver_hwpe_if[ii].r_data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8] = hci_initiator_narrow[IDX].r_data; - assign hci_initiator_narrow[IDX].ecc = hci_driver_hwpe_if[ii].ecc; - assign hci_initiator_narrow[IDX].ereq = hci_driver_hwpe_if[ii].ereq; - assign hci_initiator_narrow[IDX].r_eready = hci_driver_hwpe_if[ii].r_eready; - end - - assign hci_driver_hwpe_if[ii].r_user = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_user; - assign hci_driver_hwpe_if[ii].r_id = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_id; - assign hci_driver_hwpe_if[ii].r_opc = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_opc; - assign hci_driver_hwpe_if[ii].r_ecc = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_ecc; - - logic [HWPE_WIDTH_FACT-1:0] gnt_vec, rvalid_vec, egnt_vec, revalid_vec; - for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_red - localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f; - assign gnt_vec[f] = hci_initiator_narrow[IDX].gnt; - assign rvalid_vec[f] = hci_initiator_narrow[IDX].r_valid; - assign egnt_vec[f] = hci_initiator_narrow[IDX].egnt; - assign revalid_vec[f] = hci_initiator_narrow[IDX].r_evalid; - end - assign hci_driver_hwpe_if[ii].gnt = &gnt_vec; - assign hci_driver_hwpe_if[ii].r_valid = &rvalid_vec; - assign hci_driver_hwpe_if[ii].egnt = &egnt_vec; - assign hci_driver_hwpe_if[ii].r_evalid = &revalid_vec; + hci_core_split #( + .DW(HWPE_WIDTH_FACT * DATA_WIDTH), + .BW(DATA_WIDTH / 8), + .UW(1), + .NB_OUT_CHAN(HWPE_WIDTH_FACT), + .FIFO_DEPTH(2), + .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe) + ) i_hwpe_to_log_split ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .tcdm_target(hci_driver_hwpe_if[ii]), + .tcdm_initiator(hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT : N_CORE + (ii + 1) * HWPE_WIDTH_FACT - 1]) + ); end end else begin : gen_unsupported_mode initial $error("Unsupported INTERCO_TYPE"); diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv index fc22a0b..dd9e223 100644 --- a/target/verif/src/tcdm_banks_wrap.sv +++ b/target/verif/src/tcdm_banks_wrap.sv @@ -84,17 +84,17 @@ module tcdm_banks_wrap #( ); //r_valid - always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen - if(~rst_ni) begin - tcdm_slave[i].r_valid <= 1'b0; - end else begin - if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin - tcdm_slave[i].r_valid <= 1'b1; - end else begin - tcdm_slave[i].r_valid <= 1'b0; - end - end - end + // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen + // if(~rst_ni) begin + // tcdm_slave[i].r_valid <= 1'b0; + // end else begin + // if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin + // tcdm_slave[i].r_valid <= 1'b1; + // end else begin + // tcdm_slave[i].r_valid <= 1'b0; + // end + // end + // end end endmodule From cfcaf1ade84d18e1545507d2960a6f4ce161af66 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Thu, 5 Mar 2026 17:55:28 +0100 Subject: [PATCH 08/25] rtl: :bug: Fix r_ready masking in hci_core_split --- rtl/core/hci_core_split.sv | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rtl/core/hci_core_split.sv b/rtl/core/hci_core_split.sv index c0c96fb..88e2521 100644 --- a/rtl/core/hci_core_split.sv +++ b/rtl/core/hci_core_split.sv @@ -224,7 +224,11 @@ module hci_core_split end // r_ready masking - assign tcdm_initiator_lrdy_masked_d = cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid | ~tcdm_initiator_req : tcdm_initiator_r_valid | ~tcdm_initiator_req; + // Track lanes that have produced a response for the current split transaction. + // Using "~req" here can mark a lane as completed before r_valid is observed. + assign tcdm_initiator_lrdy_masked_d = + cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid + : tcdm_initiator_r_valid; always_ff @(posedge clk_i or negedge rst_ni) begin if(~rst_ni) begin From e5bd4f44aedb6f131f07806efce068700fea2a0e Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 6 Mar 2026 18:51:04 +0100 Subject: [PATCH 09/25] src,config: Rewrite app_drivers with FSMs + improve json config make --- target/verif/bender.mk | 6 +- target/verif/config/hardware.json | 10 +- target/verif/src/application_driver.sv | 361 +++++++++++++++++++------ target/verif/src/tb_hci.sv | 24 +- target/verif/verif.mk | 26 +- 5 files changed, 310 insertions(+), 117 deletions(-) diff --git a/target/verif/bender.mk b/target/verif/bender.mk index 8ccc663..6b1fa59 100644 --- a/target/verif/bender.mk +++ b/target/verif/bender.mk @@ -12,12 +12,12 @@ VERIF_DEFS += \ -D N_CORE=$(N_CORE) \ -D N_DMA=$(N_DMA) \ -D N_EXT=$(N_EXT) \ - -D TS_BIT=$(TS_BIT) \ - -D EXPFIFO=$(EXPFIFO) \ - -D SEL_LIC=$(SEL_LIC) \ -D DATA_WIDTH=$(DATA_WIDTH) \ -D TOT_MEM_SIZE=$(TOT_MEM_SIZE) \ -D N_BANKS=$(N_BANKS) \ + -D TS_BIT=$(TS_BIT) \ + -D EXPFIFO=$(EXPFIFO) \ + -D SEL_LIC=$(SEL_LIC) \ -D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \ -D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \ -D CLK_PERIOD=$(CLK_PERIOD) \ diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index 028c714..b6c52a5 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -4,14 +4,14 @@ "N_HWPE": 2, "HWPE_WIDTH_FACT": 8, "N_CORE": 8, - "N_DMA": 1, + "N_DMA": 0, "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 128, + "N_BANKS": 32, "INTERCO_TYPE": "LOG", "TS_BIT": 21, "EXPFIFO": 0, - "SEL_LIC": 0, - "DATA_WIDTH": 32, - "TOT_MEM_SIZE": 32, - "N_BANKS": 64 + "SEL_LIC": 0 } } diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index c3b7ffa..d921563 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -19,54 +19,112 @@ module application_driver #( parameter int unsigned MASTER_NUMBER = 1, - parameter int unsigned IS_HWPE = 1, parameter int unsigned DATA_WIDTH = 1, parameter int unsigned ADDR_WIDTH = 1, - parameter int unsigned APPL_DELAY = 2, // Delay on the input signals parameter int unsigned IW = 1, parameter string STIM_FILE = "" ) ( input logic clk_i, input logic rst_ni, + input logic clear_i, // used to gate the driver or to reset it hci_core_intf.initiator hci_if, - output logic end_stimuli_o, - output logic end_latency_o, - output int unsigned n_issued_transactions_o, - output int unsigned n_issued_read_transactions_o + output logic end_req_o, + output logic end_resp_o, + output int unsigned n_issued_tr_o, + output int unsigned n_issued_rd_tr_o, + output int unsigned n_retired_rd_tr_o ); - logic [IW-1:0] id; - string file_path; - int stim; - int scan_status; - logic wen; - logic req; - logic [DATA_WIDTH-1:0] data; - logic [ADDR_WIDTH-1:0] add; - int unsigned n_completed_read_transactions; - logic pending_rsp_is_read[$]; - - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter - logic retired_is_read; - if (!rst_ni) begin - n_completed_read_transactions <= '0; - pending_rsp_is_read.delete(); + int unsigned n_req_issued_q, n_req_issued_d; // total number of issued requests + int unsigned n_rd_req_issued_q, n_rd_req_issued_d; // total number of issued read requests + int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; // total number of retired read responses + + // Transaction queue from file + typedef struct { + logic req; + logic [IW-1:0] id; + logic wen; + logic [DATA_WIDTH-1:0] data; + logic [ADDR_WIDTH-1:0] add; + } transaction_t; + transaction_t transactions[$]; + + // Fill up the queue by reading the stimuli file until the end + initial begin + string file_path; + int stim; + int scan_status; + + if (STIM_FILE != "") begin + file_path = STIM_FILE; end else begin - if (hci_if.req && hci_if.gnt) begin - pending_rsp_is_read.push_back(hci_if.wen); - end - if (hci_if.r_valid && hci_if.r_ready) begin - if (pending_rsp_is_read.size() != 0) begin - retired_is_read = pending_rsp_is_read.pop_front(); - if (retired_is_read) begin - n_completed_read_transactions <= n_completed_read_transactions + 1; - end + $fatal("ERROR: Specify STIM_FILE path"); + end + // Open file + stim = $fopen(file_path, "r"); + if (stim == 0) begin + $fatal("ERROR: Could not open stimuli file"); + end + // Read every line + while (!$feof(stim)) begin + transaction_t transaction; + scan_status = $fscanf(stim, "%b %b %b %b %b\n", transaction.req, transaction.id, transaction.wen, transaction.data, transaction.add); + if (scan_status != 5) begin + if (!$feof(stim)) begin + $fatal(1, "ERROR: malformed stimuli line in %s", file_path); end + break; end + // First-in, first-out queue + transactions.push_back(transaction); + end + $fclose(stim); + end + + ////////////////// + // Requests FSM // + ////////////////// + + typedef enum logic [1:0] { + REQ_IDLE, + WAIT_GNT, + REQ_DONE, + RSP_DONE + } req_state_t; + + req_state_t req_state_q, req_state_d; + int unsigned tr_idx_q, tr_idx_d; // transaction ID + int unsigned last_op_issued_q, last_op_issued_d; // ID of the last issued operation (read or write) + + assign n_issued_tr_o = n_req_issued_q; + assign n_issued_rd_tr_o = n_rd_req_issued_q; + + // Sequential logic + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni || clear_i) begin + req_state_q <= REQ_IDLE; + tr_idx_q <= '0; + n_req_issued_q <= '0; + n_rd_req_issued_q <= '0; + last_op_issued_q <= '0; + end else begin + req_state_q <= req_state_d; + tr_idx_q <= tr_idx_d; + n_req_issued_q <= n_req_issued_d; + n_rd_req_issued_q <= n_rd_req_issued_d; + last_op_issued_q <= last_op_issued_d; end end - initial begin : proc_application_driver + // Combinational state logic + always_comb begin + // Defaults (FSM) + req_state_d = req_state_q; + tr_idx_d = tr_idx_q; + n_req_issued_d = n_req_issued_q; + n_rd_req_issued_d = n_rd_req_issued_q; + last_op_issued_d = last_op_issued_q; + // Defaults (HCI outputs) hci_if.id = '0; hci_if.add = '0; hci_if.data = '0; @@ -78,68 +136,197 @@ module application_driver #( hci_if.be = '1; hci_if.r_ready = 1'b1; hci_if.user = '0; - end_stimuli_o = 1'b0; - end_latency_o = 1'b0; - n_issued_transactions_o = '0; - n_issued_read_transactions_o = '0; + // Defaults (outputs) + end_req_o = 1'b0; + end_resp_o = 1'b0; + // inputs: gnt, r_data, r_valid, r_user, r_id - wait (rst_ni); - if (STIM_FILE != "") begin - file_path = STIM_FILE; - end else begin - if (IS_HWPE) begin - file_path = $sformatf( - "../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", - MASTER_NUMBER - ); - end else begin - file_path = $sformatf( - "../simvectors/generated/stimuli_processed/master_log_%0d.txt", - MASTER_NUMBER - ); + case (req_state_q) + REQ_IDLE: begin + // Check if there are still transactions to issue + if (tr_idx_q < transactions.size()) begin + // If so, increase transaction index for next iteration + tr_idx_d = tr_idx_q + 1; + // If this transaction is a request + if (transactions[tr_idx_q].req) begin + hci_if.req = 1'b1; + hci_if.id = transactions[tr_idx_q].id; + hci_if.wen = transactions[tr_idx_q].wen; + hci_if.data = transactions[tr_idx_q].data; + hci_if.add = transactions[tr_idx_q].add; + // Update counters + n_req_issued_d = n_req_issued_q + 1; + if (transactions[tr_idx_q].wen) begin + n_rd_req_issued_d = n_rd_req_issued_q + 1; + end + last_op_issued_d = tr_idx_q; + // If granted already, stay in REQ_IDLE, otherwise go to WAIT_GNT + if (hci_if.gnt) begin + req_state_d = REQ_IDLE; + end else begin + req_state_d = WAIT_GNT; + end + end + end else begin + // If no more transactions to issue + if (n_rd_req_issued_q > n_rd_resp_retired_q) begin + // If there are still read responses to retire, wait for them before finishing + req_state_d = REQ_DONE; + end else begin + req_state_d = RSP_DONE; + end + end + // Synchronously clear + if (clear_i) begin + req_state_d = REQ_IDLE; + end end - end - stim = $fopen(file_path, "r"); - if (stim == 0) begin - $fatal("ERROR: Could not open stimuli file!"); - end - @(posedge clk_i); - while (!$feof(stim)) begin - scan_status = $fscanf(stim, "%b %b %b %b %b\n", req, id, wen, data, add); - if (scan_status != 5) begin - if (!$feof(stim)) begin - $fatal(1, "ERROR: malformed stimuli line in %s", file_path); + WAIT_GNT: begin + hci_if.req = 1'b1; + hci_if.id = transactions[last_op_issued_q].id; + hci_if.wen = transactions[last_op_issued_q].wen; + hci_if.data = transactions[last_op_issued_q].data; + hci_if.add = transactions[last_op_issued_q].add; + // Check if there are still transactions to issue + if (tr_idx_q < transactions.size()) begin + // Check whether to stall or not transaction fetching, in case this is a idle transaction that can hide mem latency + if (!transactions[tr_idx_q].req) begin + tr_idx_d = tr_idx_q + 1; + end else begin + tr_idx_d = tr_idx_q; + end + // If grant received, go back to REQ_IDLE and pop another transaction, otherwise stay in WAIT_GNT + if (hci_if.gnt) begin + req_state_d = REQ_IDLE; + end else begin + req_state_d = WAIT_GNT; + end + end else begin + // If there are no more transactions, wait for last grant and then finish + if (hci_if.gnt) begin + if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin + // If the last transaction was a read, we need to wait for its response before finishing + req_state_d = REQ_DONE; + end else begin + // If the last transaction was a write, we can finish right away + req_state_d = RSP_DONE; + end + end else begin + req_state_d = WAIT_GNT; + end + end + // Synchronously clear + if (clear_i) begin + req_state_d = REQ_IDLE; end - break; end - #(APPL_DELAY); - hci_if.id = id; - hci_if.data = data; - hci_if.add = add; - hci_if.wen = wen; - hci_if.req = req; - - if (req) begin - @(posedge clk_i iff hci_if.gnt); - n_issued_transactions_o++; - if (wen) begin - n_issued_read_transactions_o++; + REQ_DONE: begin + end_req_o = 1'b1; + if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin + // All read responses have been retired + req_state_d = RSP_DONE; + end else begin + req_state_d = REQ_DONE; + end + // Synchronously clear + if (clear_i) begin + req_state_d = REQ_IDLE; + end + end + RSP_DONE: begin + end_req_o = 1'b1; + end_resp_o = 1'b1; + // Synchronously clear + if (clear_i) begin + req_state_d = REQ_IDLE; end - // Deassert in NBA region so monitors sampling this edge see the handshake. - hci_if.id <= '0; - hci_if.data <= '0; - hci_if.add <= '0; - hci_if.wen <= 1'b0; - hci_if.req <= 1'b0; - wait (hci_if.req == 1'b0); - end else begin - @(posedge clk_i); end + default: begin + req_state_d = REQ_IDLE; + end + endcase + end + + /////////////////////// + // Read response FSM // + /////////////////////// + + // We only consider read responses as write responses are not mandatory in HCI + + typedef enum logic { + RESP_IDLE, + RESP_WAIT_RVALID + } resp_state_t; + + resp_state_t resp_state_q, resp_state_d; + int unsigned n_rd_in_flight_q, n_rd_in_flight_d; // number of reads granted but not yet responded + + assign n_retired_rd_tr_o = n_rd_resp_retired_q; + + // Sequential logic + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni || clear_i) begin + resp_state_q <= RESP_IDLE; + n_rd_resp_retired_q <= '0; + n_rd_in_flight_q <= '0; + end else begin + resp_state_q <= resp_state_d; + n_rd_resp_retired_q <= n_rd_resp_retired_d; + n_rd_in_flight_q <= n_rd_in_flight_d; end + end - $fclose(stim); - end_stimuli_o = 1'b1; - wait (n_completed_read_transactions >= n_issued_read_transactions_o); - end_latency_o = 1'b1; + // Combinational state logic + always_comb begin + // Defaults (FSM) + resp_state_d = resp_state_q; + n_rd_resp_retired_d = n_rd_resp_retired_q; + n_rd_in_flight_d = n_rd_in_flight_q; + + case (resp_state_q) + RESP_IDLE: begin + // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + n_rd_in_flight_d = n_rd_in_flight_q + 1; + resp_state_d = RESP_WAIT_RVALID; + end + // Synchronously clear + if (clear_i) begin + resp_state_d = RESP_IDLE; + n_rd_in_flight_d = '0; + end + end + RESP_WAIT_RVALID: begin + // Track newly granted reads while waiting + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + n_rd_in_flight_d = n_rd_in_flight_q + 1; + end + // When read response handshake happens, retire one response + if (hci_if.r_valid && hci_if.r_ready) begin + n_rd_resp_retired_d = n_rd_resp_retired_q + 1; + // Adjust in-flight: account for simultaneous grant (already added above) + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + // net in-flight = in_flight + 1 (new grant) - 1 (retired) = in_flight + n_rd_in_flight_d = n_rd_in_flight_q; // undo the +1 above, net unchanged + end else begin + n_rd_in_flight_d = n_rd_in_flight_q - 1; + end + // If no more in-flight reads (after this retirement), go back to RESP_IDLE + if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin + resp_state_d = RESP_IDLE; + end + end + // Synchronously clear + if (clear_i) begin + resp_state_d = RESP_IDLE; + n_rd_in_flight_d = '0; + end + end + default: begin + resp_state_d = RESP_IDLE; + n_rd_in_flight_d = '0; + end + endcase end + endmodule diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 0e51a48..2413371 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -295,20 +295,20 @@ module tb_hci $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), - .IS_HWPE(0), .DATA_WIDTH(DATA_WIDTH), .ADDR_WIDTH(ADDR_WIDTH), - .APPL_DELAY(APPL_DELAY), .IW(IW_cores), .STIM_FILE(STIM_FILE_LOG) ) i_app_driver_log ( .clk_i(clk), .rst_ni(rst_n), + .clear_i(1'b0), .hci_if(hci_driver_log_if[ii]), - .end_stimuli_o(s_end_stimuli[ii]), - .end_latency_o(s_end_latency[ii]), - .n_issued_transactions_o(s_issued_transactions[ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[ii]) + .end_req_o(s_end_stimuli[ii]), + .end_resp_o(s_end_latency[ii]), + .n_issued_tr_o(s_issued_transactions[ii]), + .n_issued_rd_tr_o(s_issued_read_transactions[ii]), + .n_retired_rd_tr_o() ); end endgenerate @@ -319,20 +319,20 @@ module tb_hci $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), - .IS_HWPE(1), .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH), .ADDR_WIDTH(ADDR_WIDTH), - .APPL_DELAY(APPL_DELAY), .IW(IW_hwpe), .STIM_FILE(STIM_FILE_HWPE) ) i_app_driver_hwpe ( .clk_i(clk), .rst_ni(rst_n), + .clear_i(1'b0), .hci_if(hci_driver_hwpe_if[ii]), - .end_stimuli_o(s_end_stimuli[N_LOG_MASTERS + ii]), - .end_latency_o(s_end_latency[N_LOG_MASTERS + ii]), - .n_issued_transactions_o(s_issued_transactions[N_LOG_MASTERS + ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[N_LOG_MASTERS + ii]) + .end_req_o(s_end_stimuli[N_LOG_MASTERS + ii]), + .end_resp_o(s_end_latency[N_LOG_MASTERS + ii]), + .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]), + .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]), + .n_retired_rd_tr_o() ); end endgenerate diff --git a/target/verif/verif.mk b/target/verif/verif.mk index e9829ea..155fecb 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -32,10 +32,13 @@ PYTHON ?= python3 # Config gen # ############## -# Source-of-truth JSON configs -VERIF_CFG_JSON := $(HCI_VERIF_CFG_DIR)/hardware.json \ - $(HCI_VERIF_CFG_DIR)/testbench.json \ - $(HCI_VERIF_CFG_DIR)/workload.json +# JSON configs are configurable from env var (default to config/) +HARDWARE_JSON ?= $(HCI_VERIF_CFG_DIR)/hardware.json +TESTBENCH_JSON ?= $(HCI_VERIF_CFG_DIR)/testbench.json +WORKLOAD_JSON ?= $(HCI_VERIF_CFG_DIR)/workload.json + +# Source-of-truth JSON configs (used as stim-verif dependencies) +VERIF_CFG_JSON := $(HARDWARE_JSON) $(TESTBENCH_JSON) $(WORKLOAD_JSON) # Makefiles to generate from JSON configs VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \ @@ -43,9 +46,12 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \ .PHONY: config-verif config-verif: $(VERIF_CFG_MK) -# Generate Makefiles from JSON configs -$(HCI_VERIF_CFG_GEN_DIR)/%.mk: $(HCI_VERIF_CFG_DIR)/%.json $(HCI_VERIF_CFG_GEN_DIR)/%.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) - $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py $* $(HCI_VERIF_CFG_DIR) $(HCI_VERIF_CFG_GEN_DIR) > $@ + +$(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(dir $(HARDWARE_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@ + +$(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(dir $(TESTBENCH_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@ $(HCI_VERIF_CFG_GEN_DIR): mkdir -p $@ @@ -67,9 +73,9 @@ stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp $(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) mkdir -p $(SIMVECTORS_GEN_DIR) $(PYTHON) $(GEN_STIM_SCRIPT) \ - --workload_config $(HCI_VERIF_CFG_DIR)/workload.json \ - --testbench_config $(HCI_VERIF_CFG_DIR)/testbench.json \ - --hardware_config $(HCI_VERIF_CFG_DIR)/hardware.json + --workload_config $(WORKLOAD_JSON) \ + --testbench_config $(TESTBENCH_JSON) \ + --hardware_config $(HARDWARE_JSON) date > $@ .PHONY: clean-stim-verif From 22705903d25f31e19236c24e896b48a25bb0cb85 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Fri, 6 Mar 2026 23:28:51 +0100 Subject: [PATCH 10/25] verif: Implement advanced verification - Stimuli generation also outputs a memory map that is gonna be accessed - HWPE dataflow can be flexibly configured from workload.json (see workload.schema.json) - Added a traffic opion to the access patterns to be generated - Added support for HWPE custom dataflow inside testbench (supporta all mode: HCI, LOG, MUX) - Removed old parameters substituted by more advanced functionalities - Added possibility to select custom ranges for specific access patterns in vector generation --- target/verif/README.md | 179 +++++ target/verif/bender.mk | 4 +- .../verif/config/generated/testbench.mk.tpl | 2 - target/verif/config/hardware.json | 4 +- target/verif/config/testbench.json | 2 - target/verif/config/workload.json | 178 ++--- target/verif/config/workload.schema.json | 239 +++++++ target/verif/simvectors/README.md | 23 - .../verif/simvectors/hci_stimuli/__init__.py | 5 +- .../verif/simvectors/hci_stimuli/generator.py | 259 +++----- .../verif/simvectors/hci_stimuli/patterns.py | 441 +++++++++++++ .../verif/simvectors/hci_stimuli/processor.py | 68 +- target/verif/simvectors/main.py | 614 ++++++++++++++---- target/verif/src/application_driver.sv | 49 +- target/verif/src/simulation_report.sv | 54 +- target/verif/src/tb_hci.sv | 65 +- target/verif/src/tb_hci_pkg.sv | 32 +- target/verif/src/throughput_monitor.sv | 10 +- target/verif/verif.mk | 59 +- 19 files changed, 1632 insertions(+), 655 deletions(-) create mode 100644 target/verif/README.md create mode 100644 target/verif/config/workload.schema.json delete mode 100644 target/verif/simvectors/README.md create mode 100644 target/verif/simvectors/hci_stimuli/patterns.py diff --git a/target/verif/README.md b/target/verif/README.md new file mode 100644 index 0000000..20db9b2 --- /dev/null +++ b/target/verif/README.md @@ -0,0 +1,179 @@ +# HCI Verification Framework + +## Overview + +The verification framework drives configurable memory traffic from multiple masters through the HCI interconnect and measures throughput and latency. It is fully driven by three JSON configuration files: + +| File | Purpose | +|------|---------| +| `config/hardware.json` | Interconnect topology (number of masters, banks, data widths, ...) | +| `config/testbench.json` | Simulation parameters (clock period, arbitration policy, ...) | +| `config/workload.json` | Per-master traffic patterns, transaction counts, and dataflow dependencies | + +--- + +## Configuration files + +### `hardware.json` + +Controls the hardware topology instantiated in the testbench. Generates `config/generated/hardware.mk` which is included by the build system and passed as Verilog defines. + +### `testbench.json` + +Controls simulation-level knobs (clock period, reset cycles, arbitration parameters). Generates `config/generated/testbench.mk`. + +### `workload.json` + +The most user-facing configuration. Describes what each master does and in what order. Structure: + +```json +{ + "description": "...", + "log_masters": [ ... ], + "hwpe_masters": [ ... ] +} +``` + +#### Per-master fields + +Each entry in `log_masters` or `hwpe_masters` supports: + +| Field | Required | Description | +|-------|----------|-------------| +| `id` | recommended | Index within the master array. Must match the positional index (0-based). Used for documentation; mapping is always positional. | +| `description` | no | Human-readable label. | +| `mem_access_type` | yes | Traffic pattern. See [Access patterns](#access-patterns). | +| `n_transactions` | yes* | Number of transactions to issue. Can be derived from geometry for structured patterns — see below. | +| `phase` | no | Phase name (string). Masters in the same phase can run concurrently. Default: `"default"`. | +| `wait_for` | no | List of phase names that must complete (`end_req_o` asserted on all masters of those phases) before this master starts. Default: `[]` (start immediately). | +| `start_delay_cycles` | no | Number of idle cycles prepended to this master's stimuli file (static start delay within a phase). Default: `0`. | +| `region_base_address` | no | Base byte address of the memory region for this master (int, hex `0x...`, or decimal string). Default: evenly partitioned. | +| `region_size_bytes` | no | Size in bytes of the memory region for this master. Default: evenly partitioned. | +| `start_address` | no | Starting address for `linear`/`2d`/`3d` patterns (int, hex `0x...`, or decimal string). Default: `"0"`. | +| `stride0`, `len_d0` | no | Inner dimension stride (in words) and length for `linear`/`2d`/`3d`. | +| `stride1`, `len_d1` | no | Middle dimension stride (in words) and length for `2d`/`3d`. | +| `stride2` | no | Outer dimension stride (in words) for `3d`. | +| `traffic_pct` | no | [`random`, `linear`] Bus utilization percentage (1–100). After each transaction emits `floor((100-pct)/pct)` idle cycles. Default: `100` (back-to-back). | +| `traffic_read_pct` | no | [`random`, `linear`] Percentage of accesses that are reads. If omitted, read/write is random per transaction. When set, all reads are issued first, then all writes. | +| `idle_cycles_between_phases` | no | [`2d`, `3d`, `matmul_phased`] Idle cycles inserted between phases (between outer rows for `2d`/`3d`, between read-A/read-B/write-C for `matmul_phased`). Models compute time. Default: `0`. | +| `matmul_ratio_a/b/c` | no | [`matmul_phased`] Phase ratio for read-A : read-B : write-C transaction split. Default: `1:1:1`. | +| `region_base_address_a/b/c` | no | [`matmul_phased`] Explicit per-phase base addresses, overriding the auto-split of the combined region. | +| `region_size_bytes_a/b/c` | no | [`matmul_phased`] Explicit per-phase region sizes (paired with `region_base_address_a/b/c`). | + +\* `n_transactions` can be omitted for structured patterns if geometry fields are provided — `main.py` derives it automatically and reports it. For `random` it is always required. + +#### Deriving `n_transactions` from geometry + +| Pattern | Geometry fields | Derived count | +|---------|----------------|---------------| +| `linear` | `length` | `length` | +| `2d` | `len_d0`, `len_d1` | `len_d0 × len_d1` | +| `3d` | `len_d0`, `len_d1`, `len_d2` | `len_d0 × len_d1 × len_d2` | +| `matmul_phased` | `matrix_m`, `matrix_n`, `matrix_k` | `m×k + k×n + m×n` | +| `random` | — | must be set explicitly | +| `idle` | — | ignored | + +--- + +## Access patterns + +| `mem_access_type` | Description | +|-------------------|-------------| +| `random` | Uniformly random addresses within the assigned region. | +| `linear` | Strided 1-D sequential scan. | +| `2d` | Strided 2-D scan (inner `stride0`/`len_d0`, outer `stride1`). | +| `3d` | Strided 3-D scan (inner `stride0`/`len_d0`, mid `stride1`/`len_d1`, outer `stride2`). | +| `matmul_phased` | Deterministic phased traffic modelling matrix multiply: read-A phase, read-B phase, write-C phase. The assigned region is split into three equal sub-regions A, B, C. | +| `idle` | No transactions (driver idles). | + +--- + +## Dataflow: phases and dependencies + +### Concept + +Masters can be assigned to named **phases** and can declare **dependencies** on other phases. This allows modeling realistic dataflow graphs, e.g.: + +- HWPE A and HWPE B run in parallel (same phase, no dependencies) +- HWPE C starts only after HWPE B finishes (C `wait_for: ["phaseB"]`) + +```json +"hwpe_masters": [ + { "id": 0, "phase": "phaseA", "wait_for": [], ... }, + { "id": 1, "phase": "phaseB", "wait_for": [], ... }, + { "id": 2, "phase": "phaseC", "wait_for": ["phaseB"], ... } +] +``` + +### Mechanism + +`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/wait_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define. + +In `tb_hci_pkg.sv`, `WAIT_MASKS[i]` holds driver `i`'s mask. In `tb_hci.sv`, each driver's `clear_i` is combinationally held high until all drivers in its mask have asserted `end_req_o`: + +``` +clear_i[i] = (eff_mask[i] != 0) && ((s_end_req & eff_mask[i]) != eff_mask[i]) +``` + +A driver with an empty mask starts as soon as reset is released. + +The dependency is on `end_req_o` (all transactions issued), not `end_resp_o` (all read responses retired). This is intentional: a dependent master can begin issuing as soon as the dependency master has issued all its requests, which is the correct model for pipelined HWPE dataflow. + +### MUX mode (INTERCO_TYPE=MUX) + +`hci_core_mux_static` forwards exactly one HWPE to the interconnect at a time (selected by `sel_i`). Therefore, in MUX mode **at most one HWPE can be active at any given time**, regardless of what `wait_for` says in the workload. + +**The user-defined `wait_for` dependencies are ignored for HWPE drivers in MUX mode.** Instead, the testbench automatically enforces a strict sequential chain: + +- HWPE 0 starts first (no dependency) +- HWPE 1 waits for HWPE 0 +- HWPE 2 waits for HWPE 1 +- ... +- HWPE k waits for HWPE k-1 + +**Ordering is by index**, which equals the positional order in the `hwpe_masters` array in `workload.json` (0-based). To control execution order in MUX mode, reorder the entries in `hwpe_masters`. + +LOG master `wait_for` dependencies are always respected regardless of interconnect mode. + +`sel_i` is driven combinationally to the index of the currently active HWPE (the lowest-indexed HWPE whose `clear_i` is 0). + +This means **the same workload JSON can be used across all three interconnect modes** (HCI, MUX, LOG). In HCI/LOG mode the declared `wait_for` dependencies are honored; in MUX mode they are overridden by the sequential chain. + +--- + +## Memory layout + +The memory uses a **bank-interleaved addressing scheme**: consecutive word addresses map to consecutive banks before wrapping. + +``` +byte addr 0x00 → bank 0 +byte addr 0x04 → bank 1 (assuming DATA_WIDTH=32, 4 bytes/word) +... +byte addr 0x7C → bank 31 (for N_BANKS=32) +byte addr 0x80 → bank 0 (wrap) +``` + +In general, bank index = `(byte_addr / (DATA_WIDTH/8)) % N_BANKS`. + +`main.py` reports the memory map for each master after stimuli generation, showing: +- First and last byte addresses accessed +- Total transfer size in bytes +- Number of distinct banks touched +- For matmul_phased: sub-region boundaries for A, B, C matrices + +This allows verifying that regions are correctly partitioned and identifying any unintended overlaps between masters. + +--- + +## Build targets + +| Target | Action | +|--------|--------| +| `make config-verif` | Generate `hardware.mk`, `testbench.mk` from JSON | +| `make stim-verif` | Generate stimuli and `wait_masks.mk` from workload/hardware/testbench JSON | +| `make compile-verif` | Compile RTL and testbench with QuestaSim | +| `make opt-verif` | Optimize compiled design | +| `make run-verif` | Run simulation | +| `make clean-verif` | Remove all generated artifacts | + +Pass `WORKLOAD_JSON=config/workload_.json` to `make stim-verif` / `make run-verif` to select an alternative workload. diff --git a/target/verif/bender.mk b/target/verif/bender.mk index 6b1fa59..4aac401 100644 --- a/target/verif/bender.mk +++ b/target/verif/bender.mk @@ -18,14 +18,12 @@ VERIF_DEFS += \ -D TS_BIT=$(TS_BIT) \ -D EXPFIFO=$(EXPFIFO) \ -D SEL_LIC=$(SEL_LIC) \ - -D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \ - -D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \ -D CLK_PERIOD=$(CLK_PERIOD) \ -D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \ -D RANDOM_GNT=$(RANDOM_GNT) \ -D INTERCO_TYPE=$(INTERCO_TYPE) \ -D INVERT_PRIO=$(INVERT_PRIO) \ - -D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) + -D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) \ # Common targets for bender VERIF_TARGS ?= diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl index 469736b..a65ee53 100644 --- a/target/verif/config/generated/testbench.mk.tpl +++ b/target/verif/config/generated/testbench.mk.tpl @@ -5,8 +5,6 @@ # This file is auto-generated from testbench.json - DO NOT EDIT MANUALLY # Testbench parameters (from testbench.json) -N_TRANSACTION_LOG?=${N_TRANSACTION_LOG} -TRANSACTION_RATIO?=${TRANSACTION_RATIO} CLK_PERIOD?=${CLK_PERIOD} RST_CLK_CYCLES?=${RST_CLK_CYCLES} RANDOM_GNT?=${RANDOM_GNT} diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index b6c52a5..c8db224 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -1,7 +1,7 @@ { "description": "Hardware configuration parameters for HCI interconnect", "parameters": { - "N_HWPE": 2, + "N_HWPE": 3, "HWPE_WIDTH_FACT": 8, "N_CORE": 8, "N_DMA": 0, @@ -9,7 +9,7 @@ "DATA_WIDTH": 32, "TOT_MEM_SIZE": 128, "N_BANKS": 32, - "INTERCO_TYPE": "LOG", + "INTERCO_TYPE": "HCI", "TS_BIT": 21, "EXPFIFO": 0, "SEL_LIC": 0 diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json index 46005ab..9ce418f 100644 --- a/target/verif/config/testbench.json +++ b/target/verif/config/testbench.json @@ -1,8 +1,6 @@ { "description": "Testbench configuration parameters", "parameters": { - "N_TRANSACTION_LOG": 1000, - "TRANSACTION_RATIO": 1, "CLK_PERIOD": 50, "RST_CLK_CYCLES": 10, "RANDOM_GNT": 0, diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index 7da0838..e094564 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -1,144 +1,58 @@ { - "description": "Workload configuration for stimuli generation", - "simulation_parameters": { - "EXACT_OR_MAX_OFFSET": 0, - "CYCLE_OFFSET_LOG": 1, - "CYCLE_OFFSET_HWPE": 1 - }, + "description": "8 cores at 50% traffic in dedicated regions + 3 HWPEs: DMA imports tile B, GEMM reads tile B and writes output, Transpose reads GEMM output and writes transposed result. DMA and GEMM run concurrently (wave0), Transpose waits for wave0.", + "log_masters": [ + { "id": 0, "description": "Core 0", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00000000", "region_base_address": "0x00000000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 1, "description": "Core 1", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00001000", "region_base_address": "0x00001000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 2, "description": "Core 2", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00002000", "region_base_address": "0x00002000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 3, "description": "Core 3", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00003000", "region_base_address": "0x00003000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 4, "description": "Core 4", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00004000", "region_base_address": "0x00004000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 5, "description": "Core 5", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00005000", "region_base_address": "0x00005000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 6, "description": "Core 6", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00006000", "region_base_address": "0x00006000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 7, "description": "Core 7", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00007000", "region_base_address": "0x00007000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ { "id": 0, - "description": "Core 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "DMA — wide tile import into buffer B (0x08000–0x0FFFF)", + "mem_access_type": "random", + "n_transactions": 1000, + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00008000", + "region_size_bytes": 32768, + "phase": "hwpe_wave0" }, { "id": 1, - "description": "Core 1", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "GEMM — reads tile B from DMA buffer (0x08000–0x0FFFF), writes output to buffer C (0x10000–0x17FFF)", + "mem_access_type": "matmul_phased", + "n_transactions": 1000, + "region_base_address_a": "0x00008000", + "region_size_bytes_a": 32768, + "matmul_ratio_b": 0, + "region_base_address_c": "0x00010000", + "region_size_bytes_c": 32768, + "matmul_ratio_a": 1, + "matmul_ratio_c": 1, + "phase": "hwpe_wave0" }, { "id": 2, - "description": "Core 2", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 3, - "description": "Core 3", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 4, - "description": "Core 4", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 5, - "description": "Core 5", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 6, - "description": "Core 6", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 7, - "description": "Core 7", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 8, - "description": "DMA 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 9, - "description": "External 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - } - ], - "hwpe_masters": [ - { - "id": 0, - "description": "HWPE 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 1, - "description": "HWPE 1", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Transpose engine — reads GEMM output (0x10000–0x17FFF), writes transposed result (0x18000–0x1FFFF)", + "mem_access_type": "matmul_phased", + "n_transactions": 1000, + "region_base_address_a": "0x00010000", + "region_size_bytes_a": 32768, + "matmul_ratio_b": 0, + "region_base_address_c": "0x00018000", + "region_size_bytes_c": 32768, + "matmul_ratio_a": 1, + "matmul_ratio_c": 1, + "phase": "hwpe_wave1", + "wait_for": ["hwpe_wave0"] } ] -} \ No newline at end of file +} diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json new file mode 100644 index 0000000..3b4c653 --- /dev/null +++ b/target/verif/config/workload.schema.json @@ -0,0 +1,239 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "workload.schema.json", + "title": "HCI Verification Workload Configuration", + "description": "Schema for workload.json consumed by target/verif/simvectors/main.py", + "type": "object", + "required": ["log_masters", "hwpe_masters"], + "additionalProperties": false, + "properties": { + "description": { + "type": "string", + "description": "Human-readable description of this workload." + }, + "log_masters": { + "type": "array", + "description": "Ordered list of LOG masters (CORE, then DMA, then EXT). Length must equal N_CORE + N_DMA + N_EXT in hardware.json.", + "items": { "$ref": "#/$defs/master" } + }, + "hwpe_masters": { + "type": "array", + "description": "Ordered list of HWPE masters. Length must equal N_HWPE in hardware.json.", + "items": { "$ref": "#/$defs/master" } + } + }, + + "$defs": { + "master": { + "type": "object", + "description": "Per-master stimulus configuration. The active pattern is selected by mem_access_type.", + "required": ["mem_access_type"], + "unevaluatedProperties": false, + "properties": { + + "id": { + "type": "integer", + "description": "Optional consistency label. Mapping is always positional; a mismatch triggers a warning." + }, + "description": { + "type": "string", + "description": "Human-readable label shown in the memory-map report." + }, + "mem_access_type": { + "type": "string", + "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"], + "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'. Use traffic_pct/traffic_read_pct on 'random'/'linear' to model bus utilization." + }, + "phase": { + "type": "string", + "default": "default", + "description": "Phase group this master belongs to, referenced by other masters' wait_for." + }, + "wait_for": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Phase names this master waits for before its WAIT_MASK clears in the testbench." + }, + "start_delay_cycles": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Number of idle cycles prepended to the processed stimuli file (static start delay)." + }, + + "n_transactions": { + "type": "integer", + "minimum": 0, + "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields (see per-pattern rules)." + }, + + "region_base_address": { + "oneOf": [ + { "type": "integer", "minimum": 0 }, + { "type": "string", "description": "Decimal, hex (0x...) or binary string." } + ], + "description": "[random, matmul_phased] Base byte address of the memory region. Default: master_local_idx * (total_mem / n_peers_of_kind), aligned to access_bytes." + }, + "region_size_bytes": { + "oneOf": [ + { "type": "integer", "minimum": 0 }, + { "type": "string", "description": "Decimal, hex (0x...) or binary string." } + ], + "description": "[random, matmul_phased] Size in bytes of the memory region. Default: total_mem / n_peers_of_kind. Clamped so region does not exceed total memory." + }, + + "start_address": { + "type": "string", + "default": "0", + "description": "[linear, 2d, 3d] Start byte address as decimal, hex (0x...) or binary string." + }, + "stride0": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[linear, 2d, 3d] Innermost stride in words." + }, + "stride1": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[2d, 3d] Middle stride in words." + }, + "stride2": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[3d] Outermost stride in words." + }, + "length": { + "type": "integer", + "minimum": 0, + "description": "[linear] Alias for n_transactions." + }, + "len_d0": { + "type": "integer", + "minimum": 1, + "description": "[2d, 3d] Innermost dimension length. Can replace n_transactions together with len_d1 (and len_d2 for 3d)." + }, + "len_d1": { + "type": "integer", + "minimum": 1, + "description": "[2d, 3d] Middle dimension length." + }, + "len_d2": { + "type": "integer", + "minimum": 1, + "description": "[3d] Outermost dimension length." + }, + + "matrix_m": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Rows of A and C. Can replace n_transactions together with matrix_n and matrix_k (total = m*k + k*n + m*n)." + }, + "matrix_n": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Columns of B and C." + }, + "matrix_k": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Columns of A / rows of B." + }, + "region_base_address_a": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the read-A region. If set together with region_size_bytes_a, overrides the auto-split for phase A." + }, + "region_size_bytes_a": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the read-A region." + }, + "region_base_address_b": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the read-B region." + }, + "region_size_bytes_b": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the read-B region." + }, + "region_base_address_c": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the write-C region." + }, + "region_size_bytes_c": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the write-C region." + }, + + "matmul_ratio_a": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the read-A phase in the transaction split." + }, + "matmul_ratio_b": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the read-B phase." + }, + "matmul_ratio_c": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the write-C phase." + }, + + "traffic_pct": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 100, + "description": "[random, linear] Bus utilization percentage. After each transaction emit floor((100-pct)/pct) idle cycles. E.g. 50 => 1 idle per transaction (req, idle, req, idle, ...). Default 100 = back-to-back." + }, + "traffic_read_pct": { + "type": "integer", + "minimum": 0, + "maximum": 100, + "description": "[random, linear] Percentage of accesses that are reads (wen=1). If omitted, wen is random per transaction. When set, all reads are emitted first, then all writes." + }, + "idle_cycles_between_phases": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[2d, 3d, matmul_phased] Number of req=0 cycles inserted at each phase boundary (between outer rows for 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time between memory bursts." + } + }, + + "allOf": [ + { + "if": { "properties": { "mem_access_type": { "const": "random" } }, "required": ["mem_access_type"] }, + "then": { "required": ["n_transactions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] }, + "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["length"] }] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] }, + "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1"] }] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "3d" } }, "required": ["mem_access_type"] }, + "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1", "len_d2"] }] } + }, + { + "if": { + "properties": { "mem_access_type": { "enum": ["matmul_phased", "matmul"] } }, + "required": ["mem_access_type"] + }, + "then": { + "anyOf": [{ "required": ["n_transactions"] }, { "required": ["matrix_m", "matrix_n", "matrix_k"] }] + } + } + ] + } + } +} diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md deleted file mode 100644 index 97be798..0000000 --- a/target/verif/simvectors/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# HCI Stimuli Generator - -Python package for generating test stimuli for the HCI verification environment. - -## Usage - -### Generate Stimuli - -```bash -python target/verif/simvectors/main.py --workload_config --testbench_config --hardware_config -``` - -Or use the Makefile: -```bash -make stimuli -``` - -## Configuration - -Configuration files are located in `target/verif/config/`: -- `hardware.json` - HCI hardware parameters (auto-generates `generated/hardware.mk`) -- `testbench.json` - Testbench parameters (auto-generates `generated/testbench.mk`) -- `workload.json` - Workload configuration with simulation parameters and master-specific settings diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py index 3f4cd89..6381011 100644 --- a/target/verif/simvectors/hci_stimuli/__init__.py +++ b/target/verif/simvectors/hci_stimuli/__init__.py @@ -6,7 +6,6 @@ """ from .generator import StimuliGenerator -from .processor import unfold_raw_txt, pad_txt_files - -__all__ = ['StimuliGenerator', 'unfold_raw_txt', 'pad_txt_files'] +from .processor import pad_txt_files +__all__ = ['StimuliGenerator', 'pad_txt_files'] diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py index af121d6..aac6927 100644 --- a/target/verif/simvectors/hci_stimuli/generator.py +++ b/target/verif/simvectors/hci_stimuli/generator.py @@ -1,189 +1,80 @@ -"""Stimuli Generator class and access-pattern generators.""" -import random -import os - -class StimuliGenerator: - def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION): - self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY - self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8) - self.N_BANKS = N_BANKS - self.TOT_MEM_SIZE = TOT_MEM_SIZE - self.DATA_WIDTH = DATA_WIDTH - self.ADD_WIDTH = int(ADD_WIDTH) - self.filepath = filepath - os.makedirs(os.path.dirname(filepath),exist_ok=True) - self.N_TEST = N_TEST - self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET - self.CYCLE_OFFSET = CYCLE_OFFSET +"""StimuliGenerator: infrastructure for writing cycle-accurate stimuli files. + +Each output file has one line per simulation cycle in the format: + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) + +req=0 means no transaction that cycle (id/wen/data/add are don't-cares). +req=1 means an active transaction. +""" + +import os +import random + +from .patterns import PatternsMixin + + +class StimuliGenerator(PatternsMixin): + def __init__( + self, + IW, + WIDTH_OF_MEMORY, + N_BANKS, + TOT_MEM_SIZE, + DATA_WIDTH, + ADD_WIDTH, + filepath, + N_TEST, + MASTER_NUMBER_IDENTIFICATION, + ): + self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY + self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY / 8) + self.N_BANKS = N_BANKS + self.TOT_MEM_SIZE = TOT_MEM_SIZE + self.DATA_WIDTH = DATA_WIDTH + self.ADD_WIDTH = int(ADD_WIDTH) + self.filepath = filepath + os.makedirs(os.path.dirname(filepath), exist_ok=True) + self.N_TEST = N_TEST self.IW = IW self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION def _format_id(self, id_value): return bin(id_value % (1 << self.IW))[2:].zfill(self.IW) - - def random_data(self): - data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data - data = bin(data_decimal)[2:].zfill(self.DATA_WIDTH) - return data - - def data_wen_offset(self): - wen = random.randint(0,1) # write enable signal (1 = read, 0 = write) - if (self.EXACT_OR_MAX_OFFSET): - cycle_offset = random.randint(1,self.CYCLE_OFFSET) - else: - cycle_offset = self.CYCLE_OFFSET - if wen: - data = "0" * self.DATA_WIDTH - else: - data = self.random_data() - return data, wen, cycle_offset - - - def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - for test in range(self.N_TEST): - data, wen, cycle_offset = self.data_wen_offset() - while True: - add_decimal = int((random.randint(0, int((self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE)/self.WIDTH_OF_MEMORY_BYTE)))*(self.WIDTH_OF_MEMORY_BYTE)) # generate a random word-aligned memory address. - if add_decimal > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - add_decimal = add_decimal - self.TOT_MEM_SIZE*1024 #rolls over - add = bin(add_decimal)[2:].zfill(self.ADD_WIDTH) - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - break - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - next_address = int(start_address,2) - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - for test in range(self.N_TEST): - data, wen, cycle_offset = self.data_wen_offset() - - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - next_address += (self.WIDTH_OF_MEMORY_BYTE)*stride0 #word-aligned memory address - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - break - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - - return id - - def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - start_address = int(start_address,2) - next_address = start_address - j = 0 - STOP = 0 - while True: - for i in range(len_d0): - data, wen, cycle_offset = self.data_wen_offset() - next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 #word-aligned memory address - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - if STOP: - break - j = j + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - start_address = int(start_address,2) - next_address = start_address - k = 0 - STOP = 0 - while True: - for j in range(len_d1): - for i in range(len_d0): - data, wen, cycle_offset = self.data_wen_offset() - next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 + k*(self.WIDTH_OF_MEMORY_BYTE)*stride2 #word-aligned memory address - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - if STOP: - break - if STOP: - break - k = k + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - + + def random_data(self): + data_decimal = random.randint(0, (2 ** self.DATA_WIDTH) - 1) + return bin(data_decimal)[2:].zfill(self.DATA_WIDTH) + + def _write_req(self, file_obj, id_value, wen, data, add): + """Write one active-request line (req=1).""" + file_obj.write( + "1 " + + self._format_id(id_value) + + " " + + str(wen) + + " " + + data + + " " + + add + + "\n" + ) + + def _write_idle(self, file_obj): + """Write one idle line (req=0).""" + file_obj.write( + "0 " + + "0" * self.IW + + " 0 " + + "0" * self.DATA_WIDTH + + " " + + "0" * self.ADD_WIDTH + + "\n" + ) + + def data_wen(self): + wen = random.randint(0, 1) # 1=read, 0=write + if wen: + data = "0" * self.DATA_WIDTH + else: + data = self.random_data() + return data, wen diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py new file mode 100644 index 0000000..987e526 --- /dev/null +++ b/target/verif/simvectors/hci_stimuli/patterns.py @@ -0,0 +1,441 @@ +"""Access-pattern generators for StimuliGenerator. + +Each method writes a cycle-accurate stimuli file directly (one line per cycle): + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) + +req=0 lines are idle cycles. req=1 lines are active transactions. +There is no intermediate raw format or cycle_offset expansion step. + +Patterns: + - random: uniformly random addresses, random read/write mix. + Optional: traffic_pct / traffic_read_pct for uniform idle interleaving. + - linear: strided 1-D scan. + Optional: traffic_pct / traffic_read_pct for uniform idle interleaving. + - 2d: strided 2-D scan (inner stride0, outer stride1). + Optional: idle_cycles_between_phases inserted after each outer row. + - 3d: strided 3-D scan (stride0, stride1, stride2). + Optional: idle_cycles_between_phases inserted after each mid-level block. + - idle: no transactions (single req=0 line). + - matmul_phased: phased traffic modelling matrix multiply + (read-A phase, read-B phase, write-C phase). + Optional: idle_cycles_between_phases inserted between phases. + +traffic_pct semantics (random, linear): + After every transaction, emit floor((100 - traffic_pct) / traffic_pct) idle cycles. + traffic_pct=50 => 1 idle per transaction (req, idle, req, idle, ...) + traffic_pct=100 => no idles (back-to-back) + +traffic_read_pct semantics (random, linear): + Fraction of transactions that are reads (wen=1); remainder are writes (wen=0). + Only meaningful when traffic_pct < 100 or as override of the default random mix. + When set, wen is no longer random: the first n_reads transactions are reads, + the rest are writes (pattern: all reads first, then all writes). + +idle_cycles_between_phases semantics (2d, 3d, matmul_phased): + Number of req=0 cycles inserted at each phase boundary (between outer rows for + 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time + between bursts of memory accesses. + +All patterns respect the cross-master forbidden address lists: + - forbidden_write: addresses already read -> no later master may write here + - forbidden_read: addresses already written -> no later master may read/write here +""" + +import random + + +class PatternsMixin: + """Mixin providing memory access pattern generators.""" + + # ------------------------------------------------------------------ # + # Shared helpers # + # ------------------------------------------------------------------ # + + @staticmethod + def _parse_address(addr_str): + """Parse a binary, hex (0x...), or decimal string to int.""" + s = str(addr_str) + if s.startswith('0x') or s.startswith('0X'): + return int(s, 16) + if set(s) <= {'0', '1'}: + return int(s, 2) + return int(s, 0) + + @staticmethod + def _align_down(value, alignment): + if alignment <= 0: + return value + return (value // alignment) * alignment + + @staticmethod + def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c): + """Split total_ops into three phases according to ratios.""" + ra = max(0, int(ratio_a)) + rb = max(0, int(ratio_b)) + rc = max(0, int(ratio_c)) + if ra == 0 and rb == 0 and rc == 0: + ra, rb, rc = 1, 1, 1 + rsum = ra + rb + rc + cnt_a = (total_ops * ra) // rsum + cnt_b = (total_ops * rb) // rsum + cnt_c = total_ops - cnt_a - cnt_b + + if total_ops >= 3: + if ra > 0 and cnt_a == 0: + cnt_a = 1 + cnt_c = max(0, cnt_c - 1) + if rb > 0 and cnt_b == 0: + cnt_b = 1 + cnt_c = max(0, cnt_c - 1) + if rc > 0 and cnt_c == 0: + if cnt_a > 1: + cnt_a -= 1 + cnt_c = 1 + elif cnt_b > 1: + cnt_b -= 1 + cnt_c = 1 + return cnt_a, cnt_b, cnt_c + + @staticmethod + def _idles_per_req(traffic_pct): + """Number of idle cycles to emit after each transaction for a given traffic_pct.""" + traffic_pct = max(1, min(100, int(traffic_pct))) + if traffic_pct >= 100: + return 0 + return int(round((100 - traffic_pct) / traffic_pct)) + + def _is_allowed(self, add, wen, forbidden_read, forbidden_write): + """Return True if the address is not in the relevant forbidden list.""" + return add not in (forbidden_read if wen else forbidden_write) + + def _record_access(self, add, wen, forbidden_read_new, forbidden_write_new): + """Mark add as used by this master.""" + forbidden_write_new.append(add) + if not wen: + forbidden_read_new.append(add) + + # ------------------------------------------------------------------ # + # Access patterns # + # ------------------------------------------------------------------ # + + def random_gen( + self, + id_start, + forbidden_read, + forbidden_write, + region_base=0, + region_size=None, + traffic_pct=100, + traffic_read_pct=None, + ): + """Uniformly random addresses within [region_base, region_base + region_size). + + traffic_pct: bus utilisation percentage. After each transaction, + floor((100 - traffic_pct) / traffic_pct) idle cycles are emitted. + Default 100 = back-to-back (no idles). + + traffic_read_pct: if set, the first n_reads transactions are reads and + the rest are writes. If not set, wen is random per transaction. + """ + total_mem_bytes = int(self.TOT_MEM_SIZE * 1024) + if region_size is None: + region_size = total_mem_bytes + region_size = min(region_size, total_mem_bytes - region_base) + n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE) + n_idles = self._idles_per_req(traffic_pct) + + # Build wen sequence + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + else: + wen_seq = None + + id_value = id_start + forbidden_read_new = [] + forbidden_write_new = [] + with open(self.filepath, "w", encoding="ascii") as file: + for i in range(self.N_TEST): + if wen_seq is not None: + wen = wen_seq[i] + data = "0" * self.DATA_WIDTH if wen else self.random_data() + else: + data, wen = self.data_wen() + while True: + add_decimal = region_base + random.randint(0, int(n_words) - 1) * self.WIDTH_OF_MEMORY_BYTE + add = bin(int(add_decimal))[2:].zfill(self.ADD_WIDTH) + if self._is_allowed(add, wen, forbidden_read, forbidden_write): + self._record_access(add, wen, forbidden_read_new, forbidden_write_new) + break + self._write_req(file, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(file) + forbidden_read.extend(forbidden_read_new) + forbidden_write.extend(forbidden_write_new) + return id_value + + def linear_gen( + self, + stride0, + start_address, + id_start, + forbidden_read, + forbidden_write, + traffic_pct=100, + traffic_read_pct=None, + ): + """Strided 1-D linear scan. Forbidden addresses are skipped (no idle inserted). + + traffic_pct / traffic_read_pct: see random_gen. + """ + n_idles = self._idles_per_req(traffic_pct) + + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + else: + wen_seq = None + + id_value = id_start + forbidden_read_new = [] + forbidden_write_new = [] + seq_idx = 0 + with open(self.filepath, "w", encoding="ascii") as file: + next_address = self._parse_address(start_address) + if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: + next_address -= self.TOT_MEM_SIZE * 1024 + for i in range(self.N_TEST): + if wen_seq is not None: + wen = wen_seq[i] + data = "0" * self.DATA_WIDTH if wen else self.random_data() + else: + data, wen = self.data_wen() + add = bin(next_address)[2:].zfill(self.ADD_WIDTH) + next_address += self.WIDTH_OF_MEMORY_BYTE * stride0 + if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: + next_address -= self.TOT_MEM_SIZE * 1024 + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): + continue + self._record_access(add, wen, forbidden_read_new, forbidden_write_new) + self._write_req(file, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(file) + forbidden_read.extend(forbidden_read_new) + forbidden_write.extend(forbidden_write_new) + return id_value + + def gen_2d( + self, + stride0, + len_d0, + stride1, + start_address, + id_start, + forbidden_read, + forbidden_write, + idle_cycles_between_phases=0, + ): + """Strided 2-D scan: inner loop stride0/len_d0, outer loop stride1. + + idle_cycles_between_phases: req=0 cycles inserted after each completed inner row. + """ + id_value = id_start + forbidden_read_new = [] + forbidden_write_new = [] + with open(self.filepath, "w", encoding="ascii") as file: + start_address_int = self._parse_address(start_address) + j = 0 + while id_value - id_start < self.N_TEST: + for i in range(len_d0): + data, wen = self.data_wen() + next_address = ( + start_address_int + + i * self.WIDTH_OF_MEMORY_BYTE * stride0 + + j * self.WIDTH_OF_MEMORY_BYTE * stride1 + ) + if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: + next_address -= self.TOT_MEM_SIZE * 1024 + add = bin(next_address)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): + continue + self._record_access(add, wen, forbidden_read_new, forbidden_write_new) + self._write_req(file, id_value, wen, data, add) + id_value += 1 + if id_value - id_start >= self.N_TEST: + break + for _ in range(idle_cycles_between_phases): + self._write_idle(file) + j += 1 + forbidden_read.extend(forbidden_read_new) + forbidden_write.extend(forbidden_write_new) + return id_value + + def gen_3d( + self, + stride0, + len_d0, + stride1, + len_d1, + stride2, + start_address, + id_start, + forbidden_read, + forbidden_write, + idle_cycles_between_phases=0, + ): + """Strided 3-D scan: inner stride0/len_d0, mid stride1/len_d1, outer stride2. + + idle_cycles_between_phases: req=0 cycles inserted after each completed mid-level block. + """ + id_value = id_start + forbidden_read_new = [] + forbidden_write_new = [] + with open(self.filepath, "w", encoding="ascii") as file: + start_address_int = self._parse_address(start_address) + k = 0 + while id_value - id_start < self.N_TEST: + for j in range(len_d1): + for i in range(len_d0): + data, wen = self.data_wen() + next_address = ( + start_address_int + + i * self.WIDTH_OF_MEMORY_BYTE * stride0 + + j * self.WIDTH_OF_MEMORY_BYTE * stride1 + + k * self.WIDTH_OF_MEMORY_BYTE * stride2 + ) + if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: + next_address -= self.TOT_MEM_SIZE * 1024 + add = bin(next_address)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): + continue + self._record_access(add, wen, forbidden_read_new, forbidden_write_new) + self._write_req(file, id_value, wen, data, add) + id_value += 1 + if id_value - id_start >= self.N_TEST: + break + if id_value - id_start >= self.N_TEST: + break + for _ in range(idle_cycles_between_phases): + self._write_idle(file) + k += 1 + forbidden_read.extend(forbidden_read_new) + forbidden_write.extend(forbidden_write_new) + return id_value + + def idle_gen(self, id_start): + """Emit a single req=0 idle line (master never issues transactions).""" + with open(self.filepath, "w", encoding="ascii") as file: + self._write_idle(file) + return id_start + + def matmul_phased_gen( + self, + id_start, + forbidden_read, + forbidden_write, + region_base_address, + region_size_bytes, + matmul_ratio_a=1, + matmul_ratio_b=1, + matmul_ratio_c=1, + idle_cycles_between_phases=0, + region_base_address_a=None, + region_size_bytes_a=None, + region_base_address_b=None, + region_size_bytes_b=None, + region_base_address_c=None, + region_size_bytes_c=None, + ): + """Phased traffic modelling matrix multiply: read-A, read-B, write-C. + + Each phase reads/writes its own sub-region. If region_base_address_a/b/c + and region_size_bytes_a/b/c are provided, they are used directly for each + phase. Otherwise the combined region [region_base_address, +region_size_bytes) + is split into equal thirds automatically. + + idle_cycles_between_phases: req=0 cycles inserted between each pair of + active phases (after read-A and after read-B, if those phases are non-empty). + Models computation time between memory bursts. + """ + id_value = id_start + forbidden_read_new = [] + forbidden_write_new = [] + access_bytes = max(1, self.DATA_WIDTH // 8) + total_mem_bytes = int(self.TOT_MEM_SIZE * 1024) + + def _resolve_region(base_override, size_override, fallback_base, fallback_size): + if base_override is not None and size_override is not None: + b = self._align_down(int(base_override), access_bytes) + s = self._align_down(int(size_override), access_bytes) + else: + b = self._align_down(int(fallback_base), access_bytes) + s = self._align_down(int(fallback_size), access_bytes) + if b + s > total_mem_bytes: + s = self._align_down(total_mem_bytes - b, access_bytes) + return b, s + + # If per-phase regions are explicitly provided, use them directly + if region_base_address_a is not None and region_size_bytes_a is not None: + a_base, a_size = _resolve_region(region_base_address_a, region_size_bytes_a, 0, 0) + b_base, b_size = _resolve_region(region_base_address_b, region_size_bytes_b, a_base, a_size) + c_base, c_size = _resolve_region(region_base_address_c, region_size_bytes_c, a_base, a_size) + else: + # Auto-split combined region into thirds + base = self._align_down(int(region_base_address), access_bytes) + size = self._align_down(int(region_size_bytes), access_bytes) + if base + size > total_mem_bytes: + size = self._align_down(total_mem_bytes - base, access_bytes) + region_words = size // access_bytes + if region_words < 3: + with open(self.filepath, "w", encoding="ascii") as file: + self._write_idle(file) + return id_value + a_words = max(1, region_words // 3) + b_words = max(1, region_words // 3) + c_words = region_words - a_words - b_words + a_base = base; a_size = a_words * access_bytes + b_base = a_base + a_size; b_size = b_words * access_bytes + c_base = b_base + b_size; c_size = c_words * access_bytes + + a_end = a_base + a_size + b_end = b_base + b_size + c_end = c_base + c_size + + cnt_a, cnt_b, cnt_c = self._phase_counts( + self.N_TEST, + int(matmul_ratio_a), + int(matmul_ratio_b), + int(matmul_ratio_c), + ) + + def _emit_phase(file_obj, count, wen, phase_base, phase_end): + nonlocal id_value + addr = phase_base + for _ in range(count): + data = "0" * self.DATA_WIDTH if wen else self.random_data() + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + self._write_req(file_obj, id_value, wen, data, add) + self._record_access(add, wen, forbidden_read_new, forbidden_write_new) + id_value += 1 + addr += access_bytes + if addr >= phase_end: + addr = phase_base + + with open(self.filepath, "w", encoding="ascii") as file: + _emit_phase(file, cnt_a, 1, a_base, a_end) # read A + if cnt_a > 0 and (cnt_b > 0 or cnt_c > 0): + for _ in range(idle_cycles_between_phases): + self._write_idle(file) + _emit_phase(file, cnt_b, 1, b_base, b_end) # read B + if cnt_b > 0 and cnt_c > 0: + for _ in range(idle_cycles_between_phases): + self._write_idle(file) + _emit_phase(file, cnt_c, 0, c_base, c_end) # write C + + forbidden_read.extend(forbidden_read_new) + forbidden_write.extend(forbidden_write_new) + return id_value diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py index 2f580be..fa8b51b 100644 --- a/target/verif/simvectors/hci_stimuli/processor.py +++ b/target/verif/simvectors/hci_stimuli/processor.py @@ -1,62 +1,12 @@ -"""Processor helpers: unfold and pad stimuli text files.""" +"""Processor helpers: pad stimuli files to equal length.""" -import os +def pad_txt_files(folder_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT): + """No-op: padding stimuli files to a common length is not needed. -# 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list. -# -folder_path_raw --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated) -# -folder_path_processed --> String that specifies the path of the folder containing the new txt files created by this function -def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT): - file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")] - for file in file_names: - filepath_read = os.path.join(folder_path_raw,file) - filepath_write = os.path.join(folder_path_processed, file) - os.makedirs(os.path.dirname(filepath_write),exist_ok=True) - with open(filepath_read, 'r', encoding = "ascii") as file_read: - with open(filepath_write, 'w', encoding="ascii") as file_write: - for line in file_read: - if line != 'zero': - values = line.split() - id = values[0] - cycle_offset = values[1] - wen = values[2] - data = values[3] - add = values[4] - if "log" in file: - for _ in range(int(cycle_offset)-1): - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - for _ in range(int(cycle_offset)-1): - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n") - else: - if "log" in file: - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - - -# 2) ++PAD++ txt files to have the same number of lines -# -Folder_path --> path of the folder containing the txt files to be padded -def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT): - file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder - max_lines = 0 - line_count = {} # Dictionary to store the number of lines in each txt file - # Determining the maximum number of lines among the txt files - for file in file_names: - file_path = os.path.join(folder_path,file) - with open(file_path,'r', encoding = 'ascii') as f: - line_count[file] = sum(1 for _ in f) - max_lines = max(max_lines, line_count[file]) - # Pad files - for file in file_names: - padding_needed = max_lines - line_count[file] - if padding_needed > 0: - file_path = os.path.join(folder_path,file) - with open(file_path, 'a', encoding = 'ascii') as f: - if "log" in file: - for _ in range(padding_needed): - f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - for _ in range(padding_needed): - f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") + Each application_driver self-terminates when its transaction queue is + exhausted, and later-phase drivers are held in reset via clear_i until + their dependencies assert end_req_o. Cross-file length alignment would + only add unnecessary idle cycles at the end of early-phase files. + """ + pass diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index bbdd853..54ff02b 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -1,29 +1,26 @@ """Stimuli generator (reads JSON configs in `verif/config`). This script is invoked by the top-level Makefile and expects three -JSON config files: workload, testbench and hardware. It produces raw -and processed stimuli in `verif/simvectors/generated`. +JSON config files: workload, testbench and hardware. It produces +cycle-accurate stimuli in `verif/simvectors/generated/stimuli`. + +Each stimuli file has one line per simulation cycle: + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) """ -### LIBRARIES AND DEPENDENCIES ### import json +import math import sys from pathlib import Path import argparse -import numpy as np code_directory = Path(__file__).resolve().parent - -# Try to import the local package `hci_stimuli`. If the running -# environment doesn't include the `simvectors` directory on `sys.path` -# (for example when invoked from a different working directory), add -# `code_directory` to `sys.path` as a minimal fallback. try: - from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files + from hci_stimuli import StimuliGenerator, pad_txt_files except Exception: sys.path.insert(0, str(code_directory)) - from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files + from hci_stimuli import StimuliGenerator, pad_txt_files def parse_args(argv=None): @@ -31,6 +28,8 @@ def parse_args(argv=None): parser.add_argument('--workload_config', required=True, help="Path to JSON workload configuration file") parser.add_argument('--testbench_config', required=True, help="Path to JSON testbench configuration file") parser.add_argument('--hardware_config', required=True, help="Path to JSON hardware configuration file") + parser.add_argument('--emit_phases_mk', default=None, metavar='PATH', + help="Also write the phases.mk Makefile fragment to PATH") parser.add_argument( '--golden', action='store_true', @@ -52,19 +51,17 @@ def load_config(filename, description): except json.JSONDecodeError as e: print(f"ERROR: Invalid JSON in {description} file: {e}") sys.exit(1) + + ### MAIN ENTRYPOINT ### def main(argv=None): - # parse CLI args args = parse_args(argv) - # load configs hardware_config = load_config(args.hardware_config, "Hardware configuration") testbench_config = load_config(args.testbench_config, "Testbench configuration") workload_config = load_config(args.workload_config, "Workload configuration") - # helpers imported at module-level (with a small sys.path fallback) - - # Extract hardware parameters + # Hardware parameters hw_params = hardware_config['parameters'] N_BANKS = hw_params['N_BANKS'] TOT_MEM_SIZE = hw_params['TOT_MEM_SIZE'] @@ -75,136 +72,287 @@ def main(argv=None): N_HWPE = hw_params['N_HWPE'] HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT'] - # Extract testbench parameters - tb_params = testbench_config['parameters'] - TEST_RATIO = tb_params['TRANSACTION_RATIO'] - N_TEST_LOG = tb_params['N_TRANSACTION_LOG'] - - # Extract workload simulation parameters - workload_sim_params = workload_config['simulation_parameters'] - CYCLE_OFFSET_LOG = workload_sim_params['CYCLE_OFFSET_LOG'] - CYCLE_OFFSET_HWPE = workload_sim_params['CYCLE_OFFSET_HWPE'] - EXACT_OR_MAX_OFFSET = workload_sim_params['EXACT_OR_MAX_OFFSET'] - - # Extract workload master parameters log_masters = workload_config['log_masters'] hwpe_masters = workload_config['hwpe_masters'] # Derived parameters - WIDTH_OF_MEMORY = DATA_WIDTH - WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8 - N_WORDS = (TOT_MEM_SIZE * 1000 / N_BANKS) / WIDTH_OF_MEMORY_BYTE - ADD_WIDTH = int(np.ceil(np.log2(TOT_MEM_SIZE * 1000))) - N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO) + ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000)) N_LOG = N_CORE + N_DMA + N_EXT - N_MASTER = N_LOG + N_HWPE - # Match tb_hci_pkg interface ID widths (hci_system-style model). - IW_CORES = 8 - IW_HWPE = IW_CORES - if IW_CORES != IW_HWPE: - print("ERROR: stimuli generator currently requires equal log/HWPE interface ID widths") - sys.exit(1) - IW = IW_CORES - CORE_ZERO_FLAG = False - EXT_ZERO_FLAG = False - DMA_ZERO_FLAG = False - HWPE_ZERO_FLAG = False + IW = 8 # Validations if len(log_masters) != N_LOG: print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})") sys.exit(1) - if len(hwpe_masters) != N_HWPE: print(f"ERROR: Number of HWPE masters in workload config ({len(hwpe_masters)}) doesn't match hardware config N_HWPE ({N_HWPE})") sys.exit(1) - - if (not N_WORDS.is_integer()): - print("ERROR: the number of words is not an integer value") - sys.exit(1) - if (N_MASTER < 1): + if N_LOG + N_HWPE < 1: print("ERROR: the number of masters must be > 0") sys.exit(1) + n_words = (TOT_MEM_SIZE * 1000 / N_BANKS) / (DATA_WIDTH / 8) + if not n_words.is_integer(): + print("ERROR: the number of words is not an integer value") + sys.exit(1) # Prepare output dirs simvectors_dir = code_directory.resolve() generated_dir = (simvectors_dir / 'generated').resolve() - raw_dir = (generated_dir / 'stimuli_raw').resolve() - processed_dir = (generated_dir / 'stimuli_processed').resolve() + stimuli_dir = (generated_dir / 'stimuli').resolve() generated_dir.mkdir(parents=True, exist_ok=True) - raw_dir.mkdir(parents=True, exist_ok=True) - processed_dir.mkdir(parents=True, exist_ok=True) + stimuli_dir.mkdir(parents=True, exist_ok=True) - # Create zero files when a class of masters is absent. We keep the - # original behaviour of creating a single 'zero' file per missing - # class to preserve downstream expectations. - def _create_zero_file(path: Path): + def _create_idle_file(path: Path, data_width: int): + """Write a single idle line for a master that is not present in hardware.""" path.parent.mkdir(parents=True, exist_ok=True) - path.write_text('zero', encoding='ascii') + path.write_text( + "0 " + "0" * IW + " 0 " + "0" * data_width + " " + "0" * ADD_WIDTH + "\n", + encoding='ascii', + ) + + CORE_ZERO_FLAG = False + DMA_ZERO_FLAG = False + EXT_ZERO_FLAG = False + HWPE_ZERO_FLAG = False if N_CORE <= 0: CORE_ZERO_FLAG = True N_CORE = 1 - _create_zero_file(raw_dir / 'master_log_0.txt') + _create_idle_file(stimuli_dir / 'master_log_0.txt', DATA_WIDTH) if N_DMA <= 0: DMA_ZERO_FLAG = True N_DMA = 1 - _create_zero_file(raw_dir / f'master_log_{N_CORE}.txt') + _create_idle_file(stimuli_dir / f'master_log_{N_CORE}.txt', DATA_WIDTH) if N_EXT <= 0: EXT_ZERO_FLAG = True N_EXT = 1 - _create_zero_file(raw_dir / f'master_log_{N_CORE + N_DMA}.txt') + _create_idle_file(stimuli_dir / f'master_log_{N_CORE + N_DMA}.txt', DATA_WIDTH) if N_HWPE <= 0: HWPE_ZERO_FLAG = True N_HWPE = 1 - _create_zero_file(raw_dir / 'master_hwpe_0.txt') + _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH) next_start_id = 0 LIST_OF_FORBIDDEN_ADDRESSES_WRITE = [] LIST_OF_FORBIDDEN_ADDRESSES_READ = [] - def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, master_global_idx: int): - """Create StimuliGenerator and run the configured generation method. + # Memory map entries collected during generation, printed at the end + memory_map_entries = [] + + def _bank_of(byte_addr): + return (byte_addr // (DATA_WIDTH // 8)) % N_BANKS + + def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes, + region_base, region_size, start_address, stride0, len_d0, + stride1, len_d1, stride2, master_config, total_mem_bytes): + label = f"{kind}_{local_idx}" + (f" ({description})" if description else "") + if config == 'idle' or n_test == 0: + memory_map_entries.append({'label': label, 'pattern': config, 'n': 0, + 'info': 'idle - no memory accesses'}) + return + + first_addr = last_addr = None + detail = {} + + if config == 'random': + first_addr = region_base + last_addr = region_base + region_size - access_bytes + detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x} ({region_size} B)" + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" + elif config == 'matmul_phased': + ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None) + if ra is not None and sa is not None: + # Explicit per-phase regions + a_base, a_size = ra, sa + b_base, b_size = (rb, sb) if rb is not None and sb is not None else (ra, sa) + c_base, c_size = (rc, sc) if rc is not None and sc is not None else (ra, sa) + detail['matrix_A (read)'] = f"0x{a_base:08x} - 0x{a_base + a_size - access_bytes:08x} ({a_size} B)" + if int(master_config.get('matmul_ratio_b', 1)) > 0: + detail['matrix_B (read)'] = f"0x{b_base:08x} - 0x{b_base + b_size - access_bytes:08x} ({b_size} B)" + detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_size - access_bytes:08x} ({c_size} B)" + first_addr = a_base + last_addr = c_base + c_size - access_bytes + else: + # Auto-split combined region into thirds + a_words = max(1, (region_size // access_bytes) // 3) + b_words = max(1, (region_size // access_bytes) // 3) + c_words = (region_size // access_bytes) - a_words - b_words + a_base = region_base + b_base = a_base + a_words * access_bytes + c_base = b_base + b_words * access_bytes + detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x} ({region_size} B) [auto-split]" + detail['matrix_A (read)'] = f"0x{a_base:08x} - 0x{b_base - access_bytes:08x} ({a_words * access_bytes} B)" + detail['matrix_B (read)'] = f"0x{b_base:08x} - 0x{c_base - access_bytes:08x} ({b_words * access_bytes} B)" + detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_words * access_bytes - access_bytes:08x} ({c_words * access_bytes} B)" + first_addr = a_base + last_addr = c_base + c_words * access_bytes - access_bytes + if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')): + m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k']) + detail['matrix_dims'] = f"M={m} N={n} K={k} (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})" + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + elif config == 'linear': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + last_addr = base + (n_test - 1) * stride0 * access_bytes + last_addr = last_addr % total_mem_bytes + detail['start'] = f"0x{base:08x}" + detail['stride'] = f"{stride0} words ({stride0 * access_bytes} B)" + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" + elif config == '2d': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + last_addr = (base + (len_d0 - 1) * stride0 * access_bytes + + (n_test // max(len_d0, 1) - 1) * stride1 * access_bytes) % total_mem_bytes + detail['dims'] = f"{len_d0} x (n_rows) stride0={stride0} stride1={stride1}" + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + elif config == '3d': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + detail['dims'] = f"{len_d0} x {len_d1} x (n_outer) stride0={stride0} stride1={stride1} stride2={stride2}" + last_addr = base # approximate for 3d + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + + if first_addr is not None: + detail['first_addr'] = f"0x{first_addr:08x} (bank {_bank_of(first_addr)})" + detail['last_addr'] = f"0x{last_addr:08x} (bank {_bank_of(last_addr)})" + detail['transfer'] = f"{n_test} transactions x {data_width // 8} B = {n_test * data_width // 8} B" + + memory_map_entries.append({'label': label, 'pattern': config, 'n': n_test, + 'detail': detail}) + + # (file_path, n_idle_cycles) -- populated during generation, + # prepended as idle lines before padding for static start delays. + pending_start_delays = [] + + def _parse_maybe_bin_int(raw_value, default_value): + """Parse an int or binary/hex/decimal string; return default on failure.""" + if raw_value is None: + return default_value + if isinstance(raw_value, int): + return raw_value + if isinstance(raw_value, str): + v = raw_value.strip() + if not v: + return default_value + if set(v) <= {"0", "1"}: + return int(v, 2) + try: + return int(v, 0) + except ValueError: + return default_value + return default_value + + def _normalize_mem_access_type(raw_value, master_name): + allowed = {"random", "linear", "2d", "3d", "idle", "matmul_phased"} + aliases = {"matmul": "matmul_phased"} + + if not isinstance(raw_value, str): + print( + f"ERROR: {master_name} has invalid mem_access_type={raw_value} " + f"(type={type(raw_value).__name__}). Allowed: {', '.join(sorted(allowed))}" + ) + sys.exit(1) + + key = aliases.get(raw_value.strip().lower(), raw_value.strip().lower()) + if key not in allowed: + print( + f"ERROR: {master_name} has invalid mem_access_type='{raw_value}'. " + f"Allowed: {', '.join(sorted(allowed))}" + ) + sys.exit(1) + return key + + def _warn_if_id_mismatch(master_cfg, expected_idx, master_name): + raw_id = master_cfg.get("id", expected_idx) + try: + cfg_id = int(raw_id) + except (TypeError, ValueError): + print(f"WARNING: {master_name} has non-integer id={raw_id}; positional index {expected_idx} is used.") + return + if cfg_id != expected_idx: + print( + f"WARNING: {master_name} has id={cfg_id} but positional index is {expected_idx}; " + "stimuli-to-driver mapping is positional." + ) + + def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_width: int, kind: str, local_idx: int) -> int: + """Resolve n_transactions from explicit field or geometry, depending on pattern.""" + if 'n_transactions' in master_config: + return int(master_config['n_transactions']) + if mem_access_type == 'linear': + length = master_config.get('length') + if length is not None: + return int(length) + elif mem_access_type == '2d': + len_d0 = master_config.get('len_d0') + len_d1 = master_config.get('len_d1') + if len_d0 is not None and len_d1 is not None: + return int(len_d0) * int(len_d1) + elif mem_access_type == '3d': + len_d0 = master_config.get('len_d0') + len_d1 = master_config.get('len_d1') + len_d2 = master_config.get('len_d2') + if len_d0 is not None and len_d1 is not None and len_d2 is not None: + return int(len_d0) * int(len_d1) * int(len_d2) + elif mem_access_type == 'matmul_phased': + m = master_config.get('matrix_m') + n = master_config.get('matrix_n') + k = master_config.get('matrix_k') + if m is not None and n is not None and k is not None: + return int(m) * int(k) + int(k) * int(n) + int(m) * int(n) + elif mem_access_type == 'idle': + return 0 + print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no " + f"'n_transactions' and no geometry fields to derive it from.") + sys.exit(1) - Parameters: - - filepath: output raw txt file path - - master_config: dict from workload.json for this master - - is_hwpe: whether this is an HWPE master (affects data width and counts) - - master_global_idx: global master id used by the generator - """ + def _generate_master( + filepath: Path, + master_config: dict, + *, + is_hwpe: bool, + master_global_idx: int, + master_local_idx: int, + n_peers_of_kind: int, + ): nonlocal next_start_id data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH - n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG - cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG + kind = 'master_hwpe' if is_hwpe else 'master_log' master = StimuliGenerator( - IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH, - str(filepath), n_test, EXACT_OR_MAX_OFFSET, cycle_offset, master_global_idx + IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH, + str(filepath), 0, master_global_idx ) - config = str(master_config.get('mem_access_type', '0')) - start_address = str(master_config.get('start_address', '0')) - stride0 = int(master_config.get('stride0', 0)) - len_d0 = int(master_config.get('len_d0', 0)) - stride1 = int(master_config.get('stride1', 0)) - len_d1 = int(master_config.get('len_d1', 0)) - stride2 = int(master_config.get('stride2', 0)) - - if config == '0': - next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '1': - next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '2': - next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '3': - next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) + if 'mem_access_type' not in master_config: + print(f"ERROR: {kind}_{master_local_idx} is missing mem_access_type.") + sys.exit(1) - def _gen_hwpe_master(master_idx, master_config, global_idx): - nonlocal next_start_id - filepath = raw_dir / f"master_hwpe_{master_idx}.txt" - master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH_FACT * DATA_WIDTH, ADD_WIDTH, - str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx) - config = str(master_config.get('mem_access_type', '0')) + config = _normalize_mem_access_type( + master_config['mem_access_type'], + f"{kind}_{master_local_idx}", + ) start_address = str(master_config.get('start_address', '0')) stride0 = int(master_config.get('stride0', 0)) len_d0 = int(master_config.get('len_d0', 0)) @@ -212,19 +360,112 @@ def _gen_hwpe_master(master_idx, master_config, global_idx): len_d1 = int(master_config.get('len_d1', 0)) stride2 = int(master_config.get('stride2', 0)) - if config == '0': - next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '1': - next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '2': - next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '3': - next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) + # Region parameters + total_mem_bytes = int(TOT_MEM_SIZE * 1024) + access_bytes = max(1, int(data_width // 8)) + default_region_size = total_mem_bytes // max(1, n_peers_of_kind) + default_region_base = master_local_idx * default_region_size + + region_base = _parse_maybe_bin_int(master_config.get('region_base_address'), default_region_base) + region_size = _parse_maybe_bin_int(master_config.get('region_size_bytes'), default_region_size) + + region_base = (region_base // access_bytes) * access_bytes + if region_base >= total_mem_bytes: + region_base = region_base % total_mem_bytes + region_size = (max(0, region_size) // access_bytes) * access_bytes + if region_size <= 0: + region_size = (default_region_size // access_bytes) * access_bytes + if region_base + region_size > total_mem_bytes: + region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes + + n_test = _resolve_n_transactions(master_config, config, data_width, kind, master_local_idx) + master.N_TEST = n_test + + # Start delay: prepend idle lines directly to the stimuli file after generation + start_delay = int(master_config.get('start_delay_cycles', 0)) + if start_delay > 0: + pending_start_delays.append((filepath, start_delay, data_width)) + + if config == 'random': + tpct = master_config.get('traffic_pct') + next_start_id = master.random_gen( + next_start_id, + LIST_OF_FORBIDDEN_ADDRESSES_READ, + LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + region_base=region_base, + region_size=region_size, + traffic_pct=int(tpct) if tpct is not None else 100, + traffic_read_pct=master_config.get('traffic_read_pct'), + ) + elif config == 'linear': + tpct = master_config.get('traffic_pct') + next_start_id = master.linear_gen( + stride0, start_address, next_start_id, + LIST_OF_FORBIDDEN_ADDRESSES_READ, + LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + traffic_pct=int(tpct) if tpct is not None else 100, + traffic_read_pct=master_config.get('traffic_read_pct'), + ) + elif config == '2d': + next_start_id = master.gen_2d( + stride0, len_d0, stride1, start_address, next_start_id, + LIST_OF_FORBIDDEN_ADDRESSES_READ, + LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), + ) + elif config == '3d': + next_start_id = master.gen_3d( + stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, + LIST_OF_FORBIDDEN_ADDRESSES_READ, + LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), + ) + elif config == 'idle': + next_start_id = master.idle_gen(next_start_id) + elif config == 'matmul_phased': + if not is_hwpe: + print( + f"WARNING: mem_access_type='matmul_phased' is typically used for HWPE masters; " + f"{kind}_{master_local_idx} will still use requested phased behavior." + ) + min_region_size = 3 * access_bytes + if region_size < min_region_size: + print( + f"ERROR: {kind}_{master_local_idx} region_size_bytes=" + f"{region_size} is too small for matmul_phased (minimum {min_region_size})." + ) + sys.exit(1) + next_start_id = master.matmul_phased_gen( + next_start_id, + LIST_OF_FORBIDDEN_ADDRESSES_READ, + LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + region_base, + region_size, + int(master_config.get('matmul_ratio_a', 1)), + int(master_config.get('matmul_ratio_b', 1)), + int(master_config.get('matmul_ratio_c', 1)), + idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), + region_base_address_a=_parse_maybe_bin_int(master_config.get('region_base_address_a'), None), + region_size_bytes_a=_parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None), + region_base_address_b=_parse_maybe_bin_int(master_config.get('region_base_address_b'), None), + region_size_bytes_b=_parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None), + region_base_address_c=_parse_maybe_bin_int(master_config.get('region_base_address_c'), None), + region_size_bytes_c=_parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None), + ) + + _record_memory_map( + kind, master_local_idx, + master_config.get('description', ''), + config, n_test, data_width, access_bytes, + region_base, region_size, + start_address, stride0, len_d0, stride1, len_d1, stride2, + master_config, total_mem_bytes, + ) global_idx = 0 - # Generate logarithmic masters (CORE, DMA, EXT) in order + + # Generate LOG masters (CORE, DMA, EXT) in order for i in range(N_LOG): - # determine class of this master (core/dma/ext) if i < N_CORE: if CORE_ZERO_FLAG: global_idx += 1 @@ -239,66 +480,175 @@ def _gen_hwpe_master(master_idx, master_config, global_idx): continue master_cfg = log_masters[i] - _generate_master(raw_dir / f"master_log_{i}.txt", master_cfg, is_hwpe=False, master_global_idx=global_idx) + _warn_if_id_mismatch(master_cfg, i, f"master_log_{i}") + _generate_master( + stimuli_dir / f"master_log_{i}.txt", + master_cfg, + is_hwpe=False, + master_global_idx=global_idx, + master_local_idx=i, + n_peers_of_kind=max(1, N_LOG), + ) global_idx += 1 - # Generate HWPE masters; their global index follows the previous masters + # Generate HWPE masters for hw_idx in range(N_HWPE): if HWPE_ZERO_FLAG: global_idx += 1 continue master_cfg = hwpe_masters[hw_idx] - _generate_master(raw_dir / f"master_hwpe_{hw_idx}.txt", master_cfg, is_hwpe=True, master_global_idx=global_idx) + _warn_if_id_mismatch(master_cfg, hw_idx, f"master_hwpe_{hw_idx}") + _generate_master( + stimuli_dir / f"master_hwpe_{hw_idx}.txt", + master_cfg, + is_hwpe=True, + master_global_idx=global_idx, + master_local_idx=hw_idx, + n_peers_of_kind=max(1, N_HWPE), + ) global_idx += 1 - print("STEP 0 COMPLETED: create raw txt files") - - # Process raw files - simvector_raw_path = str(raw_dir) - simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve()) - unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) - print("STEP 1 COMPLETED: unfold txt files") - - pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) - print("STEP 2 COMPLETED: pad txt files") - + print("STEP 0 COMPLETED: generate stimuli files") + + # ----------------------------------------------------------------------- + # Compute WAIT_MASKS and emit phases.mk + # ----------------------------------------------------------------------- + N_DRIVERS = N_LOG + N_HWPE + + phase_to_drivers: dict[str, list[int]] = {} + driver_phases: list[str] = [] + + for i, m in enumerate(log_masters): + phase = str(m.get('phase', 'default')) + driver_phases.append(phase) + phase_to_drivers.setdefault(phase, []).append(i) + + for i, m in enumerate(hwpe_masters): + gidx = N_LOG + i + phase = str(m.get('phase', 'default')) + driver_phases.append(phase) + phase_to_drivers.setdefault(phase, []).append(gidx) + + wait_masks: list[int] = [0] * N_DRIVERS + + for i, m in enumerate(log_masters): + mask = 0 + for dep_phase in m.get('wait_for', []): + for dep_drv in phase_to_drivers.get(str(dep_phase), []): + mask |= (1 << dep_drv) + wait_masks[i] = mask + + for i, m in enumerate(hwpe_masters): + gidx = N_LOG + i + mask = 0 + for dep_phase in m.get('wait_for', []): + for dep_drv in phase_to_drivers.get(str(dep_phase), []): + mask |= (1 << dep_drv) + wait_masks[gidx] = mask + + hex_width = max(1, (N_DRIVERS + 3) // 4) + mask_literals = [f"{N_DRIVERS}'h{wait_masks[i]:0{hex_width}x}" for i in range(N_DRIVERS)] + wait_masks_param = "'{" + ", ".join(mask_literals) + "}" + + if args.emit_phases_mk: + phases_mk_path = Path(args.emit_phases_mk) + phases_mk_path.parent.mkdir(parents=True, exist_ok=True) + phases_mk_path.write_text( + "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n" + "# Per-driver dependency masks for tb_hci.sv (WAIT_MASKS_PARAM).\n" + f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n" + f"WAIT_MASKS_PARAM := {wait_masks_param}\n", + encoding='utf-8', + ) + print(f"PHASES.MK written: {phases_mk_path}") + + # ----------------------------------------------------------------------- + # Build and emit memory map report + # ----------------------------------------------------------------------- + word_bytes = DATA_WIDTH // 8 + bank_stride_bytes = N_BANKS * word_bytes + lines = [] + lines.append("=" * 72) + lines.append("MEMORY MAP REPORT") + lines.append(f" Total memory : {TOT_MEM_SIZE} KiB ({TOT_MEM_SIZE * 1024} B)") + lines.append(f" Banks : {N_BANKS} x {word_bytes} B/word (interleaved, stride {bank_stride_bytes} B)") + lines.append(f" Data width : {DATA_WIDTH} b LOG / {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE") + lines.append("=" * 72) + for entry in memory_map_entries: + lines.append(f"\n [{entry['label']}] pattern={entry['pattern']} n_transactions={entry['n']}") + if 'info' in entry: + lines.append(f" {entry['info']}") + for k, v in entry.get('detail', {}).items(): + lines.append(f" {k:<14}: {v}") + lines.append("") + lines.append(" Phase / dependency map:") + for phase, drivers in sorted(phase_to_drivers.items()): + driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers] + lines.append(f" phase '{phase}': {', '.join(driver_names)}") + for i in range(N_DRIVERS): + if wait_masks[i]: + name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}" + deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}" + for j in range(N_DRIVERS) if wait_masks[i] & (1 << j)] + lines.append(f" {name} waits for: {', '.join(deps)}") + lines.append("=" * 72) + + report_text = "\n".join(lines) + "\n" + print("\n" + report_text) + memory_map_path = generated_dir / 'memory_map.txt' + memory_map_path.write_text(report_text, encoding='utf-8') + print(f"Memory map written: {memory_map_path}") + + # ----------------------------------------------------------------------- + # Apply per-master start delays + # ----------------------------------------------------------------------- + for fpath, delay, dw in pending_start_delays: + if fpath.exists(): + idle_line = "0 " + "0" * IW + " 0 " + "0" * dw + " " + "0" * ADD_WIDTH + "\n" + original = fpath.read_text(encoding='ascii') + fpath.write_text(idle_line * delay + original, encoding='ascii') + + # ----------------------------------------------------------------------- + # Pad all stimuli files to equal length + # ----------------------------------------------------------------------- + pad_txt_files(str(stimuli_dir), IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) + print("STEP 1 COMPLETED: pad stimuli files") + + # ----------------------------------------------------------------------- + # Golden vectors + # ----------------------------------------------------------------------- if args.golden: golden_dir = (generated_dir / 'golden').resolve() golden_dir.mkdir(parents=True, exist_ok=True) - for stim_path in sorted(processed_dir.glob('master_*.txt')): + for stim_path in sorted(stimuli_dir.glob('master_*.txt')): try: text = stim_path.read_text(encoding='ascii') except OSError: continue - if text.strip() == 'zero': - continue - mem = {} out_lines = [] for raw_line in text.splitlines(): line = raw_line.strip() if not line: continue - parts = line.split() if len(parts) != 5: continue - req_s, id_s, wen_s, data_s, add_s = parts if req_s != '1': continue - if wen_s == '0': mem[add_s] = data_s continue - exp_s = mem.get(add_s, '1' * len(data_s)) out_lines.append(f"{id_s} {add_s} {exp_s}") - (golden_dir / f"golden_{stim_path.name}").write_text("\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii') - print("STEP 3 COMPLETED: golden vectors") + (golden_dir / f"golden_{stim_path.name}").write_text( + "\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii' + ) + print("STEP 2 COMPLETED: golden vectors") if __name__ == '__main__': diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index d921563..d4697e9 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -85,7 +85,8 @@ module application_driver #( // Requests FSM // ////////////////// - typedef enum logic [1:0] { + typedef enum logic [2:0] { + REQ_RESET, REQ_IDLE, WAIT_GNT, REQ_DONE, @@ -102,7 +103,7 @@ module application_driver #( // Sequential logic always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni || clear_i) begin - req_state_q <= REQ_IDLE; + req_state_q <= REQ_RESET; tr_idx_q <= '0; n_req_issued_q <= '0; n_rd_req_issued_q <= '0; @@ -142,6 +143,11 @@ module application_driver #( // inputs: gnt, r_data, r_valid, r_user, r_id case (req_state_q) + REQ_RESET: begin + if (!clear_i) begin + req_state_d = REQ_IDLE; + end + end REQ_IDLE: begin // Check if there are still transactions to issue if (tr_idx_q < transactions.size()) begin @@ -176,10 +182,6 @@ module application_driver #( req_state_d = RSP_DONE; end end - // Synchronously clear - if (clear_i) begin - req_state_d = REQ_IDLE; - end end WAIT_GNT: begin hci_if.req = 1'b1; @@ -215,10 +217,6 @@ module application_driver #( req_state_d = WAIT_GNT; end end - // Synchronously clear - if (clear_i) begin - req_state_d = REQ_IDLE; - end end REQ_DONE: begin end_req_o = 1'b1; @@ -228,21 +226,13 @@ module application_driver #( end else begin req_state_d = REQ_DONE; end - // Synchronously clear - if (clear_i) begin - req_state_d = REQ_IDLE; - end end RSP_DONE: begin end_req_o = 1'b1; end_resp_o = 1'b1; - // Synchronously clear - if (clear_i) begin - req_state_d = REQ_IDLE; - end end default: begin - req_state_d = REQ_IDLE; + req_state_d = REQ_RESET; end endcase end @@ -253,7 +243,8 @@ module application_driver #( // We only consider read responses as write responses are not mandatory in HCI - typedef enum logic { + typedef enum logic [1:0] { + RESP_RESET, RESP_IDLE, RESP_WAIT_RVALID } resp_state_t; @@ -284,17 +275,17 @@ module application_driver #( n_rd_in_flight_d = n_rd_in_flight_q; case (resp_state_q) + RESP_RESET: begin + if (!clear_i) begin + resp_state_d = RESP_IDLE; + end + end RESP_IDLE: begin // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID if (hci_if.req && hci_if.wen && hci_if.gnt) begin n_rd_in_flight_d = n_rd_in_flight_q + 1; resp_state_d = RESP_WAIT_RVALID; end - // Synchronously clear - if (clear_i) begin - resp_state_d = RESP_IDLE; - n_rd_in_flight_d = '0; - end end RESP_WAIT_RVALID: begin // Track newly granted reads while waiting @@ -316,15 +307,9 @@ module application_driver #( resp_state_d = RESP_IDLE; end end - // Synchronously clear - if (clear_i) begin - resp_state_d = RESP_IDLE; - n_rd_in_flight_d = '0; - end end default: begin - resp_state_d = RESP_IDLE; - n_rd_in_flight_d = '0; + resp_state_d = RESP_RESET; end endcase end diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index 28f8b52..e8b02bd 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -20,11 +20,11 @@ module simulation_report import tb_hci_pkg::*; ( - input logic [0:N_DRIVERS-1] end_stimuli_i, - input logic [0:N_DRIVERS-1] end_latency_i, - input real throughput_complete_i, - input real stim_latency_i, - input real tot_latency_i, + input logic [N_DRIVERS-1:0] end_req_i, + input logic [N_DRIVERS-1:0] end_resp_i, + input real throughput_complete_i, + input real stim_latency_i, + input real tot_latency_i, input real latency_per_master_i[N_DRIVERS], input real sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE], input real sum_req_to_gnt_latency_hwpe_i[N_HWPE], @@ -60,6 +60,13 @@ module simulation_report int unsigned log_masters_with_grants; int unsigned hwpe_masters_with_grants; logic missing_reads; + // Ideal bandwidth: maximum data the memory system can serve per cycle. + // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide. + real ideal_bw_mem_side_bpc; // bits per cycle (memory-side ceiling) + // Master side: sum of all master bandwidths at 100% traffic. + real ideal_bw_master_side_bpc; // bits per cycle (master-side ceiling) + real ideal_bw_bpc; // min(mem, master) = bottleneck ideal BW + real actual_bw_utilization; // throughput_complete / ideal_bw sum_req_to_gnt_latency_all = 0.0; average_req_to_gnt_latency_weighted = 0.0; @@ -81,9 +88,9 @@ module simulation_report hwpe_masters_with_grants = '0; missing_reads = 1'b0; - wait (&end_stimuli_i); + wait (&end_req_i); wait (stim_latency_i >= 0); - wait (&end_latency_i); + wait (&end_resp_i); wait (throughput_complete_i >= 0); wait (tot_latency_i >= 0); for (int i = 0; i < N_CORE; i++) begin @@ -184,6 +191,19 @@ module simulation_report average_req_to_gnt_latency_hwpe_unweighted / real'(hwpe_masters_with_grants); end + // Ideal bandwidth computation. + // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle. + ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH); + // Master side: N_LOG_MASTERS narrow ports + N_HWPE wide ports, all at 100% traffic. + ideal_bw_master_side_bpc = real'(N_LOG_MASTERS) * real'(DATA_WIDTH) + + real'(N_HWPE) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); + // Bottleneck = minimum of the two sides. + ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_master_side_bpc) + ? ideal_bw_mem_side_bpc : ideal_bw_master_side_bpc; + // Utilization = actual / ideal. + actual_bw_utilization = (ideal_bw_bpc > 0.0) + ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0; + $display("------ Simulation Summary ------"); $display("\\\\HW CONFIG\\\\"); $display( @@ -205,10 +225,24 @@ module simulation_report $display("\n\\\\BANDWIDTH\\\\"); $display( - "Completion bandwidth (writes granted + reads completed): %0.2f bit/cycle", - throughput_complete_i + "Ideal BW (memory side): %0.0f bit/cycle [%0d banks x %0d bits]", + ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH + ); + $display( + "Ideal BW (master side): %0.0f bit/cycle [%0d log x %0d bits + %0d hwpe x %0d bits]", + ideal_bw_master_side_bpc, + N_LOG_MASTERS, DATA_WIDTH, + N_HWPE, HWPE_WIDTH_FACT * DATA_WIDTH + ); + $display( + "Ideal BW (bottleneck): %0.0f bit/cycle", + ideal_bw_bpc + ); + $display( + "Actual BW (completion): %0.2f bit/cycle [utilization: %0.1f%%]", + throughput_complete_i, actual_bw_utilization ); - $display("Stimulus phase duration: %0.2f cycles", stim_latency_i); + $display("Stimulus phase duration: %0.2f cycles", stim_latency_i); $display("Completion phase duration: %0.2f cycles", tot_latency_i); $display( "Granted transactions: reads=%0d writes=%0d total=%0d", diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 2413371..840c452 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -27,7 +27,8 @@ module tb_hci (); logic clk, rst_n; - logic s_clear; + logic [N_DRIVERS-1:0] s_end_req; // end_req_o from all drivers, [N_DRIVERS-1:0] ordering + logic [N_DRIVERS-1:0] s_clear_drv; // per-driver clear_i (held 1 until dependencies done) hci_interconnect_ctrl_t s_hci_ctrl; clk_rst_gen #( @@ -194,6 +195,16 @@ module tb_hci ); end end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux + // In MUX mode, sel_i must point at the currently active HWPE (the one not gated by + // clear_i). Priority: lowest index wins if multiple are somehow simultaneously active. + logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel; + always_comb begin + s_mux_sel = '0; + for (int i = N_HWPE-1; i >= 0; i--) begin + if (!s_clear_drv[N_LOG_MASTERS + i]) + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + end + end hci_core_mux_static #( .NB_CHAN(N_HWPE), .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe) @@ -201,7 +212,7 @@ module tb_hci .clk_i(clk), .rst_ni(rst_n), .clear_i(s_clear), - .sel_i('0), + .sel_i(s_mux_sel), .in(hci_driver_hwpe_if), .out(hci_initiator_wide[0]) ); @@ -227,11 +238,34 @@ module tb_hci end endgenerate + logic s_clear; assign s_clear = 1'b0; + assign s_hci_ctrl.arb_policy = ARBITER_MODE; assign s_hci_ctrl.invert_prio = INVERT_PRIO; assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL; + // Driver clear logic: driver i is held in reset until all drivers j in its effective wait + // mask have asserted end_req_o. If the effective mask is zero, the driver starts immediately. + // + // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict + // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static + // only forwards one HWPE at a time. HWPE ordering follows the index = position in + // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS. + generate + for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear + logic [N_DRIVERS-1:0] eff_mask; + if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask + // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1 + assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1)); + end else begin : gen_default_mask + assign eff_mask = WAIT_MASKS[ii]; + end + assign s_clear_drv[ii] = (eff_mask != '0) && + ((s_end_req & eff_mask) != eff_mask); + end + endgenerate + hci_interconnect #( .N_HWPE(N_WIDE_HCI), .N_CORE(N_NARROW_HCI), @@ -284,15 +318,14 @@ module tb_hci // Application drivers // ///////////////////////// - logic [0:N_DRIVERS-1] s_end_stimuli; - logic [0:N_DRIVERS-1] s_end_latency; + logic [N_DRIVERS-1:0] s_end_resp; int unsigned s_issued_transactions[0:N_DRIVERS-1]; int unsigned s_issued_read_transactions[0:N_DRIVERS-1]; generate for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log localparam string STIM_FILE_LOG = - $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii); + $sformatf("../simvectors/generated/stimuli/master_log_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), .DATA_WIDTH(DATA_WIDTH), @@ -302,10 +335,10 @@ module tb_hci ) i_app_driver_log ( .clk_i(clk), .rst_ni(rst_n), - .clear_i(1'b0), + .clear_i(s_clear_drv[ii]), .hci_if(hci_driver_log_if[ii]), - .end_req_o(s_end_stimuli[ii]), - .end_resp_o(s_end_latency[ii]), + .end_req_o(s_end_req[ii]), + .end_resp_o(s_end_resp[ii]), .n_issued_tr_o(s_issued_transactions[ii]), .n_issued_rd_tr_o(s_issued_read_transactions[ii]), .n_retired_rd_tr_o() @@ -316,7 +349,7 @@ module tb_hci generate for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe localparam string STIM_FILE_HWPE = - $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii); + $sformatf("../simvectors/generated/stimuli/master_hwpe_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH), @@ -326,10 +359,10 @@ module tb_hci ) i_app_driver_hwpe ( .clk_i(clk), .rst_ni(rst_n), - .clear_i(1'b0), + .clear_i(s_clear_drv[N_LOG_MASTERS + ii]), .hci_if(hci_driver_hwpe_if[ii]), - .end_req_o(s_end_stimuli[N_LOG_MASTERS + ii]), - .end_resp_o(s_end_latency[N_LOG_MASTERS + ii]), + .end_req_o(s_end_req[N_LOG_MASTERS + ii]), + .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]), .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]), .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]), .n_retired_rd_tr_o() @@ -366,8 +399,8 @@ module tb_hci ) i_throughput_monitor ( .clk_i(clk), .rst_ni(rst_n), - .end_stimuli_i(s_end_stimuli), - .end_latency_i(s_end_latency), + .end_req_i(s_end_req), + .end_resp_i(s_end_resp), .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG), .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE), .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG), @@ -399,8 +432,8 @@ module tb_hci ); simulation_report i_simulation_report ( - .end_stimuli_i(s_end_stimuli), - .end_latency_i(s_end_latency), + .end_req_i(s_end_req), + .end_resp_i(s_end_resp), .throughput_complete_i(throughput_completed), .stim_latency_i(stim_latency), .tot_latency_i(tot_latency), diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index cfcb3c6..f8812fd 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -33,11 +33,6 @@ package tb_hci_pkg; localparam time APPL_DELAY = 0; localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif; - // Transaction counts - localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif; - localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif; - localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO); - // TCDM and arbitration parameters localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; localparam int unsigned ARBITER_MODE = 0; @@ -82,6 +77,13 @@ package tb_hci_pkg; localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; + // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's + // end_req_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define. + // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential + // chain in tb_hci.sv. + localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] = + `ifdef WAIT_MASKS_PARAM `WAIT_MASKS_PARAM `else '{default: '0} `endif; + // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE. localparam int unsigned N_NARROW_HCI = N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG); @@ -104,10 +106,6 @@ package tb_hci_pkg; localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; - localparam int unsigned TOT_CHECK = - N_TRANSACTION_LOG * N_LOG_MASTERS + - N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT; - /////////////// // Bitwidths // /////////////// @@ -194,22 +192,6 @@ package tb_hci_pkg; index = address[BIT_BANK_INDEX-1 + 2:2]; endtask - task calculate_theoretical_throughput(output real throughput_theo); - real tot_data, band_memory_limit, tot_time; - if (TRANSACTION_RATIO >= 1) begin - tot_time = N_TRANSACTION_HWPE; - end else begin - tot_time = N_TRANSACTION_LOG; - end - tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * N_LOG_MASTERS + - (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE; - throughput_theo = tot_data / tot_time; - band_memory_limit = real'(N_BANKS * DATA_WIDTH); - if (throughput_theo >= band_memory_limit) begin - throughput_theo = band_memory_limit; - end - endtask - function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i); logic [ADDR_WIDTH-1:0] mapped_addr; logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv index bc7294f..9a7e51b 100644 --- a/target/verif/src/throughput_monitor.sv +++ b/target/verif/src/throughput_monitor.sv @@ -26,8 +26,8 @@ module throughput_monitor #( ) ( input logic clk_i, input logic rst_ni, - input logic [0:N_MASTER-1] end_stimuli_i, - input logic [0:N_MASTER-1] end_latency_i, + input logic [N_MASTER-1:0] end_req_i, + input logic [N_MASTER-1:0] end_resp_i, // Read transactions number input int unsigned n_read_complete_log_i[N_MASTER-N_HWPE], input int unsigned n_read_complete_hwpe_i[N_HWPE], @@ -52,7 +52,7 @@ module throughput_monitor #( #(CLK_PERIOD/100); @(posedge clk_i); start_time = $time; - wait (&end_stimuli_i); + wait (&end_req_i); end_time = $time; stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD); // cycles stim_latency_o = stim_time_cycles; @@ -69,7 +69,7 @@ module throughput_monitor #( #(CLK_PERIOD/100); @(posedge clk_i); start_time = $time; - wait (&end_latency_i); + wait (&end_resp_i); end_time = $time; completion_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD); // cycles tot_latency_o = completion_time_cycles; @@ -101,7 +101,7 @@ module throughput_monitor #( #(CLK_PERIOD/100); @(posedge clk_i); start_time = $time; - wait (end_latency_i[ii]); + wait (end_resp_i[ii] == 1'b1); end_time = $time; latency_per_master_o[ii] = real'(end_time - start_time) / real'(CLK_PERIOD); end diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 155fecb..3e1ceaa 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -11,6 +11,9 @@ HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated # Include generated Makefiles include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk +ifeq (,$(filter clean%,$(MAKECMDGOALS))) +-include $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk +endif # Bender targets and defines include $(HCI_VERIF_DIR)/bender.mk @@ -28,6 +31,30 @@ SIM_VOPT ?= $(SIM_QUESTA) vopt PYTHON ?= python3 +################## +# Simvectors gen # +################## + +GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py +STIM_SRC_FILES := $(shell find $(HCI_VERIF_DIR)/config -type f -not -path '$(HCI_VERIF_CFG_GEN_DIR)/*') \ + $(shell find $(HCI_VERIF_DIR)/simvectors -type f -not -path '$(HCI_VERIF_DIR)/simvectors/generated/*') +SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated + +.PHONY: stim-verif +stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp +$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR) + mkdir -p $(SIMVECTORS_GEN_DIR) + $(PYTHON) $(GEN_STIM_SCRIPT) \ + --workload_config $(WORKLOAD_JSON) \ + --testbench_config $(TESTBENCH_JSON) \ + --hardware_config $(HARDWARE_JSON) \ + --emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk + date > $@ + +.PHONY: clean-stim-verif +clean-stim-verif: + rm -rf $(SIMVECTORS_GEN_DIR) + ############## # Config gen # ############## @@ -58,29 +85,7 @@ $(HCI_VERIF_CFG_GEN_DIR): .PHONY: clean-config-verif clean-config-verif: - rm -f $(VERIF_CFG_MK) - -################## -# Simvectors gen # -################## - -GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py -STIM_SRC_FILES := $(shell find {$(HCI_VERIF_DIR)/config,$(HCI_VERIF_DIR)/simvectors} -type f) -SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated - -.PHONY: stim-verif -stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp -$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) - mkdir -p $(SIMVECTORS_GEN_DIR) - $(PYTHON) $(GEN_STIM_SCRIPT) \ - --workload_config $(WORKLOAD_JSON) \ - --testbench_config $(TESTBENCH_JSON) \ - --hardware_config $(HARDWARE_JSON) - date > $@ - -.PHONY: clean-stim-verif -clean-stim-verif: - rm -rf $(SIMVECTORS_GEN_DIR) + rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk ############## # Simulation # @@ -113,9 +118,13 @@ ifeq ($(GUI),0) SIM_HCI_VSIM_ARGS += -c endif -$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) +WAIT_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk +WAIT_MASKS_PARAM = $(shell grep '^WAIT_MASKS_PARAM' $(WAIT_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) + +$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp mkdir -p $(HCI_VERIF_DIR)/vsim - $(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) --vlog-arg="$(SIM_HCI_VLOG_ARGS)" > $@ + $(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \ + --vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+WAIT_MASKS_PARAM=$(WAIT_MASKS_PARAM)\"" > $@ .PHONY: compile-verif compile-verif: $(sim_vsim_lib)/.hw_compiled From 02503990f1f9fcd81dc5c6910840f8f9343e4128 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 13:28:21 +0100 Subject: [PATCH 11/25] tb: Fix error in testing of static mux config --- target/verif/src/tb_hci.sv | 23 +++++++++++++++-------- target/verif/verif.mk | 4 +++- target/verif/vsim/tb_hci.tcl | 5 +++-- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 840c452..63b720d 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -28,6 +28,7 @@ module tb_hci logic clk, rst_n; logic [N_DRIVERS-1:0] s_end_req; // end_req_o from all drivers, [N_DRIVERS-1:0] ordering + logic [N_DRIVERS-1:0] s_end_resp; // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering logic [N_DRIVERS-1:0] s_clear_drv; // per-driver clear_i (held 1 until dependencies done) hci_interconnect_ctrl_t s_hci_ctrl; @@ -195,13 +196,15 @@ module tb_hci ); end end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux - // In MUX mode, sel_i must point at the currently active HWPE (the one not gated by - // clear_i). Priority: lowest index wins if multiple are somehow simultaneously active. + // In MUX mode, sel_i points at the lowest-indexed HWPE that has not yet finished + // (i.e. whose end_resp_o has not fired). Once HWPE k asserts end_resp_o, sel_i + // advances to k+1. We cannot use s_clear_drv here because HWPE 0 always has + // eff_mask='0 so its clear_drv is permanently 0 even after it finishes. logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel; always_comb begin - s_mux_sel = '0; + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1); for (int i = N_HWPE-1; i >= 0; i--) begin - if (!s_clear_drv[N_LOG_MASTERS + i]) + if (!s_end_resp[N_LOG_MASTERS + i]) s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); end end @@ -246,23 +249,28 @@ module tb_hci assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL; // Driver clear logic: driver i is held in reset until all drivers j in its effective wait - // mask have asserted end_req_o. If the effective mask is zero, the driver starts immediately. + // mask have asserted end_resp_o. If the effective mask is zero, the driver starts immediately. // // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static // only forwards one HWPE at a time. HWPE ordering follows the index = position in // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS. + // + // All drivers use end_resp_o (s_end_resp) as the handoff condition, guaranteeing that all + // in-flight reads from the predecessor have been fully retired before the successor starts. + // This is required for correctness in MUX mode (hci_core_mux_static gates r_valid to the + // non-selected channel), and is conservatively safe for all other modes. generate for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear logic [N_DRIVERS-1:0] eff_mask; if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask - // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1 + // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1's end_resp_o assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1)); end else begin : gen_default_mask assign eff_mask = WAIT_MASKS[ii]; end assign s_clear_drv[ii] = (eff_mask != '0) && - ((s_end_req & eff_mask) != eff_mask); + ((s_end_resp & eff_mask) != eff_mask); end endgenerate @@ -318,7 +326,6 @@ module tb_hci // Application drivers // ///////////////////////// - logic [N_DRIVERS-1:0] s_end_resp; int unsigned s_issued_transactions[0:N_DRIVERS-1]; int unsigned s_issued_read_transactions[0:N_DRIVERS-1]; diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 3e1ceaa..5ef479b 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -4,6 +4,8 @@ # # Sergio Mazzola +.DELETE_ON_ERROR: + HCI_VERIF_DIR = $(HCI_ROOT)/target/verif HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated @@ -42,7 +44,7 @@ SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated .PHONY: stim-verif stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp -$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR) +$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR) mkdir -p $(SIMVECTORS_GEN_DIR) $(PYTHON) $(GEN_STIM_SCRIPT) \ --workload_config $(WORKLOAD_JSON) \ diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl index 33b1e72..72d5007 100644 --- a/target/verif/vsim/tb_hci.tcl +++ b/target/verif/vsim/tb_hci.tcl @@ -71,8 +71,8 @@ if {$GUI == 1} { } # Metrics - add wave -noupdate -group metrics /tb_hci/s_end_latency - add wave -noupdate -group metrics /tb_hci/s_end_stimuli + add wave -noupdate -group metrics /tb_hci/s_end_req + add wave -noupdate -group metrics /tb_hci/s_end_resp add wave -noupdate -group metrics /tb_hci/s_issued_transactions add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions add wave -noupdate -group metrics /tb_hci/stim_latency @@ -91,6 +91,7 @@ if {$GUI == 1} { add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE add wave -noupdate /tb_hci/s_clear + add wave -noupdate /tb_hci/s_clear_drv add wave -noupdate /tb_hci/s_hci_ctrl configure wave -signalnamewidth 1 From 2719f316f5a264967be571243588ecb8d83386b6 Mon Sep 17 00:00:00 2001 From: Francesco Conti Date: Fri, 6 Mar 2026 23:41:02 +0100 Subject: [PATCH 12/25] Add support for fractional priority in arbiter Previously, HCI supported QoS by switching to the low-priority chan for 1 stall-cycle out of N, where N-1 was specified as the low_prio_max_stall configuration from a CSR in the system. This commit changes the mechanism in a more flexible fractional priority. The counter has two configuration params (8-bit), both from CSRs: priority_cnt_numerator and priority_cnt_denominator. The former is the same as low_prio_max_stall, i.e., the number of stall-cycles the arbiter spends prioritizing the high-priority side. The latter is the total number of stall cycles after which the counter is cleared, meaning that the arbiter will spend priority_cnt_denominator-priority_cnt_numerator stall-cycles prioritizing the low-priority side. In other words, the arbiter will dedicate a fraction numerator/denominator of the available bandwidth (on average) to the high priority side. Hardware-wise the change is minimal, touching only how the counter works and the CSRs. --- rtl/common/hci_package.sv | 3 +- rtl/hci_interconnect.sv | 3 +- rtl/interco/hci_arbiter.sv | 36 ++++++++++--------- target/verif/bender.mk | 3 +- .../verif/config/generated/testbench.mk.tpl | 3 +- target/verif/config/testbench.json | 3 +- target/verif/src/tb_hci.sv | 3 +- target/verif/src/tb_hci_pkg.sv | 3 +- 8 files changed, 34 insertions(+), 23 deletions(-) diff --git a/rtl/common/hci_package.sv b/rtl/common/hci_package.sv index 1e3d1e1..15a36b8 100644 --- a/rtl/common/hci_package.sv +++ b/rtl/common/hci_package.sv @@ -51,7 +51,8 @@ package hci_package; typedef struct packed { logic [1:0] arb_policy; // used only in some systems logic invert_prio; - logic [7:0] low_prio_max_stall; + logic [7:0] priority_cnt_numerator; + logic [7:0] priority_cnt_denominator; } hci_interconnect_ctrl_t; typedef struct packed { diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv index 53e10ad..fee4918 100644 --- a/rtl/hci_interconnect.sv +++ b/rtl/hci_interconnect.sv @@ -278,7 +278,8 @@ module hci_interconnect always_comb begin ctrl_arbiter_tree = ctrl_i; - ctrl_arbiter_tree.low_prio_max_stall = 1; + ctrl_arbiter_tree.priority_cnt_numerator = 1; + ctrl_arbiter_tree.priority_cnt_denominator = 2; end hci_arbiter_tree #( diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv index 2447608..da75e51 100644 --- a/rtl/interco/hci_arbiter.sv +++ b/rtl/interco/hci_arbiter.sv @@ -44,13 +44,15 @@ * .. _hci_arbiter_ctrl: * .. table:: **hci_arbiter** input control signals. * - * +----------------------+------------------------+---------------------------------------------------------------+ - * | **Name** | **Type** | **Description** | - * +----------------------+------------------------+---------------------------------------------------------------+ - * | *invert_prio* | `logic` | When 1, invert priorities between `in_high` and `in_low`. | - * +----------------------+------------------------+---------------------------------------------------------------+ - * | *low_prio_max_stall* | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel. | - * +----------------------+------------------------+---------------------------------------------------------------+ + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | **Name** | **Type** | **Description** | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *invert_prio* | `logic` | When 1, invert priorities between `in_high` and `in_low`. | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *priority_cnt_numerator* | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel. | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *priority_cnt_denominator* | `logic[7:0]` | Clear condition of priority counter (max low-prio stalls + high-prio stalls). | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ * */ @@ -75,8 +77,9 @@ module hci_arbiter logic [NB_CHAN-1:0] hs_pass_d; logic hs_req_d; logic ls_req_d; + logic hs_req_masked_d; logic switch_channels_d; - logic unsigned [7:0] ls_stall_ctr_d; + logic unsigned [7:0] priority_cnt_q; // priority_req is the OR of all requests coming out of the log interconnect. // it should be simplified to simply an OR of all requests coming *into* the @@ -85,10 +88,11 @@ module hci_arbiter begin hs_req_d = |hs_req_in; ls_req_d = |ls_req_in; - if (ctrl_i.low_prio_max_stall > 0) //Set to 0 to disable this functionality + hs_req_masked_d = hs_req_d; + if (ctrl_i.priority_cnt_numerator > 0) //Set to 0 to disable this functionality begin - if (ls_stall_ctr_d >= ctrl_i.low_prio_max_stall) - hs_req_d = 0; //Let low side through for once + if (priority_cnt_q >= ctrl_i.priority_cnt_numerator && priority_cnt_q < ctrl_i.priority_cnt_denominator) + hs_req_masked_d = 0; //Let low side through for once end end @@ -96,11 +100,11 @@ module hci_arbiter always_ff @(posedge clk_i or negedge rst_ni) begin if (~rst_ni) - ls_stall_ctr_d <= 0; + priority_cnt_q <= 0; + else if(priority_cnt_q == ctrl_i.priority_cnt_denominator-1) + priority_cnt_q <= 0; else if (hs_req_d & ls_req_d) - ls_stall_ctr_d <= ls_stall_ctr_d + 1; - else - ls_stall_ctr_d <= 0; + priority_cnt_q <= priority_cnt_q + 1; end assign switch_channels_d = ctrl_i.invert_prio; @@ -129,7 +133,7 @@ module hci_arbiter // Side select generate for(genvar ii=0; ii Date: Sat, 7 Mar 2026 22:03:05 +0100 Subject: [PATCH 13/25] add regression test (no CI yet) --- regr/basic.yml | 31 +++ regr/bwruntests.py | 393 ++++++++++++++++++++++++++++++++++ regr/full_regression.sh | 37 ++++ regr/hardware/hci.json | 17 ++ regr/hardware/log.json | 17 ++ regr/testbench/fair.json | 11 + regr/testbench/hwpe_prio.json | 11 + regr/testbench/log_prio.json | 11 + regr/testbench/testbench.json | 11 + 9 files changed, 539 insertions(+) create mode 100644 regr/basic.yml create mode 100755 regr/bwruntests.py create mode 100755 regr/full_regression.sh create mode 100644 regr/hardware/hci.json create mode 100644 regr/hardware/log.json create mode 100644 regr/testbench/fair.json create mode 100644 regr/testbench/hwpe_prio.json create mode 100644 regr/testbench/log_prio.json create mode 100644 regr/testbench/testbench.json diff --git a/regr/basic.yml b/regr/basic.yml new file mode 100644 index 0000000..ab21052 --- /dev/null +++ b/regr/basic.yml @@ -0,0 +1,31 @@ +# Copyright (C) 2024 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Francesco Conti (f.conti@unibo.it) +# + +hci_tests: + log_fair: + path: . + command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/log.json + hci_fair: + path: . + command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/hci.json + hci_hwpe_prio: + path: . + command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio.json HARDWARE_JSON=regr/hardware/hci.json + hci_log_prio: + path: . + command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio.json HARDWARE_JSON=regr/hardware/hci.json diff --git a/regr/bwruntests.py b/regr/bwruntests.py new file mode 100755 index 0000000..555a1d4 --- /dev/null +++ b/regr/bwruntests.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 + +# Copyright 2020 ETH Zurich and University of Bologna +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Run shell commands listed in a file separated by newlines in a parallel +# fashion. If requested the results (tuples consisting of command, stdout, +# stderr and returncode) will be gathered in a junit.xml file. There a few +# knobs to tune the number of spawned processes and the junit.xml formatting. + +# Author: Robert Balas (balasr@iis.ee.ethz.ch) + +import argparse +import re +from subprocess import (Popen, TimeoutExpired, + CalledProcessError, PIPE) +from threading import Lock +import shlex +import sys +import signal +import os +import multiprocessing +import errno +import pprint +import time +import random +from collections import OrderedDict +import json + +runtest = argparse.ArgumentParser( + prog='bwruntests', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""Run PULP tests in parallel""", + epilog=""" +Test_file needs to be either a .yaml file (set the --yaml switch) +which looks like this: + +mytests.yml +[...] +parallel_bare_tests: # name of the test set + parMatrixMul8: # name of the test + path: ./parallel_bare_tests/parMatrixMul8 # path to the test's folder + command: make clean all run # command to run in the test's folder +[...] + +or + +Test_file needs to be a list of commands to be executed. Each line corresponds +to a single command and a test + +commands.f +[...] +make -C ./ml_tests/mlGrad clean all run +make -C ./ml_tests/mlDct clean all run +[...] + +Example: +bwruntests.py --proc-verbose -v \\ + --report_junit -t 3600 --yaml \\ + -o simplified-runtime.xml runtime-tests.yaml + +This Runs a set of tests defined in runtime-tests.yaml and dumps the +resulting junit.xml into simplified-runtime.xml. The --proc-verbose +scripts makes sure to print the stdout of each process to the shell. To +prevent a broken process from running forever, a maximum timeout of 3600 +seconds was set. For debugging purposes we enabled -v (--verbose) which +shows the full set of commands being run.""") + +runtest.version = '0.2' + +runtest.add_argument('test_file', type=str, + help='file defining tests to be run') +runtest.add_argument('--version', action='version', + version='%(prog)s ' + runtest.version) +runtest.add_argument('-p', '--max_procs', type=int, + default=multiprocessing.cpu_count(), + help="""Number of parallel + processes used to run test. + Default is number of cpu cores.""") +runtest.add_argument('-t', '--timeout', type=float, + default=None, + help="""Timeout for all processes in seconds""") +runtest.add_argument('-v', '--verbose', action='store_true', + help="""Enable verbose output""") +runtest.add_argument('-s', '--proc_verbose', action='store_true', + help="""Write processes' stdout and stderr to shell stdout + after they terminate""") +runtest.add_argument('--report_junit', action='store_true', + help="""Generate a junit report""") +runtest.add_argument('--disable_junit_pp', action='store_true', + help="""Disable pretty print of junit report""") +runtest.add_argument('--disable_results_pp', action='store_true', + help="""Disable printing test results""") +runtest.add_argument('-y,', '--yaml', action='store_true', + help="""Read tests from yaml file instead of executing + from a list of commands""") +runtest.add_argument('-o,', '--output', type=str, + help="""Write junit.xml to file instead of stdout""") +runtest.add_argument('-P,', '--perf', type=str, default=None, + help="""Write performance results to JSON file""") +stdout_lock = Lock() + +shared_total = 0 +len_total = 0 + +class FinishedProcess(object): + """A process that has finished running. + """ + def __init__(self, name, cwd, runargs, returncode, + stdout=None, stderr=None, time=None): + self.name = name + self.cwd = cwd + self.runargs = runargs + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + self.time = time + exec_time = 0 + throughput = 0 + workload = 0 + if returncode == 0: + matches = re.findall("# hwpe cycles =\s+(\d+)", stdout) + if matches: + exec_time = int(matches[0]) + self.exec_time = exec_time + + + def __repr__(self): + runargs = ['name={!r}'.format(self.name)] + runargs += ['cwd={!r}'.format(self.cwd)] + runargs += ['args={!r}'.format(self.runargs), + 'returncode={!r}'.format(self.returncode)] + if self.stdout is not None: + runargs.append('stdout={!r}'.format(self.stdout)) + if self.stderr is not None: + runargs.append('stderr={!r}'.format(self.stderr)) + if self.time is not None: + runargs.append('time={!r}'.format(self.time)) + return "{}({})".format(type(self).__name__, ', '.join(runargs)) + +def fork(name, cwd, *popenargs, check=False, shell=True, + **kwargs): + """Run subprocess and return process args, error code, stdout and stderr + """ + + def proc_out(cwd, stdout, stderr): + print('cwd={}'.format(cwd)) + print('stdout=') + print(stdout.decode('utf-8')) + print('stderr=') + print(stderr.decode('utf-8')) + + kwargs['stdout'] = PIPE + kwargs['stderr'] = PIPE + + with Popen(*popenargs, preexec_fn=os.setpgrp, cwd=cwd, + **kwargs) as process: + try: + # Child and parent are racing for setting/using the pgid so we have + # to set it in both processes. See glib manual. + try: + os.setpgid(process.pid, process.pid) + except OSError as e: + if e.errno != errno.EACCES: + raise + # measure runtime + start = time.time() + stdout, stderr = process.communicate(input, timeout=args.timeout) + except TimeoutExpired: + pgid = os.getpgid(process.pid) + os.killpg(pgid, signal.SIGKILL) + # process.kill() will only kill the immediate child but not its + # forks. This won't work since our commands will create a few forks + # (make -> vsim -> etc). We need to make a process group and kill + # that + stdout, stderr = process.communicate() + timeoutmsg = 'TIMEOUT after {:f}s'.format(args.timeout) + + if args.proc_verbose: + stdout_lock.acquire() + print(name) + print(timeoutmsg) + proc_out(cwd, stdout, stderr) + stdout_lock.release() + + return FinishedProcess(name, cwd, process.args, 1, + stdout.decode('utf-8'), + timeoutmsg + '\n' + + stderr.decode('utf-8'), + time.time() - start) + # Including KeyboardInterrupt, communicate handled that. + except: # noqa: E722 + pgid = os.getpgid(process.pid) + os.killpg(pgid, signal.SIGKILL) + # We don't call process.wait() as .__exit__ does that for us. + raise + retcode = process.poll() + if check and retcode: + raise CalledProcessError(retcode, process.args, + output=stdout, stderr=stderr) + if args.proc_verbose: + stdout_lock.acquire() + print(name) + proc_out(cwd, stdout, stderr) + stdout_lock.release() + + with lock: + shared_total.value += 1 + print("[%s][%d/%d] %s" % ("\033[1;32m OK \033[0m" if retcode == 0 else "\033[1;31mFAIL\033[0m", shared_total.value, len_total.value, name)) + + return FinishedProcess(name, cwd, process.args, retcode, + stdout.decode('utf-8'), + stderr.decode('utf-8'), + time.time() - start) + +def poolInit(s, t, l): + global shared_total + global len_total + global lock + shared_total = s + len_total = t + lock = l + +if __name__ == '__main__': + args = runtest.parse_args() + pp = pprint.PrettyPrinter(indent=4) + + # lazy importing so that we can work without junit_xml + if args.report_junit: + try: + from junit_xml import TestSuite, TestCase + except ImportError: + print("""Error: The --report_junit option requires +the junit_xml library which is not installed.""", + file=sys.stderr) + exit(1) + + # lazy import PrettyTable for displaying results + if not(args.disable_results_pp): + try: + from prettytable import PrettyTable + except ImportError: + print("""Warning: Displaying results requires the PrettyTable +library which is not installed""") + + tests = [] # list of tuple (testname, working dir, command) + + # load tests (yaml or command list) + if args.yaml: + try: + import yaml + except ImportError: + print("""Error: The --yaml option requires +the pyyaml library which is not installed.""", + file=sys.stderr) + exit(1) + with open(args.test_file) as f: + testyaml = yaml.load(f, Loader=yaml.Loader) + for testsetname, testv in testyaml.items(): + for testname, insn in testv.items(): + cmd = shlex.split(insn['command']) + cwd = insn['path'] + tests.append((testsetname + ':' + testname, cwd, cmd)) + if args.verbose: + pp.pprint(tests) + else: # (command list) + with open(args.test_file) as f: + testnames = list(map(str.rstrip, f)) + shellcmds = [shlex.split(e) for e in testnames] + cwds = ['./' for e in testnames] + tests = list(zip(testnames, cwds, shellcmds)) + if args.verbose: + print('Tests which we are running:') + pp.pprint(tests) + pp.pprint(shellcmds) + + # Spawning process pool + # Disable signals to prevent race. Child processes inherit SIGINT handler + original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) + lock = multiprocessing.Lock() + shared_total = multiprocessing.Value('i', 0) + len_total = multiprocessing.Value('i', len(tests)) + pool = multiprocessing.Pool(processes=args.max_procs, initializer=poolInit, initargs=(shared_total, len_total, lock )) + # Restore SIGINT handler + signal.signal(signal.SIGINT, original_sigint_handler) + # Shuffle tests + random.shuffle(tests) + try: + procresults = pool.starmap(fork, tests) + except KeyboardInterrupt: + print("\nTerminating bwruntest.py") + pool.terminate() + pool.join() + exit(1) + + # pp.pprint(procresults) + pool.close() + pool.join() + + # Generate junit.xml file. Junit.xml differentiates between failure and + # errors but we treat everything as errors. + if args.report_junit: + testcases = [] + for p in procresults: + # we can either expect p.name = testsetname:testname + # or p.name = testname + testcase = TestCase(p.name, + classname=((p.name).split(':'))[0], + stdout=p.stdout, + stderr=p.stderr, + elapsed_sec=p.time) + if p.returncode != 0: + testcase.add_failure_info(p.stderr) + testcases.append(testcase) + + testsuite = TestSuite('bwruntests', testcases) + if args.output: + with open(args.output, 'w') as f: + TestSuite.to_file(f, [testsuite], + prettyprint=not(args.disable_junit_pp)) + else: + print(TestSuite.to_xml_string([testsuite], + prettyprint=(args.disable_junit_pp))) + + # # print JSON for performance regression + # if args.perf is not None: + # # if file does not exist, create new dictionary: + # if not os.path.isfile(args.perf): + # d = OrderedDict([]) + # # else, load the existing dictionary + # else: + # with open(args.perf) as f: + # d = json.load(f, object_pairs_hook=OrderedDict) + # # save the new execution times + # for p in procresults: + # if p.returncode == 0: + # d[p.name] = p.exec_time + # with open(args.perf, 'w', encoding='utf-8') as f: + # json.dump(d, f, ensure_ascii=False, indent=4) + + # print JSON for performance regression + if args.perf is not None: + # if file does not exist, create new dictionary: + if not os.path.isfile(args.perf): + d = list([]) + # else, load the existing dictionary + else: + with open(args.perf) as f: + d = json.load(f) + # save the new execution times + for p in procresults: + if p.returncode == 0: + d.append({ 'name': p.name, 'value': p.exec_time, 'unit': 'cycles'}) + with open(args.perf, 'w', encoding='utf-8') as f: + json.dump(d, f, ensure_ascii=False, indent=4) + + # print summary of test results + if not(args.disable_results_pp): + testcount = sum(1 for x in tests) + testfailcount = sum(1 for p in procresults if p.returncode != 0) + testpassedcount = testcount - testfailcount + resulttable = PrettyTable(['test', 'cycles', 'time', 'passed/total']) + resulttable.align['test'] = "l" + for p in procresults: + testpassed = 1 if p.returncode == 0 else 0 + testname = p.name + resulttable.add_row([testname, + p.exec_time, + '{0:.2f}s'.format(p.time), + '{0:d}/{1:d}'.format(testpassed, 1)]) + resulttable.add_row(['total', '', '', '{0:d}/{1:d}'. + format(testpassedcount, testcount)]) + print(resulttable) + if testpassedcount != testcount: + import sys; sys.exit(1) + diff --git a/regr/full_regression.sh b/regr/full_regression.sh new file mode 100755 index 0000000..1e10f4e --- /dev/null +++ b/regr/full_regression.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright (C) 2020-2024 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Francesco Conti (f.conti@unibo.it) +# + +export N_PROC=1 +export P_STALL=0.04 +TIMEOUT=400 + +# Declare a string array with type +declare -a test_list=( + "regr/basic.yml" +) + +# Read the list values with space +for val in "${test_list[@]}"; do + nice -n10 regr/bwruntests.py --report_junit -t ${TIMEOUT} --yaml -o regr/hci_tests.xml -p${N_PROC} $val + if test $? -ne 0; then + echo "Error in test $val" + exit 1 + fi +done +unset P_STALL diff --git a/regr/hardware/hci.json b/regr/hardware/hci.json new file mode 100644 index 0000000..c8db224 --- /dev/null +++ b/regr/hardware/hci.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 3, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 128, + "N_BANKS": 32, + "INTERCO_TYPE": "HCI", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/regr/hardware/log.json b/regr/hardware/log.json new file mode 100644 index 0000000..6a6a3bd --- /dev/null +++ b/regr/hardware/log.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 3, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 128, + "N_BANKS": 32, + "INTERCO_TYPE": "LOG", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/regr/testbench/fair.json b/regr/testbench/fair.json new file mode 100644 index 0000000..011e20b --- /dev/null +++ b/regr/testbench/fair.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 0, + "PRIORITY_CNT_NUMERATOR": 1, + "PRIORITY_CNT_DENOMINATOR": 2 + } +} diff --git a/regr/testbench/hwpe_prio.json b/regr/testbench/hwpe_prio.json new file mode 100644 index 0000000..4eefeb8 --- /dev/null +++ b/regr/testbench/hwpe_prio.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 1, + "PRIORITY_CNT_NUMERATOR": 9, + "PRIORITY_CNT_DENOMINATOR": 10 + } +} diff --git a/regr/testbench/log_prio.json b/regr/testbench/log_prio.json new file mode 100644 index 0000000..473e3fd --- /dev/null +++ b/regr/testbench/log_prio.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 0, + "PRIORITY_CNT_NUMERATOR": 9, + "PRIORITY_CNT_DENOMINATOR": 10 + } +} diff --git a/regr/testbench/testbench.json b/regr/testbench/testbench.json new file mode 100644 index 0000000..011e20b --- /dev/null +++ b/regr/testbench/testbench.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 0, + "PRIORITY_CNT_NUMERATOR": 1, + "PRIORITY_CNT_DENOMINATOR": 2 + } +} From 2674f77b791588f39fa521707b2bd20238522558 Mon Sep 17 00:00:00 2001 From: Francesco Conti Date: Sat, 7 Mar 2026 22:08:13 +0100 Subject: [PATCH 14/25] Add basic github/gitlab CI workflow --- .github/workflows/gitlab-ci.yml | 29 ++++++++++++ .gitlab-ci.yml | 44 +++++++++++++++++++ regr/basic.yml | 8 ++-- regr/hardware/{hci.json => hci/hardware.json} | 0 regr/hardware/{log.json => log/hardware.json} | 0 regr/testbench/fair.json | 11 ----- regr/testbench/{ => fair}/testbench.json | 0 .../testbench.json} | 0 .../testbench.json} | 0 9 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/gitlab-ci.yml create mode 100644 .gitlab-ci.yml rename regr/hardware/{hci.json => hci/hardware.json} (100%) rename regr/hardware/{log.json => log/hardware.json} (100%) delete mode 100644 regr/testbench/fair.json rename regr/testbench/{ => fair}/testbench.json (100%) rename regr/testbench/{hwpe_prio.json => hwpe_prio/testbench.json} (100%) rename regr/testbench/{log_prio.json => log_prio/testbench.json} (100%) diff --git a/.github/workflows/gitlab-ci.yml b/.github/workflows/gitlab-ci.yml new file mode 100644 index 0000000..e89ce0d --- /dev/null +++ b/.github/workflows/gitlab-ci.yml @@ -0,0 +1,29 @@ +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Paul Scheffler + +name: gitlab-ci + +on: [ push, pull_request, workflow_dispatch ] + +permissions: + # deployments permission to deploy GitHub pages website + deployments: write + # contents permission to update benchmark contents in gh-pages branch + contents: write + +jobs: + gitlab-ci: + runs-on: ubuntu-latest + steps: + - name: Check Gitlab CI + uses: pulp-platform/pulp-actions/gitlab-ci@v1 + # Skip on forks or pull requests from forks due to missing secrets. + if: github.repository == 'pulp-platform/hci' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) + with: + domain: iis-git.ee.ethz.ch + repo: github-mirror/hci + token: ${{ secrets.GITLAB_TOKEN }} + diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..a3d926f --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,44 @@ +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Paul Scheffler + +# We initialize the nonfree repo, then spawn a sub-pipeline from it + +variables: + GIT_SUBMODULE_STRATEGY: recursive + # Our reference GCC toolchain for reproducible builds + +before_script: + - python -V # Print out python version for debugging + - python -m pip install --user virtualenv + +.base: + artifacts: + when: always + expire_in: 1 week + +stages: + - build + - test + +build: + stage: build + script: + - make checkout + artifacts: + when: always + expire_in: 3 hours + paths: [ .bender ] + +testset: + extends: .base + needs: [ build ] + stage: test + script: + - regr/full_regression.sh + artifacts: + when: always + expire_in: 1 year + paths: [ regr/hci_tests.xml ] diff --git a/regr/basic.yml b/regr/basic.yml index ab21052..9f43cfa 100644 --- a/regr/basic.yml +++ b/regr/basic.yml @@ -19,13 +19,13 @@ hci_tests: log_fair: path: . - command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/log.json + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/log/hardware.json hci_fair: path: . - command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/hci.json + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json hci_hwpe_prio: path: . - command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio.json HARDWARE_JSON=regr/hardware/hci.json + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json hci_log_prio: path: . - command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio.json HARDWARE_JSON=regr/hardware/hci.json + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json diff --git a/regr/hardware/hci.json b/regr/hardware/hci/hardware.json similarity index 100% rename from regr/hardware/hci.json rename to regr/hardware/hci/hardware.json diff --git a/regr/hardware/log.json b/regr/hardware/log/hardware.json similarity index 100% rename from regr/hardware/log.json rename to regr/hardware/log/hardware.json diff --git a/regr/testbench/fair.json b/regr/testbench/fair.json deleted file mode 100644 index 011e20b..0000000 --- a/regr/testbench/fair.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "description": "Testbench configuration parameters", - "parameters": { - "CLK_PERIOD": 50, - "RST_CLK_CYCLES": 10, - "RANDOM_GNT": 0, - "INVERT_PRIO": 0, - "PRIORITY_CNT_NUMERATOR": 1, - "PRIORITY_CNT_DENOMINATOR": 2 - } -} diff --git a/regr/testbench/testbench.json b/regr/testbench/fair/testbench.json similarity index 100% rename from regr/testbench/testbench.json rename to regr/testbench/fair/testbench.json diff --git a/regr/testbench/hwpe_prio.json b/regr/testbench/hwpe_prio/testbench.json similarity index 100% rename from regr/testbench/hwpe_prio.json rename to regr/testbench/hwpe_prio/testbench.json diff --git a/regr/testbench/log_prio.json b/regr/testbench/log_prio/testbench.json similarity index 100% rename from regr/testbench/log_prio.json rename to regr/testbench/log_prio/testbench.json From 13aa12a70d745780fd948daccbb1413352ec18c3 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 13:39:52 +0100 Subject: [PATCH 15/25] verif: Fix GUI makefile param --- target/verif/verif.mk | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 5ef479b..2285537 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -94,11 +94,7 @@ clean-config-verif: ############## # Parameters -ifdef gui - GUI ?= $(gui) -else - GUI ?= 0 -endif +GUI ?= $(if $(gui),$(gui),0) # Top-level to simulate sim_top_level ?= tb_hci sim_vsim_lib ?= $(HCI_VERIF_DIR)/vsim/work From 5a93c8c4c237a5b32e2bde4bb63c3376f640b738 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 13:47:24 +0100 Subject: [PATCH 16/25] tb: Remove end_req signal (only track complete time to response) --- target/verif/src/application_driver.sv | 4 ---- target/verif/src/simulation_report.sv | 5 ----- target/verif/src/tb_hci.sv | 8 -------- target/verif/src/tb_hci_pkg.sv | 2 +- target/verif/src/throughput_monitor.sv | 18 ------------------ 5 files changed, 1 insertion(+), 36 deletions(-) diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index d4697e9..c2a920e 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -28,7 +28,6 @@ module application_driver #( input logic rst_ni, input logic clear_i, // used to gate the driver or to reset it hci_core_intf.initiator hci_if, - output logic end_req_o, output logic end_resp_o, output int unsigned n_issued_tr_o, output int unsigned n_issued_rd_tr_o, @@ -138,7 +137,6 @@ module application_driver #( hci_if.r_ready = 1'b1; hci_if.user = '0; // Defaults (outputs) - end_req_o = 1'b0; end_resp_o = 1'b0; // inputs: gnt, r_data, r_valid, r_user, r_id @@ -219,7 +217,6 @@ module application_driver #( end end REQ_DONE: begin - end_req_o = 1'b1; if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin // All read responses have been retired req_state_d = RSP_DONE; @@ -228,7 +225,6 @@ module application_driver #( end end RSP_DONE: begin - end_req_o = 1'b1; end_resp_o = 1'b1; end default: begin diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index e8b02bd..420f015 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -20,10 +20,8 @@ module simulation_report import tb_hci_pkg::*; ( - input logic [N_DRIVERS-1:0] end_req_i, input logic [N_DRIVERS-1:0] end_resp_i, input real throughput_complete_i, - input real stim_latency_i, input real tot_latency_i, input real latency_per_master_i[N_DRIVERS], input real sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE], @@ -88,8 +86,6 @@ module simulation_report hwpe_masters_with_grants = '0; missing_reads = 1'b0; - wait (&end_req_i); - wait (stim_latency_i >= 0); wait (&end_resp_i); wait (throughput_complete_i >= 0); wait (tot_latency_i >= 0); @@ -242,7 +238,6 @@ module simulation_report "Actual BW (completion): %0.2f bit/cycle [utilization: %0.1f%%]", throughput_complete_i, actual_bw_utilization ); - $display("Stimulus phase duration: %0.2f cycles", stim_latency_i); $display("Completion phase duration: %0.2f cycles", tot_latency_i); $display( "Granted transactions: reads=%0d writes=%0d total=%0d", diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index f67eb46..80966b5 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -27,7 +27,6 @@ module tb_hci (); logic clk, rst_n; - logic [N_DRIVERS-1:0] s_end_req; // end_req_o from all drivers, [N_DRIVERS-1:0] ordering logic [N_DRIVERS-1:0] s_end_resp; // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering logic [N_DRIVERS-1:0] s_clear_drv; // per-driver clear_i (held 1 until dependencies done) hci_interconnect_ctrl_t s_hci_ctrl; @@ -345,7 +344,6 @@ module tb_hci .rst_ni(rst_n), .clear_i(s_clear_drv[ii]), .hci_if(hci_driver_log_if[ii]), - .end_req_o(s_end_req[ii]), .end_resp_o(s_end_resp[ii]), .n_issued_tr_o(s_issued_transactions[ii]), .n_issued_rd_tr_o(s_issued_read_transactions[ii]), @@ -369,7 +367,6 @@ module tb_hci .rst_ni(rst_n), .clear_i(s_clear_drv[N_LOG_MASTERS + ii]), .hci_if(hci_driver_hwpe_if[ii]), - .end_req_o(s_end_req[N_LOG_MASTERS + ii]), .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]), .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]), .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]), @@ -395,7 +392,6 @@ module tb_hci real latency_per_master[N_DRIVERS]; real throughput_completed; - real stim_latency; real tot_latency; throughput_monitor #( @@ -407,14 +403,12 @@ module tb_hci ) i_throughput_monitor ( .clk_i(clk), .rst_ni(rst_n), - .end_req_i(s_end_req), .end_resp_i(s_end_resp), .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG), .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE), .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG), .n_write_granted_hwpe_i(N_WRITE_GRANTED_TRANSACTIONS_HWPE), .throughput_complete_o(throughput_completed), - .stim_latency_o(stim_latency), .tot_latency_o(tot_latency), .latency_per_master_o(latency_per_master) ); @@ -440,10 +434,8 @@ module tb_hci ); simulation_report i_simulation_report ( - .end_req_i(s_end_req), .end_resp_i(s_end_resp), .throughput_complete_i(throughput_completed), - .stim_latency_i(stim_latency), .tot_latency_i(tot_latency), .latency_per_master_i(latency_per_master), .sum_req_to_gnt_latency_log_i(SUM_REQ_TO_GNT_LATENCY_LOG), diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index 0d16b40..0addc97 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -79,7 +79,7 @@ package tb_hci_pkg; localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's - // end_req_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define. + // end_resp_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define. // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential // chain in tb_hci.sv. localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] = diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv index 9a7e51b..9298577 100644 --- a/target/verif/src/throughput_monitor.sv +++ b/target/verif/src/throughput_monitor.sv @@ -26,7 +26,6 @@ module throughput_monitor #( ) ( input logic clk_i, input logic rst_ni, - input logic [N_MASTER-1:0] end_req_i, input logic [N_MASTER-1:0] end_resp_i, // Read transactions number input int unsigned n_read_complete_log_i[N_MASTER-N_HWPE], @@ -36,28 +35,11 @@ module throughput_monitor #( input int unsigned n_write_granted_hwpe_i[N_HWPE], // Completion-side throughput: accepted writes + completed reads per elapsed completion cycle. output real throughput_complete_o, - // Elapsed cycles from reset release to end_stimuli. - output real stim_latency_o, // Total simulation time (cycles) and simulation time per master (cycles) output real tot_latency_o, output real latency_per_master_o[N_MASTER] ); - // Stimulus duration at stimulus completion. - initial begin - time start_time, end_time; - real stim_time_cycles; - stim_latency_o = -1; - wait (rst_ni); - #(CLK_PERIOD/100); - @(posedge clk_i); - start_time = $time; - wait (&end_req_i); - end_time = $time; - stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD); // cycles - stim_latency_o = stim_time_cycles; - end - // Completion-side throughput at full completion. initial begin time start_time, end_time; From d6d653792c6aa69687f62624294bde924f84ff3c Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 13:58:12 +0100 Subject: [PATCH 17/25] verif: Rename tb monitors and fix license headers --- Bender.yml | 7 ++++--- bender.mk | 2 +- target/verif/bender.mk | 2 +- target/verif/config/generated/hardware.mk.tpl | 2 +- target/verif/config/generated/testbench.mk.tpl | 2 +- target/verif/src/application_driver.sv | 4 +++- .../{throughput_monitor.sv => bandwidth_monitor.sv} | 13 ++++++++----- .../src/{latency_monitor.sv => req_gnt_monitor.sv} | 13 ++++++++----- target/verif/src/simulation_report.sv | 5 ++++- target/verif/src/tb_hci.sv | 11 +++++------ target/verif/src/tb_hci_pkg.sv | 3 +-- target/verif/verif.mk | 2 +- 12 files changed, 38 insertions(+), 28 deletions(-) rename target/verif/src/{throughput_monitor.sv => bandwidth_monitor.sv} (91%) rename target/verif/src/{latency_monitor.sv => req_gnt_monitor.sv} (95%) diff --git a/Bender.yml b/Bender.yml index 516fe27..37aa7af 100644 --- a/Bender.yml +++ b/Bender.yml @@ -2,10 +2,11 @@ package: name: hci authors: - "Francesco Conti " - - "Gianna Paulin " - "Tobias Riedener " - "Luigi Ghionda " - "Arpan Suravi Prasad " + - "Sergio Mazzola " dependencies: hwpe-stream: { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.9.0 } @@ -70,8 +71,8 @@ sources: # Level 1 - target/verif/src/application_driver.sv - target/verif/src/tcdm_banks_wrap.sv - - target/verif/src/latency_monitor.sv - - target/verif/src/throughput_monitor.sv + - target/verif/src/req_gnt_monitor.sv + - target/verif/src/bandwidth_monitor.sv # Level 2 - target/verif/src/simulation_report.sv # Level 3 diff --git a/bender.mk b/bender.mk index 5f78e57..788c13a 100644 --- a/bender.mk +++ b/bender.mk @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # diff --git a/target/verif/bender.mk b/target/verif/bender.mk index d4dadc3..02e640b 100644 --- a/target/verif/bender.mk +++ b/target/verif/bender.mk @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl index f6bdc14..031d817 100644 --- a/target/verif/config/generated/hardware.mk.tpl +++ b/target/verif/config/generated/hardware.mk.tpl @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl index 66b617d..c36dad7 100644 --- a/target/verif/config/generated/testbench.mk.tpl +++ b/target/verif/config/generated/testbench.mk.tpl @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index c2a920e..35c66c4 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -1,7 +1,9 @@ /* * application_driver.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/bandwidth_monitor.sv similarity index 91% rename from target/verif/src/throughput_monitor.sv rename to target/verif/src/bandwidth_monitor.sv index 9298577..524e1ff 100644 --- a/target/verif/src/throughput_monitor.sv +++ b/target/verif/src/bandwidth_monitor.sv @@ -1,7 +1,10 @@ /* - * throughput_monitor.sv + * bandwidth_monitor.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -13,11 +16,11 @@ */ /** - * Throughput monitor - * Measures actual throughput and simulation time for each master + * Bandwidth monitor + * Measures actual bandwidth and completion time for each master */ -module throughput_monitor #( +module bandwidth_monitor #( parameter int unsigned N_MASTER, parameter int unsigned N_HWPE, parameter int unsigned CLK_PERIOD, diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/req_gnt_monitor.sv similarity index 95% rename from target/verif/src/latency_monitor.sv rename to target/verif/src/req_gnt_monitor.sv index 3aff305..66fa4cb 100644 --- a/target/verif/src/latency_monitor.sv +++ b/target/verif/src/req_gnt_monitor.sv @@ -1,7 +1,10 @@ /* - * latency_monitor.sv + * req_gnt_monitor.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -13,11 +16,11 @@ */ /** - * Latency monitor - * Tracks request-to-grant latency and transaction counters for all masters + * Request-to-grant monitor + * Tracks request-to-grant stall latency and transaction counters for all masters */ -module latency_monitor #( +module req_gnt_monitor #( parameter int unsigned N_MASTER = 4, parameter int unsigned N_HWPE = 1 ) ( diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index 420f015..78be03d 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -1,7 +1,10 @@ /* * simulation_report.sv * - * Copyright (C) 2026 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 80966b5..d95ce3b 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -4,8 +4,7 @@ * Sergio Mazzola * Luca Codeluppi * - * - * Copyright (C) 2019-2025 ETH Zurich, University of Bologna + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -394,13 +393,13 @@ module tb_hci real throughput_completed; real tot_latency; - throughput_monitor #( + bandwidth_monitor #( .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE), .CLK_PERIOD(CLK_PERIOD), .DATA_WIDTH(DATA_WIDTH), .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT) - ) i_throughput_monitor ( + ) i_bandwidth_monitor ( .clk_i(clk), .rst_ni(rst_n), .end_resp_i(s_end_resp), @@ -413,10 +412,10 @@ module tb_hci .latency_per_master_o(latency_per_master) ); - latency_monitor #( + req_gnt_monitor #( .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE) - ) i_latency_monitor ( + ) i_req_gnt_monitor ( .clk_i(clk), .rst_ni(rst_n), .hci_driver_log_if(hci_driver_log_if), diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index 0addc97..47c5d7b 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -4,8 +4,7 @@ * Sergio Mazzola * Luca Codeluppi * - * - * Copyright (C) 2019-2025 ETH Zurich, University of Bologna + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 2285537..b94a3b4 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # From b4d59303f5f2d6bcac934b7f5223f0172b11f81d Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 19:47:38 +0100 Subject: [PATCH 18/25] tb: Add support for multiple patterns per master, in different phases --- target/verif/README.md | 4 +- target/verif/config/hardware.json | 6 +- target/verif/config/workload.json | 538 ++++++++++++++++-- target/verif/config/workload.schema.json | 77 ++- .../verif/simvectors/hci_stimuli/generator.py | 4 + .../verif/simvectors/hci_stimuli/patterns.py | 512 +++++------------ target/verif/simvectors/main.py | 301 +++++++--- target/verif/src/application_driver.sv | 296 +++++----- target/verif/src/tb_hci.sv | 137 +++-- target/verif/src/tb_hci_pkg.sv | 23 +- target/verif/verif.mk | 14 +- target/verif/vsim/tb_hci.tcl | 116 +++- 12 files changed, 1315 insertions(+), 713 deletions(-) diff --git a/target/verif/README.md b/target/verif/README.md index 20db9b2..cd7c03f 100644 --- a/target/verif/README.md +++ b/target/verif/README.md @@ -107,7 +107,7 @@ Masters can be assigned to named **phases** and can declare **dependencies** on ### Mechanism -`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/wait_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define. +`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/fence_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define. In `tb_hci_pkg.sv`, `WAIT_MASKS[i]` holds driver `i`'s mask. In `tb_hci.sv`, each driver's `clear_i` is combinationally held high until all drivers in its mask have asserted `end_req_o`: @@ -170,7 +170,7 @@ This allows verifying that regions are correctly partitioned and identifying any | Target | Action | |--------|--------| | `make config-verif` | Generate `hardware.mk`, `testbench.mk` from JSON | -| `make stim-verif` | Generate stimuli and `wait_masks.mk` from workload/hardware/testbench JSON | +| `make stim-verif` | Generate stimuli and `fence_masks.mk` from workload/hardware/testbench JSON | | `make compile-verif` | Compile RTL and testbench with QuestaSim | | `make opt-verif` | Optimize compiled design | | `make run-verif` | Run simulation | diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index c8db224..b349a65 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -1,14 +1,14 @@ { "description": "Hardware configuration parameters for HCI interconnect", "parameters": { - "N_HWPE": 3, + "N_HWPE": 2, "HWPE_WIDTH_FACT": 8, "N_CORE": 8, "N_DMA": 0, "N_EXT": 1, "DATA_WIDTH": 32, - "TOT_MEM_SIZE": 128, - "N_BANKS": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, "INTERCO_TYPE": "HCI", "TS_BIT": 21, "EXPFIFO": 0, diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index e094564..6c0d227 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -1,58 +1,508 @@ { - "description": "8 cores at 50% traffic in dedicated regions + 3 HWPEs: DMA imports tile B, GEMM reads tile B and writes output, Transpose reads GEMM output and writes transposed result. DMA and GEMM run concurrently (wave0), Transpose waits for wave0.", + "description": "MobileViT-inspired materialized attention kernel. d_model=192, num_heads=4, d_head=48, N=256 tokens, B=128 (2 tiles). One head at a time. Static resident: K^T_head (0x00000, 16kB), V_head (0x04000, 16kB). Ping-pong workspaces: Buf0={Q0 (0x08000,8kB), A0/P0 (0x0A000,64kB), O0 (0x1A000,8kB)}, Buf1={Q1 (0x1C000,8kB), A1/P1 (0x1E000,64kB), O1 (0x2E000,8kB)}. Pipeline: DMA preloads all tensors (dma_setup); GEMM computes Q*K^T->A (gemm_qk_t0); 8 cores softmax A->P in-place (softmax_t0); GEMM computes P*V->O (gemm_pv_t0); 8 cores norm O in-place (norm_t0); repeat for tile 1. Single GEMM engine serializes all 4 GEMM ops.", "log_masters": [ - { "id": 0, "description": "Core 0", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00000000", "region_base_address": "0x00000000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 1, "description": "Core 1", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00001000", "region_base_address": "0x00001000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 2, "description": "Core 2", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00002000", "region_base_address": "0x00002000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 3, "description": "Core 3", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00003000", "region_base_address": "0x00003000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 4, "description": "Core 4", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00004000", "region_base_address": "0x00004000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 5, "description": "Core 5", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00005000", "region_base_address": "0x00005000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 6, "description": "Core 6", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00006000", "region_base_address": "0x00006000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 7, "description": "Core 7", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00007000", "region_base_address": "0x00007000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 }, - { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } - ], - - "hwpe_masters": [ { "id": 0, - "description": "DMA — wide tile import into buffer B (0x08000–0x0FFFF)", - "mem_access_type": "random", - "n_transactions": 1000, - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x00008000", - "region_size_bytes": 32768, - "phase": "hwpe_wave0" + "description": "Core 0", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] }, { "id": 1, - "description": "GEMM — reads tile B from DMA buffer (0x08000–0x0FFFF), writes output to buffer C (0x10000–0x17FFF)", - "mem_access_type": "matmul_phased", - "n_transactions": 1000, - "region_base_address_a": "0x00008000", - "region_size_bytes_a": 32768, - "matmul_ratio_b": 0, - "region_base_address_c": "0x00010000", - "region_size_bytes_c": 32768, - "matmul_ratio_a": 1, - "matmul_ratio_c": 1, - "phase": "hwpe_wave0" + "description": "Core 1", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] }, { "id": 2, - "description": "Transpose engine — reads GEMM output (0x10000–0x17FFF), writes transposed result (0x18000–0x1FFFF)", - "mem_access_type": "matmul_phased", - "n_transactions": 1000, - "region_base_address_a": "0x00010000", - "region_size_bytes_a": 32768, - "matmul_ratio_b": 0, - "region_base_address_c": "0x00018000", - "region_size_bytes_c": 32768, - "matmul_ratio_a": 1, - "matmul_ratio_c": 1, - "phase": "hwpe_wave1", - "wait_for": ["hwpe_wave0"] + "description": "Core 2", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 3, + "description": "Core 3", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 4, + "description": "Core 4", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 5, + "description": "Core 5", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 6, + "description": "Core 6", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 7, + "description": "Core 7", + "patterns": [ + { + "description": "softmax tile 0: read+write A0/P0 in-place", + "mem_access_type": "random", + "phase": "softmax_t0", + "wait_for": ["gemm_qk_t0"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x0A000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 0: read+write O0 in-place", + "mem_access_type": "random", + "phase": "norm_t0", + "wait_for": ["gemm_pv_t0"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1A000", + "region_size_bytes": 8192 + }, + { + "description": "softmax tile 1: read+write A1/P1 in-place", + "mem_access_type": "random", + "phase": "softmax_t1", + "wait_for": ["gemm_qk_t1"], + "n_transactions": 2048, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x1E000", + "region_size_bytes": 65536 + }, + { + "description": "norm tile 1: read+write O1 in-place", + "mem_access_type": "random", + "phase": "norm_t1", + "wait_for": ["gemm_pv_t1"], + "n_transactions": 384, + "traffic_pct": 75, + "traffic_read_pct": 50, + "region_base_address": "0x2E000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 8, + "description": "External 0 (idle)", + "mem_access_type": "idle" + } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA: preload K^T_head, V_head, Q0, Q1 (all writes, one-shot)", + "patterns": [ + { + "description": "Preload K^T_head (0x00000, 12kB active / 16kB padded), V_head (0x04000), Q0 (0x08000), Q1 (0x1C000)", + "mem_access_type": "random", + "phase": "dma_setup", + "n_transactions": 9216, + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 122880 + } + ] + }, + { + "id": 1, + "description": "GEMM engine: 4 sequential ops (QK_t0, PV_t0, QK_t1, PV_t1)", + "patterns": [ + { + "description": "QK tile 0: read Q0 [128x48 int8], write A0 [128x256 int16]", + "mem_access_type": "matmul_phased", + "phase": "gemm_qk_t0", + "wait_for": ["dma_setup"], + "matrix_m": 128, + "matrix_n": 256, + "matrix_k": 48, + "region_base_address_a": "0x08000", + "region_size_bytes_a": 8192, + "matmul_ratio_a": 1, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 16384, + "matmul_ratio_b": 0, + "region_base_address_c": "0x0A000", + "region_size_bytes_c": 65536, + "matmul_ratio_c": 4 + }, + { + "description": "PV tile 0: read P0 [128x256 int16], write O0 [128x48 int8]", + "mem_access_type": "matmul_phased", + "phase": "gemm_pv_t0", + "wait_for": ["softmax_t0"], + "matrix_m": 128, + "matrix_n": 48, + "matrix_k": 256, + "region_base_address_a": "0x0A000", + "region_size_bytes_a": 65536, + "matmul_ratio_a": 4, + "region_base_address_b": "0x04000", + "region_size_bytes_b": 16384, + "matmul_ratio_b": 0, + "region_base_address_c": "0x1A000", + "region_size_bytes_c": 8192, + "matmul_ratio_c": 1 + }, + { + "description": "QK tile 1: read Q1 [128x48 int8], write A1 [128x256 int16]", + "mem_access_type": "matmul_phased", + "phase": "gemm_qk_t1", + "wait_for": ["norm_t0"], + "matrix_m": 128, + "matrix_n": 256, + "matrix_k": 48, + "region_base_address_a": "0x1C000", + "region_size_bytes_a": 8192, + "matmul_ratio_a": 1, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 16384, + "matmul_ratio_b": 0, + "region_base_address_c": "0x1E000", + "region_size_bytes_c": 65536, + "matmul_ratio_c": 4 + }, + { + "description": "PV tile 1: read P1 [128x256 int16], write O1 [128x48 int8]", + "mem_access_type": "matmul_phased", + "phase": "gemm_pv_t1", + "wait_for": ["softmax_t1"], + "matrix_m": 128, + "matrix_n": 48, + "matrix_k": 256, + "region_base_address_a": "0x1E000", + "region_size_bytes_a": 65536, + "matmul_ratio_a": 4, + "region_base_address_b": "0x04000", + "region_size_bytes_b": 16384, + "matmul_ratio_b": 0, + "region_base_address_c": "0x2E000", + "region_size_bytes_c": 8192, + "matmul_ratio_c": 1 + } + ] } ] } diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json index 3b4c653..2140e37 100644 --- a/target/verif/config/workload.schema.json +++ b/target/verif/config/workload.schema.json @@ -24,48 +24,39 @@ }, "$defs": { - "master": { + + "pattern": { "type": "object", - "description": "Per-master stimulus configuration. The active pattern is selected by mem_access_type.", + "description": "A single traffic pattern segment. Multiple patterns on the same master are executed sequentially, separated by PAUSE fence tokens in the stimulus file.", "required": ["mem_access_type"], "unevaluatedProperties": false, "properties": { - "id": { - "type": "integer", - "description": "Optional consistency label. Mapping is always positional; a mismatch triggers a warning." - }, "description": { "type": "string", - "description": "Human-readable label shown in the memory-map report." + "description": "Human-readable label for this pattern segment, shown in the memory-map report." }, "mem_access_type": { "type": "string", "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"], - "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'. Use traffic_pct/traffic_read_pct on 'random'/'linear' to model bus utilization." + "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'." }, "phase": { "type": "string", "default": "default", - "description": "Phase group this master belongs to, referenced by other masters' wait_for." + "description": "Phase name for this pattern segment. Used by other patterns' wait_for to reference this segment. The master's phase identity is determined by its first pattern's phase." }, "wait_for": { "type": "array", "items": { "type": "string" }, "default": [], - "description": "Phase names this master waits for before its WAIT_MASK clears in the testbench." - }, - "start_delay_cycles": { - "type": "integer", - "minimum": 0, - "default": 0, - "description": "Number of idle cycles prepended to the processed stimuli file (static start delay)." + "description": "Phase names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced phases have advanced past the same fence level." }, "n_transactions": { "type": "integer", "minimum": 0, - "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields (see per-pattern rules)." + "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields." }, "region_base_address": { @@ -73,20 +64,20 @@ { "type": "integer", "minimum": 0 }, { "type": "string", "description": "Decimal, hex (0x...) or binary string." } ], - "description": "[random, matmul_phased] Base byte address of the memory region. Default: master_local_idx * (total_mem / n_peers_of_kind), aligned to access_bytes." + "description": "[random, matmul_phased] Base byte address of the memory region." }, "region_size_bytes": { "oneOf": [ { "type": "integer", "minimum": 0 }, { "type": "string", "description": "Decimal, hex (0x...) or binary string." } ], - "description": "[random, matmul_phased] Size in bytes of the memory region. Default: total_mem / n_peers_of_kind. Clamped so region does not exceed total memory." + "description": "[random, matmul_phased] Size in bytes of the memory region." }, "start_address": { "type": "string", "default": "0", - "description": "[linear, 2d, 3d] Start byte address as decimal, hex (0x...) or binary string." + "description": "[linear, 2d, 3d] Start byte address." }, "stride0": { "type": "integer", @@ -114,7 +105,7 @@ "len_d0": { "type": "integer", "minimum": 1, - "description": "[2d, 3d] Innermost dimension length. Can replace n_transactions together with len_d1 (and len_d2 for 3d)." + "description": "[2d, 3d] Innermost dimension length." }, "len_d1": { "type": "integer", @@ -130,7 +121,7 @@ "matrix_m": { "type": "integer", "minimum": 1, - "description": "[matmul_phased] Rows of A and C. Can replace n_transactions together with matrix_n and matrix_k (total = m*k + k*n + m*n)." + "description": "[matmul_phased] Rows of A and C." }, "matrix_n": { "type": "integer", @@ -144,7 +135,7 @@ }, "region_base_address_a": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], - "description": "[matmul_phased] Base address of the read-A region. If set together with region_size_bytes_a, overrides the auto-split for phase A." + "description": "[matmul_phased] Base address of the read-A region." }, "region_size_bytes_a": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], @@ -166,12 +157,11 @@ "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], "description": "[matmul_phased] Size in bytes of the write-C region." }, - "matmul_ratio_a": { "type": "integer", "minimum": 0, "default": 1, - "description": "[matmul_phased] Relative weight of the read-A phase in the transaction split." + "description": "[matmul_phased] Relative weight of the read-A phase." }, "matmul_ratio_b": { "type": "integer", @@ -191,19 +181,19 @@ "minimum": 1, "maximum": 100, "default": 100, - "description": "[random, linear] Bus utilization percentage. After each transaction emit floor((100-pct)/pct) idle cycles. E.g. 50 => 1 idle per transaction (req, idle, req, idle, ...). Default 100 = back-to-back." + "description": "[random, linear] Bus utilization percentage." }, "traffic_read_pct": { "type": "integer", "minimum": 0, "maximum": 100, - "description": "[random, linear] Percentage of accesses that are reads (wen=1). If omitted, wen is random per transaction. When set, all reads are emitted first, then all writes." + "description": "[random, linear] Percentage of accesses that are reads (wen=1)." }, "idle_cycles_between_phases": { "type": "integer", "minimum": 0, "default": 0, - "description": "[2d, 3d, matmul_phased] Number of req=0 cycles inserted at each phase boundary (between outer rows for 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time between memory bursts." + "description": "[2d, 3d, matmul_phased] Idle cycles inserted at each inner phase boundary." } }, @@ -234,6 +224,37 @@ } } ] + }, + + "master": { + "type": "object", + "description": "Per-master configuration. A master either has a flat single-pattern config (legacy, backward compatible) or a 'patterns' list for multi-phase execution.", + "unevaluatedProperties": false, + "properties": { + "id": { + "type": "integer", + "description": "Optional consistency label. Mapping is always positional." + }, + "description": { + "type": "string", + "description": "Human-readable master label." + }, + "start_delay_cycles": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Number of idle cycles prepended to the stimulus file before the first pattern." + }, + "patterns": { + "type": "array", + "minItems": 1, + "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for of pattern f defines which phases must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern (legacy format).", + "items": { "$ref": "#/$defs/pattern" } + } + }, + "allOf": [ + { "$ref": "#/$defs/pattern" } + ] } } } diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py index aac6927..52993f5 100644 --- a/target/verif/simvectors/hci_stimuli/generator.py +++ b/target/verif/simvectors/hci_stimuli/generator.py @@ -71,6 +71,10 @@ def _write_idle(self, file_obj): + "\n" ) + def _write_pause(self, file_obj): + """Write a PAUSE fence token line.""" + file_obj.write("PAUSE\n") + def data_wen(self): wen = random.randint(0, 1) # 1=read, 0=write if wen: diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py index 987e526..063c008 100644 --- a/target/verif/simvectors/hci_stimuli/patterns.py +++ b/target/verif/simvectors/hci_stimuli/patterns.py @@ -4,438 +4,240 @@ req(1b) id(IWb) wen(1b) data(Nb) add(Ab) req=0 lines are idle cycles. req=1 lines are active transactions. -There is no intermediate raw format or cycle_offset expansion step. -Patterns: - - random: uniformly random addresses, random read/write mix. - Optional: traffic_pct / traffic_read_pct for uniform idle interleaving. - - linear: strided 1-D scan. - Optional: traffic_pct / traffic_read_pct for uniform idle interleaving. - - 2d: strided 2-D scan (inner stride0, outer stride1). - Optional: idle_cycles_between_phases inserted after each outer row. - - 3d: strided 3-D scan (stride0, stride1, stride2). - Optional: idle_cycles_between_phases inserted after each mid-level block. - - idle: no transactions (single req=0 line). - - matmul_phased: phased traffic modelling matrix multiply - (read-A phase, read-B phase, write-C phase). - Optional: idle_cycles_between_phases inserted between phases. +Fence semantics (one trailing PAUSE per pattern): + Each pattern ends with a PAUSE. fence_idx[i] increments when resume_i fires while + fence_reached_o is high (i.e. while the driver is sitting at the PAUSE). + fence_idx[i] >= k means driver i has been granted to leave fence k-1, + i.e. pattern k-1 is complete and driver i is about to start pattern k. -traffic_pct semantics (random, linear): - After every transaction, emit floor((100 - traffic_pct) / traffic_pct) idle cycles. - traffic_pct=50 => 1 idle per transaction (req, idle, req, idle, ...) - traffic_pct=100 => no idles (back-to-back) + resume_i fires when the dependencies of the NEXT pattern are satisfied. + So resume_i = "your next job's inputs are ready, proceed". -traffic_read_pct semantics (random, linear): - Fraction of transactions that are reads (wen=1); remainder are writes (wen=0). - Only meaningful when traffic_pct < 100 or as override of the default random mix. - When set, wen is no longer random: the first n_reads transactions are reads, - the rest are writes (pattern: all reads first, then all writes). + Trailing PAUSE of the last pattern has mask=0 → resume_i fires in one cycle + → fence_idx advances to N_patterns, signalling final completion to dependents. -idle_cycles_between_phases semantics (2d, 3d, matmul_phased): - Number of req=0 cycles inserted at each phase boundary (between outer rows for - 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time - between bursts of memory accesses. - -All patterns respect the cross-master forbidden address lists: - - forbidden_write: addresses already read -> no later master may write here - - forbidden_read: addresses already written -> no later master may read/write here +All generators accept append=True to open the file in append mode. """ import random class PatternsMixin: - """Mixin providing memory access pattern generators.""" - - # ------------------------------------------------------------------ # - # Shared helpers # - # ------------------------------------------------------------------ # @staticmethod def _parse_address(addr_str): - """Parse a binary, hex (0x...), or decimal string to int.""" s = str(addr_str) - if s.startswith('0x') or s.startswith('0X'): - return int(s, 16) - if set(s) <= {'0', '1'}: - return int(s, 2) + if s.startswith('0x') or s.startswith('0X'): return int(s, 16) + if set(s) <= {'0', '1'}: return int(s, 2) return int(s, 0) @staticmethod def _align_down(value, alignment): - if alignment <= 0: - return value + if alignment <= 0: return value return (value // alignment) * alignment @staticmethod def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c): - """Split total_ops into three phases according to ratios.""" - ra = max(0, int(ratio_a)) - rb = max(0, int(ratio_b)) - rc = max(0, int(ratio_c)) - if ra == 0 and rb == 0 and rc == 0: - ra, rb, rc = 1, 1, 1 - rsum = ra + rb + rc - cnt_a = (total_ops * ra) // rsum - cnt_b = (total_ops * rb) // rsum - cnt_c = total_ops - cnt_a - cnt_b - + ra = max(0, int(ratio_a)); rb = max(0, int(ratio_b)); rc = max(0, int(ratio_c)) + if ra == 0 and rb == 0 and rc == 0: ra, rb, rc = 1, 1, 1 + s = ra + rb + rc + ca = (total_ops * ra) // s; cb = (total_ops * rb) // s; cc = total_ops - ca - cb if total_ops >= 3: - if ra > 0 and cnt_a == 0: - cnt_a = 1 - cnt_c = max(0, cnt_c - 1) - if rb > 0 and cnt_b == 0: - cnt_b = 1 - cnt_c = max(0, cnt_c - 1) - if rc > 0 and cnt_c == 0: - if cnt_a > 1: - cnt_a -= 1 - cnt_c = 1 - elif cnt_b > 1: - cnt_b -= 1 - cnt_c = 1 - return cnt_a, cnt_b, cnt_c + if ra > 0 and ca == 0: ca = 1; cc = max(0, cc-1) + if rb > 0 and cb == 0: cb = 1; cc = max(0, cc-1) + if rc > 0 and cc == 0: + if ca > 1: ca -= 1; cc = 1 + elif cb > 1: cb -= 1; cc = 1 + return ca, cb, cc @staticmethod def _idles_per_req(traffic_pct): - """Number of idle cycles to emit after each transaction for a given traffic_pct.""" traffic_pct = max(1, min(100, int(traffic_pct))) - if traffic_pct >= 100: - return 0 - return int(round((100 - traffic_pct) / traffic_pct)) + return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct)) + + def _is_allowed(self, add, wen, fr, fw): + return add not in (fr if wen else fw) - def _is_allowed(self, add, wen, forbidden_read, forbidden_write): - """Return True if the address is not in the relevant forbidden list.""" - return add not in (forbidden_read if wen else forbidden_write) + def _record_access(self, add, wen, fr_new, fw_new): + fw_new.append(add) + if not wen: fr_new.append(add) - def _record_access(self, add, wen, forbidden_read_new, forbidden_write_new): - """Mark add as used by this master.""" - forbidden_write_new.append(add) - if not wen: - forbidden_read_new.append(add) + def _open(self, append): + return open(self.filepath, "a" if append else "w", encoding="ascii") # ------------------------------------------------------------------ # - # Access patterns # + # Access patterns — each writes: transactions | PAUSE # # ------------------------------------------------------------------ # - def random_gen( - self, - id_start, - forbidden_read, - forbidden_write, - region_base=0, - region_size=None, - traffic_pct=100, - traffic_read_pct=None, - ): - """Uniformly random addresses within [region_base, region_base + region_size). - - traffic_pct: bus utilisation percentage. After each transaction, - floor((100 - traffic_pct) / traffic_pct) idle cycles are emitted. - Default 100 = back-to-back (no idles). - - traffic_read_pct: if set, the first n_reads transactions are reads and - the rest are writes. If not set, wen is random per transaction. - """ - total_mem_bytes = int(self.TOT_MEM_SIZE * 1024) - if region_size is None: - region_size = total_mem_bytes - region_size = min(region_size, total_mem_bytes - region_base) + def random_gen(self, id_start, forbidden_read, forbidden_write, + region_base=0, region_size=None, traffic_pct=100, + traffic_read_pct=None, append=False): + total = int(self.TOT_MEM_SIZE * 1024) + if region_size is None: region_size = total + region_size = min(region_size, total - region_base) n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE) n_idles = self._idles_per_req(traffic_pct) - - # Build wen sequence if traffic_read_pct is not None: rpct = max(0, min(100, int(traffic_read_pct))) n_reads = (self.N_TEST * rpct) // 100 - wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) else: wen_seq = None - - id_value = id_start - forbidden_read_new = [] - forbidden_write_new = [] - with open(self.filepath, "w", encoding="ascii") as file: + id_value = id_start; fr_new, fw_new = [], [] + with self._open(append) as f: for i in range(self.N_TEST): - if wen_seq is not None: - wen = wen_seq[i] - data = "0" * self.DATA_WIDTH if wen else self.random_data() - else: - data, wen = self.data_wen() + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: data, wen = self.data_wen() + else: data = "0"*self.DATA_WIDTH if wen else self.random_data() while True: - add_decimal = region_base + random.randint(0, int(n_words) - 1) * self.WIDTH_OF_MEMORY_BYTE - add = bin(int(add_decimal))[2:].zfill(self.ADD_WIDTH) + ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE + add = bin(int(ad))[2:].zfill(self.ADD_WIDTH) if self._is_allowed(add, wen, forbidden_read, forbidden_write): - self._record_access(add, wen, forbidden_read_new, forbidden_write_new) - break - self._write_req(file, id_value, wen, data, add) - id_value += 1 - for _ in range(n_idles): - self._write_idle(file) - forbidden_read.extend(forbidden_read_new) - forbidden_write.extend(forbidden_write_new) + self._record_access(add, wen, fr_new, fw_new); break + self._write_req(f, id_value, wen, data, add); id_value += 1 + for _ in range(n_idles): self._write_idle(f) + self._write_pause(f) + forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) return id_value - def linear_gen( - self, - stride0, - start_address, - id_start, - forbidden_read, - forbidden_write, - traffic_pct=100, - traffic_read_pct=None, - ): - """Strided 1-D linear scan. Forbidden addresses are skipped (no idle inserted). - - traffic_pct / traffic_read_pct: see random_gen. - """ + def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden_write, + traffic_pct=100, traffic_read_pct=None, append=False): n_idles = self._idles_per_req(traffic_pct) - if traffic_read_pct is not None: rpct = max(0, min(100, int(traffic_read_pct))) n_reads = (self.N_TEST * rpct) // 100 - wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) else: wen_seq = None - - id_value = id_start - forbidden_read_new = [] - forbidden_write_new = [] - seq_idx = 0 - with open(self.filepath, "w", encoding="ascii") as file: - next_address = self._parse_address(start_address) - if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: - next_address -= self.TOT_MEM_SIZE * 1024 + id_value = id_start; fr_new, fw_new = [], [] + with self._open(append) as f: + addr = self._parse_address(start_address) + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 for i in range(self.N_TEST): - if wen_seq is not None: - wen = wen_seq[i] - data = "0" * self.DATA_WIDTH if wen else self.random_data() - else: - data, wen = self.data_wen() - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - next_address += self.WIDTH_OF_MEMORY_BYTE * stride0 - if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: - next_address -= self.TOT_MEM_SIZE * 1024 - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): - continue - self._record_access(add, wen, forbidden_read_new, forbidden_write_new) - self._write_req(file, id_value, wen, data, add) - id_value += 1 - for _ in range(n_idles): - self._write_idle(file) - forbidden_read.extend(forbidden_read_new) - forbidden_write.extend(forbidden_write_new) + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: data, wen = self.data_wen() + else: data = "0"*self.DATA_WIDTH if wen else self.random_data() + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + addr += self.WIDTH_OF_MEMORY_BYTE * stride0 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue + self._record_access(add, wen, fr_new, fw_new) + self._write_req(f, id_value, wen, data, add); id_value += 1 + for _ in range(n_idles): self._write_idle(f) + self._write_pause(f) + forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) return id_value - def gen_2d( - self, - stride0, - len_d0, - stride1, - start_address, - id_start, - forbidden_read, - forbidden_write, - idle_cycles_between_phases=0, - ): - """Strided 2-D scan: inner loop stride0/len_d0, outer loop stride1. - - idle_cycles_between_phases: req=0 cycles inserted after each completed inner row. - """ - id_value = id_start - forbidden_read_new = [] - forbidden_write_new = [] - with open(self.filepath, "w", encoding="ascii") as file: - start_address_int = self._parse_address(start_address) - j = 0 + def gen_2d(self, stride0, len_d0, stride1, start_address, id_start, + forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False): + id_value = id_start; fr_new, fw_new = [], [] + with self._open(append) as f: + base = self._parse_address(start_address); j = 0 while id_value - id_start < self.N_TEST: for i in range(len_d0): data, wen = self.data_wen() - next_address = ( - start_address_int - + i * self.WIDTH_OF_MEMORY_BYTE * stride0 - + j * self.WIDTH_OF_MEMORY_BYTE * stride1 - ) - if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: - next_address -= self.TOT_MEM_SIZE * 1024 - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): - continue - self._record_access(add, wen, forbidden_read_new, forbidden_write_new) - self._write_req(file, id_value, wen, data, add) - id_value += 1 - if id_value - id_start >= self.N_TEST: - break - for _ in range(idle_cycles_between_phases): - self._write_idle(file) + addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue + self._record_access(add, wen, fr_new, fw_new) + self._write_req(f, id_value, wen, data, add); id_value += 1 + if id_value - id_start >= self.N_TEST: break + for _ in range(idle_cycles_between_phases): self._write_idle(f) j += 1 - forbidden_read.extend(forbidden_read_new) - forbidden_write.extend(forbidden_write_new) + self._write_pause(f) + forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) return id_value - def gen_3d( - self, - stride0, - len_d0, - stride1, - len_d1, - stride2, - start_address, - id_start, - forbidden_read, - forbidden_write, - idle_cycles_between_phases=0, - ): - """Strided 3-D scan: inner stride0/len_d0, mid stride1/len_d1, outer stride2. - - idle_cycles_between_phases: req=0 cycles inserted after each completed mid-level block. - """ - id_value = id_start - forbidden_read_new = [] - forbidden_write_new = [] - with open(self.filepath, "w", encoding="ascii") as file: - start_address_int = self._parse_address(start_address) - k = 0 + def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start, + forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False): + id_value = id_start; fr_new, fw_new = [], [] + with self._open(append) as f: + base = self._parse_address(start_address); k = 0 while id_value - id_start < self.N_TEST: for j in range(len_d1): for i in range(len_d0): data, wen = self.data_wen() - next_address = ( - start_address_int - + i * self.WIDTH_OF_MEMORY_BYTE * stride0 - + j * self.WIDTH_OF_MEMORY_BYTE * stride1 - + k * self.WIDTH_OF_MEMORY_BYTE * stride2 - ) - if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE: - next_address -= self.TOT_MEM_SIZE * 1024 - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): - continue - self._record_access(add, wen, forbidden_read_new, forbidden_write_new) - self._write_req(file, id_value, wen, data, add) - id_value += 1 - if id_value - id_start >= self.N_TEST: - break - if id_value - id_start >= self.N_TEST: - break - for _ in range(idle_cycles_between_phases): - self._write_idle(file) + addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + k*self.WIDTH_OF_MEMORY_BYTE*stride2 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue + self._record_access(add, wen, fr_new, fw_new) + self._write_req(f, id_value, wen, data, add); id_value += 1 + if id_value - id_start >= self.N_TEST: break + if id_value - id_start >= self.N_TEST: break + for _ in range(idle_cycles_between_phases): self._write_idle(f) k += 1 - forbidden_read.extend(forbidden_read_new) - forbidden_write.extend(forbidden_write_new) + self._write_pause(f) + forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) return id_value - def idle_gen(self, id_start): - """Emit a single req=0 idle line (master never issues transactions).""" - with open(self.filepath, "w", encoding="ascii") as file: - self._write_idle(file) + def idle_gen(self, id_start, append=False): + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) return id_start - def matmul_phased_gen( - self, - id_start, - forbidden_read, - forbidden_write, - region_base_address, - region_size_bytes, - matmul_ratio_a=1, - matmul_ratio_b=1, - matmul_ratio_c=1, - idle_cycles_between_phases=0, - region_base_address_a=None, - region_size_bytes_a=None, - region_base_address_b=None, - region_size_bytes_b=None, - region_base_address_c=None, - region_size_bytes_c=None, - ): - """Phased traffic modelling matrix multiply: read-A, read-B, write-C. - - Each phase reads/writes its own sub-region. If region_base_address_a/b/c - and region_size_bytes_a/b/c are provided, they are used directly for each - phase. Otherwise the combined region [region_base_address, +region_size_bytes) - is split into equal thirds automatically. - - idle_cycles_between_phases: req=0 cycles inserted between each pair of - active phases (after read-A and after read-B, if those phases are non-empty). - Models computation time between memory bursts. - """ - id_value = id_start - forbidden_read_new = [] - forbidden_write_new = [] - access_bytes = max(1, self.DATA_WIDTH // 8) - total_mem_bytes = int(self.TOT_MEM_SIZE * 1024) - - def _resolve_region(base_override, size_override, fallback_base, fallback_size): - if base_override is not None and size_override is not None: - b = self._align_down(int(base_override), access_bytes) - s = self._align_down(int(size_override), access_bytes) - else: - b = self._align_down(int(fallback_base), access_bytes) - s = self._align_down(int(fallback_size), access_bytes) - if b + s > total_mem_bytes: - s = self._align_down(total_mem_bytes - b, access_bytes) + def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write, + region_base_address, region_size_bytes, + matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1, + idle_cycles_between_phases=0, + region_base_address_a=None, region_size_bytes_a=None, + region_base_address_b=None, region_size_bytes_b=None, + region_base_address_c=None, region_size_bytes_c=None, + append=False): + id_value = id_start; fr_new, fw_new = [], [] + ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024) + + def _res(bo, so, fb, fs): + b = self._align_down(int(bo if bo is not None else fb), ab) + s = self._align_down(int(so if so is not None else fs), ab) + if b+s > tm: s = self._align_down(tm-b, ab) return b, s - # If per-phase regions are explicitly provided, use them directly if region_base_address_a is not None and region_size_bytes_a is not None: - a_base, a_size = _resolve_region(region_base_address_a, region_size_bytes_a, 0, 0) - b_base, b_size = _resolve_region(region_base_address_b, region_size_bytes_b, a_base, a_size) - c_base, c_size = _resolve_region(region_base_address_c, region_size_bytes_c, a_base, a_size) + a_base, a_size = _res(region_base_address_a, region_size_bytes_a, 0, 0) + b_base, b_size = _res(region_base_address_b, region_size_bytes_b, a_base, a_size) + c_base, c_size = _res(region_base_address_c, region_size_bytes_c, a_base, a_size) else: - # Auto-split combined region into thirds - base = self._align_down(int(region_base_address), access_bytes) - size = self._align_down(int(region_size_bytes), access_bytes) - if base + size > total_mem_bytes: - size = self._align_down(total_mem_bytes - base, access_bytes) - region_words = size // access_bytes - if region_words < 3: - with open(self.filepath, "w", encoding="ascii") as file: - self._write_idle(file) + base = self._align_down(int(region_base_address), ab) + size = self._align_down(int(region_size_bytes), ab) + if base+size > tm: size = self._align_down(tm-base, ab) + rw = size // ab + if rw < 3: + with self._open(append) as f: self._write_idle(f); self._write_pause(f) return id_value - a_words = max(1, region_words // 3) - b_words = max(1, region_words // 3) - c_words = region_words - a_words - b_words - a_base = base; a_size = a_words * access_bytes - b_base = a_base + a_size; b_size = b_words * access_bytes - c_base = b_base + b_size; c_size = c_words * access_bytes + aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw + a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab + c_base=b_base+b_size; c_size=cw*ab - a_end = a_base + a_size - b_end = b_base + b_size - c_end = c_base + c_size + ca, cb, cc = self._phase_counts(self.N_TEST, matmul_ratio_a, matmul_ratio_b, matmul_ratio_c) - cnt_a, cnt_b, cnt_c = self._phase_counts( - self.N_TEST, - int(matmul_ratio_a), - int(matmul_ratio_b), - int(matmul_ratio_c), - ) - - def _emit_phase(file_obj, count, wen, phase_base, phase_end): + def _emit(fobj, count, wen, pb, pe): nonlocal id_value - addr = phase_base + addr = pb for _ in range(count): - data = "0" * self.DATA_WIDTH if wen else self.random_data() + data = "0"*self.DATA_WIDTH if wen else self.random_data() add = bin(addr)[2:].zfill(self.ADD_WIDTH) - self._write_req(file_obj, id_value, wen, data, add) - self._record_access(add, wen, forbidden_read_new, forbidden_write_new) - id_value += 1 - addr += access_bytes - if addr >= phase_end: - addr = phase_base - - with open(self.filepath, "w", encoding="ascii") as file: - _emit_phase(file, cnt_a, 1, a_base, a_end) # read A - if cnt_a > 0 and (cnt_b > 0 or cnt_c > 0): - for _ in range(idle_cycles_between_phases): - self._write_idle(file) - _emit_phase(file, cnt_b, 1, b_base, b_end) # read B - if cnt_b > 0 and cnt_c > 0: - for _ in range(idle_cycles_between_phases): - self._write_idle(file) - _emit_phase(file, cnt_c, 0, c_base, c_end) # write C - - forbidden_read.extend(forbidden_read_new) - forbidden_write.extend(forbidden_write_new) + self._write_req(fobj, id_value, wen, data, add) + self._record_access(add, wen, fr_new, fw_new) + id_value += 1; addr += ab + if addr >= pe: addr = pb + + with self._open(append) as f: + _emit(f, ca, 1, a_base, a_base+a_size) + if ca > 0 and (cb > 0 or cc > 0): + for _ in range(idle_cycles_between_phases): self._write_idle(f) + _emit(f, cb, 1, b_base, b_base+b_size) + if cb > 0 and cc > 0: + for _ in range(idle_cycles_between_phases): self._write_idle(f) + _emit(f, cc, 0, c_base, c_base+c_size) + self._write_pause(f) + + forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) return id_value diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 54ff02b..c6a0eae 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -327,15 +327,18 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt f"'n_transactions' and no geometry fields to derive it from.") sys.exit(1) - def _generate_master( + def _generate_pattern( filepath: Path, - master_config: dict, + pattern_config: dict, *, is_hwpe: bool, master_global_idx: int, master_local_idx: int, n_peers_of_kind: int, + append: bool, ): + """Generate one pattern segment. append=True opens file in append mode. + Every pattern always writes a trailing PAUSE (handled by the generator).""" nonlocal next_start_id data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH kind = 'master_hwpe' if is_hwpe else 'master_log' @@ -345,29 +348,28 @@ def _generate_master( str(filepath), 0, master_global_idx ) - if 'mem_access_type' not in master_config: - print(f"ERROR: {kind}_{master_local_idx} is missing mem_access_type.") + if 'mem_access_type' not in pattern_config: + print(f"ERROR: {kind}_{master_local_idx} pattern is missing mem_access_type.") sys.exit(1) config = _normalize_mem_access_type( - master_config['mem_access_type'], + pattern_config['mem_access_type'], f"{kind}_{master_local_idx}", ) - start_address = str(master_config.get('start_address', '0')) - stride0 = int(master_config.get('stride0', 0)) - len_d0 = int(master_config.get('len_d0', 0)) - stride1 = int(master_config.get('stride1', 0)) - len_d1 = int(master_config.get('len_d1', 0)) - stride2 = int(master_config.get('stride2', 0)) - - # Region parameters + start_address = str(pattern_config.get('start_address', '0')) + stride0 = int(pattern_config.get('stride0', 0)) + len_d0 = int(pattern_config.get('len_d0', 0)) + stride1 = int(pattern_config.get('stride1', 0)) + len_d1 = int(pattern_config.get('len_d1', 0)) + stride2 = int(pattern_config.get('stride2', 0)) + total_mem_bytes = int(TOT_MEM_SIZE * 1024) access_bytes = max(1, int(data_width // 8)) default_region_size = total_mem_bytes // max(1, n_peers_of_kind) default_region_base = master_local_idx * default_region_size - region_base = _parse_maybe_bin_int(master_config.get('region_base_address'), default_region_base) - region_size = _parse_maybe_bin_int(master_config.get('region_size_bytes'), default_region_size) + region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base) + region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size) region_base = (region_base // access_bytes) * access_bytes if region_base >= total_mem_bytes: @@ -378,16 +380,11 @@ def _generate_master( if region_base + region_size > total_mem_bytes: region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes - n_test = _resolve_n_transactions(master_config, config, data_width, kind, master_local_idx) + n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx) master.N_TEST = n_test - # Start delay: prepend idle lines directly to the stimuli file after generation - start_delay = int(master_config.get('start_delay_cycles', 0)) - if start_delay > 0: - pending_start_delays.append((filepath, start_delay, data_width)) - if config == 'random': - tpct = master_config.get('traffic_pct') + tpct = pattern_config.get('traffic_pct') next_start_id = master.random_gen( next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, @@ -395,33 +392,37 @@ def _generate_master( region_base=region_base, region_size=region_size, traffic_pct=int(tpct) if tpct is not None else 100, - traffic_read_pct=master_config.get('traffic_read_pct'), + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, ) elif config == 'linear': - tpct = master_config.get('traffic_pct') + tpct = pattern_config.get('traffic_pct') next_start_id = master.linear_gen( stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE, traffic_pct=int(tpct) if tpct is not None else 100, - traffic_read_pct=master_config.get('traffic_read_pct'), + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, ) elif config == '2d': next_start_id = master.gen_2d( stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE, - idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + append=append, ) elif config == '3d': next_start_id = master.gen_3d( stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE, - idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + append=append, ) elif config == 'idle': - next_start_id = master.idle_gen(next_start_id) + next_start_id = master.idle_gen(next_start_id, append=append) elif config == 'matmul_phased': if not is_hwpe: print( @@ -441,27 +442,82 @@ def _generate_master( LIST_OF_FORBIDDEN_ADDRESSES_WRITE, region_base, region_size, - int(master_config.get('matmul_ratio_a', 1)), - int(master_config.get('matmul_ratio_b', 1)), - int(master_config.get('matmul_ratio_c', 1)), - idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)), - region_base_address_a=_parse_maybe_bin_int(master_config.get('region_base_address_a'), None), - region_size_bytes_a=_parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None), - region_base_address_b=_parse_maybe_bin_int(master_config.get('region_base_address_b'), None), - region_size_bytes_b=_parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None), - region_base_address_c=_parse_maybe_bin_int(master_config.get('region_base_address_c'), None), - region_size_bytes_c=_parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None), + int(pattern_config.get('matmul_ratio_a', 1)), + int(pattern_config.get('matmul_ratio_b', 1)), + int(pattern_config.get('matmul_ratio_c', 1)), + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None), + region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None), + region_base_address_b=_parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None), + region_size_bytes_b=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None), + region_base_address_c=_parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None), + region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None), + append=append, ) _record_memory_map( kind, master_local_idx, - master_config.get('description', ''), + pattern_config.get('description', ''), config, n_test, data_width, access_bytes, region_base, region_size, start_address, stride0, len_d0, stride1, len_d1, stride2, - master_config, total_mem_bytes, + pattern_config, total_mem_bytes, ) + def _generate_master( + filepath: Path, + master_config: dict, + *, + is_hwpe: bool, + master_global_idx: int, + master_local_idx: int, + n_peers_of_kind: int, + ): + """Generate stimulus for a master, supporting single flat pattern or patterns list.""" + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + + # Resolve pattern list: either explicit 'patterns' list or a single flat pattern + if 'patterns' in master_config: + patterns = master_config['patterns'] + if not patterns: + kind = 'master_hwpe' if is_hwpe else 'master_log' + print(f"ERROR: {kind}_{master_local_idx} has empty patterns list.") + sys.exit(1) + else: + # Legacy flat format: treat the master config itself as a single pattern + patterns = [master_config] + + # Start delay applies to the whole master (prepended before first pattern) + start_delay = int(master_config.get('start_delay_cycles', 0)) + if start_delay > 0: + pending_start_delays.append((filepath, start_delay, data_width)) + + # For each pattern with wait_for, prepend a synthetic idle+PAUSE that acts as + # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free + # pass), so fence_idx advances immediately after the real work is done. + # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle + # gate, blocking), giving resume_i a single clean meaning: start your next job. + dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + first_written = False + for p_idx, pattern_config in enumerate(patterns): + if pattern_config.get('wait_for'): + # Synthetic idle+PAUSE gates this pattern + _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, + dw, ADD_WIDTH, str(filepath), 0, master_global_idx) + _idle.N_TEST = 0 + _idle.idle_gen(next_start_id, append=first_written) + first_written = True + _generate_pattern( + filepath, + pattern_config, + is_hwpe=is_hwpe, + master_global_idx=master_global_idx, + master_local_idx=master_local_idx, + n_peers_of_kind=n_peers_of_kind, + append=first_written, + ) + first_written = True + global_idx = 0 # Generate LOG masters (CORE, DMA, EXT) in order @@ -511,56 +567,148 @@ def _generate_master( print("STEP 0 COMPLETED: generate stimuli files") # ----------------------------------------------------------------------- - # Compute WAIT_MASKS and emit phases.mk + # Compute FENCE_MASKS and emit fence_masks.mk + # + # Fence slot f corresponds to the PAUSE before pattern f in the stimulus + # file (i.e. between pattern f-1 and pattern f). The mask at slot f holds + # the set of drivers that must have passed fence f before this driver can + # resume from that PAUSE. + # + # For a master with N patterns, there are N fence slots (slot 0 = before + # pattern 0, slot f = before pattern f). The wait_for of pattern f defines + # the mask at fence slot f. + # + # Legacy flat masters (no 'patterns' key) are treated as single-pattern + # masters: one fence slot (slot 0) from the top-level wait_for field. # ----------------------------------------------------------------------- N_DRIVERS = N_LOG + N_HWPE - phase_to_drivers: dict[str, list[int]] = {} - driver_phases: list[str] = [] - - for i, m in enumerate(log_masters): - phase = str(m.get('phase', 'default')) - driver_phases.append(phase) - phase_to_drivers.setdefault(phase, []).append(i) - - for i, m in enumerate(hwpe_masters): - gidx = N_LOG + i - phase = str(m.get('phase', 'default')) - driver_phases.append(phase) - phase_to_drivers.setdefault(phase, []).append(gidx) - - wait_masks: list[int] = [0] * N_DRIVERS + def _patterns_of(master_config): + """Return the list of pattern configs for a master.""" + if 'patterns' in master_config: + return master_config['patterns'] + return [master_config] - for i, m in enumerate(log_masters): + # Build phase->driver map: every pattern of every driver registers its phase. + # This allows wait_for to reference any phase, not just first patterns. + # A phase may be associated with multiple drivers (e.g. 8 cores all in softmax_t0). + phase_to_drivers: dict[str, list[int]] = {} + all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters] + for i, (m, _) in enumerate(all_masters): + for pat in _patterns_of(m): + phase = str(pat.get('phase', 'default')) + if i not in phase_to_drivers.get(phase, []): + phase_to_drivers.setdefault(phase, []).append(i) + + # Build phase->pattern_index map: for each phase, which pattern index within + # each driver corresponds to that phase. Used to compute FENCE_REQ_LEVELS. + # phase_pattern_idx[phase][driver] = pattern index of that phase in that driver + phase_pattern_idx: dict[str, dict[int, int]] = {} + for i, (m, _) in enumerate(all_masters): + for p_idx, pat in enumerate(_patterns_of(m)): + phase = str(pat.get('phase', 'default')) + phase_pattern_idx.setdefault(phase, {})[i] = p_idx + + def _resolve_wait_mask(wait_for_list): mask = 0 - for dep_phase in m.get('wait_for', []): + for dep_phase in wait_for_list: for dep_drv in phase_to_drivers.get(str(dep_phase), []): mask |= (1 << dep_drv) - wait_masks[i] = mask - - for i, m in enumerate(hwpe_masters): - gidx = N_LOG + i - mask = 0 - for dep_phase in m.get('wait_for', []): + return mask + + # Precompute per-driver fence_idx value after finishing pattern p: + # = number of fences (synthetic idle gates + trailing PAUSEs) passed up to and + # including the trailing PAUSE of pattern p. + def _fence_idx_after_pattern(drv_idx, pat_idx): + pats = _patterns_of(all_masters[drv_idx][0]) + # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for) + n_gates = sum(1 for k in range(pat_idx + 1) if pats[k].get('wait_for')) + # Plus trailing PAUSEs for patterns 0..pat_idx + n_trailing = pat_idx + 1 + return n_gates + n_trailing + + def _resolve_req_levels(wait_for_list): + """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j.""" + levels = [0] * N_DRIVERS + for dep_phase in wait_for_list: for dep_drv in phase_to_drivers.get(str(dep_phase), []): - mask |= (1 << dep_drv) - wait_masks[gidx] = mask + p_idx = phase_pattern_idx.get(str(dep_phase), {}).get(dep_drv, 0) + levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx) + return levels + + # Build per-driver fence mask and req_level lists. + # Each pattern with wait_for gets a synthetic idle gate (mask = wait_for) before it. + # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx). + # Fences are enumerated in file order: for each pattern p: + # if p has wait_for: synthetic idle fence (mask = wait_for of p) + # trailing PAUSE fence (mask = 0) + fence_masks: list[list[int]] = [] + req_levels: list[list[list[int]]] = [] + for i, (m, _) in enumerate(all_masters): + patterns = _patterns_of(m) + per_masks = [] + per_levels = [] + for pat in patterns: + if pat.get('wait_for'): + # Synthetic idle gate: blocking fence + per_masks.append(_resolve_wait_mask(pat['wait_for'])) + per_levels.append(_resolve_req_levels(pat['wait_for'])) + # Trailing PAUSE: free pass, just signals completion + per_masks.append(0) + per_levels.append([0] * N_DRIVERS) + fence_masks.append(per_masks) + req_levels.append(per_levels) + + max_fences = max((len(fm) for fm in fence_masks), default=1) + + # Pad to max_fences + for i in range(N_DRIVERS): + while len(fence_masks[i]) < max_fences: + fence_masks[i].append(0) + while len(req_levels[i]) < max_fences: + req_levels[i].append([0] * N_DRIVERS) + # Emit SV literals hex_width = max(1, (N_DRIVERS + 3) // 4) - mask_literals = [f"{N_DRIVERS}'h{wait_masks[i]:0{hex_width}x}" for i in range(N_DRIVERS)] - wait_masks_param = "'{" + ", ".join(mask_literals) + "}" + per_driver_literals = [] + for i in range(N_DRIVERS): + slot_literals = [f"{N_DRIVERS}'h{fence_masks[i][f]:0{hex_width}x}" for f in range(max_fences)] + per_driver_literals.append("'{" + ", ".join(slot_literals) + "}") + fence_masks_param = "'{" + ", ".join(per_driver_literals) + "}" + + # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned + # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector. + # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15). + LEVEL_BITS = 4 + packed_width = N_DRIVERS * LEVEL_BITS + packed_hex_digits = (packed_width + 3) // 4 + req_driver_literals = [] + for i in range(N_DRIVERS): + fence_literals = [] + for f in range(max_fences): + val = 0 + for j in range(N_DRIVERS): + val |= (req_levels[i][f][j] & 0xF) << (j * LEVEL_BITS) + fence_literals.append(f"{packed_width}'h{val:0{packed_hex_digits}x}") + req_driver_literals.append("'{" + ", ".join(fence_literals) + "}") + fence_req_levels_packed_param = "'{" + ", ".join(req_driver_literals) + "}" if args.emit_phases_mk: phases_mk_path = Path(args.emit_phases_mk) phases_mk_path.parent.mkdir(parents=True, exist_ok=True) phases_mk_path.write_text( "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n" - "# Per-driver dependency masks for tb_hci.sv (WAIT_MASKS_PARAM).\n" + "# Per-driver per-fence dependency data for tb_hci.sv.\n" f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n" - f"WAIT_MASKS_PARAM := {wait_masks_param}\n", + f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n" + f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n" + f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n" + f"MAX_FENCES_PARAM := {max_fences}\n" + f"FENCE_MASKS_PARAM := {fence_masks_param}\n" + f"FENCE_REQ_LEVELS_PACKED_PARAM := {fence_req_levels_packed_param}\n", encoding='utf-8', ) - print(f"PHASES.MK written: {phases_mk_path}") + print(f"FENCE_MASKS.MK written: {phases_mk_path}") # ----------------------------------------------------------------------- # Build and emit memory map report @@ -586,11 +734,12 @@ def _generate_master( driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers] lines.append(f" phase '{phase}': {', '.join(driver_names)}") for i in range(N_DRIVERS): - if wait_masks[i]: - name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}" - deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}" - for j in range(N_DRIVERS) if wait_masks[i] & (1 << j)] - lines.append(f" {name} waits for: {', '.join(deps)}") + name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}" + for f, mask in enumerate(fence_masks[i]): + if mask: + deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}" + for j in range(N_DRIVERS) if mask & (1 << j)] + lines.append(f" {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}") lines.append("=" * 72) report_text = "\n".join(lines) + "\n" diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index 35c66c4..6ed4787 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -16,7 +16,16 @@ /** * Application driver module - * Reads stimuli from file and drives transactions on HCI interface + * Reads stimuli from file and drives transactions on HCI interface. + * + * Stimulus file format (one line per cycle): + * req(1b) id(IWb) wen(1b) data(Nb) add(Ab) -- active transaction + * PAUSE -- fence synchronization point + * + * When a PAUSE token is encountered the driver drains all in-flight reads + * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1 + * until resume_i is asserted. This allows multi-phase execution on a single + * driver without resetting counters between phases. */ module application_driver #( @@ -28,56 +37,75 @@ module application_driver #( ) ( input logic clk_i, input logic rst_ni, - input logic clear_i, // used to gate the driver or to reset it + input logic resume_i, // asserted by tb_hci when fence dependencies are met hci_core_intf.initiator hci_if, - output logic end_resp_o, + output logic fence_reached_o, // held HIGH while driver is paused at a fence + output logic end_resp_o, // held HIGH after all transactions and responses done output int unsigned n_issued_tr_o, output int unsigned n_issued_rd_tr_o, output int unsigned n_retired_rd_tr_o ); - int unsigned n_req_issued_q, n_req_issued_d; // total number of issued requests - int unsigned n_rd_req_issued_q, n_rd_req_issued_d; // total number of issued read requests - int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; // total number of retired read responses + int unsigned n_req_issued_q, n_req_issued_d; + int unsigned n_rd_req_issued_q, n_rd_req_issued_d; + int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; - // Transaction queue from file + // Transaction queue from file. is_pause=1 entries are fence tokens, not real transactions. typedef struct { - logic req; - logic [IW-1:0] id; - logic wen; + logic is_pause; + logic req; + logic [IW-1:0] id; + logic wen; logic [DATA_WIDTH-1:0] data; logic [ADDR_WIDTH-1:0] add; } transaction_t; transaction_t transactions[$]; - // Fill up the queue by reading the stimuli file until the end + // Fill up the queue by reading the stimuli file until the end. + // PAUSE lines are read as fence tokens with is_pause=1. initial begin string file_path; - int stim; - int scan_status; + int stim; + string line; if (STIM_FILE != "") begin file_path = STIM_FILE; end else begin $fatal("ERROR: Specify STIM_FILE path"); end - // Open file stim = $fopen(file_path, "r"); if (stim == 0) begin - $fatal("ERROR: Could not open stimuli file"); + $fatal("ERROR: Could not open stimuli file: %s", file_path); end - // Read every line while (!$feof(stim)) begin - transaction_t transaction; - scan_status = $fscanf(stim, "%b %b %b %b %b\n", transaction.req, transaction.id, transaction.wen, transaction.data, transaction.add); - if (scan_status != 5) begin - if (!$feof(stim)) begin - $fatal(1, "ERROR: malformed stimuli line in %s", file_path); + transaction_t t; + int scan_status; + void'($fgets(line, stim)); + // Strip trailing newline/CR for comparison + if (line.len() > 0 && (line[line.len()-1] == "\n" || line[line.len()-1] == "\r")) + line = line.substr(0, line.len()-2); + if (line.len() > 1 && line[line.len()-1] == "\r") + line = line.substr(0, line.len()-2); + if (line == "PAUSE") begin + t.is_pause = 1'b1; + t.req = 1'b0; + t.id = '0; + t.wen = 1'b0; + t.data = '0; + t.add = '0; + transactions.push_back(t); + end else if (line.len() > 0) begin + t.is_pause = 1'b0; + scan_status = $sscanf(line, "%b %b %b %b %b", + t.req, t.id, t.wen, t.data, t.add); + if (scan_status != 5) begin + if (!$feof(stim)) begin + $fatal(1, "ERROR: malformed stimuli line in %s: '%s'", file_path, line); + end + break; end - break; + transactions.push_back(t); end - // First-in, first-out queue - transactions.push_back(transaction); end $fclose(stim); end @@ -87,150 +115,155 @@ module application_driver #( ////////////////// typedef enum logic [2:0] { - REQ_RESET, REQ_IDLE, WAIT_GNT, REQ_DONE, + DRAIN_FOR_PAUSE, // drain in-flight reads before asserting fence_reached_o + PAUSED, // fence synchronization: hold fence_reached_o until resume_i RSP_DONE } req_state_t; - req_state_t req_state_q, req_state_d; - int unsigned tr_idx_q, tr_idx_d; // transaction ID - int unsigned last_op_issued_q, last_op_issued_d; // ID of the last issued operation (read or write) + req_state_t req_state_q, req_state_d; + int unsigned tr_idx_q, tr_idx_d; + int unsigned last_op_issued_q, last_op_issued_d; - assign n_issued_tr_o = n_req_issued_q; + assign n_issued_tr_o = n_req_issued_q; assign n_issued_rd_tr_o = n_rd_req_issued_q; - // Sequential logic always_ff @(posedge clk_i or negedge rst_ni) begin - if (!rst_ni || clear_i) begin - req_state_q <= REQ_RESET; - tr_idx_q <= '0; - n_req_issued_q <= '0; - n_rd_req_issued_q <= '0; - last_op_issued_q <= '0; + if (!rst_ni) begin + req_state_q <= REQ_IDLE; + tr_idx_q <= '0; + n_req_issued_q <= '0; + n_rd_req_issued_q <= '0; + last_op_issued_q <= '0; end else begin - req_state_q <= req_state_d; - tr_idx_q <= tr_idx_d; - n_req_issued_q <= n_req_issued_d; - n_rd_req_issued_q <= n_rd_req_issued_d; - last_op_issued_q <= last_op_issued_d; + req_state_q <= req_state_d; + tr_idx_q <= tr_idx_d; + n_req_issued_q <= n_req_issued_d; + n_rd_req_issued_q <= n_rd_req_issued_d; + last_op_issued_q <= last_op_issued_d; end end - // Combinational state logic always_comb begin - // Defaults (FSM) - req_state_d = req_state_q; - tr_idx_d = tr_idx_q; - n_req_issued_d = n_req_issued_q; + // FSM defaults + req_state_d = req_state_q; + tr_idx_d = tr_idx_q; + n_req_issued_d = n_req_issued_q; n_rd_req_issued_d = n_rd_req_issued_q; last_op_issued_d = last_op_issued_q; - // Defaults (HCI outputs) - hci_if.id = '0; - hci_if.add = '0; - hci_if.data = '0; - hci_if.req = 1'b0; - hci_if.wen = 1'b0; - hci_if.ecc = '0; - hci_if.ereq = '0; + // HCI output defaults + hci_if.id = '0; + hci_if.add = '0; + hci_if.data = '0; + hci_if.req = 1'b0; + hci_if.wen = 1'b0; + hci_if.ecc = '0; + hci_if.ereq = '0; hci_if.r_eready = '0; - hci_if.be = '1; - hci_if.r_ready = 1'b1; - hci_if.user = '0; - // Defaults (outputs) - end_resp_o = 1'b0; - // inputs: gnt, r_data, r_valid, r_user, r_id + hci_if.be = '1; + hci_if.r_ready = 1'b1; + hci_if.user = '0; + // Output defaults + fence_reached_o = 1'b0; + end_resp_o = 1'b0; case (req_state_q) - REQ_RESET: begin - if (!clear_i) begin - req_state_d = REQ_IDLE; - end - end REQ_IDLE: begin - // Check if there are still transactions to issue if (tr_idx_q < transactions.size()) begin - // If so, increase transaction index for next iteration - tr_idx_d = tr_idx_q + 1; - // If this transaction is a request - if (transactions[tr_idx_q].req) begin - hci_if.req = 1'b1; - hci_if.id = transactions[tr_idx_q].id; - hci_if.wen = transactions[tr_idx_q].wen; - hci_if.data = transactions[tr_idx_q].data; - hci_if.add = transactions[tr_idx_q].add; - // Update counters - n_req_issued_d = n_req_issued_q + 1; - if (transactions[tr_idx_q].wen) begin - n_rd_req_issued_d = n_rd_req_issued_q + 1; - end - last_op_issued_d = tr_idx_q; - // If granted already, stay in REQ_IDLE, otherwise go to WAIT_GNT - if (hci_if.gnt) begin - req_state_d = REQ_IDLE; + if (transactions[tr_idx_q].is_pause) begin + // Consume the PAUSE token and drain any in-flight reads before pausing + tr_idx_d = tr_idx_q + 1; + if (n_rd_req_issued_q > n_rd_resp_retired_q) begin + req_state_d = DRAIN_FOR_PAUSE; end else begin - req_state_d = WAIT_GNT; + req_state_d = PAUSED; + end + end else begin + tr_idx_d = tr_idx_q + 1; + if (transactions[tr_idx_q].req) begin + hci_if.req = 1'b1; + hci_if.id = transactions[tr_idx_q].id; + hci_if.wen = transactions[tr_idx_q].wen; + hci_if.data = transactions[tr_idx_q].data; + hci_if.add = transactions[tr_idx_q].add; + n_req_issued_d = n_req_issued_q + 1; + if (transactions[tr_idx_q].wen) begin + n_rd_req_issued_d = n_rd_req_issued_q + 1; + end + last_op_issued_d = tr_idx_q; + req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT; end end end else begin - // If no more transactions to issue + // No more transactions if (n_rd_req_issued_q > n_rd_resp_retired_q) begin - // If there are still read responses to retire, wait for them before finishing req_state_d = REQ_DONE; end else begin req_state_d = RSP_DONE; end end end + WAIT_GNT: begin - hci_if.req = 1'b1; - hci_if.id = transactions[last_op_issued_q].id; - hci_if.wen = transactions[last_op_issued_q].wen; + hci_if.req = 1'b1; + hci_if.id = transactions[last_op_issued_q].id; + hci_if.wen = transactions[last_op_issued_q].wen; hci_if.data = transactions[last_op_issued_q].data; - hci_if.add = transactions[last_op_issued_q].add; - // Check if there are still transactions to issue + hci_if.add = transactions[last_op_issued_q].add; if (tr_idx_q < transactions.size()) begin - // Check whether to stall or not transaction fetching, in case this is a idle transaction that can hide mem latency - if (!transactions[tr_idx_q].req) begin + // Advance over any idle (req=0, not-pause) entries to hide memory latency + if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin tr_idx_d = tr_idx_q + 1; - end else begin - tr_idx_d = tr_idx_q; - end - // If grant received, go back to REQ_IDLE and pop another transaction, otherwise stay in WAIT_GNT - if (hci_if.gnt) begin - req_state_d = REQ_IDLE; - end else begin - req_state_d = WAIT_GNT; end + req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT; end else begin - // If there are no more transactions, wait for last grant and then finish if (hci_if.gnt) begin if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin - // If the last transaction was a read, we need to wait for its response before finishing req_state_d = REQ_DONE; end else begin - // If the last transaction was a write, we can finish right away req_state_d = RSP_DONE; end - end else begin - req_state_d = WAIT_GNT; end end end + REQ_DONE: begin if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin - // All read responses have been retired req_state_d = RSP_DONE; - end else begin - req_state_d = REQ_DONE; end end + + DRAIN_FOR_PAUSE: begin + // Wait for all in-flight reads to retire before asserting the fence + if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin + req_state_d = PAUSED; + end + end + + PAUSED: begin + // Hold fence_reached_o HIGH until tb_hci asserts resume_i. + // If the next token is also a PAUSE (e.g. trailing free-pass followed by + // a blocking synthetic idle), consume it immediately and stay in PAUSED + // to avoid a spurious one-cycle REQ_IDLE bounce between consecutive fences. + fence_reached_o = 1'b1; + if (resume_i) begin + if (tr_idx_q < transactions.size() && transactions[tr_idx_q].is_pause) begin + tr_idx_d = tr_idx_q + 1; + req_state_d = PAUSED; + end else begin + req_state_d = REQ_IDLE; + end + end + end + RSP_DONE: begin end_resp_o = 1'b1; end + default: begin - req_state_d = REQ_RESET; + req_state_d = REQ_IDLE; end endcase end @@ -239,75 +272,60 @@ module application_driver #( // Read response FSM // /////////////////////// - // We only consider read responses as write responses are not mandatory in HCI - typedef enum logic [1:0] { - RESP_RESET, RESP_IDLE, RESP_WAIT_RVALID } resp_state_t; resp_state_t resp_state_q, resp_state_d; - int unsigned n_rd_in_flight_q, n_rd_in_flight_d; // number of reads granted but not yet responded + int unsigned n_rd_in_flight_q, n_rd_in_flight_d; assign n_retired_rd_tr_o = n_rd_resp_retired_q; - // Sequential logic always_ff @(posedge clk_i or negedge rst_ni) begin - if (!rst_ni || clear_i) begin - resp_state_q <= RESP_IDLE; + if (!rst_ni) begin + resp_state_q <= RESP_IDLE; n_rd_resp_retired_q <= '0; - n_rd_in_flight_q <= '0; + n_rd_in_flight_q <= '0; end else begin - resp_state_q <= resp_state_d; + resp_state_q <= resp_state_d; n_rd_resp_retired_q <= n_rd_resp_retired_d; - n_rd_in_flight_q <= n_rd_in_flight_d; + n_rd_in_flight_q <= n_rd_in_flight_d; end end - // Combinational state logic always_comb begin - // Defaults (FSM) - resp_state_d = resp_state_q; + resp_state_d = resp_state_q; n_rd_resp_retired_d = n_rd_resp_retired_q; - n_rd_in_flight_d = n_rd_in_flight_q; + n_rd_in_flight_d = n_rd_in_flight_q; case (resp_state_q) - RESP_RESET: begin - if (!clear_i) begin - resp_state_d = RESP_IDLE; - end - end RESP_IDLE: begin - // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID if (hci_if.req && hci_if.wen && hci_if.gnt) begin n_rd_in_flight_d = n_rd_in_flight_q + 1; - resp_state_d = RESP_WAIT_RVALID; + resp_state_d = RESP_WAIT_RVALID; end end + RESP_WAIT_RVALID: begin - // Track newly granted reads while waiting if (hci_if.req && hci_if.wen && hci_if.gnt) begin n_rd_in_flight_d = n_rd_in_flight_q + 1; end - // When read response handshake happens, retire one response if (hci_if.r_valid && hci_if.r_ready) begin n_rd_resp_retired_d = n_rd_resp_retired_q + 1; - // Adjust in-flight: account for simultaneous grant (already added above) if (hci_if.req && hci_if.wen && hci_if.gnt) begin - // net in-flight = in_flight + 1 (new grant) - 1 (retired) = in_flight - n_rd_in_flight_d = n_rd_in_flight_q; // undo the +1 above, net unchanged + n_rd_in_flight_d = n_rd_in_flight_q; // +1 grant -1 retire = net 0 end else begin n_rd_in_flight_d = n_rd_in_flight_q - 1; end - // If no more in-flight reads (after this retirement), go back to RESP_IDLE if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin resp_state_d = RESP_IDLE; end end end + default: begin - resp_state_d = RESP_RESET; + resp_state_d = RESP_IDLE; end endcase end diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index d95ce3b..0f743e2 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -26,8 +26,10 @@ module tb_hci (); logic clk, rst_n; - logic [N_DRIVERS-1:0] s_end_resp; // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering - logic [N_DRIVERS-1:0] s_clear_drv; // per-driver clear_i (held 1 until dependencies done) + logic [N_DRIVERS-1:0] s_end_resp; // end_resp_o from all drivers + logic [N_DRIVERS-1:0] s_fence_reached; // fence_reached_o from all drivers (level, HIGH while PAUSED) + logic [N_DRIVERS-1:0] s_resume; // resume_i to each driver (asserted when fence deps are met) + int unsigned fence_idx [N_DRIVERS]; // number of fences each driver has passed so far hci_interconnect_ctrl_t s_hci_ctrl; clk_rst_gen #( @@ -38,9 +40,9 @@ module tb_hci .rst_no(rst_n) ); - ///////// - // HCI // - ///////// + //////////////////// + // HCI interfaces // + //////////////////// localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{ DW: DW_cores, @@ -164,6 +166,12 @@ module tb_hci .clk(clk) ); + /////////////////////////// + // Interface assignments // + /////////////////////////// + + /* Assignments of narrow initiators to LOG branch of HCI */ + generate for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow hci_core_assign i_core_to_narrow_assign ( @@ -185,7 +193,11 @@ module tb_hci .tcdm_initiator(hci_initiator_ext[ii]) ); end + endgenerate + /* Assignments of wide initiators to HCI (either LOG branch, HCI branch, or static MUX) */ + + generate if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign hci_core_assign i_hwpe_hci_assign ( @@ -194,16 +206,36 @@ module tb_hci ); end end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux - // In MUX mode, sel_i points at the lowest-indexed HWPE that has not yet finished - // (i.e. whose end_resp_o has not fired). Once HWPE k asserts end_resp_o, sel_i - // advances to k+1. We cannot use s_clear_drv here because HWPE 0 always has - // eff_mask='0 so its clear_drv is permanently 0 even after it finishes. + // Phase-ordered MUX arbitration: + // + // The mux is held by whichever HWPE is currently running (not paused, not done). + // In-flight reads are drained before any PAUSE (DRAIN_FOR_PAUSE state in the + // driver FSM), so sel_i is safe to switch as soon as fence_reached_o goes high. + // + // When no HWPE is running (all are either paused or done), the mux is granted + // to the lowest-indexed HWPE that is paused AND whose fence dependencies are + // satisfied (s_resume high). This serializes same-phase jobs by master ID and + // respects cross-phase data dependencies. logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel; always_comb begin + automatic logic any_running; + any_running = 1'b0; + for (int i = 0; i < N_HWPE; i++) begin + if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i]) + any_running = 1'b1; + end s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1); for (int i = N_HWPE-1; i >= 0; i--) begin - if (!s_end_resp[N_LOG_MASTERS + i]) - s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + if (any_running) begin + // Active HWPE holds the mux; lowest index wins (descending loop) + if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i]) + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + end else begin + // No HWPE running: grant to lowest-indexed paused+ready HWPE + if (!s_end_resp[N_LOG_MASTERS + i] && s_fence_reached[N_LOG_MASTERS + i] + && s_resume[N_LOG_MASTERS + i]) + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + end end end hci_core_mux_static #( @@ -239,6 +271,10 @@ module tb_hci end endgenerate + ///////////////// + // Fence logic // + ///////////////// + logic s_clear; assign s_clear = 1'b0; @@ -247,31 +283,54 @@ module tb_hci assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR; assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR; - // Driver clear logic: driver i is held in reset until all drivers j in its effective wait - // mask have asserted end_resp_o. If the effective mask is zero, the driver starts immediately. + // fence_idx[i]: number of fences driver i has fully passed (exited PAUSED state). + // Increments when the handshake fires: resume_i asserted while fence_reached_o is high. + // This is the same cycle the driver transitions out of PAUSED, so dependents see the + // updated count immediately on the next cycle (after the registered flip-flop updates). + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0; + end else begin + for (int i = 0; i < N_DRIVERS; i++) begin + if (s_resume[i] && s_fence_reached[i]) + fence_idx[i] <= fence_idx[i] + 1; + end + end + end + + // s_resume[i]: fire when for every set bit j in FENCE_MASKS[i][fence_idx[i]], + // fence_idx[j] >= FENCE_REQ_LEVELS[i][fence_idx[i]][j]. // - // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict - // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static - // only forwards one HWPE at a time. HWPE ordering follows the index = position in - // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS. + // fence_idx[j] >= p_j+1 means j has exited the trailing PAUSE after pattern p_j, + // i.e. j has completed pattern p_j. No s_fence_reached shortcut needed — the + // trailing PAUSE of a zero-mask fence is exited in one cycle, so fence_idx + // advances promptly and the check is unambiguous. // - // All drivers use end_resp_o (s_end_resp) as the handoff condition, guaranteeing that all - // in-flight reads from the predecessor have been fully retired before the successor starts. - // This is required for correctness in MUX mode (hci_core_mux_static gates r_valid to the - // non-selected channel), and is conservatively safe for all other modes. - generate - for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear - logic [N_DRIVERS-1:0] eff_mask; - if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask - // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1's end_resp_o - assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1)); - end else begin : gen_default_mask - assign eff_mask = WAIT_MASKS[ii]; + // In MUX mode bus serialization is handled by s_mux_sel independently. + always_comb begin + for (int i = 0; i < N_DRIVERS; i++) begin + automatic logic [N_DRIVERS-1:0] cur_mask; + automatic logic all_satisfied; + cur_mask = (fence_idx[i] < MAX_FENCES) ? FENCE_MASKS[i][fence_idx[i]] : '0; + all_satisfied = 1'b1; + for (int j = 0; j < N_DRIVERS; j++) begin + if (cur_mask[j]) begin + automatic logic [3:0] req; + req = (fence_idx[i] < MAX_FENCES) ? + FENCE_REQ_LEVELS_PACKED[i][fence_idx[i]][j*4+3 -: 4] : 4'h0; + if (fence_idx[j] < req) + all_satisfied = 1'b0; + end end - assign s_clear_drv[ii] = (eff_mask != '0) && - ((s_end_resp & eff_mask) != eff_mask); + // Only assert resume_i while the driver is actually in PAUSED state. + // Gating with fence_reached_o makes the signal a clean pulse. + s_resume[i] = all_satisfied && s_fence_reached[i]; end - endgenerate + end + + ///////// + // HCI // + ///////// hci_interconnect #( .N_HWPE(N_WIDE_HCI), @@ -341,8 +400,9 @@ module tb_hci ) i_app_driver_log ( .clk_i(clk), .rst_ni(rst_n), - .clear_i(s_clear_drv[ii]), + .resume_i(s_resume[ii]), .hci_if(hci_driver_log_if[ii]), + .fence_reached_o(s_fence_reached[ii]), .end_resp_o(s_end_resp[ii]), .n_issued_tr_o(s_issued_transactions[ii]), .n_issued_rd_tr_o(s_issued_read_transactions[ii]), @@ -364,8 +424,9 @@ module tb_hci ) i_app_driver_hwpe ( .clk_i(clk), .rst_ni(rst_n), - .clear_i(s_clear_drv[N_LOG_MASTERS + ii]), + .resume_i(s_resume[N_LOG_MASTERS + ii]), .hci_if(hci_driver_hwpe_if[ii]), + .fence_reached_o(s_fence_reached[N_LOG_MASTERS + ii]), .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]), .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]), .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]), @@ -432,6 +493,10 @@ module tb_hci .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); + /////////////// + // Reporting // + /////////////// + simulation_report i_simulation_report ( .end_resp_i(s_end_resp), .throughput_complete_i(throughput_completed), @@ -449,6 +514,10 @@ module tb_hci .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); + //////////////// + // Assertions // + //////////////// + localparam int unsigned MAX_BANK_LOCAL_ADDR = TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE; diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index 47c5d7b..dc77085 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -77,12 +77,23 @@ package tb_hci_pkg; localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; - // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's - // end_resp_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define. - // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential - // chain in tb_hci.sv. - localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] = - `ifdef WAIT_MASKS_PARAM `WAIT_MASKS_PARAM `else '{default: '0} `endif; + // Fence parameters. Every pattern ends with a PAUSE; fence f = PAUSE after pattern f. + // fence_idx[i] == k means driver i has completed k patterns. + // + // FENCE_MASKS[i][f][j]=1: driver j is a dependency for driver i at fence f. + // FENCE_REQ_LEVELS[i][f][j]: minimum fence_idx[j] required before driver i can pass + // fence f. Equals the pattern index of the referenced phase within driver j plus 1 + // (i.e., j must have completed that specific pattern). + // Both are generated by main.py and passed via defines. + localparam int unsigned MAX_FENCES = + `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif; + localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] = + `ifdef FENCE_MASKS_PARAM `FENCE_MASKS_PARAM `else '{default: '{default: '0}} `endif; + // FENCE_REQ_LEVELS_PACKED[i][f] is a packed vector of N_DRIVERS × 4 bits. + // Bits [j*4+3:j*4] hold the required fence_idx[j] before driver i can pass fence f. + // 4 bits supports up to 15 (sufficient for up to 7 patterns × 2 fences/pattern). + localparam logic [N_DRIVERS*4-1:0] FENCE_REQ_LEVELS_PACKED [N_DRIVERS][MAX_FENCES] = + `ifdef FENCE_REQ_LEVELS_PARAM `FENCE_REQ_LEVELS_PARAM `else '{default: '{default: '0}} `endif; // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE. localparam int unsigned N_NARROW_HCI = diff --git a/target/verif/verif.mk b/target/verif/verif.mk index b94a3b4..93d8282 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -14,7 +14,7 @@ HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk ifeq (,$(filter clean%,$(MAKECMDGOALS))) --include $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk +-include $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk endif # Bender targets and defines @@ -50,7 +50,7 @@ $(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_ --workload_config $(WORKLOAD_JSON) \ --testbench_config $(TESTBENCH_JSON) \ --hardware_config $(HARDWARE_JSON) \ - --emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk + --emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk date > $@ .PHONY: clean-stim-verif @@ -87,7 +87,7 @@ $(HCI_VERIF_CFG_GEN_DIR): .PHONY: clean-config-verif clean-config-verif: - rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk + rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk ############## # Simulation # @@ -116,13 +116,15 @@ ifeq ($(GUI),0) SIM_HCI_VSIM_ARGS += -c endif -WAIT_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk -WAIT_MASKS_PARAM = $(shell grep '^WAIT_MASKS_PARAM' $(WAIT_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) +FENCE_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk +MAX_FENCES_PARAM = $(shell grep '^MAX_FENCES_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) +FENCE_MASKS_PARAM = $(shell grep '^FENCE_MASKS_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) +FENCE_REQ_LEVELS_PARAM = $(shell grep '^FENCE_REQ_LEVELS_PACKED_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) $(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp mkdir -p $(HCI_VERIF_DIR)/vsim $(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \ - --vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+WAIT_MASKS_PARAM=$(WAIT_MASKS_PARAM)\"" > $@ + --vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+MAX_FENCES_PARAM=$(MAX_FENCES_PARAM) +define+FENCE_MASKS_PARAM=$(FENCE_MASKS_PARAM) +define+FENCE_REQ_LEVELS_PARAM=$(FENCE_REQ_LEVELS_PARAM)\"" > $@ .PHONY: compile-verif compile-verif: $(sim_vsim_lib)/.hw_compiled diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl index 72d5007..e0de90a 100644 --- a/target/verif/vsim/tb_hci.tcl +++ b/target/verif/vsim/tb_hci.tcl @@ -10,72 +10,146 @@ if {$GUI == 1} { set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS] set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS] + set N_DRIVERS [examine -radix dec /tb_hci_pkg/N_DRIVERS] set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI] set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI] set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT] set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE] + set MAX_FENCES [examine -radix dec /tb_hci_pkg/MAX_FENCES] add wave -noupdate /tb_hci/clk add wave -noupdate /tb_hci/rst_n + add wave -noupdate -divider Interfaces + # ------------------------------------------------------------------------- # Application-driver interfaces - add wave -noupdate -group application_drivers -divider log_masters + # ------------------------------------------------------------------------- + add wave -noupdate -group driver_side -divider narrow_masters for {set i 0} {$i < $N_LOG_MASTERS} {incr i} { - add wave -noupdate -group application_drivers -group log_$i /tb_hci/hci_driver_log_if[$i]/* + add wave -noupdate -group driver_side -group log_$i /tb_hci/hci_driver_log_if[$i]/* } - add wave -noupdate -group application_drivers -divider hwpe_masters + add wave -noupdate -group driver_side -divider hwpe_masters for {set i 0} {$i < $N_HWPE} {incr i} { - add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/* + add wave -noupdate -group driver_side -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/* } + # ------------------------------------------------------------------------- # Interconnect-side interfaces - add wave -noupdate -group hci_interfaces -divider narrow_cores + # ------------------------------------------------------------------------- + add wave -noupdate -group hci_initiator_side -divider narrow_cores for {set i 0} {$i < $N_CORE} {incr i} { - add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_initiator_narrow[$i]/* + add wave -noupdate -group hci_initiator_side -group core_$i /tb_hci/hci_initiator_narrow[$i]/* } if {[string first "LOG" $INTERCO_TYPE] != -1} { - add wave -noupdate -group hci_interfaces -divider narrow_hwpe_split + add wave -noupdate -group hci_initiator_side -divider narrow_hwpe_split for {set i 0} {$i < $N_HWPE} {incr i} { for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} { set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}] if {$idx < $N_NARROW_HCI} { - add wave -noupdate -group hci_interfaces -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/* + add wave -noupdate -group hci_initiator_side -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/* } } } } if {$N_WIDE_HCI > 0} { - add wave -noupdate -group hci_interfaces -divider wide_hwpe + add wave -noupdate -group hci_initiator_side -divider wide_hwpe for {set i 0} {$i < $N_WIDE_HCI} {incr i} { - add wave -noupdate -group hci_interfaces -group wide_$i /tb_hci/hci_initiator_wide[$i]/* + add wave -noupdate -group hci_initiator_side -group wide_$i /tb_hci/hci_initiator_wide[$i]/* } } - add wave -noupdate -group hci_interfaces -divider dma + add wave -noupdate -group hci_initiator_side -divider dma for {set i 0} {$i < $N_DMA} {incr i} { - add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_initiator_dma[$i]/* + add wave -noupdate -group hci_initiator_side -group dma_$i /tb_hci/hci_initiator_dma[$i]/* } - add wave -noupdate -group hci_interfaces -divider ext + add wave -noupdate -group hci_initiator_side -divider ext for {set i 0} {$i < $N_EXT} {incr i} { - add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_initiator_ext[$i]/* + add wave -noupdate -group hci_initiator_side -group ext_$i /tb_hci/hci_initiator_ext[$i]/* } + # ------------------------------------------------------------------------- # Memory slaves - add wave -noupdate -group memory_slaves + # ------------------------------------------------------------------------- + add wave -noupdate -group memory_targets for {set i 0} {$i < $N_BANKS} {incr i} { - add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_target_mems[$i]/* + add wave -noupdate -group memory_targets -group bank_$i /tb_hci/hci_target_mems[$i]/* } + add wave -noupdate -divider "Application drivers" + # ------------------------------------------------------------------------- + # Per-driver driver internals (req/resp FSM states, counters) + # ------------------------------------------------------------------------- + add wave -noupdate -group driver_internals -divider log_masters + for {set i 0} {$i < $N_LOG_MASTERS} {incr i} { + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/req_state_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resp_state_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/tr_idx_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_req_issued_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_req_issued_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_resp_retired_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/fence_reached_o + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/end_resp_o + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resume_i + } + + add wave -noupdate -group driver_internals -divider hwpe_masters + for {set i 0} {$i < $N_HWPE} {incr i} { + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/req_state_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resp_state_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/tr_idx_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_req_issued_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_req_issued_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_resp_retired_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/fence_reached_o + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/end_resp_o + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resume_i + } + + add wave -noupdate -divider Testbench + # ------------------------------------------------------------------------- + # Fence / synchronization signals + # ------------------------------------------------------------------------- + add wave -noupdate -group fence_sync /tb_hci/s_end_resp + add wave -noupdate -group fence_sync /tb_hci/s_fence_reached + add wave -noupdate -group fence_sync /tb_hci/s_resume + + for {set i 0} {$i < $N_DRIVERS} {incr i} { + add wave -noupdate -group fence_sync -group fence_idx /tb_hci/fence_idx[$i] + } + + # MUX sel (only present when INTERCO_TYPE == MUX) + if {[string first "MUX" $INTERCO_TYPE] != -1} { + add wave -noupdate -group fence_sync /tb_hci/gen_hwpe_mux/s_mux_sel + } + + + # ------------------------------------------------------------------------- # Metrics - add wave -noupdate -group metrics /tb_hci/s_end_req - add wave -noupdate -group metrics /tb_hci/s_end_resp + # ------------------------------------------------------------------------- add wave -noupdate -group metrics /tb_hci/s_issued_transactions add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions - add wave -noupdate -group metrics /tb_hci/stim_latency add wave -noupdate -group metrics /tb_hci/tot_latency add wave -noupdate -group metrics /tb_hci/latency_per_master add wave -noupdate -group metrics /tb_hci/throughput_completed @@ -90,8 +164,10 @@ if {$GUI == 1} { add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE + # ------------------------------------------------------------------------- + # HCI control + # ------------------------------------------------------------------------- add wave -noupdate /tb_hci/s_clear - add wave -noupdate /tb_hci/s_clear_drv add wave -noupdate /tb_hci/s_hci_ctrl configure wave -signalnamewidth 1 From 414bfb5dddefb77e90fd92ef733da35bdbe9fed6 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Mon, 9 Mar 2026 20:31:19 +0100 Subject: [PATCH 19/25] ci: Adapt tests to new hw config in workloads --- regr/hardware/hci/hardware.json | 6 +++--- regr/hardware/log/hardware.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regr/hardware/hci/hardware.json b/regr/hardware/hci/hardware.json index c8db224..b349a65 100644 --- a/regr/hardware/hci/hardware.json +++ b/regr/hardware/hci/hardware.json @@ -1,14 +1,14 @@ { "description": "Hardware configuration parameters for HCI interconnect", "parameters": { - "N_HWPE": 3, + "N_HWPE": 2, "HWPE_WIDTH_FACT": 8, "N_CORE": 8, "N_DMA": 0, "N_EXT": 1, "DATA_WIDTH": 32, - "TOT_MEM_SIZE": 128, - "N_BANKS": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, "INTERCO_TYPE": "HCI", "TS_BIT": 21, "EXPFIFO": 0, diff --git a/regr/hardware/log/hardware.json b/regr/hardware/log/hardware.json index 6a6a3bd..a505fa1 100644 --- a/regr/hardware/log/hardware.json +++ b/regr/hardware/log/hardware.json @@ -1,14 +1,14 @@ { "description": "Hardware configuration parameters for HCI interconnect", "parameters": { - "N_HWPE": 3, + "N_HWPE": 2, "HWPE_WIDTH_FACT": 8, "N_CORE": 8, "N_DMA": 0, "N_EXT": 1, "DATA_WIDTH": 32, - "TOT_MEM_SIZE": 128, - "N_BANKS": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, "INTERCO_TYPE": "LOG", "TS_BIT": 21, "EXPFIFO": 0, From afa2bfcb8b877aa2f436efcad23c8de0f99b7d4e Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Tue, 10 Mar 2026 16:10:41 +0100 Subject: [PATCH 20/25] verif: Generate HTML dataflow from stimvectors --- target/verif/config/hardware.json | 2 +- target/verif/config/workload.json | 574 +++-------- target/verif/config/workload.schema.json | 34 +- .../verif/simvectors/hci_stimuli/patterns.py | 3 + target/verif/simvectors/main.py | 945 ++++++++++++++++-- 5 files changed, 1038 insertions(+), 520 deletions(-) diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index b349a65..84c47a1 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -9,7 +9,7 @@ "DATA_WIDTH": 32, "TOT_MEM_SIZE": 256, "N_BANKS": 64, - "INTERCO_TYPE": "HCI", + "INTERCO_TYPE": "MUX", "TS_BIT": 21, "EXPFIFO": 0, "SEL_LIC": 0 diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index 6c0d227..683a709 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -1,506 +1,178 @@ { - "description": "MobileViT-inspired materialized attention kernel. d_model=192, num_heads=4, d_head=48, N=256 tokens, B=128 (2 tiles). One head at a time. Static resident: K^T_head (0x00000, 16kB), V_head (0x04000, 16kB). Ping-pong workspaces: Buf0={Q0 (0x08000,8kB), A0/P0 (0x0A000,64kB), O0 (0x1A000,8kB)}, Buf1={Q1 (0x1C000,8kB), A1/P1 (0x1E000,64kB), O1 (0x2E000,8kB)}. Pipeline: DMA preloads all tensors (dma_setup); GEMM computes Q*K^T->A (gemm_qk_t0); 8 cores softmax A->P in-place (softmax_t0); GEMM computes P*V->O (gemm_pv_t0); 8 cores norm O in-place (norm_t0); repeat for tile 1. Single GEMM engine serializes all 4 GEMM ops.", - + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", "log_masters": [ - { - "id": 0, - "description": "Core 0", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", - "region_size_bytes": 8192 - }, - { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", - "region_size_bytes": 8192 - } - ] - }, + { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, + { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, + { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, + { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, + { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, + { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, + { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, + { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ { "id": 1, - "description": "Core 1", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", - "region_size_bytes": 8192 - }, - { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", - "region_size_bytes": 8192 - } - ] - }, - { - "id": 2, - "description": "Core 2", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", - "region_size_bytes": 8192 - }, - { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", - "region_size_bytes": 8192 - } - ] - }, - { - "id": 3, - "description": "Core 3", + "description": "DMA engine (linear setup + ping/pong prefetch)", "patterns": [ { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", - "region_size_bytes": 8192 - }, - { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", "region_size_bytes": 8192 - } - ] - }, - { - "id": 4, - "description": "Core 4", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 }, { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", "region_size_bytes": 8192 }, { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", + "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A1_buf1", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", "region_size_bytes": 8192 - } - ] - }, - { - "id": 5, - "description": "Core 5", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 }, { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", + "description": "Prefetch tile A2 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A2_buf0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", "region_size_bytes": 8192 }, { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", + "description": "Read back C0 after GEMM tile A0", + "mem_access_type": "linear", + "job": "dma_store_C0_buf2", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", "region_size_bytes": 8192 - } - ] - }, - { - "id": 6, - "description": "Core 6", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 }, { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", + "description": "Prefetch tile A3 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A3_buf1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", "region_size_bytes": 8192 }, { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", + "description": "Read back C1 after GEMM tile A1", + "mem_access_type": "linear", + "job": "dma_store_C1_buf3", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", "region_size_bytes": 8192 - } - ] - }, - { - "id": 7, - "description": "Core 7", - "patterns": [ - { - "description": "softmax tile 0: read+write A0/P0 in-place", - "mem_access_type": "random", - "phase": "softmax_t0", - "wait_for": ["gemm_qk_t0"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x0A000", - "region_size_bytes": 65536 }, { - "description": "norm tile 0: read+write O0 in-place", - "mem_access_type": "random", - "phase": "norm_t0", - "wait_for": ["gemm_pv_t0"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1A000", + "description": "Read back C0 after GEMM tile A2", + "mem_access_type": "linear", + "job": "dma_store_C2_buf2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", "region_size_bytes": 8192 }, { - "description": "softmax tile 1: read+write A1/P1 in-place", - "mem_access_type": "random", - "phase": "softmax_t1", - "wait_for": ["gemm_qk_t1"], - "n_transactions": 2048, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x1E000", - "region_size_bytes": 65536 - }, - { - "description": "norm tile 1: read+write O1 in-place", - "mem_access_type": "random", - "phase": "norm_t1", - "wait_for": ["gemm_pv_t1"], - "n_transactions": 384, - "traffic_pct": 75, - "traffic_read_pct": 50, - "region_base_address": "0x2E000", + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", "region_size_bytes": 8192 } ] }, - { - "id": 8, - "description": "External 0 (idle)", - "mem_access_type": "idle" - } - ], - - "hwpe_masters": [ { "id": 0, - "description": "DMA: preload K^T_head, V_head, Q0, Q1 (all writes, one-shot)", + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", "patterns": [ { - "description": "Preload K^T_head (0x00000, 12kB active / 16kB padded), V_head (0x04000), Q0 (0x08000), Q1 (0x1C000)", - "mem_access_type": "random", - "phase": "dma_setup", - "n_transactions": 9216, - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x00000", - "region_size_bytes": 122880 - } - ] - }, - { - "id": 1, - "description": "GEMM engine: 4 sequential ops (QK_t0, PV_t0, QK_t1, PV_t1)", - "patterns": [ - { - "description": "QK tile 0: read Q0 [128x48 int8], write A0 [128x256 int16]", + "description": "GEMM tile A0: A0 x B -> C0", "mem_access_type": "matmul_phased", - "phase": "gemm_qk_t0", - "wait_for": ["dma_setup"], - "matrix_m": 128, - "matrix_n": 256, - "matrix_k": 48, - "region_base_address_a": "0x08000", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", "region_size_bytes_a": 8192, - "matmul_ratio_a": 1, "region_base_address_b": "0x00000", - "region_size_bytes_b": 16384, - "matmul_ratio_b": 0, - "region_base_address_c": "0x0A000", - "region_size_bytes_c": 65536, - "matmul_ratio_c": 4 + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 }, { - "description": "PV tile 0: read P0 [128x256 int16], write O0 [128x48 int8]", + "description": "GEMM tile A1: A1 x B -> C1", "mem_access_type": "matmul_phased", - "phase": "gemm_pv_t0", - "wait_for": ["softmax_t0"], - "matrix_m": 128, - "matrix_n": 48, - "matrix_k": 256, - "region_base_address_a": "0x0A000", - "region_size_bytes_a": 65536, - "matmul_ratio_a": 4, - "region_base_address_b": "0x04000", - "region_size_bytes_b": 16384, - "matmul_ratio_b": 0, - "region_base_address_c": "0x1A000", - "region_size_bytes_c": 8192, - "matmul_ratio_c": 1 + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 }, { - "description": "QK tile 1: read Q1 [128x48 int8], write A1 [128x256 int16]", + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", "mem_access_type": "matmul_phased", - "phase": "gemm_qk_t1", - "wait_for": ["norm_t0"], - "matrix_m": 128, - "matrix_n": 256, - "matrix_k": 48, - "region_base_address_a": "0x1C000", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", "region_size_bytes_a": 8192, - "matmul_ratio_a": 1, "region_base_address_b": "0x00000", - "region_size_bytes_b": 16384, - "matmul_ratio_b": 0, - "region_base_address_c": "0x1E000", - "region_size_bytes_c": 65536, - "matmul_ratio_c": 4 + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 }, { - "description": "PV tile 1: read P1 [128x256 int16], write O1 [128x48 int8]", + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", "mem_access_type": "matmul_phased", - "phase": "gemm_pv_t1", - "wait_for": ["softmax_t1"], - "matrix_m": 128, - "matrix_n": 48, - "matrix_k": 256, - "region_base_address_a": "0x1E000", - "region_size_bytes_a": 65536, - "matmul_ratio_a": 4, - "region_base_address_b": "0x04000", - "region_size_bytes_b": 16384, - "matmul_ratio_b": 0, - "region_base_address_c": "0x2E000", - "region_size_bytes_c": 8192, - "matmul_ratio_c": 1 + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 } ] } diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json index 2140e37..1037a0f 100644 --- a/target/verif/config/workload.schema.json +++ b/target/verif/config/workload.schema.json @@ -41,22 +41,22 @@ "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"], "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'." }, - "phase": { + "job": { "type": "string", "default": "default", - "description": "Phase name for this pattern segment. Used by other patterns' wait_for to reference this segment. The master's phase identity is determined by its first pattern's phase." + "description": "Job name for this pattern segment. Used by other patterns' wait_for_jobs to reference this segment." }, - "wait_for": { + "wait_for_jobs": { "type": "array", "items": { "type": "string" }, "default": [], - "description": "Phase names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced phases have advanced past the same fence level." + "description": "Job names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced jobs have advanced past the same fence level." }, "n_transactions": { "type": "integer", "minimum": 0, - "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields." + "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry or from region_size_bytes (transaction-width based)." }, "region_base_address": { @@ -64,14 +64,14 @@ { "type": "integer", "minimum": 0 }, { "type": "string", "description": "Decimal, hex (0x...) or binary string." } ], - "description": "[random, matmul_phased] Base byte address of the memory region." + "description": "[random, linear, matmul_phased] Base byte address of the memory region." }, "region_size_bytes": { "oneOf": [ { "type": "integer", "minimum": 0 }, { "type": "string", "description": "Decimal, hex (0x...) or binary string." } ], - "description": "[random, matmul_phased] Size in bytes of the memory region." + "description": "[random, linear, matmul_phased] Size in bytes of the memory region. For linear/matmul_phased, if n_transactions is omitted this can be used to derive it from transaction width." }, "start_address": { @@ -181,7 +181,7 @@ "minimum": 1, "maximum": 100, "default": 100, - "description": "[random, linear] Bus utilization percentage." + "description": "[random, linear, matmul_phased] Modeled request utilization percentage (adds req=0 idles after each request when <100)." }, "traffic_read_pct": { "type": "integer", @@ -204,7 +204,13 @@ }, { "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] }, - "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["length"] }] } + "then": { + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["length"] }, + { "required": ["region_size_bytes"] } + ] + } }, { "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] }, @@ -220,7 +226,11 @@ "required": ["mem_access_type"] }, "then": { - "anyOf": [{ "required": ["n_transactions"] }, { "required": ["matrix_m", "matrix_n", "matrix_k"] }] + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["region_size_bytes"] }, + { "required": ["matrix_m", "matrix_n", "matrix_k"] } + ] } } ] @@ -228,7 +238,7 @@ "master": { "type": "object", - "description": "Per-master configuration. A master either has a flat single-pattern config (legacy, backward compatible) or a 'patterns' list for multi-phase execution.", + "description": "Per-master configuration. A master either has a flat single-pattern config or a 'patterns' list for multi-job execution.", "unevaluatedProperties": false, "properties": { "id": { @@ -248,7 +258,7 @@ "patterns": { "type": "array", "minItems": 1, - "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for of pattern f defines which phases must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern (legacy format).", + "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for_jobs of pattern f defines which jobs must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern.", "items": { "$ref": "#/$defs/pattern" } } }, diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py index 063c008..afc6633 100644 --- a/target/verif/simvectors/hci_stimuli/patterns.py +++ b/target/verif/simvectors/hci_stimuli/patterns.py @@ -186,6 +186,7 @@ def idle_gen(self, id_start, append=False): def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write, region_base_address, region_size_bytes, matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1, + traffic_pct=100, idle_cycles_between_phases=0, region_base_address_a=None, region_size_bytes_a=None, region_base_address_b=None, region_size_bytes_b=None, @@ -193,6 +194,7 @@ def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write, append=False): id_value = id_start; fr_new, fw_new = [], [] ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024) + n_idles = self._idles_per_req(traffic_pct) def _res(bo, so, fb, fs): b = self._align_down(int(bo if bo is not None else fb), ab) @@ -228,6 +230,7 @@ def _emit(fobj, count, wen, pb, pe): self._record_access(add, wen, fr_new, fw_new) id_value += 1; addr += ab if addr >= pe: addr = pb + for _ in range(n_idles): self._write_idle(fobj) with self._open(append) as f: _emit(f, ca, 1, a_base, a_base+a_size) diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index c6a0eae..0cd3c12 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -11,6 +11,7 @@ import json import math import sys +import html from pathlib import Path import argparse @@ -71,6 +72,10 @@ def main(argv=None): N_EXT = hw_params['N_EXT'] N_HWPE = hw_params['N_HWPE'] HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT'] + N_CORE_CFG = N_CORE + N_DMA_CFG = N_DMA + N_EXT_CFG = N_EXT + N_HWPE_CFG = N_HWPE log_masters = workload_config['log_masters'] hwpe_masters = workload_config['hwpe_masters'] @@ -78,8 +83,21 @@ def main(argv=None): # Derived parameters ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000)) N_LOG = N_CORE + N_DMA + N_EXT + N_LOG_CFG = N_LOG IW = 8 + def _narrow_driver_name(local_idx: int) -> str: + idx = int(local_idx) + if idx < N_CORE_CFG: + return f"core_{idx}" + idx -= N_CORE_CFG + if idx < N_DMA_CFG: + return f"dma_{idx}" + idx -= N_DMA_CFG + if idx < N_EXT_CFG: + return f"ext_{idx}" + return f"narrow_{local_idx}" + # Validations if len(log_masters) != N_LOG: print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})") @@ -145,7 +163,13 @@ def _bank_of(byte_addr): def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes, region_base, region_size, start_address, stride0, len_d0, stride1, len_d1, stride2, master_config, total_mem_bytes): - label = f"{kind}_{local_idx}" + (f" ({description})" if description else "") + if kind == 'master_log': + label_prefix = _narrow_driver_name(local_idx) + elif kind == 'master_hwpe': + label_prefix = f"hwpe_{local_idx}" + else: + label_prefix = f"{kind}_{local_idx}" + label = label_prefix + (f" ({description})" if description else "") if config == 'idle' or n_test == 0: memory_map_entries.append({'label': label, 'pattern': config, 'n': 0, 'info': 'idle - no memory accesses'}) @@ -199,6 +223,10 @@ def _record_memory_map(kind, local_idx, description, config, n_test, data_width, if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')): m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k']) detail['matrix_dims'] = f"M={m} N={n} K={k} (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})" + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" idle_between = master_config.get('idle_cycles_between_phases', 0) if idle_between: detail['idle_between_phases'] = f"{idle_between} cycles" @@ -283,6 +311,19 @@ def _normalize_mem_access_type(raw_value, master_name): sys.exit(1) return key + def _pattern_job_name(pattern_config: dict) -> str: + """Resolve job name from the mandatory 'job' key.""" + return str(pattern_config.get('job', 'default')) + + def _pattern_wait_for_jobs(pattern_config): + """Resolve dependency list from the mandatory 'wait_for_jobs' key.""" + raw = pattern_config.get('wait_for_jobs', []) + if raw is None: + return [] + if isinstance(raw, list): + return [str(x) for x in raw] + return [str(raw)] + def _warn_if_id_mismatch(master_cfg, expected_idx, master_name): raw_id = master_cfg.get("id", expected_idx) try: @@ -300,10 +341,16 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt """Resolve n_transactions from explicit field or geometry, depending on pattern.""" if 'n_transactions' in master_config: return int(master_config['n_transactions']) + access_bytes = max(1, int(data_width // 8)) if mem_access_type == 'linear': length = master_config.get('length') if length is not None: return int(length) + raw_region_size = master_config.get('region_size_bytes') + if raw_region_size is not None: + region_size = _parse_maybe_bin_int(raw_region_size, None) + if region_size is not None: + return max(0, int(region_size) // access_bytes) elif mem_access_type == '2d': len_d0 = master_config.get('len_d0') len_d1 = master_config.get('len_d1') @@ -316,6 +363,13 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt if len_d0 is not None and len_d1 is not None and len_d2 is not None: return int(len_d0) * int(len_d1) * int(len_d2) elif mem_access_type == 'matmul_phased': + # For non-random region-based traffic, allow deriving transactions + # from region size and transaction width. + raw_region_size = master_config.get('region_size_bytes') + if raw_region_size is not None: + region_size = _parse_maybe_bin_int(raw_region_size, None) + if region_size is not None: + return max(0, int(region_size) // access_bytes) m = master_config.get('matrix_m') n = master_config.get('matrix_n') k = master_config.get('matrix_k') @@ -356,8 +410,19 @@ def _generate_pattern( pattern_config['mem_access_type'], f"{kind}_{master_local_idx}", ) - start_address = str(pattern_config.get('start_address', '0')) - stride0 = int(pattern_config.get('stride0', 0)) + if 'start_address' in pattern_config: + start_address = str(pattern_config['start_address']) + elif config == 'linear' and 'region_base_address' in pattern_config: + start_address = str(pattern_config['region_base_address']) + else: + start_address = '0' + + if 'stride0' in pattern_config: + stride0 = int(pattern_config['stride0']) + elif config == 'linear' and 'region_size_bytes' in pattern_config: + stride0 = 1 + else: + stride0 = 0 len_d0 = int(pattern_config.get('len_d0', 0)) stride1 = int(pattern_config.get('stride1', 0)) len_d1 = int(pattern_config.get('len_d1', 0)) @@ -369,7 +434,19 @@ def _generate_pattern( default_region_base = master_local_idx * default_region_size region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base) - region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size) + + # For non-random region-based patterns, if n_transactions is provided but + # region_size_bytes is omitted, span a full non-wrapping region that can + # hold all transactions once at the current transaction width. + region_size_input = pattern_config.get('region_size_bytes') + if ( + config in {'linear', 'matmul_phased'} + and region_size_input is None + and 'n_transactions' in pattern_config + ): + region_size_input = int(pattern_config['n_transactions']) * access_bytes + + region_size = _parse_maybe_bin_int(region_size_input, default_region_size) region_base = (region_base // access_bytes) * access_bytes if region_base >= total_mem_bytes: @@ -382,13 +459,18 @@ def _generate_pattern( n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx) master.N_TEST = n_test + # Keep forbidden-address filtering local to this pattern invocation. + # This allows intended buffer reuse across phases (e.g. double buffering) + # while still avoiding duplicates inside a single pattern generator call. + forbidden_read_local = list(LIST_OF_FORBIDDEN_ADDRESSES_READ) + forbidden_write_local = list(LIST_OF_FORBIDDEN_ADDRESSES_WRITE) if config == 'random': tpct = pattern_config.get('traffic_pct') next_start_id = master.random_gen( next_start_id, - LIST_OF_FORBIDDEN_ADDRESSES_READ, - LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + forbidden_read_local, + forbidden_write_local, region_base=region_base, region_size=region_size, traffic_pct=int(tpct) if tpct is not None else 100, @@ -399,8 +481,8 @@ def _generate_pattern( tpct = pattern_config.get('traffic_pct') next_start_id = master.linear_gen( stride0, start_address, next_start_id, - LIST_OF_FORBIDDEN_ADDRESSES_READ, - LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + forbidden_read_local, + forbidden_write_local, traffic_pct=int(tpct) if tpct is not None else 100, traffic_read_pct=pattern_config.get('traffic_read_pct'), append=append, @@ -408,16 +490,16 @@ def _generate_pattern( elif config == '2d': next_start_id = master.gen_2d( stride0, len_d0, stride1, start_address, next_start_id, - LIST_OF_FORBIDDEN_ADDRESSES_READ, - LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + forbidden_read_local, + forbidden_write_local, idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), append=append, ) elif config == '3d': next_start_id = master.gen_3d( stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, - LIST_OF_FORBIDDEN_ADDRESSES_READ, - LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + forbidden_read_local, + forbidden_write_local, idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), append=append, ) @@ -438,13 +520,14 @@ def _generate_pattern( sys.exit(1) next_start_id = master.matmul_phased_gen( next_start_id, - LIST_OF_FORBIDDEN_ADDRESSES_READ, - LIST_OF_FORBIDDEN_ADDRESSES_WRITE, + forbidden_read_local, + forbidden_write_local, region_base, region_size, int(pattern_config.get('matmul_ratio_a', 1)), int(pattern_config.get('matmul_ratio_b', 1)), int(pattern_config.get('matmul_ratio_c', 1)), + traffic_pct=int(pattern_config.get('traffic_pct', 100)), idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None), region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None), @@ -492,7 +575,7 @@ def _generate_master( if start_delay > 0: pending_start_delays.append((filepath, start_delay, data_width)) - # For each pattern with wait_for, prepend a synthetic idle+PAUSE that acts as + # For each pattern with wait_for_jobs, prepend a synthetic idle+PAUSE that acts as # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free # pass), so fence_idx advances immediately after the real work is done. # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle @@ -500,7 +583,7 @@ def _generate_master( dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH first_written = False for p_idx, pattern_config in enumerate(patterns): - if pattern_config.get('wait_for'): + if _pattern_wait_for_jobs(pattern_config): # Synthetic idle+PAUSE gates this pattern _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, dw, ADD_WIDTH, str(filepath), 0, master_global_idx) @@ -575,11 +658,11 @@ def _generate_master( # resume from that PAUSE. # # For a master with N patterns, there are N fence slots (slot 0 = before - # pattern 0, slot f = before pattern f). The wait_for of pattern f defines + # pattern 0, slot f = before pattern f). The wait_for_jobs of pattern f defines # the mask at fence slot f. # # Legacy flat masters (no 'patterns' key) are treated as single-pattern - # masters: one fence slot (slot 0) from the top-level wait_for field. + # masters: one fence slot (slot 0) from the top-level wait_for_jobs field. # ----------------------------------------------------------------------- N_DRIVERS = N_LOG + N_HWPE @@ -589,30 +672,30 @@ def _patterns_of(master_config): return master_config['patterns'] return [master_config] - # Build phase->driver map: every pattern of every driver registers its phase. - # This allows wait_for to reference any phase, not just first patterns. - # A phase may be associated with multiple drivers (e.g. 8 cores all in softmax_t0). - phase_to_drivers: dict[str, list[int]] = {} + # Build job->driver map: every pattern of every driver registers its job. + # This allows wait_for_jobs to reference any job, not just first patterns. + # A job may be associated with multiple drivers (e.g. 8 cores all in softmax_t0). + job_to_drivers = {} all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters] for i, (m, _) in enumerate(all_masters): for pat in _patterns_of(m): - phase = str(pat.get('phase', 'default')) - if i not in phase_to_drivers.get(phase, []): - phase_to_drivers.setdefault(phase, []).append(i) - - # Build phase->pattern_index map: for each phase, which pattern index within - # each driver corresponds to that phase. Used to compute FENCE_REQ_LEVELS. - # phase_pattern_idx[phase][driver] = pattern index of that phase in that driver - phase_pattern_idx: dict[str, dict[int, int]] = {} + job = _pattern_job_name(pat) + if i not in job_to_drivers.get(job, []): + job_to_drivers.setdefault(job, []).append(i) + + # Build job->pattern_index map: for each job, which pattern index within + # each driver corresponds to that job. Used to compute FENCE_REQ_LEVELS. + # job_pattern_idx[job][driver] = pattern index of that job in that driver + job_pattern_idx = {} for i, (m, _) in enumerate(all_masters): for p_idx, pat in enumerate(_patterns_of(m)): - phase = str(pat.get('phase', 'default')) - phase_pattern_idx.setdefault(phase, {})[i] = p_idx + job = _pattern_job_name(pat) + job_pattern_idx.setdefault(job, {})[i] = p_idx - def _resolve_wait_mask(wait_for_list): + def _resolve_wait_mask(wait_for_jobs_list): mask = 0 - for dep_phase in wait_for_list: - for dep_drv in phase_to_drivers.get(str(dep_phase), []): + for dep_job in wait_for_jobs_list: + for dep_drv in job_to_drivers.get(str(dep_job), []): mask |= (1 << dep_drv) return mask @@ -621,38 +704,39 @@ def _resolve_wait_mask(wait_for_list): # including the trailing PAUSE of pattern p. def _fence_idx_after_pattern(drv_idx, pat_idx): pats = _patterns_of(all_masters[drv_idx][0]) - # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for) - n_gates = sum(1 for k in range(pat_idx + 1) if pats[k].get('wait_for')) + # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for_jobs) + n_gates = sum(1 for k in range(pat_idx + 1) if _pattern_wait_for_jobs(pats[k])) # Plus trailing PAUSEs for patterns 0..pat_idx n_trailing = pat_idx + 1 return n_gates + n_trailing - def _resolve_req_levels(wait_for_list): + def _resolve_req_levels(wait_for_jobs_list): """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j.""" levels = [0] * N_DRIVERS - for dep_phase in wait_for_list: - for dep_drv in phase_to_drivers.get(str(dep_phase), []): - p_idx = phase_pattern_idx.get(str(dep_phase), {}).get(dep_drv, 0) + for dep_job in wait_for_jobs_list: + for dep_drv in job_to_drivers.get(str(dep_job), []): + p_idx = job_pattern_idx.get(str(dep_job), {}).get(dep_drv, 0) levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx) return levels # Build per-driver fence mask and req_level lists. - # Each pattern with wait_for gets a synthetic idle gate (mask = wait_for) before it. + # Each pattern with wait_for_jobs gets a synthetic idle gate (mask = wait_for_jobs) before it. # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx). # Fences are enumerated in file order: for each pattern p: - # if p has wait_for: synthetic idle fence (mask = wait_for of p) + # if p has wait_for_jobs: synthetic idle fence (mask = wait_for_jobs of p) # trailing PAUSE fence (mask = 0) - fence_masks: list[list[int]] = [] - req_levels: list[list[list[int]]] = [] + fence_masks = [] + req_levels = [] for i, (m, _) in enumerate(all_masters): patterns = _patterns_of(m) per_masks = [] per_levels = [] for pat in patterns: - if pat.get('wait_for'): + wait_for_jobs = _pattern_wait_for_jobs(pat) + if wait_for_jobs: # Synthetic idle gate: blocking fence - per_masks.append(_resolve_wait_mask(pat['wait_for'])) - per_levels.append(_resolve_req_levels(pat['wait_for'])) + per_masks.append(_resolve_wait_mask(wait_for_jobs)) + per_levels.append(_resolve_req_levels(wait_for_jobs)) # Trailing PAUSE: free pass, just signals completion per_masks.append(0) per_levels.append([0] * N_DRIVERS) @@ -699,7 +783,7 @@ def _resolve_req_levels(wait_for_list): phases_mk_path.write_text( "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n" "# Per-driver per-fence dependency data for tb_hci.sv.\n" - f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n" + f"# Drivers 0..{N_LOG-1} = narrow masters (core/dma/ext), {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n" f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n" f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n" f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n" @@ -713,6 +797,337 @@ def _resolve_req_levels(wait_for_list): # ----------------------------------------------------------------------- # Build and emit memory map report # ----------------------------------------------------------------------- + INTERCO_TYPE = str(hw_params.get('INTERCO_TYPE', 'HCI')).strip().upper() + if INTERCO_TYPE not in {"LOG", "MUX", "HCI"}: + INTERCO_TYPE = "HCI" + DW_NARROW = int(DATA_WIDTH) + DW_WIDE = int(HWPE_WIDTH_FACT * DATA_WIDTH) + N_NARROW_HCI_CFG = int( + N_CORE_CFG + N_DMA_CFG + N_EXT_CFG + + (N_HWPE_CFG * HWPE_WIDTH_FACT if INTERCO_TYPE == "LOG" else 0) + ) + N_WIDE_HCI_CFG = int(N_HWPE_CFG if INTERCO_TYPE == "HCI" else (1 if INTERCO_TYPE == "MUX" else 0)) + N_MASTER_PORTS_CFG = int(N_NARROW_HCI_CFG + N_WIDE_HCI_CFG) + + def _driver_name(driver_idx): + if driver_idx < N_LOG: + return _narrow_driver_name(driver_idx) + return f"hwpe_{driver_idx - N_LOG}" + + def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peers): + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + access_bytes = max(1, int(data_width // 8)) + total_mem_bytes = int(TOT_MEM_SIZE * 1024) + default_region_size = total_mem_bytes // max(1, n_peers) + default_region_base = local_idx * default_region_size + + region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base) + region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size) + + region_base = (region_base // access_bytes) * access_bytes + if region_base >= total_mem_bytes: + region_base = region_base % total_mem_bytes + region_size = (max(0, region_size) // access_bytes) * access_bytes + if region_size <= 0: + region_size = (default_region_size // access_bytes) * access_bytes + if region_base + region_size > total_mem_bytes: + region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes + + if region_size <= 0: + return [] + + if mem_access_type == 'idle': + return [] + + if mem_access_type != 'matmul_phased': + return [{ + 'label': 'region', + 'base': region_base, + 'size': region_size, + 'end': region_base + region_size - 1, + }] + + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + + regions = [] + if ra is not None and sa is not None: + sub_defs = [ + ('A(read)', ra, sa), + ('B(read)', rb if rb is not None else ra, sb if sb is not None else sa), + ('C(write)', rc if rc is not None else ra, sc if sc is not None else sa), + ] + for label, base_raw, size_raw in sub_defs: + base = (int(base_raw) // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, int(size_raw)) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size > 0: + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + + n_words = region_size // access_bytes + if n_words < 3: + return [{ + 'label': 'region', + 'base': region_base, + 'size': region_size, + 'end': region_base + region_size - 1, + }] + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = n_words - a_words - b_words + sub_regions = [ + ('A(read)', region_base, a_words * access_bytes), + ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes), + ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes), + ] + for label, base, size in sub_regions: + if size <= 0: + continue + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + + def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test): + # Intentionally simple temporal model: + # - one time unit per transaction, plus optional req=0 idles from traffic shaping + # - no interconnect stall/conflict modeling + # - explicit start_delay_cycles is still honored separately + base = max(0, int(n_test)) + tpct = pattern_config.get('traffic_pct') + n_idles_per_req = 0 + if tpct is not None: + tp = max(1, min(100, int(tpct))) + n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp)) + cycles = base * (1 + n_idles_per_req) + # Keep explicit matmul phase-boundary idle modeling. + if mem_access_type == 'matmul_phased': + idle_between = int(pattern_config.get('idle_cycles_between_phases', 0)) + if idle_between > 0: + ra = max(0, int(pattern_config.get('matmul_ratio_a', 1))) + rb = max(0, int(pattern_config.get('matmul_ratio_b', 1))) + rc = max(0, int(pattern_config.get('matmul_ratio_c', 1))) + if ra == 0 and rb == 0 and rc == 0: + ra, rb, rc = 1, 1, 1 + s = ra + rb + rc + ca = (base * ra) // s + cb = (base * rb) // s + cc = base - ca - cb + phase_gaps = 0 + if ca > 0 and (cb > 0 or cc > 0): + phase_gaps += 1 + if cb > 0 and cc > 0: + phase_gaps += 1 + cycles += idle_between * phase_gaps + return int(cycles) + + pattern_nodes = [] + node_idx_by_driver_pattern = {} + job_to_nodes = {} + driver_last_node = {} + + for drv_idx, (master_cfg, is_hwpe) in enumerate(all_masters): + patterns = _patterns_of(master_cfg) + local_idx = drv_idx - N_LOG if is_hwpe else drv_idx + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + kind = 'master_hwpe' if is_hwpe else 'master_log' + n_peers = max(1, N_HWPE if is_hwpe else N_LOG) + start_delay = int(master_cfg.get('start_delay_cycles', 0)) + for p_idx, pat in enumerate(patterns): + raw_type = pat.get('mem_access_type', 'idle') + mem_access_type = _normalize_mem_access_type(raw_type, f"{kind}_{local_idx}") + n_test = _resolve_n_transactions(pat, mem_access_type, data_width, kind, local_idx) + declared_wait_for_jobs = _pattern_wait_for_jobs(pat) + # Timeline view follows declared dependencies from workload.json. + effective_wait_for_jobs = declared_wait_for_jobs + node = { + 'node_idx': len(pattern_nodes), + 'driver_idx': drv_idx, + 'driver_name': _driver_name(drv_idx), + 'is_hwpe': is_hwpe, + 'local_idx': local_idx, + 'pattern_idx': p_idx, + 'description': str(pat.get('description', '')).strip(), + 'job': _pattern_job_name(pat), + 'wait_for_jobs_declared': declared_wait_for_jobs, + 'wait_for_jobs_effective': effective_wait_for_jobs, + 'n_transactions': int(n_test), + 'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test)), + 'mem_access_type': mem_access_type, + 'traffic_read_pct': pat.get('traffic_read_pct'), + 'txn_bytes': int(data_width // 8), + 'start_delay': start_delay if p_idx == 0 else 0, + 'regions': _resolve_regions(pat, mem_access_type, is_hwpe, local_idx, n_peers), + } + pattern_nodes.append(node) + node_idx_by_driver_pattern[(drv_idx, p_idx)] = node['node_idx'] + job_to_nodes.setdefault(node['job'], []).append(node['node_idx']) + driver_last_node[drv_idx] = node['node_idx'] + + n_nodes = len(pattern_nodes) + preds = [set() for _ in range(n_nodes)] + succs = [set() for _ in range(n_nodes)] + mux_serialization_applied = False + mux_phase_order = [] + + def _add_edge(src, dst): + if src == dst or src < 0 or dst < 0: + return + if src not in preds[dst]: + preds[dst].add(src) + succs[src].add(dst) + + for node in pattern_nodes: + n_idx = node['node_idx'] + drv_idx = node['driver_idx'] + p_idx = node['pattern_idx'] + if p_idx > 0: + _add_edge(node_idx_by_driver_pattern[(drv_idx, p_idx - 1)], n_idx) + for dep_job in node['wait_for_jobs_effective']: + for dep_idx in job_to_nodes.get(dep_job, []): + _add_edge(dep_idx, n_idx) + + if INTERCO_TYPE == "MUX": + # Match tb_hci MUX semantics in the temporal model: + # serialize HWPE execution by job order, and by HWPE ID within a job. + hwpe_nodes = [n for n in pattern_nodes if n['is_hwpe']] + if hwpe_nodes: + job_first_seen = {} + for n in sorted(hwpe_nodes, key=lambda x: (x['pattern_idx'], x['local_idx'], x['node_idx'])): + job_first_seen.setdefault(n['job'], len(job_first_seen)) + + job_preds = {jb: set() for jb in job_first_seen} + job_succs = {jb: set() for jb in job_first_seen} + for n in hwpe_nodes: + cur = n['job'] + for dep_job in n['wait_for_jobs_effective']: + dep = str(dep_job) + if dep in job_first_seen and dep != cur: + job_preds[cur].add(dep) + job_succs[dep].add(cur) + + phase_indeg = {jb: len(job_preds[jb]) for jb in job_first_seen} + phase_ready = sorted([jb for jb, deg in phase_indeg.items() if deg == 0], + key=lambda jb: job_first_seen[jb]) + mux_phase_order = [] + while phase_ready: + cur = phase_ready.pop(0) + mux_phase_order.append(cur) + for nxt in sorted(job_succs[cur], key=lambda jb: job_first_seen[jb]): + phase_indeg[nxt] -= 1 + if phase_indeg[nxt] == 0: + phase_ready.append(nxt) + phase_ready.sort(key=lambda jb: job_first_seen[jb]) + if len(mux_phase_order) != len(job_first_seen): + mux_phase_order = sorted(job_first_seen.keys(), key=lambda jb: job_first_seen[jb]) + + phase_rank = {ph: i for i, ph in enumerate(mux_phase_order)} + hwpe_sorted = sorted( + hwpe_nodes, + key=lambda n: ( + phase_rank.get(n['job'], 10 ** 9), + n['local_idx'], + n['pattern_idx'], + n['node_idx'], + ), + ) + for i in range(1, len(hwpe_sorted)): + _add_edge(hwpe_sorted[i - 1]['node_idx'], hwpe_sorted[i]['node_idx']) + mux_serialization_applied = True + + indeg = [len(preds[i]) for i in range(n_nodes)] + ready = [i for i, d in enumerate(indeg) if d == 0] + ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx'])) + topo_order = [] + while ready: + cur = ready.pop(0) + topo_order.append(cur) + for nxt in sorted(succs[cur]): + indeg[nxt] -= 1 + if indeg[nxt] == 0: + ready.append(nxt) + ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx'])) + + schedule_has_cycle = len(topo_order) != n_nodes + if schedule_has_cycle: + topo_order = list(range(n_nodes)) + + node_start = [0 for _ in range(n_nodes)] + node_end = [0 for _ in range(n_nodes)] + for _ in range(max(1, n_nodes + 1)): + changed = False + for n_idx in topo_order: + dep_end = max((node_end[p] for p in preds[n_idx]), default=0) + start_time = max(int(pattern_nodes[n_idx]['start_delay']), dep_end) + end_time = start_time + max(0, int(pattern_nodes[n_idx]['cycles'])) + if start_time != node_start[n_idx] or end_time != node_end[n_idx]: + node_start[n_idx] = start_time + node_end[n_idx] = end_time + changed = True + if not changed: + break + + for n_idx, node in enumerate(pattern_nodes): + node['start_cycle'] = int(node_start[n_idx]) + node['end_cycle'] = int(node_end[n_idx]) + + total_cycles = max((n['end_cycle'] for n in pattern_nodes), default=0) + + driver_windows = {} + for node in pattern_nodes: + w = driver_windows.setdefault(node['driver_idx'], { + 'driver_idx': node['driver_idx'], + 'name': node['driver_name'], + 'is_hwpe': node['is_hwpe'], + 'start': node['start_cycle'], + 'end': node['end_cycle'], + }) + w['start'] = min(w['start'], node['start_cycle']) + w['end'] = max(w['end'], node['end_cycle']) + + regions_timeline = {} + for node in pattern_nodes: + for reg in node['regions']: + reg_key = (reg['base'], reg['size'], reg['label']) + entry = regions_timeline.setdefault(reg_key, { + 'base': reg['base'], + 'size': reg['size'], + 'end': reg['end'], + 'label': reg['label'], + 'accesses': [], + }) + entry['accesses'].append({ + 'driver_idx': node['driver_idx'], + 'driver_name': node['driver_name'], + 'job': node['job'], + 'start': node['start_cycle'], + 'end': node['end_cycle'], + 'pattern_idx': node['pattern_idx'], + 'description': node['description'], + }) + for reg in regions_timeline.values(): + reg['lifetime_start'] = min((a['start'] for a in reg['accesses']), default=0) + reg['lifetime_end'] = max((a['end'] for a in reg['accesses']), default=0) + + # ----------------------------------------------------------------------- + # Build memory_map.txt + # ----------------------------------------------------------------------- word_bytes = DATA_WIDTH // 8 bank_stride_bytes = N_BANKS * word_bytes lines = [] @@ -721,6 +1136,20 @@ def _resolve_req_levels(wait_for_list): lines.append(f" Total memory : {TOT_MEM_SIZE} KiB ({TOT_MEM_SIZE * 1024} B)") lines.append(f" Banks : {N_BANKS} x {word_bytes} B/word (interleaved, stride {bank_stride_bytes} B)") lines.append(f" Data width : {DATA_WIDTH} b LOG / {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE") + lines.append( + f" Drivers({DW_NARROW} bit) : " + f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG} (LOG total={N_LOG_CFG})" + ) + lines.append( + f" Drivers({DW_WIDE} bit) : " + f"HWPE={N_HWPE_CFG}" + ) + lines.append( + f" Interconnect type : {INTERCO_TYPE} | " + f"Narrow master ports ({DW_NARROW} bit)={N_NARROW_HCI_CFG} | " + f"Wide master ports ({DW_WIDE} bit)={N_WIDE_HCI_CFG} | " + f"Slave ports (banks)={N_BANKS}" + ) lines.append("=" * 72) for entry in memory_map_entries: lines.append(f"\n [{entry['label']}] pattern={entry['pattern']} n_transactions={entry['n']}") @@ -729,17 +1158,52 @@ def _resolve_req_levels(wait_for_list): for k, v in entry.get('detail', {}).items(): lines.append(f" {k:<14}: {v}") lines.append("") - lines.append(" Phase / dependency map:") - for phase, drivers in sorted(phase_to_drivers.items()): - driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers] - lines.append(f" phase '{phase}': {', '.join(driver_names)}") + lines.append(" Job / dependency map:") + for job, drivers in sorted(job_to_drivers.items()): + driver_names = [_driver_name(d) for d in drivers] + lines.append(f" job '{job}': {', '.join(driver_names)}") for i in range(N_DRIVERS): - name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}" + name = _driver_name(i) for f, mask in enumerate(fence_masks[i]): if mask: - deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}" - for j in range(N_DRIVERS) if mask & (1 << j)] + deps = [_driver_name(j) for j in range(N_DRIVERS) if mask & (1 << j)] lines.append(f" {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}") + lines.append("") + lines.append(" Temporal schedule (transaction-count model):") + lines.append(f" Total modeled time: {total_cycles} units (1 unit = 1 transaction)") + lines.append(" Note: Declared wait_for_jobs dependencies are used for scheduling.") + lines.append(" Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).") + lines.append(" Note: No interconnect contention/stall timing is modeled.") + if mux_serialization_applied: + lines.append(" Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).") + lines.append(f" MUX job order: {', '.join(mux_phase_order)}") + if schedule_has_cycle: + lines.append(" WARNING: dependency cycle detected while scheduling; using fallback order.") + for d in range(N_DRIVERS): + if d not in driver_windows: + continue + w = driver_windows[d] + lines.append(f" {w['name']:<8}: [{w['start']:>6}, {w['end']:>6}) dur={w['end'] - w['start']:>6}") + for node in [n for n in pattern_nodes if n['driver_idx'] == d]: + reg_tokens = [] + for reg in node['regions']: + reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B") + reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions" + lines.append( + f" p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']},{node['end_cycle']}) " + f"type={node['mem_access_type']} n={node['n_transactions']} {reg_text}" + ) + lines.append("") + lines.append(" Memory region lifetimes:") + for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])): + reg = regions_timeline[key] + users = sorted({a['driver_name'] for a in reg['accesses']}) + lines.append( + f" {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"({reg['size']:>6} B) lifetime=[{reg['lifetime_start']},{reg['lifetime_end']}) " + f"users={', '.join(users)}" + ) lines.append("=" * 72) report_text = "\n".join(lines) + "\n" @@ -748,6 +1212,375 @@ def _resolve_req_levels(wait_for_list): memory_map_path.write_text(report_text, encoding='utf-8') print(f"Memory map written: {memory_map_path}") + # ----------------------------------------------------------------------- + # Build memory_lifetime.html (simple SVG timeline view) + # ----------------------------------------------------------------------- + palette = [ + "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd", + "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", + "#e6ab02", "#a6761d", "#666666", + ] + + def _color_for_driver(drv_idx): + return palette[drv_idx % len(palette)] + + def _tick_step(total): + if total <= 10: + return 1 + raw = max(1, total // 10) + mag = 10 ** int(math.log10(raw)) + return int(math.ceil(raw / mag) * mag) + + total_for_plot = max(1, total_cycles) + chart_width = 1280 + x_left = 220 + x_right = 24 + plot_w = chart_width - x_left - x_right + row_h = 80 + y_top = 36 + exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())] + exec_h = y_top + row_h * len(exec_rows) + 72 + tick = _tick_step(total_for_plot) + ticks = list(range(0, total_for_plot + 1, tick)) + if ticks[-1] != total_for_plot: + ticks.append(total_for_plot) + + exec_svg = [] + exec_svg.append(f'') + exec_svg.append('') + exec_svg.append(f'Execution Timeline (transaction-count model)') + for t in ticks: + x = x_left + (t / total_for_plot) * plot_w + exec_svg.append(f'') + exec_svg.append(f'{t}') + exec_svg.append( + f'Transaction number' + ) + exec_svg.append( + f'' + 'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.' + '' + ) + + def _rw_mix_text(node): + rpct = node.get('traffic_read_pct') + if rpct is None: + read_bytes = 0 + write_bytes = 0 + for reg in node.get('regions', []): + label_l = str(reg.get('label', '')).lower() + size_b = max(0, int(reg.get('size', 0))) + if 'read' in label_l and 'write' not in label_l: + read_bytes += size_b + elif 'write' in label_l and 'read' not in label_l: + write_bytes += size_b + if (read_bytes + write_bytes) > 0: + rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes))) + else: + rpct = 50 + rpct = max(0, min(100, int(rpct))) + if rpct == 0: + return "100% writes" + if rpct == 100: + return "100% reads" + return f"{rpct}% reads / {100 - rpct}% writes" + + def _fmt_kib(bytes_v): + return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB" + + def _outside_detail_lines(node, base_addr, size_kib): + regs = list(node.get('regions', [])) + label_map = {str(r.get('label', '')): r for r in regs} + has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)')) + is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs + + if is_matmul: + mat_lines = [] + for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]: + if lbl in label_map: + r = label_map[lbl] + mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})") + if not mat_lines: + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + return mat_lines + + if regs: + r0 = regs[0] + return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"] + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + + for row_idx, row in enumerate(exec_rows): + y = y_top + row_idx * row_h + name = html.escape(row['name']) + label_color = "#111111" if row['is_hwpe'] else "#555555" + exec_svg.append(f'{name}') + exec_svg.append(f'') + nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']] + for node in nodes: + if node['end_cycle'] <= node['start_cycle']: + continue + x = x_left + (node['start_cycle'] / total_for_plot) * plot_w + w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w) + color = _color_for_driver(node['driver_idx']) + base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0) + size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0 + rw_mix = _rw_mix_text(node) + line1_full = f"{node['job']}" + line2_full = f"{node['mem_access_type']}" + line3_full = rw_mix + outside_lines = _outside_detail_lines(node, base_addr, size_kib) + title = ( + f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']}, {node['end_cycle']}) " + f"{node['mem_access_type']} n={node['n_transactions']} " + f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} " + f"{' | '.join(outside_lines)}" + ) + exec_svg.append('') + exec_svg.append(f'{html.escape(title)}') + exec_svg.append( + f'' + ) + line1 = html.escape(line1_full) + exec_svg.append( + f'{line1}' + ) + line2 = html.escape(line2_full) + exec_svg.append( + f'{line2}' + ) + line3 = html.escape(line3_full) + exec_svg.append( + f'{line3}' + ) + for ext_idx, ext in enumerate(outside_lines): + y_ext = y + 49 + (ext_idx * 9) + ext_txt = html.escape(ext) + exec_svg.append( + f'{ext_txt}' + ) + exec_svg.append('') + exec_svg.append('') + + region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))] + overlap_rows = [] + overlaps_by_region = {i: [] for i in range(len(region_rows))} + for i in range(len(region_rows)): + a = region_rows[i] + for j in range(i + 1, len(region_rows)): + b = region_rows[j] + ov_base = max(a['base'], b['base']) + ov_end = min(a['end'], b['end']) + if ov_base <= ov_end: + ov_size = ov_end - ov_base + 1 + overlap_rows.append({ + 'a_idx': i, + 'b_idx': j, + 'ov_base': ov_base, + 'ov_end': ov_end, + 'ov_size': ov_size, + }) + overlaps_by_region[i].append((j, ov_base, ov_end, ov_size)) + overlaps_by_region[j].append((i, ov_base, ov_end, ov_size)) + + used_min = min((reg['base'] for reg in region_rows), default=0) + used_max = max((reg['end'] for reg in region_rows), default=0) + if used_max < used_min: + used_max = used_min + used_span = max(1, used_max - used_min + 1) + + map_left = 170 + map_right = 24 + map_plot_w = chart_width - map_left - map_right + map_h = 124 + bar_y = 56 + bar_h = 24 + region_map_svg = [] + region_map_svg.append(f'') + region_map_svg.append('') + region_map_svg.append( + f'' + f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})" + f"" + ) + region_map_svg.append( + f'' + ) + for pct in [0, 25, 50, 75, 100]: + x = map_left + (pct / 100.0) * map_plot_w + addr = used_min + int(((used_span - 1) * pct) / 100.0) + region_map_svg.append(f'') + region_map_svg.append( + f'0x{addr:08x}' + ) + for reg in region_rows: + x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w + w = max(1.0, (reg['size'] / used_span) * map_plot_w) + color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888" + title = ( + f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"size={reg['size']}B accesses={len(reg['accesses'])}" + ) + region_map_svg.append( + f'' + f'{html.escape(title)}' + ) + if w >= 92: + region_map_svg.append( + f'{html.escape(reg["label"])}' + ) + region_map_svg.append('') + + legend_items = [] + for d in sorted(driver_windows.keys()): + n = _driver_name(d) + c = _color_for_driver(d) + legend_items.append( + f'' + f'' + f'{html.escape(n)}' + ) + + region_cards = [] + for idx, reg in enumerate(region_rows): + access_rows = [] + accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start'])) + for acc in accesses: + desc = acc['description'] if acc['description'] else "-" + access_rows.append( + "" + f"{html.escape(acc['driver_name'])}" + f"p{acc['pattern_idx']}" + f"{html.escape(acc['job'])}" + f"{html.escape(desc)}" + f"[{acc['start']}, {acc['end']})" + "" + ) + overlap_refs = overlaps_by_region.get(idx, []) + if overlap_refs: + ov_txt = ", ".join( + [ + f"{html.escape(region_rows[j]['label'])} " + f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)" + for (j, ovb, ove, ovs) in overlap_refs + ] + ) + else: + ov_txt = "none" + region_cards.append( + "
" + f"
{html.escape(reg['label'])} | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B
" + f"
Overlaps: {ov_txt}
" + "" + "" + "" + f"{''.join(access_rows)}" + "
Driver/HWPEPatternJobDescriptionModeled interval
" + "
" + ) + + overlap_table_rows = [] + for ov in overlap_rows: + a = region_rows[ov['a_idx']] + b = region_rows[ov['b_idx']] + overlap_table_rows.append( + "" + f"{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})" + f"{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})" + f"0x{ov['ov_base']:08x}" + f"0x{ov['ov_end']:08x}" + f"{ov['ov_size']}" + "" + ) + + note = "Timeline follows declared wait_for_jobs dependencies from workload.json." + note_2 = ( + "Time axis is transaction-count based only " + "(no interconnect conflict/stall/arbitration modeling)." + ) + note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing." + note_3 = "" + if mux_serialization_applied: + note_3 = ( + "MUX mode: HWPE execution is serialized by job order, " + "with lower HWPE ID first inside each job (tb_hci-like)." + ) + if mux_phase_order: + note_3 += f" Job order: {', '.join(mux_phase_order)}." + note_3_html = f"
{html.escape(note_3)}
" if note_3 else "" + cycle_warning_html = ( + "

Warning: dependency cycle detected; fallback scheduling order used.

" + if schedule_has_cycle else "" + ) + overlap_html = ( + "" + f"{''.join(overlap_table_rows)}" + "
Region ARegion BOverlap BaseOverlap EndOverlap Size (B)
" + if overlap_table_rows else + "
No overlaps detected among used regions.
" + ) + + html_doc = ( + "" + "Memory Access Region View (Transaction-Count Model)" + "" + "

Memory Access Region View

" + f"
Drivers ({DW_NARROW} bit): " + f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}
" + f"
Drivers ({DW_WIDE} bit): HWPE={N_HWPE_CFG}
" + f"
Interconnect type: {html.escape(INTERCO_TYPE)} | " + f"Narrow master ports ({DW_NARROW} bit): {N_NARROW_HCI_CFG} | " + f"Wide master ports ({DW_WIDE} bit): {N_WIDE_HCI_CFG} | " + f"Slave ports (banks): {N_BANKS} | " + f"Total modeled time: {total_cycles} units
" + f"
{html.escape(note)}
" + f"
{html.escape(note_2)}
" + f"
{html.escape(note_2b)}
" + f"{note_3_html}" + f"{cycle_warning_html}" + "

Legend

" + f"{''.join(legend_items)}
" + "
" + f"{''.join(exec_svg)}" + "
" + "
" + f"{''.join(region_map_svg)}" + "
" + "

Region Usage Blocks

" + f"{''.join(region_cards)}" + "

Overlapping Regions

" + f"{overlap_html}" + "" + ) + + memory_lifetime_path = generated_dir / 'memory_lifetime.html' + memory_lifetime_path.write_text(html_doc, encoding='utf-8') + print(f"Memory lifetime plot written: {memory_lifetime_path}") + # ----------------------------------------------------------------------- # Apply per-master start delays # ----------------------------------------------------------------------- From 23362d398de9e219e95b867406cf3d38032c355c Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Tue, 10 Mar 2026 17:49:10 +0100 Subject: [PATCH 21/25] verif: Add framework for benchmarking sweep across different hw configs --- .gitignore | 1 + target/verif/config/generated/json_to_mk.py | 11 +- target/verif/config/hardware.json | 2 +- .../hardware_hci_2hwpe_8fact.json | 17 + .../hardware_log_2hwpe_8fact.json | 17 + .../hardware_mux_2hwpe_8fact.json | 17 + target/verif/config/workload.json | 4 +- .../config/workloads/workload_dma_gemm.json | 180 ++++++ .../workloads/workload_dma_gemm_cores.json | 540 ++++++++++++++++++ .../workloads/workload_dma_gemm_ideal.json | 121 ++++ .../results/hardware_hci_2hwpe_8fact.json | 431 ++++++++++++++ target/verif/scripts/parse_vsim.py | 359 ++++++++++++ target/verif/scripts/plot_sweep_results.py | 389 +++++++++++++ target/verif/scripts/run_sweep.sh | 27 + target/verif/src/simulation_report.sv | 45 +- target/verif/verif.mk | 11 +- 16 files changed, 2148 insertions(+), 24 deletions(-) create mode 100644 target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json create mode 100644 target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json create mode 100644 target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json create mode 100644 target/verif/config/workloads/workload_dma_gemm.json create mode 100644 target/verif/config/workloads/workload_dma_gemm_cores.json create mode 100644 target/verif/config/workloads/workload_dma_gemm_ideal.json create mode 100644 target/verif/results/hardware_hci_2hwpe_8fact.json create mode 100644 target/verif/scripts/parse_vsim.py create mode 100644 target/verif/scripts/plot_sweep_results.py create mode 100644 target/verif/scripts/run_sweep.sh diff --git a/.gitignore b/.gitignore index ada3be7..0fbe9ba 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ target/verif/vsim/compile.tcl target/verif/vsim/modelsim.ini target/verif/vsim/transcript target/verif/vsim/vsim.wlf +target/verif/results diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py index 4a1c9ba..1d6ad27 100755 --- a/target/verif/config/generated/json_to_mk.py +++ b/target/verif/config/generated/json_to_mk.py @@ -6,8 +6,8 @@ Templates are automatically discovered based on the config type argument. """ -import json import argparse +import json import sys from pathlib import Path from string import Template @@ -83,9 +83,9 @@ def parse_args(argv=None): help="Configuration type to generate.", ) parser.add_argument( - "config_dir", + "config_json", type=Path, - help="Directory containing source-of-truth JSON files.", + help="Path to selected source-of-truth JSON file.", ) parser.add_argument( "generated_dir", @@ -98,11 +98,10 @@ def parse_args(argv=None): def main(): args = parse_args() config_type = args.config_type - config_dir = args.config_dir.resolve() + json_file = args.config_json.resolve() + config_dir = json_file.parent generated_dir = args.generated_dir.resolve() - # Construct file paths based on config_type argument - json_file = config_dir / f"{config_type}.json" template_file = generated_dir / f"{config_type}.mk.tpl" # Load JSON config diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index 84c47a1..b349a65 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -9,7 +9,7 @@ "DATA_WIDTH": 32, "TOT_MEM_SIZE": 256, "N_BANKS": 64, - "INTERCO_TYPE": "MUX", + "INTERCO_TYPE": "HCI", "TS_BIT": 21, "EXPFIFO": 0, "SEL_LIC": 0 diff --git a/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json new file mode 100644 index 0000000..b349a65 --- /dev/null +++ b/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "HCI", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json new file mode 100644 index 0000000..a505fa1 --- /dev/null +++ b/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "LOG", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json new file mode 100644 index 0000000..84c47a1 --- /dev/null +++ b/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "MUX", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index 683a709..8299bf9 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -14,7 +14,7 @@ "hwpe_masters": [ { - "id": 1, + "id": 0, "description": "DMA engine (linear setup + ping/pong prefetch)", "patterns": [ { @@ -107,7 +107,7 @@ ] }, { - "id": 0, + "id": 1, "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", "patterns": [ { diff --git a/target/verif/config/workloads/workload_dma_gemm.json b/target/verif/config/workloads/workload_dma_gemm.json new file mode 100644 index 0000000..8299bf9 --- /dev/null +++ b/target/verif/config/workloads/workload_dma_gemm.json @@ -0,0 +1,180 @@ +{ + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", + "log_masters": [ + { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, + { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, + { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, + { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, + { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, + { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, + { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, + { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A1_buf1", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A2 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A2_buf0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A0", + "mem_access_type": "linear", + "job": "dma_store_C0_buf2", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A3 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A3_buf1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A1", + "mem_access_type": "linear", + "job": "dma_store_C1_buf3", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A2", + "mem_access_type": "linear", + "job": "dma_store_C2_buf2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 1, + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] + } + ] +} diff --git a/target/verif/config/workloads/workload_dma_gemm_cores.json b/target/verif/config/workloads/workload_dma_gemm_cores.json new file mode 100644 index 0000000..aa2da43 --- /dev/null +++ b/target/verif/config/workloads/workload_dma_gemm_cores.json @@ -0,0 +1,540 @@ +{ + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", + "log_masters": [ + { + "id": 0, + "description": "Core 0 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 0 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core0_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core0_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core0_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core0_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 1, + "description": "Core 1 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 1 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core1_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core1_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core1_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core1_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 2, + "description": "Core 2 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 2 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core2_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core2_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core2_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core2_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 3, + "description": "Core 3 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 3 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core3_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core3_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core3_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core3_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 4, + "description": "Core 4 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 4 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core4_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core4_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core4_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core4_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 5, + "description": "Core 5 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 5 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core5_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core5_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core5_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core5_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 6, + "description": "Core 6 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 6 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core6_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core6_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core6_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core6_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 7, + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core7_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core7_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core7_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] + }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A1_buf1", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A2 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A2_buf0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A0", + "mem_access_type": "linear", + "job": "dma_store_C0_buf2", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A3 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A3_buf1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A1", + "mem_access_type": "linear", + "job": "dma_store_C1_buf3", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A2", + "mem_access_type": "linear", + "job": "dma_store_C2_buf2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 1, + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] + } + ] +} diff --git a/target/verif/config/workloads/workload_dma_gemm_ideal.json b/target/verif/config/workloads/workload_dma_gemm_ideal.json new file mode 100644 index 0000000..b166c78 --- /dev/null +++ b/target/verif/config/workloads/workload_dma_gemm_ideal.json @@ -0,0 +1,121 @@ +{ + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", + "log_masters": [ + { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, + { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, + { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, + { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, + { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, + { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, + { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, + { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 1, + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] + } + ] +} diff --git a/target/verif/results/hardware_hci_2hwpe_8fact.json b/target/verif/results/hardware_hci_2hwpe_8fact.json new file mode 100644 index 0000000..1fd0fd4 --- /dev/null +++ b/target/verif/results/hardware_hci_2hwpe_8fact.json @@ -0,0 +1,431 @@ +{ + "hw_config": { + "masters": { + "core": 8, + "dma": 0, + "ext": 1, + "hwpe": 2, + "total": 11 + }, + "memory": { + "banks": 64, + "total_size_kb": 256, + "data_width_bits": 32, + "hwpe_width_lanes": 8 + }, + "interconnect": { + "sel_lic": 0, + "ts_bit": 21, + "expfifo": 0 + }, + "interconnect_side": { + "type": "HCI", + "n_narrow_hci": 8, + "n_wide_hci": 2, + "n_dma": 0, + "n_ext": 1, + "narrow_total_ports": 9, + "total_initiator_ports": 11 + }, + "id_address": { + "iw": 11, + "addr_width": 18, + "addr_width_bank": 12 + } + }, + "bandwidth": { + "ideal_memory_side_bit_per_cycle": 2048.0, + "ideal_interconnect_side_bit_per_cycle": 800.0, + "ideal_bottleneck_bit_per_cycle": 800.0, + "actual_completion_bit_per_cycle": 306.65, + "actual_completion_utilization_pct": 38.3, + "completion_phase_duration_cycles": 4488.0, + "granted_transactions": { + "reads": 3072, + "writes": 2304, + "total": 5376 + }, + "read_complete_responses": 3072 + }, + "simulation_time": { + "per_master": [ + { + "master_name": "master_hwpe_0", + "role_name": "HWPE0", + "sim_time_cycles": 4488.0 + }, + { + "master_name": "master_hwpe_1", + "role_name": "HWPE1", + "sim_time_cycles": 4228.0 + }, + { + "master_name": "master_log_0", + "role_name": "Core0", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_1", + "role_name": "Core1", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_2", + "role_name": "Core2", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_3", + "role_name": "Core3", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_4", + "role_name": "Core4", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_5", + "role_name": "Core5", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_6", + "role_name": "Core6", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_7", + "role_name": "Core7", + "sim_time_cycles": 3.0 + }, + { + "master_name": "master_log_8", + "role_name": "EXT0", + "sim_time_cycles": 1.0 + } + ], + "total_cycles": 4488.0 + }, + "read_response_coverage": { + "master_log_0": { + "observed": 0, + "expected": 0 + }, + "master_log_1": { + "observed": 0, + "expected": 0 + }, + "master_log_2": { + "observed": 0, + "expected": 0 + }, + "master_log_3": { + "observed": 0, + "expected": 0 + }, + "master_log_4": { + "observed": 0, + "expected": 0 + }, + "master_log_5": { + "observed": 0, + "expected": 0 + }, + "master_log_6": { + "observed": 0, + "expected": 0 + }, + "master_log_7": { + "observed": 0, + "expected": 0 + }, + "master_log_8": { + "observed": 0, + "expected": 0 + }, + "master_hwpe_0": { + "observed": 1024, + "expected": 1024 + }, + "master_hwpe_1": { + "observed": 2048, + "expected": 2048 + } + }, + "transaction_counts": { + "master_log_0": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_1": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_2": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_3": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_4": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_5": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_6": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_7": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_log_8": { + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0 + }, + "master_hwpe_0": { + "granted_reads": 1024, + "granted_writes": 1280, + "read_complete": 1024 + }, + "master_hwpe_1": { + "granted_reads": 2048, + "granted_writes": 1024, + "read_complete": 2048 + } + }, + "request_to_grant_latency": { + "per_master": [ + { + "master_name": "master_hwpe_0", + "avg_req_to_gnt_stall_latency_cycles": 0.63, + "req_to_gnt_grants": 2304 + }, + { + "master_name": "master_hwpe_1", + "avg_req_to_gnt_stall_latency_cycles": 0.18, + "req_to_gnt_grants": 3072 + }, + { + "master_name": "master_log_0", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_1", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_2", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_3", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_4", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_5", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_6", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_7", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_8", + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + } + ], + "accumulated": { + "cycles": 1987.0, + "grants": 5376 + }, + "averages": { + "log": { + "weighted_cycles": 0.0, + "unweighted_cycles": 0.0 + }, + "hwpe": { + "weighted_cycles": 0.37, + "unweighted_cycles": 0.4 + }, + "global": { + "weighted_cycles": 0.37, + "unweighted_cycles": 0.4 + } + } + }, + "finish": { + "source": "/scratch/smazzola/projects/hci/hci/target/verif/src/simulation_report.sv", + "line": 464, + "time_ps": 224950, + "iteration": 4, + "instance": "/tb_hci/i_simulation_report" + }, + "masters": [ + { + "master_name": "master_hwpe_0", + "role_name": "HWPE0", + "sim_time_cycles": 4488.0, + "read_observed": 1024, + "read_expected": 1024, + "granted_reads": 1024, + "granted_writes": 1280, + "read_complete": 1024, + "avg_req_to_gnt_stall_latency_cycles": 0.63, + "req_to_gnt_grants": 2304 + }, + { + "master_name": "master_hwpe_1", + "role_name": "HWPE1", + "sim_time_cycles": 4228.0, + "read_observed": 2048, + "read_expected": 2048, + "granted_reads": 2048, + "granted_writes": 1024, + "read_complete": 2048, + "avg_req_to_gnt_stall_latency_cycles": 0.18, + "req_to_gnt_grants": 3072 + }, + { + "master_name": "master_log_0", + "role_name": "Core0", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_1", + "role_name": "Core1", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_2", + "role_name": "Core2", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_3", + "role_name": "Core3", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_4", + "role_name": "Core4", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_5", + "role_name": "Core5", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_6", + "role_name": "Core6", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_7", + "role_name": "Core7", + "sim_time_cycles": 3.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + }, + { + "master_name": "master_log_8", + "role_name": "EXT0", + "sim_time_cycles": 1.0, + "read_observed": 0, + "read_expected": 0, + "granted_reads": 0, + "granted_writes": 0, + "read_complete": 0, + "avg_req_to_gnt_stall_latency_cycles": 0.0, + "req_to_gnt_grants": 0 + } + ] +} diff --git a/target/verif/scripts/parse_vsim.py b/target/verif/scripts/parse_vsim.py new file mode 100644 index 0000000..9c0f0ad --- /dev/null +++ b/target/verif/scripts/parse_vsim.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +"""Parse the final 'Simulation Summary' section from one transcript file.""" + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Dict, List + + +SUMMARY_MARKER = "------ Simulation Summary ------" + + +class ParseError(RuntimeError): + """Raised when the transcript summary cannot be parsed.""" + + +def _as_float(value: str) -> float: + return float(value) + + +def _as_int(value: str) -> int: + return int(value) + + +def _clean_line(raw: str) -> str: + line = raw.strip() + if line.startswith("#"): + line = line[1:].strip() + return line + + +def _summary_lines(transcript_text: str) -> List[str]: + idx = transcript_text.rfind(SUMMARY_MARKER) + if idx < 0: + raise ParseError(f"Summary marker '{SUMMARY_MARKER}' not found.") + return [_clean_line(line) for line in transcript_text[idx:].splitlines()] + + +def _ensure_master(masters: Dict[str, Dict[str, object]], master_name: str) -> Dict[str, object]: + entry = masters.get(master_name) + if entry is None: + entry = {"master_name": master_name} + masters[master_name] = entry + return entry + + +def parse_summary(transcript_text: str) -> Dict[str, object]: + lines = _summary_lines(transcript_text) + + result: Dict[str, object] = { + "hw_config": {}, + "bandwidth": {}, + "simulation_time": {"per_master": []}, + "read_response_coverage": {}, + "transaction_counts": {}, + "request_to_grant_latency": { + "per_master": [], + "accumulated": {}, + "averages": {}, + }, + "finish": {}, + } + + masters: Dict[str, Dict[str, object]] = {} + + patterns = { + "masters": re.compile(r"^Masters:\s*CORE=(\d+)\s*DMA=(\d+)\s*EXT=(\d+)\s*HWPE=(\d+)\s*\(total=(\d+)\)$"), + "memory": re.compile( + r"^Memory:\s*banks=(\d+)\s*total_size=(\d+)\s*kB\s*data_width=(\d+)\s*bits\s*hwpe_width=(\d+)\s*lanes$" + ), + "interconnect": re.compile(r"^Interconnect:\s*SEL_LIC=(\d+)\s*TS_BIT=(\d+)\s*EXPFIFO=(\d+)$"), + "interconnect_side": re.compile( + r"^Interconnect-side:\s*TYPE=(LOG|HCI|MUX|UNKNOWN)\s*N_NARROW_HCI=(\d+)\s*N_WIDE_HCI=(\d+)\s*N_DMA=(\d+)\s*N_EXT=(\d+)$" + ), + "id_addr": re.compile(r"^ID/address:\s*IW=(\d+)\s*ADDR_WIDTH=(\d+)\s*ADDR_WIDTH_BANK=(\d+)$"), + "ideal_mem_bw": re.compile(r"^Ideal BW \(memory side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_interco_bw": re.compile(r"^Ideal BW \(interco side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_master_bw_legacy": re.compile(r"^Ideal BW \(master side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_bottleneck_bw": re.compile(r"^Ideal BW \(bottleneck\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "actual_bw": re.compile( + r"^Actual BW \(completion\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle\s*\[utilization:\s*([0-9]+(?:\.[0-9]+)?)%\]$" + ), + "completion_bw_legacy": re.compile(r"^Completion bandwidth .*:\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle$"), + "completion_cycles": re.compile(r"^Completion phase duration:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "granted": re.compile(r"^Granted transactions:\s*reads=(\d+)\s*writes=(\d+)\s*total=(\d+)$"), + "read_complete": re.compile(r"^Read-complete responses:\s*(\d+)$"), + "total_sim_cycles": re.compile(r"^Total simulation time:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "per_master_sim_time": re.compile(r"^([A-Za-z0-9_]+)\s*\((master_[^)]+)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "coverage": re.compile(r"^(master_[^:]+):\s*observed\s*(\d+)\s*/\s*expected\s*(\d+)$"), + "tx_counts": re.compile(r"^(master_[^:]+):\s*granted reads=(\d+)\s*writes=(\d+),\s*read-complete=(\d+)$"), + "req_gnt": re.compile( + r"^(master_[^:]+):\s*avg req->gnt stall latency\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$" + ), + "total_accum": re.compile( + r"^Total accumulated req->gnt latency:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$" + ), + "class_avg": re.compile( + r"^(LOG|HWPE|Global) avg req->gnt stall latency " + r"\((weighted by grant count|mean of per-master averages)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$" + ), + "finish_note": re.compile(r"^\*\* Note: \$finish\s*:\s*(.+)\((\d+)\)$"), + "finish_time": re.compile(r"^Time:\s*([0-9]+)\s*ps\s*Iteration:\s*(\d+)\s*Instance:\s*(.+)$"), + } + + for line in lines: + if not line or line == SUMMARY_MARKER: + continue + + match = patterns["masters"].match(line) + if match: + result["hw_config"]["masters"] = { + "core": _as_int(match.group(1)), + "dma": _as_int(match.group(2)), + "ext": _as_int(match.group(3)), + "hwpe": _as_int(match.group(4)), + "total": _as_int(match.group(5)), + } + continue + + match = patterns["memory"].match(line) + if match: + result["hw_config"]["memory"] = { + "banks": _as_int(match.group(1)), + "total_size_kb": _as_int(match.group(2)), + "data_width_bits": _as_int(match.group(3)), + "hwpe_width_lanes": _as_int(match.group(4)), + } + continue + + match = patterns["interconnect"].match(line) + if match: + result["hw_config"]["interconnect"] = { + "sel_lic": _as_int(match.group(1)), + "ts_bit": _as_int(match.group(2)), + "expfifo": _as_int(match.group(3)), + } + continue + + match = patterns["interconnect_side"].match(line) + if match: + narrow_hci = _as_int(match.group(2)) + wide_hci = _as_int(match.group(3)) + n_dma = _as_int(match.group(4)) + n_ext = _as_int(match.group(5)) + result["hw_config"]["interconnect_side"] = { + "type": match.group(1), + "n_narrow_hci": narrow_hci, + "n_wide_hci": wide_hci, + "n_dma": n_dma, + "n_ext": n_ext, + "narrow_total_ports": narrow_hci + n_dma + n_ext, + "total_initiator_ports": narrow_hci + wide_hci + n_dma + n_ext, + } + continue + + match = patterns["id_addr"].match(line) + if match: + result["hw_config"]["id_address"] = { + "iw": _as_int(match.group(1)), + "addr_width": _as_int(match.group(2)), + "addr_width_bank": _as_int(match.group(3)), + } + continue + + match = patterns["ideal_mem_bw"].match(line) + if match: + result["bandwidth"]["ideal_memory_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_interco_bw"].match(line) + if match: + result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_master_bw_legacy"].match(line) + if match: + result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_bottleneck_bw"].match(line) + if match: + result["bandwidth"]["ideal_bottleneck_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["actual_bw"].match(line) + if match: + result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1)) + result["bandwidth"]["actual_completion_utilization_pct"] = _as_float(match.group(2)) + continue + + match = patterns["completion_bw_legacy"].match(line) + if match: + result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["completion_cycles"].match(line) + if match: + result["bandwidth"]["completion_phase_duration_cycles"] = _as_float(match.group(1)) + continue + + match = patterns["granted"].match(line) + if match: + result["bandwidth"]["granted_transactions"] = { + "reads": _as_int(match.group(1)), + "writes": _as_int(match.group(2)), + "total": _as_int(match.group(3)), + } + continue + + match = patterns["read_complete"].match(line) + if match: + result["bandwidth"]["read_complete_responses"] = _as_int(match.group(1)) + continue + + match = patterns["total_sim_cycles"].match(line) + if match: + result["simulation_time"]["total_cycles"] = _as_float(match.group(1)) + continue + + match = patterns["per_master_sim_time"].match(line) + if match: + role_name = match.group(1) + master_name = match.group(2) + sim_cycles = _as_float(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["role_name"] = role_name + entry["sim_time_cycles"] = sim_cycles + continue + + match = patterns["coverage"].match(line) + if match: + master_name = match.group(1) + observed = _as_int(match.group(2)) + expected = _as_int(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["read_observed"] = observed + entry["read_expected"] = expected + result["read_response_coverage"][master_name] = { + "observed": observed, + "expected": expected, + } + continue + + match = patterns["tx_counts"].match(line) + if match: + master_name = match.group(1) + reads = _as_int(match.group(2)) + writes = _as_int(match.group(3)) + read_complete = _as_int(match.group(4)) + entry = _ensure_master(masters, master_name) + entry["granted_reads"] = reads + entry["granted_writes"] = writes + entry["read_complete"] = read_complete + result["transaction_counts"][master_name] = { + "granted_reads": reads, + "granted_writes": writes, + "read_complete": read_complete, + } + continue + + match = patterns["req_gnt"].match(line) + if match: + master_name = match.group(1) + avg_cycles = _as_float(match.group(2)) + grants = _as_int(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["avg_req_to_gnt_stall_latency_cycles"] = avg_cycles + entry["req_to_gnt_grants"] = grants + continue + + match = patterns["total_accum"].match(line) + if match: + result["request_to_grant_latency"]["accumulated"] = { + "cycles": _as_float(match.group(1)), + "grants": _as_int(match.group(2)), + } + continue + + match = patterns["class_avg"].match(line) + if match: + group = match.group(1).lower() + avg_type = match.group(2) + value = _as_float(match.group(3)) + key = "weighted_cycles" if "weighted by grant count" in avg_type else "unweighted_cycles" + averages = result["request_to_grant_latency"]["averages"] + group_entry = averages.get(group, {}) + group_entry[key] = value + averages[group] = group_entry + continue + + match = patterns["finish_note"].match(line) + if match: + result["finish"]["source"] = match.group(1).strip() + result["finish"]["line"] = _as_int(match.group(2)) + continue + + match = patterns["finish_time"].match(line) + if match: + result["finish"]["time_ps"] = _as_int(match.group(1)) + result["finish"]["iteration"] = _as_int(match.group(2)) + result["finish"]["instance"] = match.group(3).strip() + continue + + sorted_masters = [masters[name] for name in sorted(masters.keys())] + result["simulation_time"]["per_master"] = [ + { + "master_name": row["master_name"], + "role_name": row.get("role_name"), + "sim_time_cycles": row.get("sim_time_cycles"), + } + for row in sorted_masters + ] + result["request_to_grant_latency"]["per_master"] = [ + { + "master_name": row["master_name"], + "avg_req_to_gnt_stall_latency_cycles": row.get("avg_req_to_gnt_stall_latency_cycles"), + "req_to_gnt_grants": row.get("req_to_gnt_grants"), + } + for row in sorted_masters + ] + result["masters"] = sorted_masters + + return result + + +def _cli_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Parse final Simulation Summary lines from one transcript.") + parser.add_argument("--transcript", required=True, help="Path to transcript file") + parser.add_argument("--out", default="", help="Optional output JSON file path") + return parser.parse_args() + + +def main() -> int: + args = _cli_args() + transcript_path = Path(args.transcript) + if not transcript_path.exists(): + raise ParseError(f"Transcript not found: {transcript_path}") + + text = transcript_path.read_text(encoding="utf-8", errors="replace") + parsed = parse_summary(text) + + output = json.dumps(parsed, indent=2, sort_keys=False) + if args.out: + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(output + "\n", encoding="ascii") + else: + print(output) + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except ParseError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(2) diff --git a/target/verif/scripts/plot_sweep_results.py b/target/verif/scripts/plot_sweep_results.py new file mode 100644 index 0000000..cd24686 --- /dev/null +++ b/target/verif/scripts/plot_sweep_results.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +"""Plot sweep metrics from parsed transcript JSON files.""" + +import argparse +import json +import math +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.colors import ListedColormap +from matplotlib.lines import Line2D +from matplotlib.patches import Patch + +# Cycles of ideal workload runtime +IDEAL_WORKLOAD_DURATION = 3866.0 + +INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2} +INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"} +IDEAL_COLOR = "#7f7f7f" + + +def _to_int(value: object, default: int = 0) -> int: + try: + return int(value) + except Exception: + return default + + +def _to_float(value: object, default: float = float("nan")) -> float: + try: + return float(value) + except Exception: + return default + + +def _master_sort_key(master: str) -> Tuple[int, int]: + if master.startswith("master_log_"): + return (0, int(master.rsplit("_", 1)[1])) + if master.startswith("master_hwpe_"): + return (1, int(master.rsplit("_", 1)[1])) + return (9, 0) + + +def _parse_cfg_from_filename(path: Path) -> Tuple[str, int, int]: + match = re.match(r"^hardware_([a-zA-Z]+)_([0-9]+)hwpe_([0-9]+)fact\.json$", path.name) + if not match: + return ("UNK", 0, 0) + return (match.group(1).upper(), int(match.group(2)), int(match.group(3))) + + +def _derive_interco_side(hw_cfg: Dict[str, object]) -> Dict[str, int]: + masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {} + memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {} + interco_side = hw_cfg.get("interconnect_side", {}) if isinstance(hw_cfg, dict) else {} + + if isinstance(interco_side, dict) and "narrow_total_ports" in interco_side: + return { + "n_narrow_hci": _to_int(interco_side.get("n_narrow_hci")), + "n_wide_hci": _to_int(interco_side.get("n_wide_hci")), + "n_dma": _to_int(interco_side.get("n_dma")), + "n_ext": _to_int(interco_side.get("n_ext")), + "narrow_total_ports": _to_int(interco_side.get("narrow_total_ports")), + "total_initiator_ports": _to_int(interco_side.get("total_initiator_ports")), + } + + interco_type = str(interco_side.get("type", "UNK")).upper() + if interco_type == "UNK": + interco_type = str(hw_cfg.get("interco_type", "UNK")).upper() + + n_core = _to_int(masters.get("core")) + n_dma = _to_int(masters.get("dma")) + n_ext = _to_int(masters.get("ext")) + n_hwpe = _to_int(masters.get("hwpe")) + hwpe_width = _to_int(memory.get("hwpe_width_lanes"), 1) + + if interco_type == "LOG": + n_narrow_hci = n_core + n_hwpe * hwpe_width + n_wide_hci = 0 + elif interco_type == "MUX": + n_narrow_hci = n_core + n_wide_hci = 1 if n_hwpe > 0 else 0 + else: + n_narrow_hci = n_core + n_wide_hci = n_hwpe + + narrow_total = n_narrow_hci + n_dma + n_ext + return { + "n_narrow_hci": n_narrow_hci, + "n_wide_hci": n_wide_hci, + "n_dma": n_dma, + "n_ext": n_ext, + "narrow_total_ports": narrow_total, + "total_initiator_ports": narrow_total + n_wide_hci, + } + + +def _load_results(results_dir: Path) -> List[Dict[str, object]]: + entries: List[Dict[str, object]] = [] + for path in sorted(results_dir.glob("hardware_*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + + hw_cfg = data.get("hw_config", {}) + masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {} + memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {} + bw = data.get("bandwidth", {}) + + interco_from_name, n_hwpe_name, wf_name = _parse_cfg_from_filename(path) + interco_type = str(hw_cfg.get("interconnect_side", {}).get("type", interco_from_name)).upper() + if interco_type not in INTERCO_ORDER: + interco_type = interco_from_name + + n_hwpe = _to_int(masters.get("hwpe"), n_hwpe_name) + hwpe_wf = _to_int(memory.get("hwpe_width_lanes"), wf_name) + cfg_label = f"{interco_type}_{n_hwpe}x{hwpe_wf}" + + interco_side = _derive_interco_side(hw_cfg if isinstance(hw_cfg, dict) else {}) + banks = _to_int(memory.get("banks")) + data_width = _to_int(memory.get("data_width_bits")) + ideal_mem = float(banks * data_width) + ideal_interco = float( + interco_side["narrow_total_ports"] * data_width + + interco_side["n_wide_hci"] * hwpe_wf * data_width + ) + ideal_bottleneck = min(ideal_mem, ideal_interco) + + actual_bw = _to_float(bw.get("actual_completion_bit_per_cycle")) + util_pct = (actual_bw / ideal_bottleneck * 100.0) if ideal_bottleneck > 0 and not math.isnan(actual_bw) else float("nan") + + entries.append( + { + "path": path, + "label": cfg_label, + "interco_type": interco_type, + "n_hwpe": n_hwpe, + "hwpe_width_fact": hwpe_wf, + "json": data, + "total_sim_cycles": _to_float(data.get("simulation_time", {}).get("total_cycles")), + "avg_req_to_gnt_per_master": data.get("request_to_grant_latency", {}).get("per_master", []), + "ideal_mem_bw": ideal_mem, + "ideal_interco_bw": ideal_interco, + "ideal_bottleneck_bw": ideal_bottleneck, + "actual_bw": actual_bw, + "utilization_pct": util_pct, + } + ) + + entries.sort(key=lambda e: (e["n_hwpe"], e["hwpe_width_fact"], INTERCO_ORDER.get(e["interco_type"], 9))) + return entries + + +def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + values = [e["total_sim_cycles"] for e in entries] + colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries] + x = np.arange(len(entries), dtype=float) + + fig, ax = plt.subplots(figsize=(max(8, 1.2 * len(entries)), 5.6)) + bars = ax.bar(x, values, color=colors, width=0.68) + ax.set_title("Total simulation time vs ideal workload runtime") + ax.set_ylabel("cycles") + ax.set_xlabel("Configuration") + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_axisbelow(True) + ax.grid(axis="y", alpha=0.25) + + for bar, val in zip(bars, values): + if math.isnan(val): + continue + mult_of_ideal = (val / IDEAL_WORKLOAD_DURATION) if val > 0 and IDEAL_WORKLOAD_DURATION > 0 else float("nan") + pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime" + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.0f}\n({pct_txt})", + ha="center", + va="bottom", + fontsize=8, + ) + + ax.axhline( + y=IDEAL_WORKLOAD_DURATION, + color="red", + linestyle="--", + linewidth=1.6, + label=f"Ideal workload runtime ({IDEAL_WORKLOAD_DURATION:.0f} cycles)", + ) + + legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")] + legend.append(Line2D([0], [0], color="red", linestyle="--", linewidth=1.6, label="Ideal workload runtime")) + ax.legend(handles=legend, loc="lower left") + ax.margins(y=0.24) + fig.tight_layout(rect=(0.0, 0.02, 1.0, 0.98)) + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def _plot_per_master_avg_req_to_gnt(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + masters = sorted( + { + row.get("master_name", "") + for e in entries + for row in e.get("avg_req_to_gnt_per_master", []) + if isinstance(row, dict) and row.get("master_name", "") + }, + key=_master_sort_key, + ) + + if not masters: + fig, ax = plt.subplots(figsize=(8, 3)) + ax.text(0.5, 0.5, "No per-master req->gnt data", ha="center", va="center") + ax.axis("off") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + return + + matrix = np.full((len(masters), len(entries)), np.nan, dtype=float) + master_idx = {m: i for i, m in enumerate(masters)} + for j, entry in enumerate(entries): + for row in entry.get("avg_req_to_gnt_per_master", []): + if not isinstance(row, dict): + continue + m = row.get("master_name", "") + if m not in master_idx: + continue + matrix[master_idx[m], j] = _to_float(row.get("avg_req_to_gnt_stall_latency_cycles")) + + cmap = ListedColormap(plt.cm.get_cmap("viridis")(np.linspace(0.0, 1.0, 256))) + cmap.set_bad(color="white") + + fig, ax = plt.subplots(figsize=(max(10, 1.2 * len(entries)), max(4, 0.35 * len(masters)))) + im = ax.imshow(np.ma.masked_invalid(matrix), aspect="auto", cmap=cmap, interpolation="nearest") + ax.set_title("Avg req->gnt stall latency per master") + ax.set_xlabel("Configuration") + ax.set_ylabel("Master") + ax.set_xticks(np.arange(len(entries))) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_yticks(np.arange(len(masters))) + ax.set_yticklabels(masters) + + vmax = np.nanmax(matrix) if np.any(~np.isnan(matrix)) else 0.0 + thresh = 0.6 * vmax if vmax > 0 else 0.0 + for r in range(matrix.shape[0]): + for c in range(matrix.shape[1]): + val = matrix[r, c] + if math.isnan(val): + continue + color = "white" if val < thresh else "black" + ax.text(c, r, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color) + + fig.colorbar(im, ax=ax, label="cycles") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + ideal_vals = [e["ideal_bottleneck_bw"] for e in entries] + actual_vals = [e["actual_bw"] for e in entries] + util_vals = [e["utilization_pct"] for e in entries] + sim_cycles_vals = [e["total_sim_cycles"] for e in entries] + ideal_app_vals = [] + for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals): + if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_DURATION <= 0.0: + ideal_app_vals.append(float("nan")) + else: + ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_DURATION) + actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries] + x = np.arange(len(entries), dtype=float) + width = 0.34 + + fig, ax = plt.subplots(figsize=(max(9, 1.25 * len(entries)), 5.0)) + ideal_bars = ax.bar( + x - width / 2.0, + ideal_vals, + width=width, + color=IDEAL_COLOR, + label="Max interco bandwidth", + ) + actual_bars = ax.bar(x + width / 2.0, actual_vals, width=width, color=actual_colors, label="Actual BW (completion)") + ax.set_title("Bandwidth: interconnect-side ideal vs actual") + ax.set_ylabel("bit/cycle") + ax.set_xlabel("Configuration") + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_axisbelow(True) + ax.grid(axis="y", alpha=0.25) + + for bar, val in zip(ideal_bars, ideal_vals): + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.0f}", + ha="center", + va="bottom", + fontsize=8, + zorder=8, + ) + for bar, val, util in zip(actual_bars, actual_vals, util_vals): + if math.isnan(val): + continue + util_txt = "n/a" if math.isnan(util) else f"{util:.1f}% interco util" + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.1f}\n({util_txt})", + ha="center", + va="bottom", + fontsize=8, + zorder=8, + ) + + # Ideal app BW computed from moved data and ideal application duration: + # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_DURATION + valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)] + if valid_ideal_app_vals: + ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals) + ax.axhline( + y=ideal_workload_bw, + color="red", + linestyle="--", + linewidth=1.8, + label="Ideal workload bandwidth", + zorder=4, + ) + ax.text( + x[-1] + 0.35, + ideal_workload_bw, + f"{ideal_workload_bw:.1f}", + color="red", + fontsize=9, + ha="right", + va="bottom", + zorder=8, + ) + + interco_legend = [Patch(facecolor=INTERCO_COLORS[k], label=f"Actual {k}") for k in ("LOG", "MUX", "HCI")] + base_legend = [Patch(facecolor=IDEAL_COLOR, label="Max interco bandwidth")] + extra_legend = [Line2D([0], [0], color="red", linestyle="--", linewidth=1.8, label="Ideal workload bandwidth")] + ax.legend(handles=base_legend + interco_legend + extra_legend, loc="best") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Plot sweep results from parsed transcript JSON files.") + parser.add_argument( + "--results-dir", + default="target/verif/results", + help="Directory containing parsed sweep JSON files (hardware_*.json).", + ) + parser.add_argument( + "--out-dir", + default="target/verif/results/plots", + help="Output directory for generated plots.", + ) + args = parser.parse_args() + + results_dir = Path(args.results_dir) + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + entries = _load_results(results_dir) + if not entries: + raise SystemExit(f"No sweep JSON files found in: {results_dir}") + + _plot_total_sim_time(entries, out_dir / "total_simulation_time.png") + _plot_per_master_avg_req_to_gnt(entries, out_dir / "avg_req_to_gnt_per_master.png") + _plot_bandwidth(entries, out_dir / "bandwidth_ideal_vs_actual.png") + + print(f"Plots written to: {out_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/target/verif/scripts/run_sweep.sh b/target/verif/scripts/run_sweep.sh new file mode 100644 index 0000000..17f7404 --- /dev/null +++ b/target/verif/scripts/run_sweep.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -e + +# Make sure we are in hci root +if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then + echo "Error: This script must be run from within the hci project git repository." >&2 + exit 1 +fi +cd "$git_root" + +VERIF_DIR=./target/verif +RESULTS_DIR=$VERIF_DIR/results +PLOTS_DIR=$RESULTS_DIR/plots + +mkdir -p "$RESULTS_DIR" +rm -f "$RESULTS_DIR"/hardware_*.json + +# For each hardware*.json in target/verif/config/sweep_hardware, run simulation and parse transcript +for hardware_config in $VERIF_DIR/config/sweep_hardware/hardware_*.json; do + make clean-verif + echo "Running simulation with hardware config: $hardware_config" + HARDWARE_JSON="$hardware_config" GUI=0 make run-verif + python3 $VERIF_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out "$RESULTS_DIR"/$(basename "$hardware_config" .json).json +done + +python3 $VERIF_DIR/scripts/plot_sweep_results.py --results-dir "$RESULTS_DIR" --out-dir "$PLOTS_DIR" diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index 78be03d..7f27050 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -63,11 +63,14 @@ module simulation_report logic missing_reads; // Ideal bandwidth: maximum data the memory system can serve per cycle. // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide. - real ideal_bw_mem_side_bpc; // bits per cycle (memory-side ceiling) - // Master side: sum of all master bandwidths at 100% traffic. - real ideal_bw_master_side_bpc; // bits per cycle (master-side ceiling) - real ideal_bw_bpc; // min(mem, master) = bottleneck ideal BW + real ideal_bw_mem_side_bpc; // bits per cycle (memory-side ceiling) + // Interconnect side: HCI-facing narrow/wide initiator interfaces. + real ideal_bw_interco_side_bpc; // bits per cycle (interconnect-side ceiling) + real ideal_bw_bpc; // min(mem, interco) = bottleneck ideal BW real actual_bw_utilization; // throughput_complete / ideal_bw + int unsigned n_narrow_if_total; + int unsigned n_wide_if_total; + string interco_type_str; sum_req_to_gnt_latency_all = 0.0; average_req_to_gnt_latency_weighted = 0.0; @@ -193,15 +196,27 @@ module simulation_report // Ideal bandwidth computation. // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle. ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH); - // Master side: N_LOG_MASTERS narrow ports + N_HWPE wide ports, all at 100% traffic. - ideal_bw_master_side_bpc = real'(N_LOG_MASTERS) * real'(DATA_WIDTH) - + real'(N_HWPE) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); + // Interconnect side: HCI interface ports (narrow + wide). + // NOTE: for MUX mode N_WIDE_HCI=1, i.e. one shared wide initiator. + n_narrow_if_total = N_NARROW_HCI + N_DMA + N_EXT; + n_wide_if_total = N_WIDE_HCI; + ideal_bw_interco_side_bpc = real'(n_narrow_if_total) * real'(DATA_WIDTH) + + real'(n_wide_if_total) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); // Bottleneck = minimum of the two sides. - ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_master_side_bpc) - ? ideal_bw_mem_side_bpc : ideal_bw_master_side_bpc; + ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_interco_side_bpc) + ? ideal_bw_mem_side_bpc : ideal_bw_interco_side_bpc; // Utilization = actual / ideal. actual_bw_utilization = (ideal_bw_bpc > 0.0) ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0; + if (INTERCO_TYPE == LOG) begin + interco_type_str = "LOG"; + end else if (INTERCO_TYPE == HCI) begin + interco_type_str = "HCI"; + end else if (INTERCO_TYPE == MUX) begin + interco_type_str = "MUX"; + end else begin + interco_type_str = "UNKNOWN"; + end $display("------ Simulation Summary ------"); $display("\\\\HW CONFIG\\\\"); @@ -217,6 +232,10 @@ module simulation_report "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d", SEL_LIC, TS_BIT, EXPFIFO ); + $display( + "Interconnect-side: TYPE=%s N_NARROW_HCI=%0d N_WIDE_HCI=%0d N_DMA=%0d N_EXT=%0d", + interco_type_str, N_NARROW_HCI, N_WIDE_HCI, N_DMA, N_EXT + ); $display( "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d", IW, ADDR_WIDTH, ADDR_WIDTH_BANK @@ -228,10 +247,10 @@ module simulation_report ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH ); $display( - "Ideal BW (master side): %0.0f bit/cycle [%0d log x %0d bits + %0d hwpe x %0d bits]", - ideal_bw_master_side_bpc, - N_LOG_MASTERS, DATA_WIDTH, - N_HWPE, HWPE_WIDTH_FACT * DATA_WIDTH + "Ideal BW (interco side): %0.0f bit/cycle [%0d narrow-if x %0d bits + %0d wide-if x %0d bits]", + ideal_bw_interco_side_bpc, + n_narrow_if_total, DATA_WIDTH, + n_wide_if_total, HWPE_WIDTH_FACT * DATA_WIDTH ); $display( "Ideal BW (bottleneck): %0.0f bit/cycle", diff --git a/target/verif/verif.mk b/target/verif/verif.mk index 93d8282..e9f93e7 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -77,10 +77,10 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \ config-verif: $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) - $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(dir $(HARDWARE_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@ + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@ $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) - $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(dir $(TESTBENCH_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@ + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@ $(HCI_VERIF_CFG_GEN_DIR): mkdir -p $@ @@ -157,6 +157,13 @@ clean-sim-verif: rm -f $(HCI_VERIF_DIR)/vsim/transcript rm -f $(HCI_VERIF_DIR)/vsim/vsim.wlf +################ +# Benchmarking # +################ + +benchmarking-sweep: + . $(HCI_VERIF_DIR)/scripts/run_sweep.sh + ########### # Helpers # ########### From f38178eab059d307933bb4a94da59062f8956f47 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 11 Mar 2026 16:21:57 +0100 Subject: [PATCH 22/25] verif/simvectors: Extract html and memory reporting --- .../verif/simvectors/hci_stimuli/__init__.py | 3 +- .../verif/simvectors/hci_stimuli/processor.py | 12 - target/verif/simvectors/html_report.py | 394 ++++++++++++++ target/verif/simvectors/main.py | 508 ++---------------- target/verif/simvectors/memory_report.py | 119 ++++ 5 files changed, 571 insertions(+), 465 deletions(-) delete mode 100644 target/verif/simvectors/hci_stimuli/processor.py create mode 100644 target/verif/simvectors/html_report.py create mode 100644 target/verif/simvectors/memory_report.py diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py index 6381011..64b9b07 100644 --- a/target/verif/simvectors/hci_stimuli/__init__.py +++ b/target/verif/simvectors/hci_stimuli/__init__.py @@ -6,6 +6,5 @@ """ from .generator import StimuliGenerator -from .processor import pad_txt_files -__all__ = ['StimuliGenerator', 'pad_txt_files'] +__all__ = ['StimuliGenerator'] diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py deleted file mode 100644 index fa8b51b..0000000 --- a/target/verif/simvectors/hci_stimuli/processor.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Processor helpers: pad stimuli files to equal length.""" - - -def pad_txt_files(folder_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT): - """No-op: padding stimuli files to a common length is not needed. - - Each application_driver self-terminates when its transaction queue is - exhausted, and later-phase drivers are held in reset via clear_i until - their dependencies assert end_req_o. Cross-file length alignment would - only add unnecessary idle cycles at the end of early-phase files. - """ - pass diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py new file mode 100644 index 0000000..98a2b1a --- /dev/null +++ b/target/verif/simvectors/html_report.py @@ -0,0 +1,394 @@ +"""HTML report generation for memory lifetime visualization.""" + +from pathlib import Path +import html +import math + + +def build_memory_lifetime_html( + *, + pattern_nodes, + driver_windows, + regions_timeline, + total_cycles, + mux_serialization_applied, + mux_phase_order, + schedule_has_cycle, + driver_name_fn, + interco_type, + n_core_cfg, + n_dma_cfg, + n_ext_cfg, + n_hwpe_cfg, + dw_narrow, + dw_wide, + n_narrow_hci_cfg, + n_wide_hci_cfg, + n_banks, +): + palette = [ + "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd", + "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", + "#e6ab02", "#a6761d", "#666666", + ] + + def _color_for_driver(drv_idx): + return palette[drv_idx % len(palette)] + + def _tick_step(total): + if total <= 10: + return 1 + raw = max(1, total // 10) + mag = 10 ** int(math.log10(raw)) + return int(math.ceil(raw / mag) * mag) + + total_for_plot = max(1, total_cycles) + chart_width = 1280 + x_left = 220 + x_right = 24 + plot_w = chart_width - x_left - x_right + row_h = 80 + y_top = 36 + exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())] + exec_h = y_top + row_h * len(exec_rows) + 72 + tick = _tick_step(total_for_plot) + ticks = list(range(0, total_for_plot + 1, tick)) + if ticks[-1] != total_for_plot: + ticks.append(total_for_plot) + + exec_svg = [] + exec_svg.append(f'') + exec_svg.append('') + exec_svg.append(f'Execution Timeline (transaction-count model)') + for t in ticks: + x = x_left + (t / total_for_plot) * plot_w + exec_svg.append(f'') + exec_svg.append(f'{t}') + exec_svg.append( + f'Transaction number' + ) + exec_svg.append( + f'' + 'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.' + '' + ) + + def _rw_mix_text(node): + rpct = node.get('traffic_read_pct') + if rpct is None: + read_bytes = 0 + write_bytes = 0 + for reg in node.get('regions', []): + label_l = str(reg.get('label', '')).lower() + size_b = max(0, int(reg.get('size', 0))) + if 'read' in label_l and 'write' not in label_l: + read_bytes += size_b + elif 'write' in label_l and 'read' not in label_l: + write_bytes += size_b + if (read_bytes + write_bytes) > 0: + rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes))) + else: + rpct = 50 + rpct = max(0, min(100, int(rpct))) + if rpct == 0: + return "100% writes" + if rpct == 100: + return "100% reads" + return f"{rpct}% reads / {100 - rpct}% writes" + + def _fmt_kib(bytes_v): + return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB" + + def _outside_detail_lines(node, base_addr, size_kib): + regs = list(node.get('regions', [])) + label_map = {str(r.get('label', '')): r for r in regs} + has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)')) + is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs + + if is_matmul: + mat_lines = [] + for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]: + if lbl in label_map: + r = label_map[lbl] + mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})") + if not mat_lines: + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + return mat_lines + + if regs: + r0 = regs[0] + return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"] + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + + for row_idx, row in enumerate(exec_rows): + y = y_top + row_idx * row_h + name = html.escape(row['name']) + label_color = "#111111" if row['is_hwpe'] else "#555555" + exec_svg.append(f'{name}') + exec_svg.append(f'') + nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']] + for node in nodes: + if node['end_cycle'] <= node['start_cycle']: + continue + x = x_left + (node['start_cycle'] / total_for_plot) * plot_w + w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w) + color = _color_for_driver(node['driver_idx']) + base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0) + size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0 + rw_mix = _rw_mix_text(node) + line1_full = f"{node['job']}" + line2_full = f"{node['mem_access_type']}" + line3_full = rw_mix + outside_lines = _outside_detail_lines(node, base_addr, size_kib) + title = ( + f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']}, {node['end_cycle']}) " + f"{node['mem_access_type']} n={node['n_transactions']} " + f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} " + f"{' | '.join(outside_lines)}" + ) + exec_svg.append('') + exec_svg.append(f'{html.escape(title)}') + exec_svg.append( + f'' + ) + line1 = html.escape(line1_full) + exec_svg.append( + f'{line1}' + ) + line2 = html.escape(line2_full) + exec_svg.append( + f'{line2}' + ) + line3 = html.escape(line3_full) + exec_svg.append( + f'{line3}' + ) + for ext_idx, ext in enumerate(outside_lines): + y_ext = y + 49 + (ext_idx * 9) + ext_txt = html.escape(ext) + exec_svg.append( + f'{ext_txt}' + ) + exec_svg.append('') + exec_svg.append('') + + region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))] + overlap_rows = [] + overlaps_by_region = {i: [] for i in range(len(region_rows))} + for i in range(len(region_rows)): + a = region_rows[i] + for j in range(i + 1, len(region_rows)): + b = region_rows[j] + ov_base = max(a['base'], b['base']) + ov_end = min(a['end'], b['end']) + if ov_base <= ov_end: + ov_size = ov_end - ov_base + 1 + overlap_rows.append({ + 'a_idx': i, + 'b_idx': j, + 'ov_base': ov_base, + 'ov_end': ov_end, + 'ov_size': ov_size, + }) + overlaps_by_region[i].append((j, ov_base, ov_end, ov_size)) + overlaps_by_region[j].append((i, ov_base, ov_end, ov_size)) + + used_min = min((reg['base'] for reg in region_rows), default=0) + used_max = max((reg['end'] for reg in region_rows), default=0) + if used_max < used_min: + used_max = used_min + used_span = max(1, used_max - used_min + 1) + + map_left = 170 + map_right = 24 + map_plot_w = chart_width - map_left - map_right + map_h = 124 + bar_y = 56 + bar_h = 24 + region_map_svg = [] + region_map_svg.append(f'') + region_map_svg.append('') + region_map_svg.append( + f'' + f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})" + f"" + ) + region_map_svg.append( + f'' + ) + for pct in [0, 25, 50, 75, 100]: + x = map_left + (pct / 100.0) * map_plot_w + addr = used_min + int(((used_span - 1) * pct) / 100.0) + region_map_svg.append(f'') + region_map_svg.append( + f'0x{addr:08x}' + ) + for reg in region_rows: + x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w + w = max(1.0, (reg['size'] / used_span) * map_plot_w) + color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888" + title = ( + f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"size={reg['size']}B accesses={len(reg['accesses'])}" + ) + region_map_svg.append( + f'' + f'{html.escape(title)}' + ) + if w >= 92: + region_map_svg.append( + f'{html.escape(reg["label"])}' + ) + region_map_svg.append('') + + legend_items = [] + for d in sorted(driver_windows.keys()): + n = driver_name_fn(d) + c = _color_for_driver(d) + legend_items.append( + f'' + f'' + f'{html.escape(n)}' + ) + + region_cards = [] + for idx, reg in enumerate(region_rows): + access_rows = [] + accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start'])) + for acc in accesses: + desc = acc['description'] if acc['description'] else "-" + access_rows.append( + "" + f"{html.escape(acc['driver_name'])}" + f"p{acc['pattern_idx']}" + f"{html.escape(acc['job'])}" + f"{html.escape(desc)}" + f"[{acc['start']}, {acc['end']})" + "" + ) + overlap_refs = overlaps_by_region.get(idx, []) + if overlap_refs: + ov_txt = ", ".join( + [ + f"{html.escape(region_rows[j]['label'])} " + f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)" + for (j, ovb, ove, ovs) in overlap_refs + ] + ) + else: + ov_txt = "none" + region_cards.append( + "
" + f"
{html.escape(reg['label'])} | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B
" + f"
Overlaps: {ov_txt}
" + "" + "" + "" + f"{''.join(access_rows)}" + "
Driver/HWPEPatternJobDescriptionModeled interval
" + "
" + ) + + overlap_table_rows = [] + for ov in overlap_rows: + a = region_rows[ov['a_idx']] + b = region_rows[ov['b_idx']] + overlap_table_rows.append( + "" + f"{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})" + f"{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})" + f"0x{ov['ov_base']:08x}" + f"0x{ov['ov_end']:08x}" + f"{ov['ov_size']}" + "" + ) + + note = "Timeline follows declared wait_for_jobs dependencies from workload.json." + note_2 = ( + "Time axis is transaction-count based only " + "(no interconnect conflict/stall/arbitration modeling)." + ) + note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing." + note_3 = "" + if mux_serialization_applied: + note_3 = ( + "MUX mode: HWPE execution is serialized by job order, " + "with lower HWPE ID first inside each job (tb_hci-like)." + ) + if mux_phase_order: + note_3 += f" Job order: {', '.join(mux_phase_order)}." + note_3_html = f"
{html.escape(note_3)}
" if note_3 else "" + cycle_warning_html = ( + "

Warning: dependency cycle detected; fallback scheduling order used.

" + if schedule_has_cycle else "" + ) + overlap_html = ( + "" + f"{''.join(overlap_table_rows)}" + "
Region ARegion BOverlap BaseOverlap EndOverlap Size (B)
" + if overlap_table_rows else + "
No overlaps detected among used regions.
" + ) + + return ( + "" + "Memory Access Region View (Transaction-Count Model)" + "" + "

Memory Access Region View

" + f"
Drivers ({dw_narrow} bit): " + f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}
" + f"
Drivers ({dw_wide} bit): HWPE={n_hwpe_cfg}
" + f"
Interconnect type: {html.escape(interco_type)} | " + f"Narrow master ports ({dw_narrow} bit): {n_narrow_hci_cfg} | " + f"Wide master ports ({dw_wide} bit): {n_wide_hci_cfg} | " + f"Slave ports (banks): {n_banks} | " + f"Total modeled time: {total_cycles} units
" + f"
{html.escape(note)}
" + f"
{html.escape(note_2)}
" + f"
{html.escape(note_2b)}
" + f"{note_3_html}" + f"{cycle_warning_html}" + "

Legend

" + f"{''.join(legend_items)}
" + "
" + f"{''.join(exec_svg)}" + "
" + "
" + f"{''.join(region_map_svg)}" + "
" + "

Region Usage Blocks

" + f"{''.join(region_cards)}" + "

Overlapping Regions

" + f"{overlap_html}" + "" + ) + + +def write_memory_lifetime_html(memory_lifetime_path: Path, **kwargs): + html_doc = build_memory_lifetime_html(**kwargs) + memory_lifetime_path.write_text(html_doc, encoding='utf-8') diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 0cd3c12..3ff2f30 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -11,17 +11,20 @@ import json import math import sys -import html from pathlib import Path import argparse code_directory = Path(__file__).resolve().parent try: - from hci_stimuli import StimuliGenerator, pad_txt_files + from hci_stimuli import StimuliGenerator + from memory_report import write_memory_map_txt + from html_report import write_memory_lifetime_html except Exception: sys.path.insert(0, str(code_directory)) - from hci_stimuli import StimuliGenerator, pad_txt_files + from hci_stimuli import StimuliGenerator + from memory_report import write_memory_map_txt + from html_report import write_memory_lifetime_html def parse_args(argv=None): @@ -1128,457 +1131,64 @@ def _add_edge(src, dst): # ----------------------------------------------------------------------- # Build memory_map.txt # ----------------------------------------------------------------------- - word_bytes = DATA_WIDTH // 8 - bank_stride_bytes = N_BANKS * word_bytes - lines = [] - lines.append("=" * 72) - lines.append("MEMORY MAP REPORT") - lines.append(f" Total memory : {TOT_MEM_SIZE} KiB ({TOT_MEM_SIZE * 1024} B)") - lines.append(f" Banks : {N_BANKS} x {word_bytes} B/word (interleaved, stride {bank_stride_bytes} B)") - lines.append(f" Data width : {DATA_WIDTH} b LOG / {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE") - lines.append( - f" Drivers({DW_NARROW} bit) : " - f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG} (LOG total={N_LOG_CFG})" - ) - lines.append( - f" Drivers({DW_WIDE} bit) : " - f"HWPE={N_HWPE_CFG}" - ) - lines.append( - f" Interconnect type : {INTERCO_TYPE} | " - f"Narrow master ports ({DW_NARROW} bit)={N_NARROW_HCI_CFG} | " - f"Wide master ports ({DW_WIDE} bit)={N_WIDE_HCI_CFG} | " - f"Slave ports (banks)={N_BANKS}" + memory_map_path = generated_dir / 'memory_map.txt' + report_text = write_memory_map_txt( + memory_map_path=memory_map_path, + total_mem_size_kib=TOT_MEM_SIZE, + n_banks=N_BANKS, + data_width=DATA_WIDTH, + hwpe_data_width=HWPE_WIDTH_FACT * DATA_WIDTH, + n_core_cfg=N_CORE_CFG, + n_dma_cfg=N_DMA_CFG, + n_ext_cfg=N_EXT_CFG, + n_log_cfg=N_LOG_CFG, + n_hwpe_cfg=N_HWPE_CFG, + interco_type=INTERCO_TYPE, + dw_narrow=DW_NARROW, + dw_wide=DW_WIDE, + n_narrow_hci_cfg=N_NARROW_HCI_CFG, + n_wide_hci_cfg=N_WIDE_HCI_CFG, + memory_map_entries=memory_map_entries, + job_to_drivers=job_to_drivers, + driver_name_fn=_driver_name, + n_drivers=N_DRIVERS, + fence_masks=fence_masks, + total_cycles=total_cycles, + mux_serialization_applied=mux_serialization_applied, + mux_phase_order=mux_phase_order, + schedule_has_cycle=schedule_has_cycle, + driver_windows=driver_windows, + pattern_nodes=pattern_nodes, + regions_timeline=regions_timeline, ) - lines.append("=" * 72) - for entry in memory_map_entries: - lines.append(f"\n [{entry['label']}] pattern={entry['pattern']} n_transactions={entry['n']}") - if 'info' in entry: - lines.append(f" {entry['info']}") - for k, v in entry.get('detail', {}).items(): - lines.append(f" {k:<14}: {v}") - lines.append("") - lines.append(" Job / dependency map:") - for job, drivers in sorted(job_to_drivers.items()): - driver_names = [_driver_name(d) for d in drivers] - lines.append(f" job '{job}': {', '.join(driver_names)}") - for i in range(N_DRIVERS): - name = _driver_name(i) - for f, mask in enumerate(fence_masks[i]): - if mask: - deps = [_driver_name(j) for j in range(N_DRIVERS) if mask & (1 << j)] - lines.append(f" {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}") - lines.append("") - lines.append(" Temporal schedule (transaction-count model):") - lines.append(f" Total modeled time: {total_cycles} units (1 unit = 1 transaction)") - lines.append(" Note: Declared wait_for_jobs dependencies are used for scheduling.") - lines.append(" Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).") - lines.append(" Note: No interconnect contention/stall timing is modeled.") - if mux_serialization_applied: - lines.append(" Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).") - lines.append(f" MUX job order: {', '.join(mux_phase_order)}") - if schedule_has_cycle: - lines.append(" WARNING: dependency cycle detected while scheduling; using fallback order.") - for d in range(N_DRIVERS): - if d not in driver_windows: - continue - w = driver_windows[d] - lines.append(f" {w['name']:<8}: [{w['start']:>6}, {w['end']:>6}) dur={w['end'] - w['start']:>6}") - for node in [n for n in pattern_nodes if n['driver_idx'] == d]: - reg_tokens = [] - for reg in node['regions']: - reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B") - reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions" - lines.append( - f" p{node['pattern_idx']} job={node['job']} " - f"[{node['start_cycle']},{node['end_cycle']}) " - f"type={node['mem_access_type']} n={node['n_transactions']} {reg_text}" - ) - lines.append("") - lines.append(" Memory region lifetimes:") - for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])): - reg = regions_timeline[key] - users = sorted({a['driver_name'] for a in reg['accesses']}) - lines.append( - f" {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} " - f"({reg['size']:>6} B) lifetime=[{reg['lifetime_start']},{reg['lifetime_end']}) " - f"users={', '.join(users)}" - ) - lines.append("=" * 72) - - report_text = "\n".join(lines) + "\n" print("\n" + report_text) - memory_map_path = generated_dir / 'memory_map.txt' - memory_map_path.write_text(report_text, encoding='utf-8') print(f"Memory map written: {memory_map_path}") # ----------------------------------------------------------------------- # Build memory_lifetime.html (simple SVG timeline view) # ----------------------------------------------------------------------- - palette = [ - "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf", - "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd", - "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", - "#e6ab02", "#a6761d", "#666666", - ] - - def _color_for_driver(drv_idx): - return palette[drv_idx % len(palette)] - - def _tick_step(total): - if total <= 10: - return 1 - raw = max(1, total // 10) - mag = 10 ** int(math.log10(raw)) - return int(math.ceil(raw / mag) * mag) - - total_for_plot = max(1, total_cycles) - chart_width = 1280 - x_left = 220 - x_right = 24 - plot_w = chart_width - x_left - x_right - row_h = 80 - y_top = 36 - exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())] - exec_h = y_top + row_h * len(exec_rows) + 72 - tick = _tick_step(total_for_plot) - ticks = list(range(0, total_for_plot + 1, tick)) - if ticks[-1] != total_for_plot: - ticks.append(total_for_plot) - - exec_svg = [] - exec_svg.append(f'') - exec_svg.append('') - exec_svg.append(f'Execution Timeline (transaction-count model)') - for t in ticks: - x = x_left + (t / total_for_plot) * plot_w - exec_svg.append(f'') - exec_svg.append(f'{t}') - exec_svg.append( - f'Transaction number' - ) - exec_svg.append( - f'' - 'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.' - '' - ) - - def _rw_mix_text(node): - rpct = node.get('traffic_read_pct') - if rpct is None: - read_bytes = 0 - write_bytes = 0 - for reg in node.get('regions', []): - label_l = str(reg.get('label', '')).lower() - size_b = max(0, int(reg.get('size', 0))) - if 'read' in label_l and 'write' not in label_l: - read_bytes += size_b - elif 'write' in label_l and 'read' not in label_l: - write_bytes += size_b - if (read_bytes + write_bytes) > 0: - rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes))) - else: - rpct = 50 - rpct = max(0, min(100, int(rpct))) - if rpct == 0: - return "100% writes" - if rpct == 100: - return "100% reads" - return f"{rpct}% reads / {100 - rpct}% writes" - - def _fmt_kib(bytes_v): - return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB" - - def _outside_detail_lines(node, base_addr, size_kib): - regs = list(node.get('regions', [])) - label_map = {str(r.get('label', '')): r for r in regs} - has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)')) - is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs - - if is_matmul: - mat_lines = [] - for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]: - if lbl in label_map: - r = label_map[lbl] - mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})") - if not mat_lines: - return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] - return mat_lines - - if regs: - r0 = regs[0] - return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"] - return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] - - for row_idx, row in enumerate(exec_rows): - y = y_top + row_idx * row_h - name = html.escape(row['name']) - label_color = "#111111" if row['is_hwpe'] else "#555555" - exec_svg.append(f'{name}') - exec_svg.append(f'') - nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']] - for node in nodes: - if node['end_cycle'] <= node['start_cycle']: - continue - x = x_left + (node['start_cycle'] / total_for_plot) * plot_w - w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w) - color = _color_for_driver(node['driver_idx']) - base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0) - size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0 - rw_mix = _rw_mix_text(node) - line1_full = f"{node['job']}" - line2_full = f"{node['mem_access_type']}" - line3_full = rw_mix - outside_lines = _outside_detail_lines(node, base_addr, size_kib) - title = ( - f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} " - f"[{node['start_cycle']}, {node['end_cycle']}) " - f"{node['mem_access_type']} n={node['n_transactions']} " - f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} " - f"{' | '.join(outside_lines)}" - ) - exec_svg.append('') - exec_svg.append(f'{html.escape(title)}') - exec_svg.append( - f'' - ) - line1 = html.escape(line1_full) - exec_svg.append( - f'{line1}' - ) - line2 = html.escape(line2_full) - exec_svg.append( - f'{line2}' - ) - line3 = html.escape(line3_full) - exec_svg.append( - f'{line3}' - ) - for ext_idx, ext in enumerate(outside_lines): - y_ext = y + 49 + (ext_idx * 9) - ext_txt = html.escape(ext) - exec_svg.append( - f'{ext_txt}' - ) - exec_svg.append('') - exec_svg.append('') - - region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))] - overlap_rows = [] - overlaps_by_region = {i: [] for i in range(len(region_rows))} - for i in range(len(region_rows)): - a = region_rows[i] - for j in range(i + 1, len(region_rows)): - b = region_rows[j] - ov_base = max(a['base'], b['base']) - ov_end = min(a['end'], b['end']) - if ov_base <= ov_end: - ov_size = ov_end - ov_base + 1 - overlap_rows.append({ - 'a_idx': i, - 'b_idx': j, - 'ov_base': ov_base, - 'ov_end': ov_end, - 'ov_size': ov_size, - }) - overlaps_by_region[i].append((j, ov_base, ov_end, ov_size)) - overlaps_by_region[j].append((i, ov_base, ov_end, ov_size)) - - used_min = min((reg['base'] for reg in region_rows), default=0) - used_max = max((reg['end'] for reg in region_rows), default=0) - if used_max < used_min: - used_max = used_min - used_span = max(1, used_max - used_min + 1) - - map_left = 170 - map_right = 24 - map_plot_w = chart_width - map_left - map_right - map_h = 124 - bar_y = 56 - bar_h = 24 - region_map_svg = [] - region_map_svg.append(f'') - region_map_svg.append('') - region_map_svg.append( - f'' - f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})" - f"" - ) - region_map_svg.append( - f'' - ) - for pct in [0, 25, 50, 75, 100]: - x = map_left + (pct / 100.0) * map_plot_w - addr = used_min + int(((used_span - 1) * pct) / 100.0) - region_map_svg.append(f'') - region_map_svg.append( - f'0x{addr:08x}' - ) - for reg in region_rows: - x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w - w = max(1.0, (reg['size'] / used_span) * map_plot_w) - color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888" - title = ( - f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} " - f"size={reg['size']}B accesses={len(reg['accesses'])}" - ) - region_map_svg.append( - f'' - f'{html.escape(title)}' - ) - if w >= 92: - region_map_svg.append( - f'{html.escape(reg["label"])}' - ) - region_map_svg.append('') - - legend_items = [] - for d in sorted(driver_windows.keys()): - n = _driver_name(d) - c = _color_for_driver(d) - legend_items.append( - f'' - f'' - f'{html.escape(n)}' - ) - - region_cards = [] - for idx, reg in enumerate(region_rows): - access_rows = [] - accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start'])) - for acc in accesses: - desc = acc['description'] if acc['description'] else "-" - access_rows.append( - "" - f"{html.escape(acc['driver_name'])}" - f"p{acc['pattern_idx']}" - f"{html.escape(acc['job'])}" - f"{html.escape(desc)}" - f"[{acc['start']}, {acc['end']})" - "" - ) - overlap_refs = overlaps_by_region.get(idx, []) - if overlap_refs: - ov_txt = ", ".join( - [ - f"{html.escape(region_rows[j]['label'])} " - f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)" - for (j, ovb, ove, ovs) in overlap_refs - ] - ) - else: - ov_txt = "none" - region_cards.append( - "
" - f"
{html.escape(reg['label'])} | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B
" - f"
Overlaps: {ov_txt}
" - "" - "" - "" - f"{''.join(access_rows)}" - "
Driver/HWPEPatternJobDescriptionModeled interval
" - "
" - ) - - overlap_table_rows = [] - for ov in overlap_rows: - a = region_rows[ov['a_idx']] - b = region_rows[ov['b_idx']] - overlap_table_rows.append( - "" - f"{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})" - f"{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})" - f"0x{ov['ov_base']:08x}" - f"0x{ov['ov_end']:08x}" - f"{ov['ov_size']}" - "" - ) - - note = "Timeline follows declared wait_for_jobs dependencies from workload.json." - note_2 = ( - "Time axis is transaction-count based only " - "(no interconnect conflict/stall/arbitration modeling)." - ) - note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing." - note_3 = "" - if mux_serialization_applied: - note_3 = ( - "MUX mode: HWPE execution is serialized by job order, " - "with lower HWPE ID first inside each job (tb_hci-like)." - ) - if mux_phase_order: - note_3 += f" Job order: {', '.join(mux_phase_order)}." - note_3_html = f"
{html.escape(note_3)}
" if note_3 else "" - cycle_warning_html = ( - "

Warning: dependency cycle detected; fallback scheduling order used.

" - if schedule_has_cycle else "" - ) - overlap_html = ( - "" - f"{''.join(overlap_table_rows)}" - "
Region ARegion BOverlap BaseOverlap EndOverlap Size (B)
" - if overlap_table_rows else - "
No overlaps detected among used regions.
" - ) - - html_doc = ( - "" - "Memory Access Region View (Transaction-Count Model)" - "" - "

Memory Access Region View

" - f"
Drivers ({DW_NARROW} bit): " - f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}
" - f"
Drivers ({DW_WIDE} bit): HWPE={N_HWPE_CFG}
" - f"
Interconnect type: {html.escape(INTERCO_TYPE)} | " - f"Narrow master ports ({DW_NARROW} bit): {N_NARROW_HCI_CFG} | " - f"Wide master ports ({DW_WIDE} bit): {N_WIDE_HCI_CFG} | " - f"Slave ports (banks): {N_BANKS} | " - f"Total modeled time: {total_cycles} units
" - f"
{html.escape(note)}
" - f"
{html.escape(note_2)}
" - f"
{html.escape(note_2b)}
" - f"{note_3_html}" - f"{cycle_warning_html}" - "

Legend

" - f"{''.join(legend_items)}
" - "
" - f"{''.join(exec_svg)}" - "
" - "
" - f"{''.join(region_map_svg)}" - "
" - "

Region Usage Blocks

" - f"{''.join(region_cards)}" - "

Overlapping Regions

" - f"{overlap_html}" - "" - ) - memory_lifetime_path = generated_dir / 'memory_lifetime.html' - memory_lifetime_path.write_text(html_doc, encoding='utf-8') + write_memory_lifetime_html( + memory_lifetime_path=memory_lifetime_path, + pattern_nodes=pattern_nodes, + driver_windows=driver_windows, + regions_timeline=regions_timeline, + total_cycles=total_cycles, + mux_serialization_applied=mux_serialization_applied, + mux_phase_order=mux_phase_order, + schedule_has_cycle=schedule_has_cycle, + driver_name_fn=_driver_name, + interco_type=INTERCO_TYPE, + n_core_cfg=N_CORE_CFG, + n_dma_cfg=N_DMA_CFG, + n_ext_cfg=N_EXT_CFG, + n_hwpe_cfg=N_HWPE_CFG, + dw_narrow=DW_NARROW, + dw_wide=DW_WIDE, + n_narrow_hci_cfg=N_NARROW_HCI_CFG, + n_wide_hci_cfg=N_WIDE_HCI_CFG, + n_banks=N_BANKS, + ) print(f"Memory lifetime plot written: {memory_lifetime_path}") # ----------------------------------------------------------------------- @@ -1590,11 +1200,7 @@ def _outside_detail_lines(node, base_addr, size_kib): original = fpath.read_text(encoding='ascii') fpath.write_text(idle_line * delay + original, encoding='ascii') - # ----------------------------------------------------------------------- - # Pad all stimuli files to equal length - # ----------------------------------------------------------------------- - pad_txt_files(str(stimuli_dir), IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT) - print("STEP 1 COMPLETED: pad stimuli files") + print("STEP 1 COMPLETED: generate documents and apply start delays to stimuli") # ----------------------------------------------------------------------- # Golden vectors diff --git a/target/verif/simvectors/memory_report.py b/target/verif/simvectors/memory_report.py new file mode 100644 index 0000000..2da9cfb --- /dev/null +++ b/target/verif/simvectors/memory_report.py @@ -0,0 +1,119 @@ +"""Text memory map report generation.""" + +from pathlib import Path + + +def build_memory_map_text( + *, + total_mem_size_kib, + n_banks, + data_width, + hwpe_data_width, + n_core_cfg, + n_dma_cfg, + n_ext_cfg, + n_log_cfg, + n_hwpe_cfg, + interco_type, + dw_narrow, + dw_wide, + n_narrow_hci_cfg, + n_wide_hci_cfg, + memory_map_entries, + job_to_drivers, + driver_name_fn, + n_drivers, + fence_masks, + total_cycles, + mux_serialization_applied, + mux_phase_order, + schedule_has_cycle, + driver_windows, + pattern_nodes, + regions_timeline, +): + word_bytes = data_width // 8 + bank_stride_bytes = n_banks * word_bytes + lines = [] + lines.append("=" * 72) + lines.append("MEMORY MAP REPORT") + lines.append(f" Total memory : {total_mem_size_kib} KiB ({total_mem_size_kib * 1024} B)") + lines.append(f" Banks : {n_banks} x {word_bytes} B/word (interleaved, stride {bank_stride_bytes} B)") + lines.append(f" Data width : {data_width} b LOG / {hwpe_data_width} b HWPE") + lines.append( + f" Drivers({dw_narrow} bit) : " + f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg} (LOG total={n_log_cfg})" + ) + lines.append( + f" Drivers({dw_wide} bit) : " + f"HWPE={n_hwpe_cfg}" + ) + lines.append( + f" Interconnect type : {interco_type} | " + f"Narrow master ports ({dw_narrow} bit)={n_narrow_hci_cfg} | " + f"Wide master ports ({dw_wide} bit)={n_wide_hci_cfg} | " + f"Slave ports (banks)={n_banks}" + ) + lines.append("=" * 72) + for entry in memory_map_entries: + lines.append(f"\n [{entry['label']}] pattern={entry['pattern']} n_transactions={entry['n']}") + if 'info' in entry: + lines.append(f" {entry['info']}") + for k, v in entry.get('detail', {}).items(): + lines.append(f" {k:<14}: {v}") + lines.append("") + lines.append(" Job / dependency map:") + for job, drivers in sorted(job_to_drivers.items()): + driver_names = [driver_name_fn(d) for d in drivers] + lines.append(f" job '{job}': {', '.join(driver_names)}") + for i in range(n_drivers): + name = driver_name_fn(i) + for f, mask in enumerate(fence_masks[i]): + if mask: + deps = [driver_name_fn(j) for j in range(n_drivers) if mask & (1 << j)] + lines.append(f" {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}") + lines.append("") + lines.append(" Temporal schedule (transaction-count model):") + lines.append(f" Total modeled time: {total_cycles} units (1 unit = 1 transaction)") + lines.append(" Note: Declared wait_for_jobs dependencies are used for scheduling.") + lines.append(" Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).") + lines.append(" Note: No interconnect contention/stall timing is modeled.") + if mux_serialization_applied: + lines.append(" Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).") + lines.append(f" MUX job order: {', '.join(mux_phase_order)}") + if schedule_has_cycle: + lines.append(" WARNING: dependency cycle detected while scheduling; using fallback order.") + for d in range(n_drivers): + if d not in driver_windows: + continue + w = driver_windows[d] + lines.append(f" {w['name']:<8}: [{w['start']:>6}, {w['end']:>6}) dur={w['end'] - w['start']:>6}") + for node in [n for n in pattern_nodes if n['driver_idx'] == d]: + reg_tokens = [] + for reg in node['regions']: + reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B") + reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions" + lines.append( + f" p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']},{node['end_cycle']}) " + f"type={node['mem_access_type']} n={node['n_transactions']} {reg_text}" + ) + lines.append("") + lines.append(" Memory region lifetimes:") + for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])): + reg = regions_timeline[key] + users = sorted({a['driver_name'] for a in reg['accesses']}) + lines.append( + f" {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"({reg['size']:>6} B) lifetime=[{reg['lifetime_start']},{reg['lifetime_end']}) " + f"users={', '.join(users)}" + ) + lines.append("=" * 72) + + return "\n".join(lines) + "\n" + + +def write_memory_map_txt(memory_map_path: Path, **kwargs): + report_text = build_memory_map_text(**kwargs) + memory_map_path.write_text(report_text, encoding='utf-8') + return report_text From 9fed2aa447ffc4d36e30d7cab7eb428a4cbf3a59 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 11 Mar 2026 16:22:41 +0100 Subject: [PATCH 23/25] verif/exploration: Move exploration stuff to subdir --- .gitignore | 2 +- target/verif/config/workload.json | 378 ++++++++++++++- .../config/workloads/workload_dma_gemm.json | 180 -------- .../hardware}/hardware_hci_2hwpe_8fact.json | 0 .../hardware}/hardware_log_2hwpe_8fact.json | 0 .../hardware}/hardware_mux_2hwpe_8fact.json | 0 .../workloads/workload_dma_gemm_cores.json | 0 .../workload_dma_gemm_cores_ideal.json} | 43 +- target/verif/exploration/exploration.mk | 17 + .../{ => exploration}/scripts/parse_vsim.py | 0 .../scripts/plot_sweep_results.py | 14 +- target/verif/exploration/scripts/run_sweep.sh | 46 ++ .../results/hardware_hci_2hwpe_8fact.json | 431 ------------------ target/verif/scripts/run_sweep.sh | 27 -- target/verif/simvectors/main.py | 11 +- target/verif/verif.mk | 10 +- 16 files changed, 472 insertions(+), 687 deletions(-) delete mode 100644 target/verif/config/workloads/workload_dma_gemm.json rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_hci_2hwpe_8fact.json (100%) rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_log_2hwpe_8fact.json (100%) rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_mux_2hwpe_8fact.json (100%) rename target/verif/{ => exploration}/config/workloads/workload_dma_gemm_cores.json (100%) rename target/verif/{config/workloads/workload_dma_gemm_ideal.json => exploration/config/workloads/workload_dma_gemm_cores_ideal.json} (75%) create mode 100644 target/verif/exploration/exploration.mk rename target/verif/{ => exploration}/scripts/parse_vsim.py (100%) rename target/verif/{ => exploration}/scripts/plot_sweep_results.py (97%) create mode 100644 target/verif/exploration/scripts/run_sweep.sh delete mode 100644 target/verif/results/hardware_hci_2hwpe_8fact.json delete mode 100644 target/verif/scripts/run_sweep.sh diff --git a/.gitignore b/.gitignore index 0fbe9ba..6212410 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ target/verif/vsim/compile.tcl target/verif/vsim/modelsim.ini target/verif/vsim/transcript target/verif/vsim/vsim.wlf -target/verif/results +target/verif/exploration/results diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index 8299bf9..aa2da43 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -1,14 +1,374 @@ { - "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", "log_masters": [ - { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, - { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, - { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, - { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, - { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, - { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, - { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, - { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, + { + "id": 0, + "description": "Core 0 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 0 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core0_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core0_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core0_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core0_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 1, + "description": "Core 1 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 1 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core1_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core1_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core1_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core1_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 2, + "description": "Core 2 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 2 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core2_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core2_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core2_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core2_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 3, + "description": "Core 3 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 3 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core3_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core3_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core3_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core3_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 4, + "description": "Core 4 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 4 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core4_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core4_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core4_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core4_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 5, + "description": "Core 5 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 5 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core5_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core5_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core5_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core5_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 6, + "description": "Core 6 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 6 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core6_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core6_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core6_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core6_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 7, + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core7_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core7_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core7_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] + }, { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } ], diff --git a/target/verif/config/workloads/workload_dma_gemm.json b/target/verif/config/workloads/workload_dma_gemm.json deleted file mode 100644 index 8299bf9..0000000 --- a/target/verif/config/workloads/workload_dma_gemm.json +++ /dev/null @@ -1,180 +0,0 @@ -{ - "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", - "log_masters": [ - { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, - { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, - { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, - { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, - { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, - { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, - { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, - { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, - { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } - ], - - "hwpe_masters": [ - { - "id": 0, - "description": "DMA engine (linear setup + ping/pong prefetch)", - "patterns": [ - { - "description": "Setup preload: B matrix (8 KiB)", - "mem_access_type": "linear", - "job": "dma_load_B", - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x00000", - "region_size_bytes": 8192 - }, - { - "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", - "mem_access_type": "linear", - "job": "dma_load_A0_buf0", - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x02000", - "region_size_bytes": 8192 - }, - { - "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", - "mem_access_type": "linear", - "job": "dma_load_A1_buf1", - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x04000", - "region_size_bytes": 8192 - }, - { - "description": "Prefetch tile A2 into buffer #0 (8 KiB)", - "mem_access_type": "linear", - "job": "dma_load_A2_buf0", - "wait_for_jobs": ["gemm_A0"], - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x02000", - "region_size_bytes": 8192 - }, - { - "description": "Read back C0 after GEMM tile A0", - "mem_access_type": "linear", - "job": "dma_store_C0_buf2", - "wait_for_jobs": ["gemm_A0"], - "traffic_pct": 100, - "traffic_read_pct": 100, - "region_base_address": "0x06000", - "region_size_bytes": 8192 - }, - { - "description": "Prefetch tile A3 into buffer #1 (8 KiB)", - "mem_access_type": "linear", - "job": "dma_load_A3_buf1", - "wait_for_jobs": ["gemm_A1"], - "traffic_pct": 100, - "traffic_read_pct": 0, - "region_base_address": "0x04000", - "region_size_bytes": 8192 - }, - { - "description": "Read back C1 after GEMM tile A1", - "mem_access_type": "linear", - "job": "dma_store_C1_buf3", - "wait_for_jobs": ["gemm_A1"], - "traffic_pct": 100, - "traffic_read_pct": 100, - "region_base_address": "0x08000", - "region_size_bytes": 8192 - }, - { - "description": "Read back C0 after GEMM tile A2", - "mem_access_type": "linear", - "job": "dma_store_C2_buf2", - "wait_for_jobs": ["gemm_A2"], - "traffic_pct": 100, - "traffic_read_pct": 100, - "region_base_address": "0x06000", - "region_size_bytes": 8192 - }, - { - "description": "Read back C1 after GEMM tile A3", - "mem_access_type": "linear", - "job": "dma_store_C3_buf3", - "wait_for_jobs": ["gemm_A3"], - "traffic_pct": 100, - "traffic_read_pct": 100, - "region_base_address": "0x08000", - "region_size_bytes": 8192 - } - ] - }, - { - "id": 1, - "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", - "patterns": [ - { - "description": "GEMM tile A0: A0 x B -> C0", - "mem_access_type": "matmul_phased", - "job": "gemm_A0", - "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], - "traffic_pct": 90, - "matrix_m": 16, - "matrix_n": 16, - "matrix_k": 16, - "region_base_address_a": "0x02000", - "region_size_bytes_a": 8192, - "region_base_address_b": "0x00000", - "region_size_bytes_b": 8192, - "region_base_address_c": "0x06000", - "region_size_bytes_c": 8192 - }, - { - "description": "GEMM tile A1: A1 x B -> C1", - "mem_access_type": "matmul_phased", - "job": "gemm_A1", - "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], - "traffic_pct": 90, - "matrix_m": 16, - "matrix_n": 16, - "matrix_k": 16, - "region_base_address_a": "0x04000", - "region_size_bytes_a": 8192, - "region_base_address_b": "0x00000", - "region_size_bytes_b": 8192, - "region_base_address_c": "0x08000", - "region_size_bytes_c": 8192 - }, - { - "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", - "mem_access_type": "matmul_phased", - "job": "gemm_A2", - "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], - "traffic_pct": 90, - "matrix_m": 16, - "matrix_n": 16, - "matrix_k": 16, - "region_base_address_a": "0x02000", - "region_size_bytes_a": 8192, - "region_base_address_b": "0x00000", - "region_size_bytes_b": 8192, - "region_base_address_c": "0x06000", - "region_size_bytes_c": 8192 - }, - { - "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", - "mem_access_type": "matmul_phased", - "job": "gemm_A3", - "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], - "traffic_pct": 90, - "matrix_m": 16, - "matrix_n": 16, - "matrix_k": 16, - "region_base_address_a": "0x04000", - "region_size_bytes_a": 8192, - "region_base_address_b": "0x00000", - "region_size_bytes_b": 8192, - "region_base_address_c": "0x08000", - "region_size_bytes_c": 8192 - } - ] - } - ] -} diff --git a/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json similarity index 100% rename from target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json rename to target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json diff --git a/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json similarity index 100% rename from target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json rename to target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json diff --git a/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json similarity index 100% rename from target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json rename to target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json diff --git a/target/verif/config/workloads/workload_dma_gemm_cores.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json similarity index 100% rename from target/verif/config/workloads/workload_dma_gemm_cores.json rename to target/verif/exploration/config/workloads/workload_dma_gemm_cores.json diff --git a/target/verif/config/workloads/workload_dma_gemm_ideal.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json similarity index 75% rename from target/verif/config/workloads/workload_dma_gemm_ideal.json rename to target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json index b166c78..c8ab84c 100644 --- a/target/verif/config/workloads/workload_dma_gemm_ideal.json +++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json @@ -1,14 +1,29 @@ { - "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.", + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", "log_masters": [ - { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" }, - { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" }, - { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" }, - { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" }, - { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" }, - { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" }, - { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" }, - { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" }, + {"id": 0, "description": "Core 0 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 1, "description": "Core 1 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 2, "description": "Core 2 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 3, "description": "Core 3 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 4, "description": "Core 4 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 5, "description": "Core 5 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 6, "description": "Core 6 post-GEMM background traffic", "mem_access_type": "idle" }, + { + "id": 7, + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] + }, { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } ], @@ -34,16 +49,6 @@ "traffic_read_pct": 0, "region_base_address": "0x02000", "region_size_bytes": 8192 - }, - { - "description": "Read back C1 after GEMM tile A3", - "mem_access_type": "linear", - "job": "dma_store_C3_buf3", - "wait_for_jobs": ["gemm_A3"], - "traffic_pct": 100, - "traffic_read_pct": 100, - "region_base_address": "0x08000", - "region_size_bytes": 8192 } ] }, diff --git a/target/verif/exploration/exploration.mk b/target/verif/exploration/exploration.mk new file mode 100644 index 0000000..1a93e7b --- /dev/null +++ b/target/verif/exploration/exploration.mk @@ -0,0 +1,17 @@ +# Copyright 2026 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Sergio Mazzola + +HCI_VERIF_EXPL_DIR = $(HCI_ROOT)/target/verif/exploration + +################ +# Benchmarking # +################ + +# Modify this script to configure parameters (e.g., workload to run) +BENCHMARK_SCRIPT := $(HCI_VERIF_EXPL_DIR)/scripts/run_sweep.sh + +benchmarking-sweep: + . $(BENCHMARK_SCRIPT) \ No newline at end of file diff --git a/target/verif/scripts/parse_vsim.py b/target/verif/exploration/scripts/parse_vsim.py similarity index 100% rename from target/verif/scripts/parse_vsim.py rename to target/verif/exploration/scripts/parse_vsim.py diff --git a/target/verif/scripts/plot_sweep_results.py b/target/verif/exploration/scripts/plot_sweep_results.py similarity index 97% rename from target/verif/scripts/plot_sweep_results.py rename to target/verif/exploration/scripts/plot_sweep_results.py index cd24686..786561e 100644 --- a/target/verif/scripts/plot_sweep_results.py +++ b/target/verif/exploration/scripts/plot_sweep_results.py @@ -18,7 +18,7 @@ from matplotlib.patches import Patch # Cycles of ideal workload runtime -IDEAL_WORKLOAD_DURATION = 3866.0 +IDEAL_WORKLOAD_RUNTIME = 4121.0 INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2} INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"} @@ -176,7 +176,7 @@ def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> No for bar, val in zip(bars, values): if math.isnan(val): continue - mult_of_ideal = (val / IDEAL_WORKLOAD_DURATION) if val > 0 and IDEAL_WORKLOAD_DURATION > 0 else float("nan") + mult_of_ideal = (val / IDEAL_WORKLOAD_RUNTIME) if val > 0 and IDEAL_WORKLOAD_RUNTIME > 0 else float("nan") pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime" ax.text( bar.get_x() + bar.get_width() / 2.0, @@ -188,11 +188,11 @@ def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> No ) ax.axhline( - y=IDEAL_WORKLOAD_DURATION, + y=IDEAL_WORKLOAD_RUNTIME, color="red", linestyle="--", linewidth=1.6, - label=f"Ideal workload runtime ({IDEAL_WORKLOAD_DURATION:.0f} cycles)", + label=f"Ideal workload runtime ({IDEAL_WORKLOAD_RUNTIME:.0f} cycles)", ) legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")] @@ -273,10 +273,10 @@ def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None: sim_cycles_vals = [e["total_sim_cycles"] for e in entries] ideal_app_vals = [] for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals): - if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_DURATION <= 0.0: + if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_RUNTIME <= 0.0: ideal_app_vals.append(float("nan")) else: - ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_DURATION) + ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_RUNTIME) actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries] x = np.arange(len(entries), dtype=float) width = 0.34 @@ -323,7 +323,7 @@ def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None: ) # Ideal app BW computed from moved data and ideal application duration: - # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_DURATION + # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_RUNTIME valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)] if valid_ideal_app_vals: ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals) diff --git a/target/verif/exploration/scripts/run_sweep.sh b/target/verif/exploration/scripts/run_sweep.sh new file mode 100644 index 0000000..8759a9f --- /dev/null +++ b/target/verif/exploration/scripts/run_sweep.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +# Make sure we are in hci root +if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then + echo "Error: This script must be run from within the hci project git repository." >&2 + exit 1 +fi +cd "$git_root" + +# Directories +VERIF_DIR=./target/verif +VERIF_EXPL_DIR=$VERIF_DIR/exploration +RESULTS_DIR=$VERIF_EXPL_DIR/results +PLOTS_DIR=$RESULTS_DIR/plots + +RUN_NAME=gemm_cores_double_buffer + +# Makefile settings (verif.mk) +export GUI=0 +export WORKLOAD_JSON=$VERIF_EXPL_DIR/config/workloads/workload_dma_gemm_cores.json +export TESTBENCH_JSON=$VERIF_DIR/config/testbench.json +# HARDWARE_JSON is swept in the loop below + +mkdir -p "$RESULTS_DIR" + +# For each hardware*.json, run simulation and parse transcript +for hardware_config in $VERIF_EXPL_DIR/config/hardware/hardware_*.json; do + make clean-verif + echo -e "\033[32;1mRunning simulation with hardware config: $hardware_config\033[0m" + export HARDWARE_JSON="$hardware_config" + make run-verif + python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).json + # Copy html report + cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).html +done + +python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME + +# Ideal run (manual) +# do not forget to change IDEAL_WORKLOAD_RUNTIME in plot_sweep_results.py +python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/ideal.json +cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/ideal.html +# Regenerate plots with correct IDEAL_WORKLOAD_RUNTIME +python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME diff --git a/target/verif/results/hardware_hci_2hwpe_8fact.json b/target/verif/results/hardware_hci_2hwpe_8fact.json deleted file mode 100644 index 1fd0fd4..0000000 --- a/target/verif/results/hardware_hci_2hwpe_8fact.json +++ /dev/null @@ -1,431 +0,0 @@ -{ - "hw_config": { - "masters": { - "core": 8, - "dma": 0, - "ext": 1, - "hwpe": 2, - "total": 11 - }, - "memory": { - "banks": 64, - "total_size_kb": 256, - "data_width_bits": 32, - "hwpe_width_lanes": 8 - }, - "interconnect": { - "sel_lic": 0, - "ts_bit": 21, - "expfifo": 0 - }, - "interconnect_side": { - "type": "HCI", - "n_narrow_hci": 8, - "n_wide_hci": 2, - "n_dma": 0, - "n_ext": 1, - "narrow_total_ports": 9, - "total_initiator_ports": 11 - }, - "id_address": { - "iw": 11, - "addr_width": 18, - "addr_width_bank": 12 - } - }, - "bandwidth": { - "ideal_memory_side_bit_per_cycle": 2048.0, - "ideal_interconnect_side_bit_per_cycle": 800.0, - "ideal_bottleneck_bit_per_cycle": 800.0, - "actual_completion_bit_per_cycle": 306.65, - "actual_completion_utilization_pct": 38.3, - "completion_phase_duration_cycles": 4488.0, - "granted_transactions": { - "reads": 3072, - "writes": 2304, - "total": 5376 - }, - "read_complete_responses": 3072 - }, - "simulation_time": { - "per_master": [ - { - "master_name": "master_hwpe_0", - "role_name": "HWPE0", - "sim_time_cycles": 4488.0 - }, - { - "master_name": "master_hwpe_1", - "role_name": "HWPE1", - "sim_time_cycles": 4228.0 - }, - { - "master_name": "master_log_0", - "role_name": "Core0", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_1", - "role_name": "Core1", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_2", - "role_name": "Core2", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_3", - "role_name": "Core3", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_4", - "role_name": "Core4", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_5", - "role_name": "Core5", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_6", - "role_name": "Core6", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_7", - "role_name": "Core7", - "sim_time_cycles": 3.0 - }, - { - "master_name": "master_log_8", - "role_name": "EXT0", - "sim_time_cycles": 1.0 - } - ], - "total_cycles": 4488.0 - }, - "read_response_coverage": { - "master_log_0": { - "observed": 0, - "expected": 0 - }, - "master_log_1": { - "observed": 0, - "expected": 0 - }, - "master_log_2": { - "observed": 0, - "expected": 0 - }, - "master_log_3": { - "observed": 0, - "expected": 0 - }, - "master_log_4": { - "observed": 0, - "expected": 0 - }, - "master_log_5": { - "observed": 0, - "expected": 0 - }, - "master_log_6": { - "observed": 0, - "expected": 0 - }, - "master_log_7": { - "observed": 0, - "expected": 0 - }, - "master_log_8": { - "observed": 0, - "expected": 0 - }, - "master_hwpe_0": { - "observed": 1024, - "expected": 1024 - }, - "master_hwpe_1": { - "observed": 2048, - "expected": 2048 - } - }, - "transaction_counts": { - "master_log_0": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_1": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_2": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_3": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_4": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_5": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_6": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_7": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_log_8": { - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0 - }, - "master_hwpe_0": { - "granted_reads": 1024, - "granted_writes": 1280, - "read_complete": 1024 - }, - "master_hwpe_1": { - "granted_reads": 2048, - "granted_writes": 1024, - "read_complete": 2048 - } - }, - "request_to_grant_latency": { - "per_master": [ - { - "master_name": "master_hwpe_0", - "avg_req_to_gnt_stall_latency_cycles": 0.63, - "req_to_gnt_grants": 2304 - }, - { - "master_name": "master_hwpe_1", - "avg_req_to_gnt_stall_latency_cycles": 0.18, - "req_to_gnt_grants": 3072 - }, - { - "master_name": "master_log_0", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_1", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_2", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_3", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_4", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_5", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_6", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_7", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_8", - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - } - ], - "accumulated": { - "cycles": 1987.0, - "grants": 5376 - }, - "averages": { - "log": { - "weighted_cycles": 0.0, - "unweighted_cycles": 0.0 - }, - "hwpe": { - "weighted_cycles": 0.37, - "unweighted_cycles": 0.4 - }, - "global": { - "weighted_cycles": 0.37, - "unweighted_cycles": 0.4 - } - } - }, - "finish": { - "source": "/scratch/smazzola/projects/hci/hci/target/verif/src/simulation_report.sv", - "line": 464, - "time_ps": 224950, - "iteration": 4, - "instance": "/tb_hci/i_simulation_report" - }, - "masters": [ - { - "master_name": "master_hwpe_0", - "role_name": "HWPE0", - "sim_time_cycles": 4488.0, - "read_observed": 1024, - "read_expected": 1024, - "granted_reads": 1024, - "granted_writes": 1280, - "read_complete": 1024, - "avg_req_to_gnt_stall_latency_cycles": 0.63, - "req_to_gnt_grants": 2304 - }, - { - "master_name": "master_hwpe_1", - "role_name": "HWPE1", - "sim_time_cycles": 4228.0, - "read_observed": 2048, - "read_expected": 2048, - "granted_reads": 2048, - "granted_writes": 1024, - "read_complete": 2048, - "avg_req_to_gnt_stall_latency_cycles": 0.18, - "req_to_gnt_grants": 3072 - }, - { - "master_name": "master_log_0", - "role_name": "Core0", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_1", - "role_name": "Core1", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_2", - "role_name": "Core2", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_3", - "role_name": "Core3", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_4", - "role_name": "Core4", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_5", - "role_name": "Core5", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_6", - "role_name": "Core6", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_7", - "role_name": "Core7", - "sim_time_cycles": 3.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - }, - { - "master_name": "master_log_8", - "role_name": "EXT0", - "sim_time_cycles": 1.0, - "read_observed": 0, - "read_expected": 0, - "granted_reads": 0, - "granted_writes": 0, - "read_complete": 0, - "avg_req_to_gnt_stall_latency_cycles": 0.0, - "req_to_gnt_grants": 0 - } - ] -} diff --git a/target/verif/scripts/run_sweep.sh b/target/verif/scripts/run_sweep.sh deleted file mode 100644 index 17f7404..0000000 --- a/target/verif/scripts/run_sweep.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -set -e - -# Make sure we are in hci root -if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then - echo "Error: This script must be run from within the hci project git repository." >&2 - exit 1 -fi -cd "$git_root" - -VERIF_DIR=./target/verif -RESULTS_DIR=$VERIF_DIR/results -PLOTS_DIR=$RESULTS_DIR/plots - -mkdir -p "$RESULTS_DIR" -rm -f "$RESULTS_DIR"/hardware_*.json - -# For each hardware*.json in target/verif/config/sweep_hardware, run simulation and parse transcript -for hardware_config in $VERIF_DIR/config/sweep_hardware/hardware_*.json; do - make clean-verif - echo "Running simulation with hardware config: $hardware_config" - HARDWARE_JSON="$hardware_config" GUI=0 make run-verif - python3 $VERIF_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out "$RESULTS_DIR"/$(basename "$hardware_config" .json).json -done - -python3 $VERIF_DIR/scripts/plot_sweep_results.py --results-dir "$RESULTS_DIR" --out-dir "$PLOTS_DIR" diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 3ff2f30..e2dd321 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -1132,7 +1132,7 @@ def _add_edge(src, dst): # Build memory_map.txt # ----------------------------------------------------------------------- memory_map_path = generated_dir / 'memory_map.txt' - report_text = write_memory_map_txt( + write_memory_map_txt( memory_map_path=memory_map_path, total_mem_size_kib=TOT_MEM_SIZE, n_banks=N_BANKS, @@ -1161,15 +1161,14 @@ def _add_edge(src, dst): pattern_nodes=pattern_nodes, regions_timeline=regions_timeline, ) - print("\n" + report_text) print(f"Memory map written: {memory_map_path}") # ----------------------------------------------------------------------- - # Build memory_lifetime.html (simple SVG timeline view) + # Build dataflow.html (simple SVG timeline view) # ----------------------------------------------------------------------- - memory_lifetime_path = generated_dir / 'memory_lifetime.html' + dataflow_path = generated_dir / 'dataflow.html' write_memory_lifetime_html( - memory_lifetime_path=memory_lifetime_path, + memory_lifetime_path=dataflow_path, pattern_nodes=pattern_nodes, driver_windows=driver_windows, regions_timeline=regions_timeline, @@ -1189,7 +1188,7 @@ def _add_edge(src, dst): n_wide_hci_cfg=N_WIDE_HCI_CFG, n_banks=N_BANKS, ) - print(f"Memory lifetime plot written: {memory_lifetime_path}") + print(f"Dataflow plot written: {dataflow_path}") # ----------------------------------------------------------------------- # Apply per-master start delays diff --git a/target/verif/verif.mk b/target/verif/verif.mk index e9f93e7..5d08eba 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -10,6 +10,9 @@ HCI_VERIF_DIR = $(HCI_ROOT)/target/verif HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated +# Other Makefiles +include $(HCI_VERIF_DIR)/exploration/exploration.mk + # Include generated Makefiles include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk @@ -157,13 +160,6 @@ clean-sim-verif: rm -f $(HCI_VERIF_DIR)/vsim/transcript rm -f $(HCI_VERIF_DIR)/vsim/vsim.wlf -################ -# Benchmarking # -################ - -benchmarking-sweep: - . $(HCI_VERIF_DIR)/scripts/run_sweep.sh - ########### # Helpers # ########### From bf63bf558e7df718fdfd0aa6ece4429e25e613a7 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 11 Mar 2026 18:08:24 +0100 Subject: [PATCH 24/25] simvectors: Add new mem access patterns, fine-tune idle cycles gen, fix RAW deps model --- target/verif/README.md | 2 + target/verif/config/workload.schema.json | 209 +++++- target/verif/simvectors/README.md | 293 ++++++++ .../verif/simvectors/hci_stimuli/patterns.py | 666 +++++++++++++++++- target/verif/simvectors/html_report.py | 2 +- target/verif/simvectors/main.py | 564 +++++++++++++-- 6 files changed, 1647 insertions(+), 89 deletions(-) create mode 100644 target/verif/simvectors/README.md diff --git a/target/verif/README.md b/target/verif/README.md index cd7c03f..756efbe 100644 --- a/target/verif/README.md +++ b/target/verif/README.md @@ -4,6 +4,8 @@ The verification framework drives configurable memory traffic from multiple masters through the HCI interconnect and measures throughput and latency. It is fully driven by three JSON configuration files: +For the up-to-date stimuli-generator pattern catalog and output format, see `target/verif/simvectors/README.md`. + | File | Purpose | |------|---------| | `config/hardware.json` | Interconnect topology (number of masters, banks, data widths, ...) | diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json index 1037a0f..fde1870 100644 --- a/target/verif/config/workload.schema.json +++ b/target/verif/config/workload.schema.json @@ -38,8 +38,23 @@ }, "mem_access_type": { "type": "string", - "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"], - "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'." + "enum": [ + "idle", + "random", + "linear", + "2d", + "3d", + "matmul_phased", + "matmul", + "multi_linear", + "bank_group_linear", + "rw_rowwise", + "gather_scatter", + "matmul_tiled_interleave", + "matmul_tiled", + "hotspot_random" + ], + "description": "Access pattern selector. Aliases: 'matmul' -> 'matmul_phased', 'matmul_tiled' -> 'matmul_tiled_interleave'." }, "job": { "type": "string", @@ -157,6 +172,160 @@ "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], "description": "[matmul_phased] Size in bytes of the write-C region." }, + "regions": { + "type": "array", + "description": "[multi_linear] Subregions to stream in schedule order.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "stride_words": { "type": "integer", "minimum": 1, "default": 1 }, + "read_pct": { "type": "integer", "minimum": 0, "maximum": 100 } + } + } + }, + "schedule": { + "type": "string", + "description": "[multi_linear, gather_scatter] Access schedule selector (e.g. round_robin, 4read_1write)." + }, + "burst_len": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[multi_linear] Number of consecutive accesses per selected region before switching." + }, + "start_bank": { + "type": "integer", + "minimum": 0, + "description": "[bank_group_linear] Starting bank index." + }, + "bank_group_span": { + "type": "integer", + "minimum": 1, + "description": "[bank_group_linear] Number of banks in the active group." + }, + "stride_beats": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[bank_group_linear] Stride in beats through the bank-group phase." + }, + "bank_group_hop": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[bank_group_linear] Optional phase hop applied when advancing to the next group window." + }, + "wen": { + "type": "integer", + "enum": [0, 1], + "description": "[bank_group_linear] Fixed direction: 1=read, 0=write. If omitted, reads/writes are mixed." + }, + "row_base_address": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[rw_rowwise] Base address of row 0." + }, + "row_size_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[rw_rowwise] Bytes touched inside each row." + }, + "n_rows": { + "type": "integer", + "minimum": 1, + "description": "[rw_rowwise] Number of rows." + }, + "row_stride_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[rw_rowwise] Byte stride between consecutive row bases." + }, + "reads_per_row": { + "type": "integer", + "minimum": 0, + "description": "[rw_rowwise] Number of reads emitted per row." + }, + "writes_per_row": { + "type": "integer", + "minimum": 0, + "description": "[rw_rowwise] Number of writes emitted per row." + }, + "idle_cycles_between_rows": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[rw_rowwise] Idle cycles inserted between rows." + }, + "read_regions": { + "type": "array", + "description": "[gather_scatter] Source regions for gather reads.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] } + } + } + }, + "write_region": { + "type": "object", + "description": "[gather_scatter] Destination region for scatter writes.", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] } + } + }, + "chunk_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[gather_scatter] Address increment size in bytes for gather/scatter stepping." + }, + "tile_a_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes read from A per tile schedule step." + }, + "tile_b_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes read from B per tile schedule step." + }, + "tile_c_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes written to C per tile schedule step." + }, + "tiles": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[matmul_tiled_interleave] Number of tile iterations before the pattern repeats." + }, + "ab_c_schedule": { + "type": "string", + "description": "[matmul_tiled_interleave] Tile phase order string, e.g. A_B_B_C." + }, + "idle_cycles_between_tiles": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[matmul_tiled_interleave] Idle cycles inserted between tile iterations." + }, + "hot_regions": { + "type": "array", + "description": "[hotspot_random] Weighted hot regions used for random accesses.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }, + "weight": { "type": "integer", "minimum": 1, "default": 1 } + } + } + }, "matmul_ratio_a": { "type": "integer", "minimum": 0, @@ -181,13 +350,13 @@ "minimum": 1, "maximum": 100, "default": 100, - "description": "[random, linear, matmul_phased] Modeled request utilization percentage (adds req=0 idles after each request when <100)." + "description": "[random, linear, matmul_phased, multi_linear, bank_group_linear, rw_rowwise, gather_scatter, matmul_tiled_interleave, hotspot_random] Modeled request utilization percentage (adds req=0 idles after each request when <100)." }, "traffic_read_pct": { "type": "integer", "minimum": 0, "maximum": 100, - "description": "[random, linear] Percentage of accesses that are reads (wen=1)." + "description": "[random, linear, hotspot_random] Percentage of accesses that are reads (wen=1)." }, "idle_cycles_between_phases": { "type": "integer", @@ -232,6 +401,38 @@ { "required": ["matrix_m", "matrix_n", "matrix_k"] } ] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "multi_linear" } }, "required": ["mem_access_type"] }, + "then": { "required": ["regions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "bank_group_linear" } }, "required": ["mem_access_type"] }, + "then": { "required": ["start_bank", "bank_group_span", "n_transactions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "rw_rowwise" } }, "required": ["mem_access_type"] }, + "then": { "required": ["row_base_address", "row_size_bytes", "n_rows", "row_stride_bytes", "reads_per_row", "writes_per_row"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "gather_scatter" } }, "required": ["mem_access_type"] }, + "then": { "required": ["read_regions", "write_region"] } + }, + { + "if": { + "properties": { "mem_access_type": { "enum": ["matmul_tiled_interleave", "matmul_tiled"] } }, + "required": ["mem_access_type"] + }, + "then": { + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["region_base_address_a", "region_size_bytes_a", "region_base_address_b", "region_size_bytes_b", "region_base_address_c", "region_size_bytes_c"] } + ] + } + }, + { + "if": { "properties": { "mem_access_type": { "const": "hotspot_random" } }, "required": ["mem_access_type"] }, + "then": { "required": ["hot_regions"] } } ] }, diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md new file mode 100644 index 0000000..472323b --- /dev/null +++ b/target/verif/simvectors/README.md @@ -0,0 +1,293 @@ +# Simvectors Stimuli Generator + +## Scope +`main.py` generates per-master stimuli vectors from: +- `workload.json` +- `hardware.json` +- `testbench.json` + +This README summarizes: +- available `mem_access_type` patterns and JSON parameters +- every source of `req=0` cycles +- read/write blocked-set behavior +- generated outputs and formats + +## JSON Structure (workload) +Top-level: +- `log_masters`: list of narrow masters +- `hwpe_masters`: list of wide masters + +### Field Format Conventions +| Kind | Accepted JSON format | Unit / meaning | +|---|---|---| +| Address fields (`*_address`, `base`) | integer or string (`"1234"`, `"0x4000"`, `"101010"`) | byte address | +| Size fields (`*_size_bytes`, `chunk_bytes`, `tile_*_bytes`) | integer or numeric string | bytes | +| Strides (`stride0/1/2`) | integer | words (word = `DATA_WIDTH/8` for that master) | +| `row_stride_bytes` | integer or numeric string | bytes | +| `stride_beats` | integer | beats (beat = transaction width in bytes for that master) | +| Counters (`n_transactions`, `len_*`, `tiles`, `reads_per_row`, `writes_per_row`) | integer | count | +| Percentages (`traffic_pct`, `traffic_read_pct`, `read_pct`) | integer | percent | +| `wen` | `0` or `1` | `0`=write, `1`=read | + +### Master-Level Fields +| Field | Required | Default | Notes | +|---|---|---|---| +| `id` | no | positional index | Informational/consistency warning only. | +| `description` | no | empty | Human-readable label. | +| `start_delay_cycles` | no | `0` | Prepended `req=0` cycles before first pattern. | +| `patterns` | no | absent | If present, this list drives generation. | + +Precedence/exclusivity: +- Master format is effectively either: + - flat single-pattern master (no `patterns`) + - or `patterns` list. +- If `patterns` exists, flat pattern fields at master level are ignored for traffic generation (except master-level fields like `start_delay_cycles`). + +### Common Pattern Fields (apply to every pattern type) +| Field | Required | Default | Notes | +|---|---|---|---| +| `mem_access_type` | yes | none | Pattern selector. | +| `description` | no | empty | Label used in reports. | +| `job` | no | `"default"` | Dependency graph node name. | +| `wait_for_jobs` | no | `[]` | Inserts dependency gate before pattern. | +| `n_transactions` | conditional | derivable for many patterns | If omitted, derived when supported. | +| `traffic_pct` | no | `100` | Adds per-request idle shaping (`req=0`) on patterns that implement traffic shaping. | + +## Pattern Catalog +The tables below list pattern-specific fields. +Complete field set for a pattern = **common fields above + pattern-specific fields below**. +`Required = conditional` means: required unless the documented derivation path is present. + +### `idle` +No memory transaction. Emits idle and trailing `PAUSE`. + +Pattern-specific fields: none. + +### `random` +Uniform random over a region. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | yes | none | int | Not derivable for this pattern. | +| `region_base_address` | no | evenly partitioned per master | address (bytes) | | +| `region_size_bytes` | no | evenly partitioned per master | bytes | | +| `traffic_read_pct` | no | random R/W mix | % | If set, deterministic read/write split. | + +### `linear` +1D strided stream. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `length` or `region_size_bytes`. | +| `length` | no | none | int | Alias source for derived `n_transactions`. | +| `start_address` | no | `"0"` | address (bytes) | If absent and `region_base_address` exists, uses `region_base_address`. | +| `stride0` | no | `0` (or `1` if `region_size_bytes` set) | words | | +| `region_base_address` | no | evenly partitioned per master | address (bytes) | Used for region context and start fallback. | +| `region_size_bytes` | no | evenly partitioned per master | bytes | Can derive `n_transactions`. | +| `traffic_read_pct` | no | random R/W mix | % | | + +### `2d` +2D walk. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1`. | +| `start_address` | no | `"0"` | address (bytes) | | +| `stride0` | no | `0` | words | | +| `len_d0` | conditional | none | int | | +| `stride1` | no | `0` | words | | +| `len_d1` | conditional | none | int | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. | + +### `3d` +3D walk. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1 * len_d2`. | +| `start_address` | no | `"0"` | address (bytes) | | +| `stride0` | no | `0` | words | | +| `len_d0` | conditional | none | int | | +| `stride1` | no | `0` | words | | +| `len_d1` | conditional | none | int | | +| `stride2` | no | `0` | words | | +| `len_d2` | conditional | none | int | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. | + +### `matmul_phased` (alias: `matmul`) +Phased A-read / B-read / C-write traffic. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from region size or matrix dims. | +| `region_base_address`, `region_size_bytes` | conditional | evenly partitioned | bytes | Combined region (auto A/B/C split). | +| `matrix_m`, `matrix_n`, `matrix_k` | no | none | int | Alternative source for derived `n_transactions`. | +| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | no | none | bytes | Explicit per-phase regions. | +| `matmul_ratio_a/b/c` | no | `1/1/1` | relative weights | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit phase-boundary idles. | + +Mutual exclusivity / precedence: +- If explicit `*_a/b/c` regions are provided, they take precedence over combined-region auto-split. + +### `multi_linear` +Multiple subregions, schedule-driven interleave. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `regions` | yes | none | array | Each entry has `base`, `size_bytes`, optional `stride_words`, `read_pct`. | +| `schedule` | no | `round_robin` | string | | +| `burst_len` | no | `1` | int | | +| `n_transactions` | conditional | derived | int | Derivable from sum of region sizes. | + +### `bank_group_linear` +Linear stream constrained by bank group phase controls. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `start_bank` | yes | none | int | 0-based bank index. | +| `bank_group_span` | yes | none | int | Number of banks in active group. | +| `stride_beats` | no | `1` | beats | | +| `bank_group_hop` | no | `0` | int | Group-phase hop per wrap. | +| `wen` | no | mixed R/W | `0` or `1` | Fixed direction if set. | +| `n_transactions` | yes | none | int | Required for this pattern. | + +### `rw_rowwise` +Per-row read phase then write phase. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `row_base_address` | yes | none | address (bytes) | | +| `row_size_bytes` | yes | none | bytes | | +| `n_rows` | yes | none | int | | +| `row_stride_bytes` | yes | none | bytes | | +| `reads_per_row` | yes | none | int | | +| `writes_per_row` | yes | none | int | | +| `idle_cycles_between_rows` | no | `0` | cycles | Inserts explicit row-boundary idles. | +| `n_transactions` | conditional | derived | int | Derivable as `n_rows * (reads_per_row + writes_per_row)`. | + +### `gather_scatter` +Gather from multiple read regions, scatter to write region. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `read_regions` | yes | none | array | Each entry: `base`, `size_bytes`. | +| `write_region` | yes | none | object | `base`, `size_bytes`. | +| `chunk_bytes` | no | transaction width | bytes | Address increment granularity. | +| `schedule` | no | `4read_1write` | string | | +| `n_transactions` | conditional | derived | int | Derivable from region sizes and chunk. | + +### `matmul_tiled_interleave` (alias: `matmul_tiled`) +Tile-like interleaving among A/B/C streams. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `region_base_address`, `region_size_bytes` | no | evenly partitioned | bytes | Used as fallback context for auto split when explicit A/B/C are absent. | +| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | conditional | fallback split | bytes | Preferred explicit mode. | +| `tile_a_bytes`, `tile_b_bytes`, `tile_c_bytes` | no | transaction width | bytes | Tile step payloads per stream. | +| `tiles` | no | `1` | int | | +| `ab_c_schedule` | no | `A_B_C` | string | | +| `idle_cycles_between_tiles` | no | `0` | cycles | Inserts explicit tile-boundary idles. | +| `n_transactions` | conditional | derived | int | Can be derived from tile parameters/schedule. | + +Mutual exclusivity / precedence: +- Preferred: explicit A/B/C regions. +- If missing, generator falls back to splitting combined region context. + +### `hotspot_random` +Weighted random traffic across hot regions. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `hot_regions` | yes | none | array | Each entry: `base`, `size_bytes`, optional `weight` (default `1`). | +| `n_transactions` | no | weak fallback | int | Prefer setting explicitly. | +| `traffic_read_pct` | no | random R/W mix | % | | + +## All Sources of `req=0` Cycles +`req=0` can be generated by: + +1. `traffic_pct` shaping +- For every emitted request, inserts `idles_per_req = round((100-pct)/pct)` idle lines. +- Applies in random/linear and all new patterns that support `traffic_pct`. + +2. Boundary idle knobs (JSON) +- `idle_cycles_between_phases` in `2d`, `3d`, `matmul_phased` +- `idle_cycles_between_rows` in `rw_rowwise` +- `idle_cycles_between_tiles` in `matmul_tiled_interleave` + +3. `start_delay_cycles` (per master) +- Prepends idle lines before first pattern. + +4. Dependency gate for `wait_for_jobs` +- For each dependent pattern, generator inserts a synthetic idle+`PAUSE` gate before real traffic. + +5. `idle` pattern +- Explicitly emits idle and `PAUSE`. + +## Read/Write Blocked-Set Functionality +Address filtering is implemented inside pattern generators via: +- `_is_allowed(add, wen, read_blocked_set, write_blocked_set)` +- `_record_access(add, wen, read_blocked_set, write_blocked_set)` + +Behavior (within one pattern invocation): +- Read checks (`wen=1`) consult `read_blocked_set`. +- Write checks (`wen=0`) consult `write_blocked_set`. +- On every emitted access, the address is added to `write_blocked_set`. +- On emitted writes only, the address is also added to `read_blocked_set`. + +Effective policy: +- read after read: allowed +- write after read: blocked +- read after write: blocked +- write after write: blocked + +Notes: +- Blocking state is pattern-local (it does not persist across patterns). +- Generators are strict about transaction count: each non-idle pattern must emit exactly `n_transactions` (`N_TEST`). +- If blocking rules make the requested count unreachable for a pattern, generation fails with an explicit error instead of silently under-emitting. + +## Outputs + +### 1. Stimuli vectors +Path: +- `target/verif/simvectors/generated/stimuli/master_log_.txt` +- `target/verif/simvectors/generated/stimuli/master_hwpe_.txt` + +Per-cycle vector line format: +- `req id wen data add` +- `req`: `1` active request, `0` idle +- `id`: request ID (`IW` bits) +- `wen`: `1` read, `0` write +- `data`: payload (`DATA_WIDTH` bits for narrow, `HWPE_WIDTH_FACT*DATA_WIDTH` for wide) +- `add`: byte address (`ADD_WIDTH` bits) + +Fence token: +- A standalone line `PAUSE` is emitted at end of each pattern segment. + +### 2. Memory map report +Path: +- `target/verif/simvectors/generated/memory_map.txt` + +Contains: +- per-pattern region and traffic summary +- dependency/fence map +- temporal schedule summary +- region lifetimes and overlaps context + +### 3. Dataflow visualization +Path: +- `target/verif/simvectors/generated/dataflow.html` + +Contains: +- execution timeline (SVG) +- region-map blocks +- per-region usage cards +- overlap table + +### 4. Optional outputs +- `--golden`: emits expected read-data vectors under `generated/golden/` +- `--emit_phases_mk `: emits fence/dependency Makefile fragment + +## Recommended Extra Documentation +- one minimal JSON example per pattern +- exact dependency semantics for `job` / `wait_for_jobs` with 2-3 pattern chain examples +- known caveat: timeline model in report is simplified and may differ from full RTL contention timing diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py index afc6633..cf080fe 100644 --- a/target/verif/simvectors/hci_stimuli/patterns.py +++ b/target/verif/simvectors/hci_stimuli/patterns.py @@ -22,7 +22,6 @@ import random - class PatternsMixin: @staticmethod @@ -56,21 +55,128 @@ def _idles_per_req(traffic_pct): traffic_pct = max(1, min(100, int(traffic_pct))) return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct)) - def _is_allowed(self, add, wen, fr, fw): - return add not in (fr if wen else fw) + def _is_allowed(self, add, wen, read_blocked_set, write_blocked_set): + return add not in (read_blocked_set if wen else write_blocked_set) + + def _record_access(self, add, wen, read_blocked_set, write_blocked_set): + write_blocked_set.add(add) + if not wen: + read_blocked_set.add(add) + + @staticmethod + def _init_blocked_sets(read_blocked, write_blocked): + return set(read_blocked or []), set(write_blocked or []) + + @staticmethod + def _extend_unique_sorted(target, values): + if not isinstance(target, list): + return + known = set(target) + for v in sorted(values): + if v in known: + continue + target.append(v) + known.add(v) - def _record_access(self, add, wen, fr_new, fw_new): - fw_new.append(add) - if not wen: fr_new.append(add) + def _commit_blocked_sets(self, read_blocked, write_blocked, read_blocked_set, write_blocked_set): + self._extend_unique_sorted(read_blocked, read_blocked_set) + self._extend_unique_sorted(write_blocked, write_blocked_set) + + def _require_exact_emits(self, pattern_name, id_start, id_value): + emitted = int(id_value - id_start) + expected = int(self.N_TEST) + if emitted == expected: + return + raise RuntimeError( + f"{pattern_name}: emitted {emitted} transaction(s), expected {expected}. " + "Adjust region/shape/traffic to satisfy the read/write blocked policy." + ) def _open(self, append): return open(self.filepath, "a" if append else "w", encoding="ascii") + def _total_mem_bytes(self): + return int(self.TOT_MEM_SIZE * 1024) + + def _normalize_addr(self, addr): + total = self._total_mem_bytes() + if total <= self.WIDTH_OF_MEMORY_BYTE: + return 0 + max_addr = total - self.WIDTH_OF_MEMORY_BYTE + a = int(addr) % total + if a > max_addr: + a = max_addr + return a + + @staticmethod + def _parse_read_write_schedule(schedule, default="4read_1write"): + raw = str(schedule if schedule is not None else default).strip().lower() + if not raw: + raw = default + tokens = [] + for chunk in raw.replace("-", "_").split("_"): + c = chunk.strip() + if not c: + continue + count = 1 + word = c + if c[0].isdigit(): + i = 0 + while i < len(c) and c[i].isdigit(): + i += 1 + count = max(1, int(c[:i])) + word = c[i:] + elif c[-1].isdigit(): + i = len(c) - 1 + while i >= 0 and c[i].isdigit(): + i -= 1 + count = max(1, int(c[i + 1:])) + word = c[:i + 1] + word = word.strip() + if word in {"read", "r"}: + tokens.extend(["R"] * count) + elif word in {"write", "w"}: + tokens.extend(["W"] * count) + if not tokens: + return ["R", "R", "R", "R", "W"] + return tokens + + @staticmethod + def _parse_abc_schedule(schedule, default="A_B_C"): + raw = str(schedule if schedule is not None else default).strip().upper() + if not raw: + raw = default + tokens = [] + for chunk in raw.replace("-", "_").split("_"): + c = chunk.strip() + if not c: + continue + count = 1 + letter = c + if c[0].isdigit(): + i = 0 + while i < len(c) and c[i].isdigit(): + i += 1 + count = max(1, int(c[:i])) + letter = c[i:] + elif c[-1].isdigit(): + i = len(c) - 1 + while i >= 0 and c[i].isdigit(): + i -= 1 + count = max(1, int(c[i + 1:])) + letter = c[:i + 1] + letter = letter.strip().upper() + if letter in {"A", "B", "C"}: + tokens.extend([letter] * count) + if not tokens: + return ["A", "B", "C"] + return tokens + # ------------------------------------------------------------------ # # Access patterns — each writes: transactions | PAUSE # # ------------------------------------------------------------------ # - def random_gen(self, id_start, forbidden_read, forbidden_write, + def random_gen(self, id_start, read_blocked, write_blocked, region_base=0, region_size=None, traffic_pct=100, traffic_read_pct=None, append=False): total = int(self.TOT_MEM_SIZE * 1024) @@ -84,24 +190,32 @@ def random_gen(self, id_start, forbidden_read, forbidden_write, wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) else: wen_seq = None - id_value = id_start; fr_new, fw_new = [], [] + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + max_attempts = max(1, n_words * 4) with self._open(append) as f: for i in range(self.N_TEST): wen = wen_seq[i] if wen_seq is not None else None if wen is None: data, wen = self.data_wen() else: data = "0"*self.DATA_WIDTH if wen else self.random_data() - while True: + placed = False + for _ in range(max_attempts): ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE add = bin(int(ad))[2:].zfill(self.ADD_WIDTH) - if self._is_allowed(add, wen, forbidden_read, forbidden_write): - self._record_access(add, wen, fr_new, fw_new); break + if self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + self._record_access(add, wen, read_blocked_set, write_blocked_set) + placed = True + break + if not placed: + continue self._write_req(f, id_value, wen, data, add); id_value += 1 for _ in range(n_idles): self._write_idle(f) self._write_pause(f) - forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("random", id_start, id_value) return id_value - def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden_write, + def linear_gen(self, stride0, start_address, id_start, read_blocked, write_blocked, traffic_pct=100, traffic_read_pct=None, append=False): n_idles = self._idles_per_req(traffic_pct) if traffic_read_pct is not None: @@ -110,7 +224,8 @@ def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) else: wen_seq = None - id_value = id_start; fr_new, fw_new = [], [] + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) with self._open(append) as f: addr = self._parse_address(start_address) if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: @@ -123,42 +238,50 @@ def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden addr += self.WIDTH_OF_MEMORY_BYTE * stride0 if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: addr -= self.TOT_MEM_SIZE*1024 - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue - self._record_access(add, wen, fr_new, fw_new) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) self._write_req(f, id_value, wen, data, add); id_value += 1 for _ in range(n_idles): self._write_idle(f) self._write_pause(f) - forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("linear", id_start, id_value) return id_value def gen_2d(self, stride0, len_d0, stride1, start_address, id_start, - forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False): - id_value = id_start; fr_new, fw_new = [], [] + read_blocked, write_blocked, idle_cycles_between_phases=0, append=False): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) with self._open(append) as f: base = self._parse_address(start_address); j = 0 while id_value - id_start < self.N_TEST: + emitted_before = id_value for i in range(len_d0): data, wen = self.data_wen() addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: addr -= self.TOT_MEM_SIZE*1024 add = bin(addr)[2:].zfill(self.ADD_WIDTH) - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue - self._record_access(add, wen, fr_new, fw_new) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) self._write_req(f, id_value, wen, data, add); id_value += 1 if id_value - id_start >= self.N_TEST: break for _ in range(idle_cycles_between_phases): self._write_idle(f) + if id_value == emitted_before: + break j += 1 self._write_pause(f) - forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("2d", id_start, id_value) return id_value def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start, - forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False): - id_value = id_start; fr_new, fw_new = [], [] + read_blocked, write_blocked, idle_cycles_between_phases=0, append=False): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) with self._open(append) as f: base = self._parse_address(start_address); k = 0 while id_value - id_start < self.N_TEST: + emitted_before = id_value for j in range(len_d1): for i in range(len_d0): data, wen = self.data_wen() @@ -166,15 +289,18 @@ def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_st if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: addr -= self.TOT_MEM_SIZE*1024 add = bin(addr)[2:].zfill(self.ADD_WIDTH) - if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue - self._record_access(add, wen, fr_new, fw_new) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) self._write_req(f, id_value, wen, data, add); id_value += 1 if id_value - id_start >= self.N_TEST: break if id_value - id_start >= self.N_TEST: break for _ in range(idle_cycles_between_phases): self._write_idle(f) + if id_value == emitted_before: + break k += 1 self._write_pause(f) - forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("3d", id_start, id_value) return id_value def idle_gen(self, id_start, append=False): @@ -183,7 +309,7 @@ def idle_gen(self, id_start, append=False): self._write_pause(f) return id_start - def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write, + def matmul_phased_gen(self, id_start, read_blocked, write_blocked, region_base_address, region_size_bytes, matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1, traffic_pct=100, @@ -192,7 +318,8 @@ def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write, region_base_address_b=None, region_size_bytes_b=None, region_base_address_c=None, region_size_bytes_c=None, append=False): - id_value = id_start; fr_new, fw_new = [], [] + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024) n_idles = self._idles_per_req(traffic_pct) @@ -213,6 +340,7 @@ def _res(bo, so, fb, fs): rw = size // ab if rw < 3: with self._open(append) as f: self._write_idle(f); self._write_pause(f) + self._require_exact_emits("matmul_phased", id_start, id_value) return id_value aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab @@ -226,8 +354,13 @@ def _emit(fobj, count, wen, pb, pe): for _ in range(count): data = "0"*self.DATA_WIDTH if wen else self.random_data() add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + addr += ab + if addr >= pe: + addr = pb + continue self._write_req(fobj, id_value, wen, data, add) - self._record_access(add, wen, fr_new, fw_new) + self._record_access(add, wen, read_blocked_set, write_blocked_set) id_value += 1; addr += ab if addr >= pe: addr = pb for _ in range(n_idles): self._write_idle(fobj) @@ -242,5 +375,476 @@ def _emit(fobj, count, wen, pb, pe): _emit(f, cc, 0, c_base, c_base+c_size) self._write_pause(f) - forbidden_read.extend(fr_new); forbidden_write.extend(fw_new) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("matmul_phased", id_start, id_value) + return id_value + + def multi_linear_gen( + self, + id_start, + read_blocked, + write_blocked, + regions, + schedule="round_robin", + burst_len=1, + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + burst = max(1, int(burst_len)) + + norm_regions = [] + for reg in regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + stride_words = max(1, int(reg.get("stride_words", 1))) + read_pct = reg.get("read_pct") + if read_pct is not None: + read_pct = max(0, min(100, int(read_pct))) + norm_regions.append({ + "base": base, + "size": size, + "stride_words": stride_words, + "read_pct": read_pct, + "offset": 0, + }) + + if not norm_regions: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("multi_linear", id_start, id_value) + return id_value + + rr = 0 + stalled_rounds = 0 + with self._open(append) as f: + while id_value - id_start < self.N_TEST: + emitted_before = id_value + reg = norm_regions[rr % len(norm_regions)] + rr += 1 + chunk = burst if str(schedule).strip().lower() == "round_robin" else max(1, self.N_TEST) + for _ in range(chunk): + if id_value - id_start >= self.N_TEST: + break + addr = reg["base"] + reg["offset"] + add = bin(self._normalize_addr(addr))[2:].zfill(self.ADD_WIDTH) + if reg["read_pct"] is None: + data, wen = self.data_wen() + else: + wen = 1 if random.randint(1, 100) <= reg["read_pct"] else 0 + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + step = reg["stride_words"] * ab + reg["offset"] = (reg["offset"] + step) % reg["size"] + if id_value == emitted_before: + stalled_rounds += 1 + if stalled_rounds >= len(norm_regions): + break + else: + stalled_rounds = 0 + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("multi_linear", id_start, id_value) + return id_value + + def bank_group_linear_gen( + self, + id_start, + read_blocked, + write_blocked, + start_bank, + bank_group_span, + stride_beats=1, + bank_group_hop=0, + wen=None, + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + span = max(1, min(int(bank_group_span), int(self.N_BANKS))) + start_bank = int(start_bank) % max(1, int(self.N_BANKS)) + stride = max(1, int(stride_beats)) + hop = max(0, int(bank_group_hop)) + + with self._open(append) as f: + for tx in range(self.N_TEST): + phase = tx * stride + group_idx = phase // span + bank_base = (start_bank + group_idx * hop * span) % self.N_BANKS + bank = (bank_base + (phase % span)) % self.N_BANKS + row = group_idx + word_idx = row * self.N_BANKS + bank + addr = self._normalize_addr(word_idx * ab) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if wen is None: + data, wen_cur = self.data_wen() + else: + wen_cur = 1 if int(wen) else 0 + data = "0" * self.DATA_WIDTH if wen_cur else self.random_data() + if not self._is_allowed(add, wen_cur, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen_cur, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen_cur, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("bank_group_linear", id_start, id_value) + return id_value + + def rw_rowwise_gen( + self, + id_start, + read_blocked, + write_blocked, + row_base_address, + row_size_bytes, + n_rows, + row_stride_bytes, + reads_per_row, + writes_per_row, + traffic_pct=100, + idle_cycles_between_rows=0, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + n_idles = self._idles_per_req(traffic_pct) + base = self._align_down(int(row_base_address), ab) + row_size = max(ab, self._align_down(int(row_size_bytes), ab)) + row_stride = max(ab, self._align_down(int(row_stride_bytes), ab)) + n_rows = max(0, int(n_rows)) + reads_per_row = max(0, int(reads_per_row)) + writes_per_row = max(0, int(writes_per_row)) + + with self._open(append) as f: + for r in range(n_rows): + if id_value - id_start >= self.N_TEST: + break + row_base = self._normalize_addr(base + r * row_stride) + for i in range(reads_per_row): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(row_base + (i * ab) % row_size) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = 1 + data = "0" * self.DATA_WIDTH + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + for i in range(writes_per_row): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(row_base + (i * ab) % row_size) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = 0 + data = self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + if r < n_rows - 1: + for _ in range(max(0, int(idle_cycles_between_rows))): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("rw_rowwise", id_start, id_value) + return id_value + + def gather_scatter_gen( + self, + id_start, + read_blocked, + write_blocked, + read_regions, + write_region, + chunk_bytes=0, + schedule="4read_1write", + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + chunk_val = ab if chunk_bytes is None else int(chunk_bytes) + step = max(ab, self._align_down(chunk_val if chunk_val > 0 else ab, ab)) + tokens = self._parse_read_write_schedule(schedule) + + reads = [] + for reg in read_regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + reads.append({"base": base, "size": size, "offset": 0}) + + wb = self._align_down(int((write_region or {}).get("base", 0)), ab) + ws = self._align_down(int((write_region or {}).get("size_bytes", 0)), ab) + if wb >= tm: + wb %= tm + if wb + ws > tm: + ws = self._align_down(tm - wb, ab) + + if not reads and ws <= 0: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("gather_scatter", id_start, id_value) + return id_value + + read_rr = 0 + token_idx = 0 + write_offset = 0 + max_no_progress = max(32, len(tokens) * max(1, len(reads) + (1 if ws > 0 else 0))) + no_progress_iters = 0 + with self._open(append) as f: + while id_value - id_start < self.N_TEST: + token = tokens[token_idx % len(tokens)] + token_idx += 1 + wen = 1 if token == "R" else 0 + if token == "R" and reads: + reg = reads[read_rr % len(reads)] + read_rr += 1 + addr = self._normalize_addr(reg["base"] + reg["offset"]) + reg["offset"] = (reg["offset"] + step) % reg["size"] + elif ws > 0: + addr = self._normalize_addr(wb + write_offset) + write_offset = (write_offset + step) % ws + elif reads: + reg = reads[read_rr % len(reads)] + read_rr += 1 + wen = 1 + addr = self._normalize_addr(reg["base"] + reg["offset"]) + reg["offset"] = (reg["offset"] + step) % reg["size"] + else: + break + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + no_progress_iters += 1 + if no_progress_iters >= max_no_progress: + break + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + no_progress_iters = 0 + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("gather_scatter", id_start, id_value) + return id_value + + def matmul_tiled_interleave_gen( + self, + id_start, + read_blocked, + write_blocked, + region_base_address_a, + region_size_bytes_a, + region_base_address_b, + region_size_bytes_b, + region_base_address_c, + region_size_bytes_c, + tile_a_bytes=0, + tile_b_bytes=0, + tile_c_bytes=0, + tiles=1, + ab_c_schedule="A_B_C", + traffic_pct=100, + idle_cycles_between_tiles=0, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + tile_idle = max(0, int(idle_cycles_between_tiles)) + tokens = self._parse_abc_schedule(ab_c_schedule) + + def _res(base_raw, size_raw): + base = self._align_down(int(base_raw), ab) + size = self._align_down(int(size_raw), ab) + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + return base, size + + a_base, a_size = _res(region_base_address_a, region_size_bytes_a) + b_base, b_size = _res(region_base_address_b, region_size_bytes_b) + c_base, c_size = _res(region_base_address_c, region_size_bytes_c) + if a_size <= 0 or b_size <= 0 or c_size <= 0: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("matmul_tiled_interleave", id_start, id_value) + return id_value + + ta = ab if tile_a_bytes is None else int(tile_a_bytes) + tb = ab if tile_b_bytes is None else int(tile_b_bytes) + tc = ab if tile_c_bytes is None else int(tile_c_bytes) + cnt_a = max(1, self._align_down(ta if ta > 0 else ab, ab) // ab) + cnt_b = max(1, self._align_down(tb if tb > 0 else ab, ab) // ab) + cnt_c = max(1, self._align_down(tc if tc > 0 else ab, ab) // ab) + counts = {"A": cnt_a, "B": cnt_b, "C": cnt_c} + ptr = {"A": 0, "B": 0, "C": 0} + base = {"A": a_base, "B": b_base, "C": c_base} + size = {"A": a_size, "B": b_size, "C": c_size} + max_tiles = max(1, int(tiles)) + + with self._open(append) as f: + tile_idx = 0 + stalled_tiles = 0 + while id_value - id_start < self.N_TEST: + emitted_before = id_value + for tok in tokens: + for _ in range(counts[tok]): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(base[tok] + ptr[tok]) + ptr[tok] = (ptr[tok] + ab) % size[tok] + wen = 0 if tok == "C" else 1 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + tile_idx += 1 + if tile_idle > 0 and id_value - id_start < self.N_TEST: + for _ in range(tile_idle): + self._write_idle(f) + if tile_idx >= max_tiles: + tile_idx = 0 + if id_value == emitted_before: + stalled_tiles += 1 + if stalled_tiles >= max_tiles: + break + else: + stalled_tiles = 0 + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("matmul_tiled_interleave", id_start, id_value) + return id_value + + def hotspot_random_gen( + self, + id_start, + read_blocked, + write_blocked, + hot_regions, + traffic_pct=100, + traffic_read_pct=None, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + + regions = [] + weights = [] + for reg in hot_regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + weight = max(1, int(reg.get("weight", 1))) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + regions.append({"base": base, "size": size}) + weights.append(weight) + + if not regions: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("hotspot_random", id_start, id_value) + return id_value + + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + else: + wen_seq = None + + with self._open(append) as f: + for i in range(self.N_TEST): + reg = random.choices(regions, weights=weights, k=1)[0] + n_words = max(1, reg["size"] // ab) + ad = reg["base"] + random.randint(0, n_words - 1) * ab + addr = self._normalize_addr(ad) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: + data, wen = self.data_wen() + else: + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("hotspot_random", id_start, id_value) return id_value diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py index 98a2b1a..5784714 100644 --- a/target/verif/simvectors/html_report.py +++ b/target/verif/simvectors/html_report.py @@ -106,7 +106,7 @@ def _outside_detail_lines(node, base_addr, size_kib): regs = list(node.get('regions', [])) label_map = {str(r.get('label', '')): r for r in regs} has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)')) - is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs + is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased', 'matmul_tiled_interleave'} or has_matmul_regs if is_matmul: mat_lines = [] diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index e2dd321..190d0bd 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -84,7 +84,7 @@ def main(argv=None): hwpe_masters = workload_config['hwpe_masters'] # Derived parameters - ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000)) + ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1024)) N_LOG = N_CORE + N_DMA + N_EXT N_LOG_CFG = N_LOG IW = 8 @@ -111,7 +111,7 @@ def _narrow_driver_name(local_idx: int) -> str: if N_LOG + N_HWPE < 1: print("ERROR: the number of masters must be > 0") sys.exit(1) - n_words = (TOT_MEM_SIZE * 1000 / N_BANKS) / (DATA_WIDTH / 8) + n_words = (TOT_MEM_SIZE * 1024 / N_BANKS) / (DATA_WIDTH / 8) if not n_words.is_integer(): print("ERROR: the number of words is not an integer value") sys.exit(1) @@ -154,8 +154,6 @@ def _create_idle_file(path: Path, data_width: int): _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH) next_start_id = 0 - LIST_OF_FORBIDDEN_ADDRESSES_WRITE = [] - LIST_OF_FORBIDDEN_ADDRESSES_READ = [] # Memory map entries collected during generation, printed at the end memory_map_entries = [] @@ -233,6 +231,135 @@ def _record_memory_map(kind, local_idx, description, config, n_test, data_width, idle_between = master_config.get('idle_cycles_between_phases', 0) if idle_between: detail['idle_between_phases'] = f"{idle_between} cycles" + elif config == 'multi_linear': + regs = master_config.get('regions', []) or [] + detail['schedule'] = str(master_config.get('schedule', 'round_robin')) + detail['burst_len'] = int(master_config.get('burst_len', 1)) + for idx, reg in enumerate(regs): + base = _parse_maybe_bin_int(reg.get('base'), 0) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + stride_w = int(reg.get('stride_words', 1)) + rpct = reg.get('read_pct') + rpct_txt = f", read={int(rpct)}%" if rpct is not None else "" + detail[f"region_{idx}"] = ( + f"0x{base:08x} - 0x{base + max(0, size) - 1:08x} " + f"({size} B, stride={stride_w} words{rpct_txt})" + ) + if regs: + first_addr = _parse_maybe_bin_int(regs[0].get('base'), 0) + last_reg = regs[-1] + lb = _parse_maybe_bin_int(last_reg.get('base'), 0) + ls = _parse_maybe_bin_int(last_reg.get('size_bytes'), 0) + last_addr = lb + max(0, ls) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'bank_group_linear': + span = max(1, int(master_config.get('bank_group_span', 1))) + start_bank = int(master_config.get('start_bank', 0)) % max(1, int(N_BANKS)) + stride_beats = max(1, int(master_config.get('stride_beats', 1))) + first_addr = start_bank * access_bytes + phase = max(0, n_test - 1) * stride_beats + group_idx = phase // span + bank = (start_bank + (phase % span)) % max(1, int(N_BANKS)) + last_addr = (group_idx * N_BANKS + bank) * access_bytes + last_addr = last_addr % total_mem_bytes + detail['start_bank'] = start_bank + detail['bank_group_span'] = span + detail['stride_beats'] = stride_beats + if 'bank_group_hop' in master_config: + detail['bank_group_hop'] = int(master_config.get('bank_group_hop', 0)) + if 'wen' in master_config: + detail['wen'] = int(master_config.get('wen', 1)) + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'rw_rowwise': + row_base = _parse_maybe_bin_int(master_config.get('row_base_address'), region_base) + row_size = _parse_maybe_bin_int(master_config.get('row_size_bytes'), access_bytes) + n_rows = max(0, int(master_config.get('n_rows', 0))) + row_stride = _parse_maybe_bin_int(master_config.get('row_stride_bytes'), row_size) + rpr = max(0, int(master_config.get('reads_per_row', 0))) + wpr = max(0, int(master_config.get('writes_per_row', 0))) + first_addr = row_base + last_addr = row_base + max(0, n_rows - 1) * row_stride + max(0, row_size - access_bytes) + last_addr = last_addr % total_mem_bytes + detail['rows'] = f"n_rows={n_rows}, row_size={row_size} B, row_stride={row_stride} B" + detail['per_row'] = f"reads={rpr}, writes={wpr}" + idle_between = int(master_config.get('idle_cycles_between_rows', 0)) + if idle_between: + detail['idle_between_rows'] = f"{idle_between} cycles" + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'gather_scatter': + rr = master_config.get('read_regions', []) or [] + wr = master_config.get('write_region', {}) or {} + for idx, reg in enumerate(rr): + b = _parse_maybe_bin_int(reg.get('base'), 0) + s = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + detail[f"read_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x} ({s} B)" + wb = _parse_maybe_bin_int(wr.get('base'), 0) + ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0) + detail['write_region'] = f"0x{wb:08x} - 0x{wb + max(0, ws) - 1:08x} ({ws} B)" + detail['schedule'] = str(master_config.get('schedule', '4read_1write')) + detail['chunk_bytes'] = int(_parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes)) + if rr: + first_addr = _parse_maybe_bin_int(rr[0].get('base'), 0) + else: + first_addr = wb + last_addr = wb + max(0, ws) - access_bytes if ws > 0 else first_addr + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), region_base) + sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), region_size // 3) + rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), ra + sa) + sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), region_size // 3) + rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), rb + sb) + sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), region_size - max(0, sa) - max(0, sb)) + detail['matrix_A (read)'] = f"0x{ra:08x} - 0x{ra + max(0, sa) - access_bytes:08x} ({sa} B)" + detail['matrix_B (read)'] = f"0x{rb:08x} - 0x{rb + max(0, sb) - access_bytes:08x} ({sb} B)" + detail['matrix_C (write)'] = f"0x{rc:08x} - 0x{rc + max(0, sc) - access_bytes:08x} ({sc} B)" + detail['tile_bytes'] = ( + f"A={int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes))}, " + f"B={int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes))}, " + f"C={int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes))}" + ) + detail['tiles'] = int(master_config.get('tiles', 1)) + detail['ab_c_schedule'] = str(master_config.get('ab_c_schedule', 'A_B_C')) + idle_tiles = int(master_config.get('idle_cycles_between_tiles', 0)) + if idle_tiles: + detail['idle_between_tiles'] = f"{idle_tiles} cycles" + first_addr = ra + last_addr = rc + max(0, sc) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'hotspot_random': + hrs = master_config.get('hot_regions', []) or [] + for idx, reg in enumerate(hrs): + b = _parse_maybe_bin_int(reg.get('base'), 0) + s = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + w = int(reg.get('weight', 1)) + detail[f"hot_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x} ({s} B, weight={w})" + if hrs: + first_addr = _parse_maybe_bin_int(hrs[0].get('base'), 0) + lb = _parse_maybe_bin_int(hrs[-1].get('base'), 0) + ls = _parse_maybe_bin_int(hrs[-1].get('size_bytes'), 0) + last_addr = lb + max(0, ls) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" elif config == 'linear': base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) first_addr = base @@ -295,8 +422,24 @@ def _parse_maybe_bin_int(raw_value, default_value): return default_value def _normalize_mem_access_type(raw_value, master_name): - allowed = {"random", "linear", "2d", "3d", "idle", "matmul_phased"} - aliases = {"matmul": "matmul_phased"} + allowed = { + "random", + "linear", + "2d", + "3d", + "idle", + "matmul_phased", + "multi_linear", + "bank_group_linear", + "rw_rowwise", + "gather_scatter", + "matmul_tiled_interleave", + "hotspot_random", + } + aliases = { + "matmul": "matmul_phased", + "matmul_tiled": "matmul_tiled_interleave", + } if not isinstance(raw_value, str): print( @@ -378,6 +521,60 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt k = master_config.get('matrix_k') if m is not None and n is not None and k is not None: return int(m) * int(k) + int(k) * int(n) + int(m) * int(n) + elif mem_access_type == 'multi_linear': + total = 0 + for reg in master_config.get('regions', []) or []: + size_v = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + total += max(0, int(size_v)) // access_bytes + if total > 0: + return total + elif mem_access_type == 'bank_group_linear': + print( + f"ERROR: {kind}_{local_idx} mem_access_type='bank_group_linear' " + "requires explicit 'n_transactions'." + ) + sys.exit(1) + elif mem_access_type == 'rw_rowwise': + n_rows = master_config.get('n_rows') + rpr = master_config.get('reads_per_row') + wpr = master_config.get('writes_per_row') + if n_rows is not None and rpr is not None and wpr is not None: + return max(0, int(n_rows)) * (max(0, int(rpr)) + max(0, int(wpr))) + elif mem_access_type == 'gather_scatter': + chunk = _parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes) + step = max(access_bytes, int(chunk) if chunk is not None else access_bytes) + total = 0 + for reg in master_config.get('read_regions', []) or []: + total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // step + wr = master_config.get('write_region', {}) or {} + total += max(0, int(_parse_maybe_bin_int(wr.get('size_bytes'), 0))) // step + if total > 0: + return total + elif mem_access_type == 'matmul_tiled_interleave': + tiles = max(1, int(master_config.get('tiles', 1))) + sched = str(master_config.get('ab_c_schedule', 'A_B_C')).upper().replace('-', '_') + toks = [t for t in sched.split('_') if t] + if not toks: + toks = ['A', 'B', 'C'] + cnt_a = max(1, int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes)) // access_bytes) + cnt_b = max(1, int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes)) // access_bytes) + cnt_c = max(1, int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes)) // access_bytes) + per_tile = 0 + for t in toks: + if t == 'A': + per_tile += cnt_a + elif t == 'B': + per_tile += cnt_b + elif t == 'C': + per_tile += cnt_c + if per_tile > 0: + return tiles * per_tile + elif mem_access_type == 'hotspot_random': + total = 0 + for reg in master_config.get('hot_regions', []) or []: + total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // access_bytes + if total > 0: + return total elif mem_access_type == 'idle': return 0 print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no " @@ -443,7 +640,7 @@ def _generate_pattern( # hold all transactions once at the current transaction width. region_size_input = pattern_config.get('region_size_bytes') if ( - config in {'linear', 'matmul_phased'} + config in {'linear', 'matmul_phased', 'matmul_tiled_interleave'} and region_size_input is None and 'n_transactions' in pattern_config ): @@ -462,47 +659,74 @@ def _generate_pattern( n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx) master.N_TEST = n_test - # Keep forbidden-address filtering local to this pattern invocation. - # This allows intended buffer reuse across phases (e.g. double buffering) - # while still avoiding duplicates inside a single pattern generator call. - forbidden_read_local = list(LIST_OF_FORBIDDEN_ADDRESSES_READ) - forbidden_write_local = list(LIST_OF_FORBIDDEN_ADDRESSES_WRITE) + # Read/write blocked filtering is pattern-local only. + read_blocked_local = [] + write_blocked_local = [] + tpct_raw = pattern_config.get('traffic_pct', 100) + tpct = 100 if tpct_raw is None else int(tpct_raw) + + multi_regions_cfg = [] + for reg in pattern_config.get('regions', []) or []: + multi_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + 'stride_words': int(reg.get('stride_words', 1)), + 'read_pct': reg.get('read_pct'), + }) + + read_regions_cfg = [] + for reg in pattern_config.get('read_regions', []) or []: + read_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + }) + wr_cfg_raw = pattern_config.get('write_region', {}) or {} + write_region_cfg = { + 'base': _parse_maybe_bin_int(wr_cfg_raw.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(wr_cfg_raw.get('size_bytes'), 0), + } + + hot_regions_cfg = [] + for reg in pattern_config.get('hot_regions', []) or []: + hot_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + 'weight': int(reg.get('weight', 1)), + }) if config == 'random': - tpct = pattern_config.get('traffic_pct') next_start_id = master.random_gen( next_start_id, - forbidden_read_local, - forbidden_write_local, + read_blocked_local, + write_blocked_local, region_base=region_base, region_size=region_size, - traffic_pct=int(tpct) if tpct is not None else 100, + traffic_pct=tpct, traffic_read_pct=pattern_config.get('traffic_read_pct'), append=append, ) elif config == 'linear': - tpct = pattern_config.get('traffic_pct') next_start_id = master.linear_gen( stride0, start_address, next_start_id, - forbidden_read_local, - forbidden_write_local, - traffic_pct=int(tpct) if tpct is not None else 100, + read_blocked_local, + write_blocked_local, + traffic_pct=tpct, traffic_read_pct=pattern_config.get('traffic_read_pct'), append=append, ) elif config == '2d': next_start_id = master.gen_2d( stride0, len_d0, stride1, start_address, next_start_id, - forbidden_read_local, - forbidden_write_local, + read_blocked_local, + write_blocked_local, idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), append=append, ) elif config == '3d': next_start_id = master.gen_3d( stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, - forbidden_read_local, - forbidden_write_local, + read_blocked_local, + write_blocked_local, idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), append=append, ) @@ -523,8 +747,8 @@ def _generate_pattern( sys.exit(1) next_start_id = master.matmul_phased_gen( next_start_id, - forbidden_read_local, - forbidden_write_local, + read_blocked_local, + write_blocked_local, region_base, region_size, int(pattern_config.get('matmul_ratio_a', 1)), @@ -540,6 +764,105 @@ def _generate_pattern( region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None), append=append, ) + elif config == 'multi_linear': + next_start_id = master.multi_linear_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + regions=multi_regions_cfg, + schedule=pattern_config.get('schedule', 'round_robin'), + burst_len=int(pattern_config.get('burst_len', 1)), + traffic_pct=tpct, + append=append, + ) + elif config == 'bank_group_linear': + next_start_id = master.bank_group_linear_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + start_bank=int(pattern_config.get('start_bank', 0)), + bank_group_span=int(pattern_config.get('bank_group_span', 1)), + stride_beats=int(pattern_config.get('stride_beats', 1)), + bank_group_hop=int(pattern_config.get('bank_group_hop', 0)), + wen=pattern_config.get('wen'), + traffic_pct=tpct, + append=append, + ) + elif config == 'rw_rowwise': + next_start_id = master.rw_rowwise_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + row_base_address=_parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base), + row_size_bytes=_parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes), + n_rows=int(pattern_config.get('n_rows', 1)), + row_stride_bytes=_parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), access_bytes), + reads_per_row=int(pattern_config.get('reads_per_row', 0)), + writes_per_row=int(pattern_config.get('writes_per_row', 0)), + traffic_pct=tpct, + idle_cycles_between_rows=int(pattern_config.get('idle_cycles_between_rows', 0)), + append=append, + ) + elif config == 'gather_scatter': + next_start_id = master.gather_scatter_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + read_regions=read_regions_cfg, + write_region=write_region_cfg, + chunk_bytes=_parse_maybe_bin_int(pattern_config.get('chunk_bytes'), access_bytes), + schedule=pattern_config.get('schedule', '4read_1write'), + traffic_pct=tpct, + append=append, + ) + elif config == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + if ra is None or sa is None or rb is None or sb is None or rc is None or sc is None: + # Fallback to split the combined region into A/B/C thirds. + n_words = max(3, region_size // access_bytes) + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = max(1, n_words - a_words - b_words) + ra = region_base + sa = a_words * access_bytes + rb = ra + sa + sb = b_words * access_bytes + rc = rb + sb + sc = c_words * access_bytes + next_start_id = master.matmul_tiled_interleave_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + region_base_address_a=ra, + region_size_bytes_a=sa, + region_base_address_b=rb, + region_size_bytes_b=sb, + region_base_address_c=rc, + region_size_bytes_c=sc, + tile_a_bytes=_parse_maybe_bin_int(pattern_config.get('tile_a_bytes'), access_bytes), + tile_b_bytes=_parse_maybe_bin_int(pattern_config.get('tile_b_bytes'), access_bytes), + tile_c_bytes=_parse_maybe_bin_int(pattern_config.get('tile_c_bytes'), access_bytes), + tiles=int(pattern_config.get('tiles', 1)), + ab_c_schedule=pattern_config.get('ab_c_schedule', 'A_B_C'), + traffic_pct=tpct, + idle_cycles_between_tiles=int(pattern_config.get('idle_cycles_between_tiles', 0)), + append=append, + ) + elif config == 'hotspot_random': + next_start_id = master.hotspot_random_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + hot_regions=hot_regions_cfg, + traffic_pct=tpct, + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, + ) _record_memory_map( kind, master_local_idx, @@ -841,6 +1164,162 @@ def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peer if mem_access_type == 'idle': return [] + if mem_access_type == 'multi_linear': + regions = [] + for idx, reg in enumerate(pattern_config.get('regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), region_size) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + rpct = reg.get('read_pct') + if rpct is None: + lbl = f"R{idx}" + else: + lbl = f"R{idx}({'read' if int(rpct) >= 50 else 'write'})" + regions.append({ + 'label': lbl, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + if mem_access_type == 'bank_group_linear': + span = max(1, int(pattern_config.get('bank_group_span', 1))) + start_bank = int(pattern_config.get('start_bank', 0)) % max(1, int(N_BANKS)) + n_tx = _parse_maybe_bin_int(pattern_config.get('n_transactions'), 1) + n_tx = max(1, int(n_tx)) + rows = max(1, math.ceil(n_tx / span)) + size = min(total_mem_bytes, rows * span * access_bytes) + base = (start_bank * access_bytes) % max(1, total_mem_bytes) + if base + size > total_mem_bytes: + size = max(access_bytes, total_mem_bytes - base) + return [{ + 'label': 'bank_group', + 'base': base, + 'size': size, + 'end': base + size - 1, + }] + if mem_access_type == 'rw_rowwise': + row_base = _parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base) + row_size = _parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes) + n_rows = max(1, int(pattern_config.get('n_rows', 1))) + row_stride = _parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), row_size) + base = (row_base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = ((max(0, row_stride) * max(0, n_rows - 1)) + max(0, row_size)) + size = (size // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + size = access_bytes + return [{ + 'label': 'rowwise', + 'base': base, + 'size': size, + 'end': base + size - 1, + }] + if mem_access_type == 'gather_scatter': + regions = [] + for idx, reg in enumerate(pattern_config.get('read_regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + regions.append({ + 'label': f"gather_{idx}(read)", + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + wr = pattern_config.get('write_region', {}) or {} + wb = _parse_maybe_bin_int(wr.get('base'), region_base) + ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0) + wb = (wb // access_bytes) * access_bytes + if wb >= total_mem_bytes: + wb = wb % total_mem_bytes + ws = (max(0, ws) // access_bytes) * access_bytes + if wb + ws > total_mem_bytes: + ws = ((total_mem_bytes - wb) // access_bytes) * access_bytes + if ws > 0: + regions.append({ + 'label': 'scatter(write)', + 'base': wb, + 'size': ws, + 'end': wb + ws - 1, + }) + return regions + if mem_access_type == 'hotspot_random': + regions = [] + for idx, reg in enumerate(pattern_config.get('hot_regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + regions.append({ + 'label': f"hot_{idx}", + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + if mem_access_type == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + regions = [] + if ra is not None and sa is not None and rb is not None and sb is not None and rc is not None and sc is not None: + sub_defs = [ + ('A(read)', ra, sa), + ('B(read)', rb, sb), + ('C(write)', rc, sc), + ] + else: + n_words = max(3, region_size // access_bytes) + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = max(1, n_words - a_words - b_words) + sub_defs = [ + ('A(read)', region_base, a_words * access_bytes), + ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes), + ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes), + ] + for label, base_raw, size_raw in sub_defs: + base = (int(base_raw) // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, int(size_raw)) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size > 0: + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions if mem_access_type != 'matmul_phased': return [{ @@ -907,38 +1386,17 @@ def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peer }) return regions - def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test): - # Intentionally simple temporal model: - # - one time unit per transaction, plus optional req=0 idles from traffic shaping - # - no interconnect stall/conflict modeling - # - explicit start_delay_cycles is still honored separately + def _estimate_pattern_cycles(pattern_config, _mem_access_type, n_test, _txn_bytes): + # Temporal model intentionally follows emitted traffic only: + # one unit per transaction plus req=0 idles from traffic_pct shaping. + # No absolute/phase/tile/row cycle estimation is applied here. base = max(0, int(n_test)) tpct = pattern_config.get('traffic_pct') n_idles_per_req = 0 if tpct is not None: tp = max(1, min(100, int(tpct))) n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp)) - cycles = base * (1 + n_idles_per_req) - # Keep explicit matmul phase-boundary idle modeling. - if mem_access_type == 'matmul_phased': - idle_between = int(pattern_config.get('idle_cycles_between_phases', 0)) - if idle_between > 0: - ra = max(0, int(pattern_config.get('matmul_ratio_a', 1))) - rb = max(0, int(pattern_config.get('matmul_ratio_b', 1))) - rc = max(0, int(pattern_config.get('matmul_ratio_c', 1))) - if ra == 0 and rb == 0 and rc == 0: - ra, rb, rc = 1, 1, 1 - s = ra + rb + rc - ca = (base * ra) // s - cb = (base * rb) // s - cc = base - ca - cb - phase_gaps = 0 - if ca > 0 and (cb > 0 or cc > 0): - phase_gaps += 1 - if cb > 0 and cc > 0: - phase_gaps += 1 - cycles += idle_between * phase_gaps - return int(cycles) + return int(base * (1 + n_idles_per_req)) pattern_nodes = [] node_idx_by_driver_pattern = {} @@ -971,7 +1429,7 @@ def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test): 'wait_for_jobs_declared': declared_wait_for_jobs, 'wait_for_jobs_effective': effective_wait_for_jobs, 'n_transactions': int(n_test), - 'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test)), + 'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test, int(data_width // 8))), 'mem_access_type': mem_access_type, 'traffic_read_pct': pat.get('traffic_read_pct'), 'txn_bytes': int(data_width // 8), From a3e4008b271cd2fe808265bd6a88d6ac9fb486c4 Mon Sep 17 00:00:00 2001 From: Sergio Mazzola Date: Wed, 11 Mar 2026 19:04:36 +0100 Subject: [PATCH 25/25] verif: Improve documentation --- target/verif/simvectors/main.py | 22 ++++++++++++++- target/verif/src/application_driver.sv | 14 ++++++++-- target/verif/src/tb_hci.sv | 37 +++++++++++++++++--------- target/verif/src/tb_hci_pkg.sv | 29 ++++++++++++++------ target/verif/src/tcdm_banks_wrap.sv | 2 ++ 5 files changed, 81 insertions(+), 23 deletions(-) diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 190d0bd..da740bb 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -4,7 +4,14 @@ JSON config files: workload, testbench and hardware. It produces cycle-accurate stimuli in `verif/simvectors/generated/stimuli`. -Each stimuli file has one line per simulation cycle: +Each stimuli file encodes an offered per-cycle request stream plus PAUSE fence tokens. + +Idle lines (req=0) represent intended issue gaps in the absence of backpressure. +The application driver may consume some idle entries while stalled on an earlier +request grant, and hide some memory/interconnect latency. So the file is not a +strict wall-clock replay under contention. + +Stimuli line format: req(1b) id(IWb) wen(1b) data(Nb) add(Ab) """ @@ -1089,6 +1096,19 @@ def _resolve_req_levels(wait_for_jobs_list): # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector. # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15). + max_req_level = 0 + for i in range(N_DRIVERS): + for f in range(max_fences): + for j in range(N_DRIVERS): + max_req_level = max(max_req_level, int(req_levels[i][f][j])) + if max_req_level > 15: + print( + "ERROR: Fence dependency level overflow: " + f"required fence_idx={max_req_level}, but packed format supports only 0..15. " + "Reduce the number of fence crossings per dependent job or widen LEVEL_BITS." + ) + sys.exit(1) + LEVEL_BITS = 4 packed_width = N_DRIVERS * LEVEL_BITS packed_hex_digits = (packed_width + 3) // 4 diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index 6ed4787..a6b7aa5 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -20,12 +20,20 @@ * * Stimulus file format (one line per cycle): * req(1b) id(IWb) wen(1b) data(Nb) add(Ab) -- active transaction - * PAUSE -- fence synchronization point + * PAUSE -- fence synchronization point + * + * Idle entries (req=0) are consumed as issue gaps when the driver is free to + * advance. While stalled waiting for a grant, the driver may advance over later + * idle entries, so the stimuli file represents offered traffic order and fence + * structure rather than an exact wall-clock replay under backpressure. * * When a PAUSE token is encountered the driver drains all in-flight reads * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1 * until resume_i is asserted. This allows multi-phase execution on a single * driver without resetting counters between phases. + * Multiple consecutive PAUSE tokens are legal and represent multiple fence slots + * with no intervening traffic (e.g. free-pass completion fence followed by a + * synthetic blocking fence for the next pattern). */ module application_driver #( @@ -213,7 +221,9 @@ module application_driver #( hci_if.data = transactions[last_op_issued_q].data; hci_if.add = transactions[last_op_issued_q].add; if (tr_idx_q < transactions.size()) begin - // Advance over any idle (req=0, not-pause) entries to hide memory latency + // Consume later idle entries while stalled so the driver can hide memory + // latency/backpressure when the workload permits it. This makes req=0 tokens + // issue-gap hints, not strict simulation-time no-op cycles. if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin tr_idx_d = tr_idx_q + 1; end diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 0f743e2..5ada60b 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -283,10 +283,12 @@ module tb_hci assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR; assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR; - // fence_idx[i]: number of fences driver i has fully passed (exited PAUSED state). - // Increments when the handshake fires: resume_i asserted while fence_reached_o is high. - // This is the same cycle the driver transitions out of PAUSED, so dependents see the - // updated count immediately on the next cycle (after the registered flip-flop updates). + // fence_idx[i] = number of PAUSE tokens driver i has passed. + // This counts all fences in file order, including synthetic blocking fences + // and trailing completion fences. + // + // fence_idx increments when resume_i is asserted while fence_reached_o is high, + // i.e. when the PAUSED-state handshake completes and the driver leaves that fence. always_ff @(posedge clk or negedge rst_n) begin if (!rst_n) begin for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0; @@ -298,15 +300,13 @@ module tb_hci end end - // s_resume[i]: fire when for every set bit j in FENCE_MASKS[i][fence_idx[i]], - // fence_idx[j] >= FENCE_REQ_LEVELS[i][fence_idx[i]][j]. - // - // fence_idx[j] >= p_j+1 means j has exited the trailing PAUSE after pattern p_j, - // i.e. j has completed pattern p_j. No s_fence_reached shortcut needed — the - // trailing PAUSE of a zero-mask fence is exited in one cycle, so fence_idx - // advances promptly and the check is unambiguous. + // s_resume[i] is asserted only while driver i is paused at its current fence. + // Driver i may pass that fence when, for every dependency bit j set in the + // current FENCE_MASKS entry, fence_idx[j] is at least the required level + // encoded in FENCE_REQ_LEVELS_PACKED. // - // In MUX mode bus serialization is handled by s_mux_sel independently. + // In other words: blocking fences wait for explicit dependency completion; + // trailing zero-mask fences are free passes. always_comb begin for (int i = 0; i < N_DRIVERS; i++) begin automatic logic [N_DRIVERS-1:0] cur_mask; @@ -542,4 +542,17 @@ module tb_hci end endgenerate + // Advisory check only. The hard overflow guard is in Python generation + // before packing FENCE_REQ_LEVELS into 4-bit fields. + // In case of failure due to this asser, modify tb_hci_pkg.sv and generation of fence_masks.mk + initial begin + if (MAX_FENCES > 16) begin + $warning( + "MAX_FENCES=%0d exceeds the nominal 4-bit fence-level range; " + "ensure no dependency requires a level > 15.", + MAX_FENCES + ); + end + end + endmodule diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index dc77085..507565c 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -77,14 +77,27 @@ package tb_hci_pkg; localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; - // Fence parameters. Every pattern ends with a PAUSE; fence f = PAUSE after pattern f. - // fence_idx[i] == k means driver i has completed k patterns. - // - // FENCE_MASKS[i][f][j]=1: driver j is a dependency for driver i at fence f. - // FENCE_REQ_LEVELS[i][f][j]: minimum fence_idx[j] required before driver i can pass - // fence f. Equals the pattern index of the referenced phase within driver j plus 1 - // (i.e., j must have completed that specific pattern). - // Both are generated by main.py and passed via defines. +// Fence parameters. +// +// Fence slots enumerate every PAUSE token in the stimulus file, in file order. +// This includes: +// (1) synthetic blocking PAUSEs inserted before patterns that have wait_for_jobs, +// (2) trailing PAUSEs emitted after every pattern. +// +// fence_idx[i] counts how many PAUSE tokens driver i has passed so far. +// It is therefore a "passed-fence count", not a "completed-pattern count". +// +// FENCE_MASKS[i][f][j] = 1: +// at fence slot f of driver i, driver j is a dependency. +// +// FENCE_REQ_LEVELS_PACKED[i][f][j]: +// minimum fence_idx[j] required before driver i may pass fence slot f. +// +// For a synthetic pre-pattern fence, the required level corresponds to the +// dependency driver's fence count after the referenced job has completed. +// For a trailing pattern fence, the mask is zero and the fence is a free pass. +// +// Both arrays are generated by main.py and passed via defines. localparam int unsigned MAX_FENCES = `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif; localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] = diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv index dd9e223..7aad9a8 100644 --- a/target/verif/src/tcdm_banks_wrap.sv +++ b/target/verif/src/tcdm_banks_wrap.sv @@ -83,6 +83,8 @@ module tcdm_banks_wrap #( .rdata_o(tcdm_slave[i].r_data ) // read data ); + //NOTE: Commented out. r_valid response is handled by interconnect + //r_valid // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen // if(~rst_ni) begin