From 6e6f61592f0c92100452a0be8a9bb90c2df60e96 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Thu, 5 Mar 2026 12:33:56 +0100
Subject: [PATCH 01/25] Add some README for verification env + change for
 non-IIS machines

---
 README.md             | 33 +++++++++++++++++++++++++++++++++
 target/verif/verif.mk |  6 +++++-
 2 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 690bc53..6a67cfc 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,39 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc
  - https://github.com/pulp-platform/neureka
  - https://github.com/pulp-platform/redmule
 
+# Usage flow
+
+The typical full flow is:
+
+```
+make checkout       # Fetch and check out dependencies via Bender
+make config-verif   # Generate Makefiles from JSON verification configs
+make stim-verif     # Generate simulation stimulus vectors (requires Python 3)
+make compile-verif  # Compile RTL and testbench with QuestaSim
+make opt-verif      # Optimize the compiled design with vopt
+make run-verif      # Run the simulation (batch mode by default)
+```
+
+To open the simulation in the QuestaSim GUI with waveforms, pass `GUI=1`:
+
+```
+make run-verif GUI=1
+```
+
+Cleanup targets:
+
+| Target               | Effect                                             |
+|----------------------|----------------------------------------------------|
+| `clean-config-verif` | Remove generated configuration Makefiles           |
+| `clean-stim-verif`   | Remove generated stimulus vectors                  |
+| `clean-sim-verif`    | Remove QuestaSim build artifacts (work lib, logs)  |
+| `clean-verif`        | Run all three clean targets above                  |
+
+**Notes:**
+- On IIS machines, defaults to QuestaSim (`questa-2022.3`) (can be overriden with `SIM_QUESTA=<version>`). On non-IIS machines, defaults to QuestaSim available in `PATH`.
+- Verification configuration is driven by JSON files under `target/verif/config/`. Edit those before running `config-verif` and `stim-verif`.
+- `run-verif` depends on `opt-verif` and `stim-verif`, so after `checkout` and `config-verif` you can jump straight to it.
+
 # Style guide
 These IPs use a slightly different style than other PULP IPs. Refer to `STYLE.md` for some indications.
 
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index dd656b0..bf324d3 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -17,7 +17,11 @@ include $(HCI_VERIF_DIR)/bender.mk
 
 # Tooling
 #NOTE: Only QuestaSim is currently supported by verification framework
-SIM_QUESTA ?= questa-2022.3
+ifneq (,$(wildcard /etc/iis.version))
+    SIM_QUESTA ?= questa-2022.3
+else
+    SIM_QUESTA ?=
+endif
 SIM_VLIB ?= $(SIM_QUESTA) vlib
 SIM_VSIM ?= $(SIM_QUESTA) vsim
 SIM_VOPT ?= $(SIM_QUESTA) vopt

From b00d0b151412741a707d03fd54b54768be221ef0 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Thu, 5 Mar 2026 16:26:00 +0100
Subject: [PATCH 02/25] Add some (reasonable) waivers + minor cleanup

---
 README.md                       |  3 +-
 rtl/hci_interconnect.sv         | 20 +++++++++-
 rtl/interco/hci_arbiter.sv      | 68 +++++++++++++++++++--------------
 rtl/interco/hci_arbiter_tree.sv | 21 +++++++++-
 rtl/interco/hci_router.sv       | 35 +++++++++++------
 target/verif/verif.mk           |  6 ++-
 target/verif/vsim/tb_hci.tcl    |  4 +-
 7 files changed, 108 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 6a67cfc..fc10244 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,7 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc
  - https://github.com/pulp-platform/neureka
  - https://github.com/pulp-platform/redmule
 
-# Usage flow
-
+# Verification flow
 The typical full flow is:
 
 ```
diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv
index 9825891..1d6a771 100644
--- a/rtl/hci_interconnect.sv
+++ b/rtl/hci_interconnect.sv
@@ -142,8 +142,22 @@ module hci_interconnect
     EW:  DEFAULT_EW,
     EHW: DEFAULT_EHW
   };
-  `HCI_INTF_ARRAY(hwpe_mem_muxed, clk_i, 0:N_MEM-1);
-
+  hci_core_intf #(
+    .DW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).DW  ),
+    .AW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).AW  ),
+    .BW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).BW  ),
+    .UW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).UW  ),
+    .IW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).IW  ),
+    .EW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EW  ),
+    .EHW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EHW )
+`ifndef SYNTHESIS
+    ,
+    .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // hwpe_mem_muxed is an internal muxed signal, not a protocol-compliant port
+    .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT )
+`endif
+  ) hwpe_mem_muxed [0:N_MEM-1] (
+    .clk ( clk_i )
+  );
 
   localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe_mem) = '{
     DW:  DEFAULT_DW,
@@ -257,6 +271,8 @@ module hci_interconnect
       hci_arbiter_tree #(
         .NB_REQUESTS(N_HWPE),
         .NB_CHAN ( N_MEM ),
+        .WAIVE_RQ3_ASSERT  ( WAIVE_RQ3_ASSERT  ),
+        .WAIVE_RQ4_ASSERT  ( WAIVE_RQ4_ASSERT  ),
         .`HCI_SIZE_PARAM(out)(`HCI_SIZE_PARAM(hwpe_mem_muxed))
       ) i_wide_port_arbiter_tree (
         .clk_i   ( clk_i               ),
diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv
index a291532..2447608 100644
--- a/rtl/interco/hci_arbiter.sv
+++ b/rtl/interco/hci_arbiter.sv
@@ -142,38 +142,48 @@ module hci_arbiter
         in_low [ii].gnt = '0;
         if(hs_pass_d[ii]) 
         begin
-          out[ii].req     = in_high[ii].req;
-          out[ii].add     = in_high[ii].add;
-          out[ii].wen     = in_high[ii].wen;
-          out[ii].be      = in_high[ii].be;
-          out[ii].data    = in_high[ii].data;
-          out[ii].id      = in_high[ii].id;
-          out[ii].user    = in_high[ii].user;
-          out[ii].ecc     = in_high[ii].ecc;
-          in_high[ii].gnt = out[ii].gnt;
-        end 
+          out[ii].req      = in_high[ii].req;
+          out[ii].add      = in_high[ii].add;
+          out[ii].wen      = in_high[ii].wen;
+          out[ii].be       = in_high[ii].be;
+          out[ii].data     = in_high[ii].data;
+          out[ii].id       = in_high[ii].id;
+          out[ii].user     = in_high[ii].user;
+          out[ii].r_ready  = in_high[ii].r_ready;
+          out[ii].ecc      = in_high[ii].ecc;
+          out[ii].ereq     = in_high[ii].ereq;
+          out[ii].r_eready = in_high[ii].r_eready;
+          in_high[ii].gnt  = out[ii].gnt;
+        end
         else
         begin
-          out[ii].req    = in_low[ii].req;
-          out[ii].add    = in_low[ii].add;
-          out[ii].wen    = in_low[ii].wen;
-          out[ii].be     = in_low[ii].be;
-          out[ii].data   = in_low[ii].data;
-          out[ii].id     = in_low[ii].id;
-          out[ii].user   = in_low[ii].user;
-          out[ii].ecc    = in_low[ii].ecc;
-          in_low[ii].gnt = out[ii].gnt;
+          out[ii].req      = in_low[ii].req;
+          out[ii].add      = in_low[ii].add;
+          out[ii].wen      = in_low[ii].wen;
+          out[ii].be       = in_low[ii].be;
+          out[ii].data     = in_low[ii].data;
+          out[ii].id       = in_low[ii].id;
+          out[ii].user     = in_low[ii].user;
+          out[ii].r_ready  = in_low[ii].r_ready;
+          out[ii].ecc      = in_low[ii].ecc;
+          out[ii].ereq     = in_low[ii].ereq;
+          out[ii].r_eready = in_low[ii].r_eready;
+          in_low[ii].gnt   = out[ii].gnt;
         end
-        in_high[ii].r_data = out[ii].r_data;
-        in_low [ii].r_data = out[ii].r_data;
-        in_high[ii].r_id   = out[ii].r_id;
-        in_low [ii].r_id   = out[ii].r_id;
-        in_high[ii].r_opc  = out[ii].r_opc;
-        in_low [ii].r_opc  = out[ii].r_opc;
-        in_high[ii].r_user = out[ii].r_user;
-        in_low [ii].r_user = out[ii].r_user;
-        in_high[ii].r_ecc  = out[ii].r_ecc;
-        in_low [ii].r_ecc  = out[ii].r_ecc;
+        in_high[ii].r_data   = out[ii].r_data;
+        in_low [ii].r_data   = out[ii].r_data;
+        in_high[ii].r_id     = out[ii].r_id;
+        in_low [ii].r_id     = out[ii].r_id;
+        in_high[ii].r_opc    = out[ii].r_opc;
+        in_low [ii].r_opc    = out[ii].r_opc;
+        in_high[ii].r_user   = out[ii].r_user;
+        in_low [ii].r_user   = out[ii].r_user;
+        in_high[ii].r_ecc    = out[ii].r_ecc;
+        in_low [ii].r_ecc    = out[ii].r_ecc;
+        in_high[ii].egnt     = out[ii].egnt;
+        in_low [ii].egnt     = out[ii].egnt;
+        in_high[ii].r_evalid = out[ii].r_evalid;
+        in_low [ii].r_evalid = out[ii].r_evalid;
         // r_valid signals are NOT propagated by the arbiter, they are generated at
         // routing stage. In previous HCI versions, we used a r_valid-less version
         // of the protocol here.
diff --git a/rtl/interco/hci_arbiter_tree.sv b/rtl/interco/hci_arbiter_tree.sv
index 2a10524..f304432 100644
--- a/rtl/interco/hci_arbiter_tree.sv
+++ b/rtl/interco/hci_arbiter_tree.sv
@@ -39,7 +39,9 @@ module hci_arbiter_tree
 #(
   parameter int unsigned NB_REQUESTS = 1,
   parameter int unsigned NB_CHAN = 16,
-  parameter hci_size_parameter_t `HCI_SIZE_PARAM(out)  = '0
+  parameter hci_size_parameter_t `HCI_SIZE_PARAM(out)  = '0,
+  parameter bit WAIVE_RQ3_ASSERT  = 1'b0,
+  parameter bit WAIVE_RQ4_ASSERT  = 1'b0
 )
 (
   input  logic                   clk_i,
@@ -74,7 +76,22 @@ module hci_arbiter_tree
    EHW: `HCI_SIZE_GET_EHW(out)
   };
   
-  `HCI_INTF_ARRAY(arb_out, clk_i, 0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1);
+  hci_core_intf #(
+    .DW  ( `HCI_SIZE_PARAM(arb_out).DW  ),
+    .AW  ( `HCI_SIZE_PARAM(arb_out).AW  ),
+    .BW  ( `HCI_SIZE_PARAM(arb_out).BW  ),
+    .UW  ( `HCI_SIZE_PARAM(arb_out).UW  ),
+    .IW  ( `HCI_SIZE_PARAM(arb_out).IW  ),
+    .EW  ( `HCI_SIZE_PARAM(arb_out).EW  ),
+    .EHW ( `HCI_SIZE_PARAM(arb_out).EHW )
+`ifndef SYNTHESIS
+    ,
+    .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // arb_out is an internal arbitration signal, not a protocol-compliant port
+    .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT )
+`endif
+  ) arb_out [0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1] (
+    .clk ( clk_i )
+  );
   
   generate		
     for(genvar lvl=0; lvl<NB_LEVELS; lvl++) begin : arbiter_tree_levels
diff --git a/rtl/interco/hci_router.sv b/rtl/interco/hci_router.sv
index 471a7b4..da263b2 100644
--- a/rtl/interco/hci_router.sv
+++ b/rtl/interco/hci_router.sv
@@ -231,14 +231,16 @@ module hci_router
 
     for(genvar ii=0; ii<NB_OUT_CHAN; ii++)
     begin : virt_out_bind
-      assign out[ii].req  = virt_out[ii].req;
-      assign out[ii].wen  = virt_out[ii].wen;
-      assign out[ii].be   = virt_out[ii].be;
-      assign out[ii].data = virt_out[ii].data;
-      assign out[ii].add  = virt_out[ii].add;
-      assign virt_out[ii].gnt     = out[ii].gnt;
-      assign virt_out[ii].r_valid = out_r_valid[ii];
-      assign virt_out[ii].r_data  = out[ii].r_data;
+      assign out[ii].req     = virt_out[ii].req;
+      assign out[ii].wen     = virt_out[ii].wen;
+      assign out[ii].be      = virt_out[ii].be;
+      assign out[ii].data    = virt_out[ii].data;
+      assign out[ii].add     = virt_out[ii].add;
+      assign out[ii].r_ready = virt_out[ii].r_ready;
+      assign virt_out[ii].gnt      = out[ii].gnt;
+      assign virt_out[ii].r_valid  = out_r_valid[ii];
+      assign virt_out[ii].r_data   = out[ii].r_data;
+      assign virt_out[ii].r_opc    = out[ii].r_opc;
 
       // unimplemented user bits = 0
       assign out[ii].user = '0;
@@ -246,12 +248,21 @@ module hci_router
       // unimplemented id bits = 0
       assign out[ii].id = '0;
 
+      // ECC handshake signals
       if(USE_ECC) begin : ecc_assignment
-        assign out[ii].ecc  = virt_out[ii].ecc;
-        assign virt_out[ii].r_ecc  = out[ii].r_ecc;
+        assign out[ii].ereq     = virt_out[ii].ereq;
+        assign out[ii].r_eready = virt_out[ii].r_eready;
+        assign out[ii].ecc      = virt_out[ii].ecc;
+        assign virt_out[ii].egnt     = out[ii].egnt;
+        assign virt_out[ii].r_evalid = out[ii].r_evalid;
+        assign virt_out[ii].r_ecc    = out[ii].r_ecc;
       end else begin
-        assign out[ii].ecc         = '0;
-        assign virt_out[ii].r_ecc  = '0;
+        assign out[ii].ereq     = '0;
+        assign out[ii].r_eready = '1;
+        assign out[ii].ecc      = '0;
+        assign virt_out[ii].egnt     = '0;
+        assign virt_out[ii].r_evalid = '0;
+        assign virt_out[ii].r_ecc    = '0;
       end
 
     end // virt_out_bind
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index bf324d3..1dac0de 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -81,7 +81,11 @@ clean-stim-verif:
 ##############
 
 # Parameters
-GUI ?= 0
+ifdef gui
+  GUI ?= $(gui)
+else
+  GUI ?= 0
+endif
 # Top-level to simulate
 sim_top_level ?= tb_hci
 sim_vsim_lib ?= $(HCI_VERIF_DIR)/vsim/work
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index 01111cc..8935f49 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -1,5 +1,7 @@
 # If GUI is 1, spawn waveforms
 if {$GUI == 1} {
     echo "GUI mode enabled"
+    add log -r /*
+} else {
+    run -a
 }
-run -a

From 93d3df7ed201ef3e84f9d7e2db91eaa4fbc63685 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 6 Mar 2026 13:08:19 +0100
Subject: [PATCH 03/25] tb: Fix erroneous latency reporting

---
 target/verif/src/latency_monitor.sv   |  8 ++++--
 target/verif/src/simulation_report.sv | 38 +++++++++++++--------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/latency_monitor.sv
index d89cdf5..b645ce1 100644
--- a/target/verif/src/latency_monitor.sv
+++ b/target/verif/src/latency_monitor.sv
@@ -74,8 +74,10 @@ module latency_monitor #(
           req_start_cycle_log <= '0;
           req_prev_log <= 1'b0;
         end else begin
-          if (hci_log_if[gi].req && !req_prev_log) begin
+          if (hci_log_if[gi].req && !req_prev_log && !hci_log_if[gi].gnt) begin
             req_start_cycle_log <= cycle_q;
+          end else if (hci_log_if[gi].gnt) begin
+            req_start_cycle_log <= cycle_q + 1;
           end
 
           if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin
@@ -130,8 +132,10 @@ module latency_monitor #(
           req_start_cycle_hwpe <= '0;
           req_prev_hwpe <= 1'b0;
         end else begin
-          if (hci_hwpe_if[gi].req && !req_prev_hwpe) begin
+          if (hci_hwpe_if[gi].req && !req_prev_hwpe && !hci_hwpe_if[gi].gnt) begin
             req_start_cycle_hwpe <= cycle_q;
+          end else if (hci_hwpe_if[gi].gnt) begin
+            req_start_cycle_hwpe <= cycle_q + 1;
           end
 
           if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index 4d4e2e4..c9e8423 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -205,11 +205,11 @@ module simulation_report
 
     $display("\n\\\\BANDWIDTH\\\\");
     $display(
-      "Completion bandwidth (writes granted + reads completed): %0.1f bit/cycle",
+      "Completion bandwidth (writes granted + reads completed): %0.2f bit/cycle",
       throughput_complete_i
     );
-    $display("Stimulus phase duration: %0.1f cycles", stim_latency_i);
-    $display("Completion phase duration: %0.1f cycles", tot_latency_i);
+    $display("Stimulus phase duration: %0.2f cycles", stim_latency_i);
+    $display("Completion phase duration: %0.2f cycles", tot_latency_i);
     $display(
       "Granted transactions: reads=%0d writes=%0d total=%0d",
       total_read_granted_transactions,
@@ -222,28 +222,28 @@ module simulation_report
     );
 
     $display("\n\\\\SIMULATION TIME\\\\");
-    $display("Total simulation time: %0.1f cycles", tot_latency_i);
+    $display("Total simulation time: %0.2f cycles", tot_latency_i);
     for (int i = 0; i < N_CORE_REAL; i++) begin
       $display(
-        "Core%0d (master_log_%0d): %0.1f cycles",
+        "Core%0d (master_log_%0d): %0.2f cycles",
         i, i, latency_per_master_i[i]
       );
     end
     for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
       $display(
-        "DMA%0d (master_log_%0d): %0.1f cycles",
+        "DMA%0d (master_log_%0d): %0.2f cycles",
         i - N_CORE, i, latency_per_master_i[i]
       );
     end
     for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
       $display(
-        "EXT%0d (master_log_%0d): %0.1f cycles",
+        "EXT%0d (master_log_%0d): %0.2f cycles",
         i - (N_CORE + N_DMA), i, latency_per_master_i[i]
       );
     end
     for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin
       $display(
-        "HWPE%0d (master_hwpe_%0d): %0.1f cycles",
+        "HWPE%0d (master_hwpe_%0d): %0.2f cycles",
         i - (N_MASTER - N_HWPE),
         i - (N_MASTER - N_HWPE),
         latency_per_master_i[i]
@@ -340,7 +340,7 @@ module simulation_report
     $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\");
     for (int i = 0; i < N_CORE_REAL; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -350,7 +350,7 @@ module simulation_report
     end
     for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -360,7 +360,7 @@ module simulation_report
     end
     for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -370,7 +370,7 @@ module simulation_report
     end
     for (int i = 0; i < N_HWPE_REAL; i++) begin
       $display(
-        "master_hwpe_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_hwpe_i[i] != 0) ?
             (sum_req_to_gnt_latency_hwpe_i[i] / real'(n_gnt_transactions_hwpe_i[i])) :
@@ -380,32 +380,32 @@ module simulation_report
     end
     $display("");
     $display(
-      "Total accumulated req->gnt latency: %0.1f cycles over %0d grants",
+      "Total accumulated req->gnt latency: %0d cycles over %0d grants",
       sum_req_to_gnt_latency_all,
       total_gnt_transactions_all
     );
     $display(
-      "LOG avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "LOG avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_log_weighted
     );
     $display(
-      "LOG avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "LOG avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_log_unweighted
     );
     $display(
-      "HWPE avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "HWPE avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_hwpe_weighted
     );
     $display(
-      "HWPE avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "HWPE avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_hwpe_unweighted
     );
     $display(
-      "Global avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "Global avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_weighted
     );
     $display(
-      "Global avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "Global avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_unweighted
     );
     $display("");

From 6ea8a8817d4d7b0670594d8825d84e4470482236 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 6 Mar 2026 13:09:36 +0100
Subject: [PATCH 04/25] rtl: :bug: Set independent QoS config for HWPE branch
 (was same as LIC)

---
 rtl/hci_interconnect.sv | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv
index 1d6a771..53e10ad 100644
--- a/rtl/hci_interconnect.sv
+++ b/rtl/hci_interconnect.sv
@@ -268,6 +268,19 @@ module hci_interconnect
     
       end : hwpe_req2mem
 
+      // Set arbitration tree to be perfectly fair. It must not
+      // follow the max stall policy of the HWPE vs LIC arbiter.
+      // FIXME: it would be interesting to explore what happens
+      // with an unfair but configurable setting. Probably we need
+      // a generator to do that, I do not see a way to code it in
+      // pure SystemVerilog.
+      hci_interconnect_ctrl_t ctrl_arbiter_tree;
+      always_comb
+      begin
+        ctrl_arbiter_tree = ctrl_i;
+        ctrl_arbiter_tree.low_prio_max_stall = 1;
+      end
+
       hci_arbiter_tree #(
         .NB_REQUESTS(N_HWPE),
         .NB_CHAN ( N_MEM ),
@@ -278,7 +291,7 @@ module hci_interconnect
         .clk_i   ( clk_i               ),
         .rst_ni  ( rst_ni              ),
         .clear_i ( clear_i             ),
-        .ctrl_i  ( ctrl_i              ),
+        .ctrl_i  ( ctrl_arbiter_tree   ),
         .in      ( hwpe_mem            ),
         .out     ( hwpe_mem_muxed      )
       );

From 7c61e6c0211556f96cbfa53fe9a7736788fa4a0e Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 27 Feb 2026 16:05:40 +0100
Subject: [PATCH 05/25] [WIP] Implement LOG and MUX tb

---
 target/verif/bender.mk                        |   6 +-
 target/verif/config/generated/hardware.mk.tpl |   3 +-
 target/verif/config/generated/json_to_mk.py   |  46 +++-
 .../verif/config/generated/testbench.mk.tpl   |   1 -
 target/verif/config/hardware.json             |   3 +-
 target/verif/config/testbench.json            |   1 -
 .../verif/simvectors/hci_stimuli/processor.py |  10 +-
 target/verif/simvectors/main.py               |  10 +-
 target/verif/src/application_driver.sv        |   8 +-
 target/verif/src/simulation_report.sv         |  68 +++---
 target/verif/src/tb_hci.sv                    | 216 ++++++++++++++----
 target/verif/src/tb_hci_pkg.sv                |  78 ++++---
 target/verif/src/throughput_monitor.sv        |   4 +-
 target/verif/verif.mk                         |   5 +-
 target/verif/vsim/tb_hci.tcl                  |  95 +++++++-
 15 files changed, 411 insertions(+), 143 deletions(-)

diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index 2e1362e..8ccc663 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -8,7 +8,7 @@
 VERIF_DEFS ?=
 VERIF_DEFS += \
 	-D N_HWPE=$(N_HWPE) \
-	-D HWPE_WIDTH=$(HWPE_WIDTH) \
+	-D HWPE_WIDTH_FACT=$(HWPE_WIDTH_FACT) \
 	-D N_CORE=$(N_CORE) \
 	-D N_DMA=$(N_DMA) \
 	-D N_EXT=$(N_EXT) \
@@ -22,11 +22,11 @@ VERIF_DEFS += \
 	-D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \
 	-D CLK_PERIOD=$(CLK_PERIOD) \
 	-D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \
-	-D MAX_CYCLES_BETWEEN_GNT_RVALID=$(MAX_CYCLES_BETWEEN_GNT_RVALID) \
 	-D RANDOM_GNT=$(RANDOM_GNT) \
+	-D INTERCO_TYPE=$(INTERCO_TYPE) \
 	-D INVERT_PRIO=$(INVERT_PRIO) \
 	-D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL)
 
 # Common targets for bender
 VERIF_TARGS ?=
-VERIF_TARGS += -t hci_verif
\ No newline at end of file
+VERIF_TARGS += -t hci_verif
diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl
index f06a13d..f6bdc14 100644
--- a/target/verif/config/generated/hardware.mk.tpl
+++ b/target/verif/config/generated/hardware.mk.tpl
@@ -6,10 +6,11 @@
 
 # Hardware configuration parameters (from hardware.json)
 N_HWPE?=${N_HWPE}
-HWPE_WIDTH?=${HWPE_WIDTH}
+HWPE_WIDTH_FACT?=${HWPE_WIDTH_FACT}
 N_CORE?=${N_CORE}
 N_DMA?=${N_DMA}
 N_EXT?=${N_EXT}
+INTERCO_TYPE?=${INTERCO_TYPE}
 TS_BIT?=${TS_BIT}
 EXPFIFO?=${EXPFIFO}
 SEL_LIC?=${SEL_LIC}
diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py
index fcec54f..4a1c9ba 100755
--- a/target/verif/config/generated/json_to_mk.py
+++ b/target/verif/config/generated/json_to_mk.py
@@ -45,6 +45,34 @@ def flatten_dict(d, prefix=''):
             items.append((new_key, v))
     return dict(items)
 
+
+def get_parameters(config):
+    """Return flattened parameters from config JSON."""
+    params = config.get("parameters")
+    if not isinstance(params, dict):
+        return {}
+    return flatten_dict(params)
+
+
+def load_all_parameters(config_dir):
+    """Load flattened parameters from all JSON files in config_dir."""
+    merged = {}
+    for json_path in sorted(config_dir.glob("*.json")):
+        cfg = load_json_config(json_path)
+        merged.update(get_parameters(cfg))
+    return merged
+
+
+def template_variables(template_content):
+    """Extract Template variable names used by template content."""
+    pattern = Template.pattern
+    vars_found = set()
+    for match in pattern.finditer(template_content):
+        name = match.group("named") or match.group("braced")
+        if name is not None:
+            vars_found.add(name)
+    return vars_found
+
 def parse_args(argv=None):
     parser = argparse.ArgumentParser(
         description="Convert JSON configuration to Makefile fragment using templates."
@@ -83,16 +111,22 @@ def main():
     # Load template
     template_content = load_template(template_file)
 
-    # Flatten the parameters dict for template substitution
-    template_data = flatten_dict(config['parameters'])
+    # Build substitution dictionary:
+    # 1. all parameters from all configs (fallback)
+    # 2. parameters from selected config (override)
+    template_data = load_all_parameters(config_dir)
+    template_data.update(get_parameters(config))
 
     # Apply template substitution
     template = Template(template_content)
-    try:
-        result = template.substitute(template_data)
-    except KeyError as e:
-        print(f"ERROR: Missing template variable: {e}", file=sys.stderr)
+    missing = sorted(v for v in template_variables(template_content) if v not in template_data)
+    if missing:
+        print(
+            f"ERROR: Missing template variable(s): {', '.join(missing)}",
+            file=sys.stderr,
+        )
         sys.exit(1)
+    result = template.substitute(template_data)
 
     # Output to stdout
     print(result)
diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl
index a1cf812..469736b 100644
--- a/target/verif/config/generated/testbench.mk.tpl
+++ b/target/verif/config/generated/testbench.mk.tpl
@@ -9,7 +9,6 @@ N_TRANSACTION_LOG?=${N_TRANSACTION_LOG}
 TRANSACTION_RATIO?=${TRANSACTION_RATIO}
 CLK_PERIOD?=${CLK_PERIOD}
 RST_CLK_CYCLES?=${RST_CLK_CYCLES}
-MAX_CYCLES_BETWEEN_GNT_RVALID?=${MAX_CYCLES_BETWEEN_GNT_RVALID}
 RANDOM_GNT?=${RANDOM_GNT}
 INVERT_PRIO?=${INVERT_PRIO}
 LOW_PRIO_MAX_STALL?=${LOW_PRIO_MAX_STALL}
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index 678217b..028c714 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -2,10 +2,11 @@
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
     "N_HWPE": 2,
-    "HWPE_WIDTH": 8,
+    "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
     "N_DMA": 1,
     "N_EXT": 1,
+    "INTERCO_TYPE": "LOG",
     "TS_BIT": 21,
     "EXPFIFO": 0,
     "SEL_LIC": 0,
diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json
index a4daaf4..46005ab 100644
--- a/target/verif/config/testbench.json
+++ b/target/verif/config/testbench.json
@@ -5,7 +5,6 @@
     "TRANSACTION_RATIO": 1,
     "CLK_PERIOD": 50,
     "RST_CLK_CYCLES": 10,
-    "MAX_CYCLES_BETWEEN_GNT_RVALID": 1,
     "RANDOM_GNT": 0,
     "INVERT_PRIO": 0,
     "LOW_PRIO_MAX_STALL": 10
diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py
index 91b4892..2f580be 100644
--- a/target/verif/simvectors/hci_stimuli/processor.py
+++ b/target/verif/simvectors/hci_stimuli/processor.py
@@ -6,7 +6,7 @@
 # 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list.
 # -folder_path_raw          --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated)
 # -folder_path_processed    --> String that specifies the path of the folder containing the new txt files created by this function
-def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH):
+def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT):
     file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")]
     for file in file_names:
         filepath_read = os.path.join(folder_path_raw,file)
@@ -27,18 +27,18 @@ def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH
                                 file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
                         else:
                             for _ in range(int(cycle_offset)-1):
-                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
+                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
                         file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n")
                     else:
                         if "log" in file:
                             file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
                         else:
-                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
+                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
 
 
 # 2) ++PAD++ txt files to have the same number of lines
 # -Folder_path  --> path of the folder containing the txt files to be padded
-def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH):
+def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT):
     file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder
     max_lines = 0
     line_count = {} # Dictionary to store the number of lines in each txt file
@@ -59,4 +59,4 @@ def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH):
                         f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
                 else:
                     for _ in range(padding_needed):
-                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
+                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 21c2e28..b164503 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -73,7 +73,7 @@ def main(argv=None):
     N_DMA = hw_params['N_DMA']
     N_EXT = hw_params['N_EXT']
     N_HWPE = hw_params['N_HWPE']
-    HWPE_WIDTH = hw_params['HWPE_WIDTH']
+    HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT']
 
     # Extract testbench parameters
     tb_params = testbench_config['parameters']
@@ -167,7 +167,7 @@ def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, mast
         - master_global_idx: global master id used by the generator
         """
         nonlocal next_start_id
-        data_width = HWPE_WIDTH * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
         n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG
         cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG
 
@@ -196,7 +196,7 @@ def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, mast
     def _gen_hwpe_master(master_idx, master_config, global_idx):
         nonlocal next_start_id
         filepath = raw_dir / f"master_hwpe_{master_idx}.txt"
-        master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH * DATA_WIDTH, ADD_WIDTH,
+        master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH_FACT * DATA_WIDTH, ADD_WIDTH,
                                    str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx)
         config = str(master_config.get('mem_access_type', '0'))
         start_address = str(master_config.get('start_address', '0'))
@@ -250,10 +250,10 @@ def _gen_hwpe_master(master_idx, master_config, global_idx):
     # Process raw files
     simvector_raw_path = str(raw_dir)
     simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve())
-    unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH)
+    unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
     print("STEP 1 COMPLETED: unfold txt files")
 
-    pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH)
+    pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
     print("STEP 2 COMPLETED: pad txt files")
 
     if args.golden:
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index e1651a5..c3b7ffa 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -21,14 +21,14 @@ module application_driver #(
   parameter int unsigned MASTER_NUMBER = 1,
   parameter int unsigned IS_HWPE = 1,
   parameter int unsigned DATA_WIDTH = 1,
-  parameter int unsigned ADD_WIDTH = 1,
+  parameter int unsigned ADDR_WIDTH = 1,
   parameter int unsigned APPL_DELAY = 2,  // Delay on the input signals
   parameter int unsigned IW = 1,
   parameter string STIM_FILE = ""
 ) (
-  hci_core_intf.initiator hci_if,
-  input logic             rst_ni,
   input logic             clk_i,
+  input logic             rst_ni,
+  hci_core_intf.initiator hci_if,
   output logic            end_stimuli_o,
   output logic            end_latency_o,
   output int unsigned     n_issued_transactions_o,
@@ -42,7 +42,7 @@ module application_driver #(
   logic wen;
   logic req;
   logic [DATA_WIDTH-1:0] data;
-  logic [ADD_WIDTH-1:0]  add;
+  logic [ADDR_WIDTH-1:0]  add;
   int unsigned n_completed_read_transactions;
   logic pending_rsp_is_read[$];
 
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index c9e8423..28f8b52 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -20,21 +20,21 @@
 module simulation_report
   import tb_hci_pkg::*;
 (
-  input logic [0:N_MASTER-1] end_stimuli_i,
-  input logic [0:N_MASTER-1] end_latency_i,
+  input logic [0:N_DRIVERS-1] end_stimuli_i,
+  input logic [0:N_DRIVERS-1] end_latency_i,
   input real                 throughput_complete_i,
   input real                 stim_latency_i,
   input real                 tot_latency_i,
-  input real                 latency_per_master_i[N_MASTER],
-  input real                 sum_req_to_gnt_latency_log_i[N_MASTER-N_HWPE],
+  input real                 latency_per_master_i[N_DRIVERS],
+  input real                 sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE],
   input real                 sum_req_to_gnt_latency_hwpe_i[N_HWPE],
-  input int unsigned         n_gnt_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_gnt_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_gnt_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_read_granted_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_read_granted_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_read_granted_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_write_granted_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_write_granted_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_write_granted_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_read_complete_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_read_complete_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_read_complete_transactions_hwpe_i[N_HWPE]
 );
 
@@ -86,7 +86,7 @@ module simulation_report
     wait (&end_latency_i);
     wait (throughput_complete_i >= 0);
     wait (tot_latency_i >= 0);
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -104,7 +104,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -122,7 +122,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -140,7 +140,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_hwpe_i[i];
       total_write_granted_transactions += n_write_granted_transactions_hwpe_i[i];
       total_read_complete_transactions += n_read_complete_transactions_hwpe_i[i];
@@ -188,19 +188,19 @@ module simulation_report
     $display("\\\\HW CONFIG\\\\");
     $display(
       "Masters: CORE=%0d DMA=%0d EXT=%0d HWPE=%0d (total=%0d)",
-      N_CORE_REAL, N_DMA_REAL, N_EXT_REAL, N_HWPE_REAL, N_MASTER_REAL
+      N_CORE, N_DMA, N_EXT, N_HWPE, N_DRIVERS
     );
     $display(
       "Memory: banks=%0d total_size=%0d kB data_width=%0d bits hwpe_width=%0d lanes",
-      N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH
+      N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH_FACT
     );
     $display(
       "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d",
       SEL_LIC, TS_BIT, EXPFIFO
     );
     $display(
-      "ID/address: IW=%0d ADD_WIDTH=%0d AddrMemWidth=%0d",
-      IW, ADD_WIDTH, AddrMemWidth
+      "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d",
+      IW, ADDR_WIDTH, ADDR_WIDTH_BANK
     );
 
     $display("\n\\\\BANDWIDTH\\\\");
@@ -223,35 +223,35 @@ module simulation_report
 
     $display("\n\\\\SIMULATION TIME\\\\");
     $display("Total simulation time: %0.2f cycles", tot_latency_i);
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
         "Core%0d (master_log_%0d): %0.2f cycles",
         i, i, latency_per_master_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
         "DMA%0d (master_log_%0d): %0.2f cycles",
         i - N_CORE, i, latency_per_master_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
         "EXT%0d (master_log_%0d): %0.2f cycles",
         i - (N_CORE + N_DMA), i, latency_per_master_i[i]
       );
     end
-    for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin
+    for (int i = N_DRIVERS - N_HWPE; i < N_DRIVERS - N_HWPE + N_HWPE; i++) begin
       $display(
         "HWPE%0d (master_hwpe_%0d): %0.2f cycles",
-        i - (N_MASTER - N_HWPE),
-        i - (N_MASTER - N_HWPE),
+        i - (N_DRIVERS - N_HWPE),
+        i - (N_DRIVERS - N_HWPE),
         latency_per_master_i[i]
       );
     end
 
     $display("\n\\\\READ RESPONSE COVERAGE\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -262,7 +262,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -273,7 +273,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -284,7 +284,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       expected_reads = n_read_granted_transactions_hwpe_i[i];
       observed_reads = n_read_complete_transactions_hwpe_i[i];
       $display(
@@ -300,7 +300,7 @@ module simulation_report
     end
 
     $display("\n\\\\TRANSACTION COUNTS\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -309,7 +309,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -318,7 +318,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -327,7 +327,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       $display(
         "master_hwpe_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -338,7 +338,7 @@ module simulation_report
     end
 
     $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
         "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
@@ -348,7 +348,7 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
         "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
@@ -358,7 +358,7 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
         "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
@@ -368,7 +368,7 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       $display(
         "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 5fb3411..7ce7ea2 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -30,11 +30,6 @@ module tb_hci
   logic                   s_clear;
   hci_interconnect_ctrl_t s_hci_ctrl;
 
-  logic [0:N_MASTER-1] s_end_stimuli;
-  logic [0:N_MASTER-1] s_end_latency;
-  int unsigned s_issued_transactions[0:N_MASTER-1];
-  int unsigned s_issued_read_transactions[0:N_MASTER-1];
-
   clk_rst_gen #(
     .ClkPeriod(CLK_PERIOD),
     .RstClkCycles(RST_CLK_CYCLES)
@@ -59,7 +54,7 @@ module tb_hci
   };
   localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{     // Bank parameters
     DW:  DEFAULT_DW,
-    AW:  AddrMemWidth,
+    AW:  ADDR_WIDTH_BANK,
     BW:  DEFAULT_BW,
     UW:  DEFAULT_UW,
     IW:  IW,
@@ -67,7 +62,7 @@ module tb_hci
     EHW: DEFAULT_EHW
   };
   localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{     // HWPE parameters
-    DW:  HWPE_WIDTH*DATA_WIDTH,
+    DW:  HWPE_WIDTH_FACT*DATA_WIDTH,
     AW:  DEFAULT_AW,
     BW:  DEFAULT_BW,
     UW:  DEFAULT_UW,
@@ -76,6 +71,8 @@ module tb_hci
     EHW: DEFAULT_EHW
   };
 
+  /* Application-driver-side interfaces */
+
   hci_core_intf #(
     .DW(HCI_SIZE_hwpe.DW),
     .AW(HCI_SIZE_hwpe.AW),
@@ -96,10 +93,75 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_log_if [0:N_MASTER-N_HWPE-1] (
+  ) hci_log_if [0:N_LOG_MASTERS-1] (
+    .clk(clk)
+  );
+
+  /* Interconnect-side master interfaces */
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_hwpe.DW),
+    .AW(HCI_SIZE_hwpe.AW),
+    .BW(HCI_SIZE_hwpe.BW),
+    .UW(HCI_SIZE_hwpe.UW),
+    .IW(HCI_SIZE_hwpe.IW),
+    .EW(HCI_SIZE_hwpe.EW),
+    .EHW(HCI_SIZE_hwpe.EHW)
+  ) hci_hwpe_wide_if [0:N_HWPE_MASTERS-1] (
+    .clk(clk)
+  );
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_core_if [0:N_CORE+N_HWPE_LOG_MASTERS-1] (
+    .clk(clk)
+  );
+
+  // LOG-only intermediate interfaces for HWPE split lanes.
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_hwpe_log_if [0:N_HWPE_LOG_MASTERS-1] (
+    .clk(clk)
+  );
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_dma_if [0:N_DMA-1] (
+    .clk(clk)
+  );
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_ext_if [0:N_EXT-1] (
     .clk(clk)
   );
 
+  /* Memory interface */
+
   hci_core_intf #(
     .DW(HCI_SIZE_mems.DW),
     .AW(HCI_SIZE_mems.AW),
@@ -118,20 +180,87 @@ module tb_hci
 
   /* HCI instance */
 
+  generate
+    for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_log
+      hci_core_assign i_core_to_hci_assign (
+        .tcdm_target (hci_log_if[ii]),
+        .tcdm_initiator (hci_core_if[ii])
+      );
+    end
+    for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_log
+      hci_core_assign i_dma_to_hci_assign (
+        .tcdm_target (hci_log_if[N_CORE + ii]),
+        .tcdm_initiator (hci_dma_if[ii])
+      );
+    end
+    for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_log
+      hci_core_assign i_ext_to_hci_assign (
+        .tcdm_target (hci_log_if[N_CORE + N_DMA + ii]),
+        .tcdm_initiator (hci_ext_if[ii])
+      );
+    end
+
+    if (INTERCO_TYPE == HCI) begin : gen_hwpe_to_hci
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_hci
+        hci_core_assign i_hwpe_to_hci_assign (
+          .tcdm_target (hci_hwpe_if[ii]),
+          .tcdm_initiator (hci_hwpe_wide_if[ii])
+        );
+      end
+    end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_to_log
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_log
+        hci_core_split #(
+          .DW(HWPE_WIDTH_FACT * DATA_WIDTH),
+          .BW(DATA_WIDTH / 8),
+          .UW(1),
+          .NB_OUT_CHAN(HWPE_WIDTH_FACT),
+          .FIFO_DEPTH(2),
+          .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe)
+        ) i_hwpe_to_log_split (
+          .clk_i(clk),
+          .rst_ni(rst_n),
+          .clear_i(s_clear),
+          .tcdm_target(hci_hwpe_if[ii]),
+          .tcdm_initiator(hci_hwpe_log_if[ii * HWPE_WIDTH_FACT : (ii + 1) * HWPE_WIDTH_FACT - 1])
+        );
+
+        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_log_rvalid_filter
+          localparam int unsigned IDX_LOG = ii * HWPE_WIDTH_FACT + f;
+          localparam int unsigned IDX_CORE = N_CORE + IDX_LOG;
+          hci_core_r_valid_filter #(
+            .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_cores)
+          ) i_hwpe_log_rvalid_filter (
+            .clk_i(clk),
+            .rst_ni(rst_n),
+            .clear_i(s_clear),
+            .enable_i(1'b1),
+            .tcdm_target(hci_hwpe_log_if[IDX_LOG]),
+            .tcdm_initiator(hci_core_if[IDX_CORE])
+          );
+        end
+      end
+    end else begin
+      // Error: unsupported for now
+      $error("Unsupported INTERCO_TYPE");
+    end
+  endgenerate
+
   assign s_clear = 0;
+  assign s_hci_ctrl.arb_policy = ARBITER_MODE;
   assign s_hci_ctrl.invert_prio = INVERT_PRIO;
   assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
 
   hci_interconnect #(
-    .N_HWPE(N_HWPE),   // Number of HWPEs attached to the port
-    .N_CORE(N_CORE),   // Number of Core ports
-    .N_DMA(N_DMA),     // Number of DMA ports
-    .N_EXT(N_EXT),     // Number of External ports
-    .N_MEM(N_BANKS),   // Number of Memory banks
-    .TS_BIT(TS_BIT),   // TEST_SET_BIT (for Log Interconnect)
-    .IW(IW),           // ID Width
-    .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect
-    .SEL_LIC(SEL_LIC), // Log interconnect type selector
+    .N_HWPE(N_HWPE_MASTERS), // Number of HWPE ports
+    .N_CORE(N_CORE + N_HWPE_LOG_MASTERS), // Number of CORE ports
+    .N_DMA(N_DMA),           // Number of DMA ports
+    .N_EXT(N_EXT),           // Number of External ports
+    .N_MEM(N_BANKS),         // Number of Memory banks
+    .TS_BIT(TS_BIT),         // TEST_SET_BIT (for Log Interconnect)
+    .IW(IW),                 // ID Width
+    .EXPFIFO(EXPFIFO),       // FIFO Depth for HWPE Interconnect
+    .SEL_LIC(SEL_LIC),       // Log interconnect type selector
+    .FILTER_WRITE_R_VALID ( FILTER_WRITE_R_VALID ),
     .HCI_SIZE_cores(HCI_SIZE_cores),
     .HCI_SIZE_mems(HCI_SIZE_mems),
     .HCI_SIZE_hwpe(HCI_SIZE_hwpe),
@@ -144,11 +273,11 @@ module tb_hci
     .rst_ni(rst_n),
     .clear_i(s_clear),
     .ctrl_i(s_hci_ctrl),
-    .cores(hci_log_if[0:N_CORE-1]),
-    .dma(hci_log_if[N_CORE:N_CORE+N_DMA-1]),
-    .ext(hci_log_if[N_CORE+N_DMA:N_CORE+N_DMA+N_EXT-1]),
+    .cores(hci_core_if),
+    .dma(hci_dma_if),
+    .ext(hci_ext_if),
     .mems(hci_mem_if),
-    .hwpe(hci_hwpe_if)
+    .hwpe(hci_hwpe_wide_if)
   );
 
   //////////
@@ -159,7 +288,7 @@ module tb_hci
     .BankSize(N_WORDS),
     .NbBanks(N_BANKS),
     .DataWidth(DATA_WIDTH),
-    .AddrWidth(ADD_WIDTH),
+    .AddrWidth(ADDR_WIDTH),
     .BeWidth(DATA_WIDTH/8),
     .IdWidth(IW)
   ) i_tb_mem (
@@ -173,16 +302,21 @@ module tb_hci
   // Application drivers //
   /////////////////////////
 
+  logic [0:N_DRIVERS-1] s_end_stimuli;
+  logic [0:N_DRIVERS-1] s_end_latency;
+  int unsigned s_issued_transactions[0:N_DRIVERS-1];
+  int unsigned s_issued_read_transactions[0:N_DRIVERS-1];
+
   /* CORE + DMA + EXT */
   generate
-    for (genvar ii = 0; ii < N_MASTER - N_HWPE; ii++) begin : gen_app_driver_log
+    for (genvar ii = 0; ii < N_CORE + N_DMA + N_EXT; ii++) begin : gen_app_driver_log
       localparam string STIM_FILE_LOG =
           $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
         .IS_HWPE(0),
         .DATA_WIDTH(DATA_WIDTH),
-        .ADD_WIDTH(ADD_WIDTH),
+        .ADDR_WIDTH(ADDR_WIDTH),
         .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
         .IW(IW),
         .STIM_FILE(STIM_FILE_LOG)
@@ -206,19 +340,19 @@ module tb_hci
       application_driver #(
         .MASTER_NUMBER(ii),
         .IS_HWPE(1),
-        .DATA_WIDTH(HWPE_WIDTH * DATA_WIDTH),
-        .ADD_WIDTH(ADD_WIDTH),
+        .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH),
+        .ADDR_WIDTH(ADDR_WIDTH),
         .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
         .IW(IW),
         .STIM_FILE(STIM_FILE_HWPE)
       ) i_app_driver_hwpe (
-        .hci_if(hci_hwpe_if[ii]),
-        .rst_ni(rst_n),
         .clk_i(clk),
-        .end_stimuli_o(s_end_stimuli[N_MASTER-N_HWPE+ii]),
-        .end_latency_o(s_end_latency[N_MASTER-N_HWPE+ii]),
-        .n_issued_transactions_o(s_issued_transactions[N_MASTER-N_HWPE+ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[N_MASTER-N_HWPE+ii])
+        .rst_ni(rst_n),
+        .hci_if(hci_hwpe_if[ii]),
+        .end_stimuli_o(s_end_stimuli[N_CORE+N_DMA+N_EXT + ii]),
+        .end_latency_o(s_end_latency[N_CORE+N_DMA+N_EXT + ii]),
+        .n_issued_transactions_o(s_issued_transactions[N_CORE+N_DMA+N_EXT + ii]),
+        .n_issued_read_transactions_o(s_issued_read_transactions[N_CORE+N_DMA+N_EXT + ii])
       );
     end
   endgenerate
@@ -227,30 +361,30 @@ module tb_hci
   // QoS //
   /////////
 
-  real SUM_REQ_TO_GNT_LATENCY_LOG[N_MASTER-N_HWPE];
+  real SUM_REQ_TO_GNT_LATENCY_LOG[N_CORE+N_DMA+N_EXT];
   real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE];
-  int unsigned N_GNT_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_GNT_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
   int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
   int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
   int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
   int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE];
 
   /* REAL THROUGHPUT AND SIMULATION TIME */
 
-  real latency_per_master[N_MASTER];
+  real latency_per_master[N_DRIVERS];
   real throughput_completed;
   real stim_latency;
   real tot_latency;
 
   throughput_monitor #(
-    .N_MASTER(N_MASTER),
+    .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE),
     .CLK_PERIOD(CLK_PERIOD),
     .DATA_WIDTH(DATA_WIDTH),
-    .HWPE_WIDTH(HWPE_WIDTH)
+    .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT)
   ) i_throughput_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
@@ -268,7 +402,7 @@ module tb_hci
 
   /* LATENCY MONITOR */
   latency_monitor #(
-    .N_MASTER(N_MASTER),
+    .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE)
   ) i_latency_monitor (
     .clk_i(clk),
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index cdaf055..d35ca6e 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -18,6 +18,12 @@
 
 package tb_hci_pkg;
 
+  typedef enum logic [1:0] {
+    LOG  = 2'd0,
+    MUX = 2'd1,
+    HCI  = 2'd2
+  } interco_e;
+
   //////////////////////////
   // Testbench parameters //
   //////////////////////////
@@ -37,7 +43,6 @@ package tb_hci_pkg;
   localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO);
 
   // TCDM interface parameters
-  localparam int unsigned MAX_CYCLES_BETWEEN_GNT_RVALID = `ifdef MAX_CYCLES_BETWEEN_GNT_RVALID `MAX_CYCLES_BETWEEN_GNT_RVALID `else 1 `endif;
   localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
 
   // Arbiter configuration
@@ -52,19 +57,13 @@ package tb_hci_pkg;
 
   /* Config */
 
+  localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif;
+
   // Master port counts
-  localparam int unsigned N_HWPE_REAL = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port
-  localparam int unsigned N_CORE_REAL = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports
-  localparam int unsigned N_DMA_REAL  = `ifdef N_DMA `N_DMA `else 1 `endif;   // Number of DMA ports
-  localparam int unsigned N_EXT_REAL = `ifdef N_EXT `N_EXT `else 1 `endif;    // Number of External ports
-
-  // Normalized master counts (minimum 1 for array sizing)
-  localparam int unsigned N_HWPE = (N_HWPE_REAL == 0) ? 1 : N_HWPE_REAL;
-  localparam int unsigned N_CORE = (N_CORE_REAL == 0) ? 1 : N_CORE_REAL;
-  localparam int unsigned N_DMA  = (N_DMA_REAL == 0) ? 1 : N_DMA_REAL;
-  localparam int unsigned N_EXT  = (N_EXT_REAL == 0) ? 1 : N_EXT_REAL;
-  localparam int unsigned N_MASTER = N_HWPE + N_CORE + N_DMA + N_EXT;                          // Total number of masters
-  localparam int unsigned N_MASTER_REAL = N_HWPE_REAL + N_CORE_REAL + N_DMA_REAL + N_EXT_REAL; // Total number of masters (real)
+  localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port
+  localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports
+  localparam int unsigned N_DMA  = `ifdef N_DMA `N_DMA `else 1 `endif;   // Number of DMA ports
+  localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif;    // Number of External ports
 
   // Interconnect configuration
   localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif;    // TEST_SET_BIT (for Log Interconnect)
@@ -72,23 +71,30 @@ package tb_hci_pkg;
   localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector
 
   // Data and memory parameters
-  localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif;       // Width of DATA in bits
-  localparam int unsigned HWPE_WIDTH = `ifdef HWPE_WIDTH `HWPE_WIDTH `else 4 `endif;        // Width of an HWPE wide-word
+  localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits
+  localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; // Width factor of an HWPE wide-word
   localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB)
-  localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif;                // Number of memory banks
 
   /* Derived parameters */
 
-  localparam int unsigned ADD_WIDTH = $clog2(TOT_MEM_SIZE * 1024);    // Width of ADDRESS in bits
+  localparam int unsigned N_DRIVERS = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters
+  localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks
+
+  localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
+  localparam int unsigned N_HWPE_LOG_MASTERS = (INTERCO_TYPE == LOG) ? (N_HWPE * HWPE_WIDTH_FACT) : 0;
+  localparam int unsigned N_HWPE_MASTERS = (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0);
+
+  localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024);    // Width of ADDRESS in bits
   localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH;               // Width of a memory bank (bits)
   localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes)
   localparam int unsigned BIT_BANK_INDEX       = $clog2(N_BANKS);     // Bits of the Bank index
-  localparam int unsigned AddrMemWidth = ADD_WIDTH - BIT_BANK_INDEX;  // Number of address bits per TCDM bank
+  localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX;  // Number of address bits per TCDM bank
   localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank
-  localparam int unsigned FILTER_WRITE_R_VALID = '0;
+  localparam int unsigned FILTER_WRITE_R_VALID[0:N_HWPE_MASTERS-1] = '{default: 0}; // Enable filtering of only r_valid respons
 
-  localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * (N_MASTER_REAL - N_HWPE_REAL) + N_TRANSACTION_HWPE * N_HWPE_REAL); // ID Width
-  localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE_REAL + N_DMA_REAL + N_EXT_REAL) + N_HWPE_REAL * N_TRANSACTION_HWPE * HWPE_WIDTH;
+  // Keep ID width consistent with stimuli generator.
+  localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * N_LOG_MASTERS + N_TRANSACTION_HWPE * N_HWPE);
+  localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE + N_DMA + N_EXT) + N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT;
 
   ///////////
   // Types //
@@ -97,17 +103,17 @@ package tb_hci_pkg;
   typedef struct packed {
     logic                  wen;
     logic [DATA_WIDTH-1:0] data;
-    logic [ADD_WIDTH-1:0]  add;
+    logic [ADDR_WIDTH-1:0] add;
   } stimuli_t;
 
   typedef struct packed {
-    logic [DATA_WIDTH - 1 : 0]   data;
-    logic [AddrMemWidth - 1 : 0] add;
+    logic [DATA_WIDTH-1:0] data;
+    logic [ADDR_WIDTH_BANK-1:0] add;
   } out_intc_to_mem_t;
 
   // Helper return type for HWPE address/data creation
   typedef struct {
-    logic [ADD_WIDTH-1:0] address;
+    logic [ADDR_WIDTH-1:0] address;
     logic [DATA_WIDTH-1:0] data;
     logic rolls_over;
   } hwpe_addr_data_t;
@@ -118,8 +124,8 @@ package tb_hci_pkg;
 
   // Zero-time pure function returning address/data for an HWPE lane
   function automatic hwpe_addr_data_t create_address_and_data_hwpe(
-    input logic [ADD_WIDTH-1:0] address_before,
-    input logic [HWPE_WIDTH * DATA_WIDTH-1:0] data_before,
+    input logic [ADDR_WIDTH-1:0] address_before,
+    input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before,
     input int index,
     input logic rolls_over_check_before
   );
@@ -135,7 +141,7 @@ package tb_hci_pkg;
       end
 
       ret.address = {
-        address_before[ADD_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over,
+        address_before[ADDR_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over,
         bank_index_after,
         address_before[1:0]
       };
@@ -145,7 +151,7 @@ package tb_hci_pkg;
   endfunction
 
   task calculate_bank_index(
-    input logic [ADD_WIDTH-1:0] address,
+    input logic [ADDR_WIDTH-1:0] address,
     output logic [BIT_BANK_INDEX-1:0] index
   );
     index = address[BIT_BANK_INDEX-1 + 2:2];
@@ -160,8 +166,8 @@ package tb_hci_pkg;
     end else begin
       tot_time = N_TRANSACTION_LOG;
     end
-    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_MASTER_REAL - N_HWPE_REAL) +
-               (N_TRANSACTION_HWPE * HWPE_WIDTH * DATA_WIDTH) * N_HWPE_REAL;  // bit
+    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_DRIVERS - N_HWPE) +
+               (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE;  // bit
     throughput_theo = tot_data / tot_time;  // bit per cycle
     band_memory_limit = real'(N_BANKS * DATA_WIDTH);
     if (throughput_theo >= band_memory_limit) begin
@@ -174,14 +180,14 @@ package tb_hci_pkg;
   ///////////////
 
   // Convert a full system address to per-bank local word address.
-  function int unsigned get_bank_local_address(input logic [ADD_WIDTH-1:0] addr_i);
-    logic [ADD_WIDTH-1:0] mapped_addr;
-    logic [ADD_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
+  function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i);
+    logic [ADDR_WIDTH-1:0] mapped_addr;
+    logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
     tb_hci_pkg::hwpe_addr_data_t hwpe_lane_addr_data;
-    hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH, '0);
+    hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH_FACT, '0);
     mapped_addr = hwpe_lane_addr_data.address;
     bank_local_addr = {
-      mapped_addr[ADD_WIDTH-1:BIT_BANK_INDEX + 2],
+      mapped_addr[ADDR_WIDTH-1:BIT_BANK_INDEX + 2],
       mapped_addr[1:0]
     };
     return int'(bank_local_addr);
diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv
index b25d16e..bc7294f 100644
--- a/target/verif/src/throughput_monitor.sv
+++ b/target/verif/src/throughput_monitor.sv
@@ -22,7 +22,7 @@ module throughput_monitor #(
   parameter int unsigned N_HWPE,
   parameter int unsigned CLK_PERIOD,
   parameter int unsigned DATA_WIDTH,
-  parameter int unsigned HWPE_WIDTH
+  parameter int unsigned HWPE_WIDTH_FACT
 ) (
   input logic                clk_i,
   input logic                rst_ni,
@@ -82,7 +82,7 @@ module throughput_monitor #(
     for (int i = 0; i < N_HWPE; i++) begin
       tot_data += real'(
         n_write_granted_hwpe_i[i] + n_read_complete_hwpe_i[i]
-      ) * real'(HWPE_WIDTH * DATA_WIDTH);
+      ) * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
     end
     if (completion_time_cycles > 0.0) begin
       throughput_complete_o = tot_data / completion_time_cycles;  // bits per cycle
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 1dac0de..e9829ea 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -127,11 +127,12 @@ $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled: $(sim_vsim_lib)/.hw
 	date > $@
 
 .PHONY: run-verif
-run-verif: $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp
+run-verif: $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp
 	cd $(HCI_VERIF_DIR)/vsim && \
 	$(SIM_VSIM) $(SIM_HCI_VSIM_ARGS) \
 	$(sim_top_level)_optimized \
-	-do 'set GUI $(GUI); source $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl'
+	-do 'set GUI $(GUI); source $<'
+
 
 .PHONY: clean-verif
 clean-sim-verif:
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index 8935f49..feeefa0 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -1,7 +1,100 @@
 # If GUI is 1, spawn waveforms
 if {$GUI == 1} {
     echo "GUI mode enabled"
-    add log -r /*
+    log -r /*
+
+    set N_CORE [examine -radix dec /tb_hci_pkg/N_CORE]
+    set N_DMA [examine -radix dec /tb_hci_pkg/N_DMA]
+    set N_EXT [examine -radix dec /tb_hci_pkg/N_EXT]
+    set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE]
+    set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS]
+
+    set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE]
+    set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS]
+    set N_HWPE_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_LOG_MASTERS]
+    set N_HWPE_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_MASTERS]
+
+    set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT]
+
+    add wave -noupdate /tb_hci/clk
+    add wave -noupdate /tb_hci/rst_n
+
+    # Application driver interfaces
+    add wave -noupdate -group application_drivers -divider narrow_Cores
+    for {set i 0} {$i < $N_CORE} {incr i} {
+        add wave -noupdate -group application_drivers -group core_$i /tb_hci/hci_log_if[$i]/*
+    }
+    add wave -noupdate -group application_drivers -divider wide_HWPEs
+    for {set i 0} {$i < $N_HWPE} {incr i} {
+        add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_hwpe_if[$i]/*
+    }
+    add wave -noupdate -group application_drivers -divider narrow_DMA
+    for {set i 0} {$i < $N_DMA} {incr i} {
+        set idx [expr {$i + $N_CORE}]
+        add wave -noupdate -group application_drivers -group dma_$i /tb_hci/hci_log_if[$idx]/*
+    }
+    add wave -noupdate -group application_drivers -divider narrow_External
+    for {set i 0} {$i < $N_EXT} {incr i} {
+        set idx [expr {$i + $N_CORE + $N_DMA}]
+        add wave -noupdate -group application_drivers -group ext_$i /tb_hci/hci_log_if[$idx]/*
+    }
+    # HCI-side interfaces
+    add wave -noupdate -group hci_interfaces -divider narrow_Cores
+    for {set i 0} {$i < $N_CORE} {incr i} {
+        add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_core_if[$i]/*
+    }
+    if {$INTERCO_TYPE == {LOG}} {
+        add wave -noupdate -group hci_interfaces -divider narrow_HWPEs_split
+        for {set i 0} {$i < $N_HWPE} {incr i} {
+            for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} {
+                set idx [expr {$N_CORE + $i*$HWPE_WIDTH_FACT + $f}]
+                add wave -noupdate -group hci_interfaces -group hwpe_$i -group word_$f /tb_hci/hci_core_if[$idx]/*
+            }
+        }
+    } elseif {$INTERCO_TYPE == {HCI} || $INTERCO_TYPE == {MUX}} {
+        add wave -noupdate -group hci_interfaces -divider wide_HWPEs
+        for {set i 0} {$i < $N_HWPE_MASTERS} {incr i} {
+            add wave -noupdate -group hci_interfaces -group hwpe_$i /tb_hci/hci_hwpe_wide_if[$i]/*
+        }
+    } else {
+        echo "WARNING: unsupported INTERCO_TYPE value: $INTERCO_TYPE_SYM ($INTERCO_TYPE)"
+    }
+    add wave -noupdate -group hci_interfaces -divider narrow_DMA
+    for {set i 0} {$i < $N_DMA} {incr i} {
+        add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_dma_if[$i]/*
+    }
+    add wave -noupdate -group hci_interfaces -divider narrow_External
+    for {set i 0} {$i < $N_EXT} {incr i} {
+        add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_ext_if[$i]/*
+    }
+    # Memory slaves
+    add wave -noupdate -group memory_slaves
+    for {set i 0} {$i < $N_BANKS} {incr i} {
+        add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_mem_if[$i]/*
+    }
+
+    add wave -noupdate -group metrics /tb_hci/s_end_latency
+    add wave -noupdate -group metrics /tb_hci/s_end_stimuli
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_read_transactions[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/s_issued_read_transactions[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/s_issued_read_transactions[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/s_issued_read_transactions[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/s_issued_read_transactions[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/s_issued_read_transactions[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/s_issued_read_transactions[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/s_issued_read_transactions[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[9]} {-format Analog-Step -height 84 -max 503.0} {/tb_hci/s_issued_read_transactions[10]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/s_issued_read_transactions[11]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/s_issued_read_transactions
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_transactions[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[9]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[10]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[11]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/s_issued_transactions
+    add wave -noupdate -group metrics /tb_hci/stim_latency
+    add wave -noupdate -group metrics /tb_hci/tot_latency
+    add wave -noupdate -group metrics /tb_hci/latency_per_master
+    add wave -noupdate -group metrics /tb_hci/throughput_completed
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 511.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 507.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 528.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 507.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 494.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 519.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 475.99999999999994} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 488.00000000000006} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 482.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 497.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[0]} {-format Analog-Step -height 84 -max 1327824.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[1]} {-format Analog-Step -height 84 -max 1336050.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
+    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[0]} {-format Analog-Step -height 84 -max 580398.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[1]} {-format Analog-Step -height 84 -max 580783.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[2]} {-format Analog-Step -height 84 -max 596101.00000000012} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[3]} {-format Analog-Step -height 84 -max 590598.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[4]} {-format Analog-Step -height 84 -max 603272.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[5]} {-format Analog-Step -height 84 -max 601733.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[6]} {-format Analog-Step -height 84 -max 596110.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[7]} {-format Analog-Step -height 84 -max 593715.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[8]} {-format Analog-Step -height 84 -max 602346.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[9]} {-format Analog-Step -height 84 -max 597075.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG
+    add wave -noupdate /tb_hci/s_clear
+    add wave -noupdate /tb_hci/s_hci_ctrl
+
+    configure wave -signalnamewidth 1
 } else {
     run -a
 }

From f79d065b85799b12ea0cdf0820c6918a35a87869 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 27 Feb 2026 16:06:11 +0100
Subject: [PATCH 06/25] [WIP] Change structure of tb

---
 .../verif/simvectors/hci_stimuli/generator.py |  33 +-
 target/verif/simvectors/main.py               |   8 +-
 target/verif/src/application_driver.sv        |  96 ++++--
 target/verif/src/latency_monitor.sv           |  36 +--
 target/verif/src/tb_hci.sv                    | 303 +++++++++---------
 target/verif/src/tb_hci_pkg.sv                | 172 ++++++----
 target/verif/src/tcdm_banks_wrap.sv           |   2 +-
 target/verif/vsim/tb_hci.tcl                  |  99 +++---
 8 files changed, 411 insertions(+), 338 deletions(-)

diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py
index ca5e2d0..af121d6 100644
--- a/target/verif/simvectors/hci_stimuli/generator.py
+++ b/target/verif/simvectors/hci_stimuli/generator.py
@@ -2,7 +2,7 @@
 import random
 import os
 
-class StimuliGenerator:
+class StimuliGenerator:
     def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION):
         self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY
         self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8)
@@ -15,8 +15,11 @@ def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,f
         self.N_TEST = N_TEST
         self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET
         self.CYCLE_OFFSET = CYCLE_OFFSET
-        self.IW = IW
-        self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION
+        self.IW = IW
+        self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION
+
+    def _format_id(self, id_value):
+        return bin(id_value % (1 << self.IW))[2:].zfill(self.IW)
     
     def random_data(self):
         data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data
@@ -58,8 +61,8 @@ def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_
                             LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
                             LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
                             break
-                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id  + 1
+                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                id = id  + 1
             LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
             LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
         return id
@@ -88,8 +91,8 @@ def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_R
                         LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
                         LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
                         break
-                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id + 1
+                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                id = id + 1
 
             LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
             LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
@@ -116,8 +119,8 @@ def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_
                     if wen:
                         if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
                             LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
+                            file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                            id = id + 1
                             if id - id_start >= self.N_TEST :
                                 STOP = 1
                                 break
@@ -125,8 +128,8 @@ def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_
                         if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
                             LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
                             LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                            file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
+                            file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                            id = id + 1
                             if id - id_start >= self.N_TEST :
                                 STOP = 1
                                 break
@@ -159,8 +162,8 @@ def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIS
                         if wen:
                             if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
                                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
+                                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                                id = id + 1
                                 if id - id_start >= self.N_TEST :
                                     STOP = 1
                                     break
@@ -168,8 +171,8 @@ def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIS
                             if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
                                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
                                 LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
+                                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
+                                id = id + 1
                                 if id - id_start >= self.N_TEST :
                                     STOP = 1
                                     break
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index b164503..bbdd853 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -98,7 +98,13 @@ def main(argv=None):
     N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO)
     N_LOG = N_CORE + N_DMA + N_EXT
     N_MASTER = N_LOG + N_HWPE
-    IW = int(np.ceil(np.log2(N_TEST_LOG * N_LOG + N_TEST_HWPE * N_HWPE)))
+    # Match tb_hci_pkg interface ID widths (hci_system-style model).
+    IW_CORES = 8
+    IW_HWPE = IW_CORES
+    if IW_CORES != IW_HWPE:
+        print("ERROR: stimuli generator currently requires equal log/HWPE interface ID widths")
+        sys.exit(1)
+    IW = IW_CORES
     CORE_ZERO_FLAG = False
     EXT_ZERO_FLAG = False
     DMA_ZERO_FLAG = False
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index c3b7ffa..fd69399 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -24,6 +24,8 @@ module application_driver #(
   parameter int unsigned ADDR_WIDTH = 1,
   parameter int unsigned APPL_DELAY = 2,  // Delay on the input signals
   parameter int unsigned IW = 1,
+  parameter bit USE_STREAMER_FIFO = 1'b1,
+  parameter int unsigned STREAMER_FIFO_DEPTH = 2,
   parameter string STIM_FILE = ""
 ) (
   input logic             clk_i,
@@ -45,6 +47,48 @@ module application_driver #(
   logic [ADDR_WIDTH-1:0]  add;
   int unsigned n_completed_read_transactions;
   logic pending_rsp_is_read[$];
+  localparam hci_package::hci_size_parameter_t HCI_SIZE_driver = '{
+    DW:  DATA_WIDTH,
+    AW:  ADDR_WIDTH,
+    BW:  hci_package::DEFAULT_BW,
+    UW:  hci_package::DEFAULT_UW,
+    IW:  IW,
+    EW:  hci_package::DEFAULT_EW,
+    EHW: hci_package::DEFAULT_EHW
+  };
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_driver.DW),
+    .AW(HCI_SIZE_driver.AW),
+    .BW(HCI_SIZE_driver.BW),
+    .UW(HCI_SIZE_driver.UW),
+    .IW(HCI_SIZE_driver.IW),
+    .EW(HCI_SIZE_driver.EW),
+    .EHW(HCI_SIZE_driver.EHW)
+  ) hci_drv_if (
+    .clk(clk_i)
+  );
+
+  generate
+    if (USE_STREAMER_FIFO) begin : gen_driver_streamer_fifo
+      hci_core_fifo #(
+        .FIFO_DEPTH(STREAMER_FIFO_DEPTH),
+        .HCI_SIZE_tcdm_initiator(HCI_SIZE_driver)
+      ) i_driver_streamer_fifo (
+        .clk_i(clk_i),
+        .rst_ni(rst_ni),
+        .clear_i(1'b0),
+        .flags_o(),
+        .tcdm_target(hci_drv_if),
+        .tcdm_initiator(hci_if)
+      );
+    end else begin : gen_driver_direct
+      hci_core_assign i_driver_direct_assign (
+        .tcdm_target(hci_drv_if),
+        .tcdm_initiator(hci_if)
+      );
+    end
+  endgenerate
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter
     logic retired_is_read;
@@ -52,10 +96,10 @@ module application_driver #(
       n_completed_read_transactions <= '0;
       pending_rsp_is_read.delete();
     end else begin
-      if (hci_if.req && hci_if.gnt) begin
-        pending_rsp_is_read.push_back(hci_if.wen);
+      if (hci_drv_if.req && hci_drv_if.gnt) begin
+        pending_rsp_is_read.push_back(hci_drv_if.wen);
       end
-      if (hci_if.r_valid && hci_if.r_ready) begin
+      if (hci_drv_if.r_valid && hci_drv_if.r_ready) begin
         if (pending_rsp_is_read.size() != 0) begin
           retired_is_read = pending_rsp_is_read.pop_front();
           if (retired_is_read) begin
@@ -67,17 +111,17 @@ module application_driver #(
   end
 
   initial begin : proc_application_driver
-    hci_if.id = '0;
-    hci_if.add = '0;
-    hci_if.data = '0;
-    hci_if.req = 1'b0;
-    hci_if.wen = 1'b0;
-    hci_if.ecc = '0;
-    hci_if.ereq = '0;
-    hci_if.r_eready = '0;
-    hci_if.be = '1;
-    hci_if.r_ready = 1'b1;
-    hci_if.user = '0;
+    hci_drv_if.id = '0;
+    hci_drv_if.add = '0;
+    hci_drv_if.data = '0;
+    hci_drv_if.req = 1'b0;
+    hci_drv_if.wen = 1'b0;
+    hci_drv_if.ecc = '0;
+    hci_drv_if.ereq = '0;
+    hci_drv_if.r_eready = '0;
+    hci_drv_if.be = '1;
+    hci_drv_if.r_ready = 1'b1;
+    hci_drv_if.user = '0;
     end_stimuli_o = 1'b0;
     end_latency_o = 1'b0;
     n_issued_transactions_o = '0;
@@ -113,25 +157,25 @@ module application_driver #(
         break;
       end
       #(APPL_DELAY);
-      hci_if.id = id;
-      hci_if.data = data;
-      hci_if.add = add;
-      hci_if.wen = wen;
-      hci_if.req = req;
+      hci_drv_if.id = id;
+      hci_drv_if.data = data;
+      hci_drv_if.add = add;
+      hci_drv_if.wen = wen;
+      hci_drv_if.req = req;
 
       if (req) begin
-        @(posedge clk_i iff hci_if.gnt);
+        @(posedge clk_i iff hci_drv_if.gnt);
         n_issued_transactions_o++;
         if (wen) begin
           n_issued_read_transactions_o++;
         end
         // Deassert in NBA region so monitors sampling this edge see the handshake.
-        hci_if.id <= '0;
-        hci_if.data <= '0;
-        hci_if.add <= '0;
-        hci_if.wen <= 1'b0;
-        hci_if.req <= 1'b0;
-        wait (hci_if.req == 1'b0);
+        hci_drv_if.id <= '0;
+        hci_drv_if.data <= '0;
+        hci_drv_if.add <= '0;
+        hci_drv_if.wen <= 1'b0;
+        hci_drv_if.req <= 1'b0;
+        wait (hci_drv_if.req == 1'b0);
       end else begin
         @(posedge clk_i);
       end
diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/latency_monitor.sv
index b645ce1..3aff305 100644
--- a/target/verif/src/latency_monitor.sv
+++ b/target/verif/src/latency_monitor.sv
@@ -24,8 +24,8 @@ module latency_monitor #(
   input logic                clk_i,
   input logic                rst_ni,
   // Monitored interfaces
-  hci_core_intf.monitor      hci_log_if [0:N_MASTER-N_HWPE-1],
-  hci_core_intf.monitor      hci_hwpe_if [0:N_HWPE-1],
+  hci_core_intf.monitor      hci_driver_log_if [0:N_MASTER-N_HWPE-1],
+  hci_core_intf.monitor      hci_driver_hwpe_if [0:N_HWPE-1],
   // Accumulated request-to-grant latency.
   output real                sum_req_to_gnt_latency_log_o[N_MASTER-N_HWPE],
   output real                sum_req_to_gnt_latency_hwpe_o[N_HWPE],
@@ -74,32 +74,32 @@ module latency_monitor #(
           req_start_cycle_log <= '0;
           req_prev_log <= 1'b0;
         end else begin
-          if (hci_log_if[gi].req && !req_prev_log && !hci_log_if[gi].gnt) begin
+          if (hci_driver_log_if[gi].req && !req_prev_log && !hci_driver_log_if[gi].gnt) begin
             req_start_cycle_log <= cycle_q;
-          end else if (hci_log_if[gi].gnt) begin
+          end else if (hci_driver_log_if[gi].gnt) begin
             req_start_cycle_log <= cycle_q + 1;
           end
 
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt) begin
             if (req_prev_log) begin
               sum_req_to_gnt_latency_log_o[gi] <=
                   sum_req_to_gnt_latency_log_o[gi] +
                   real'(cycle_q - req_start_cycle_log);
             end
             n_gnt_transactions_log_o[gi] <= n_gnt_transactions_log_o[gi] + 1;
-            pending_rsp_is_read_log.push_back(hci_log_if[gi].wen);
+            pending_rsp_is_read_log.push_back(hci_driver_log_if[gi].wen);
           end
-          req_prev_log <= hci_log_if[gi].req;
+          req_prev_log <= hci_driver_log_if[gi].req;
 
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt && hci_log_if[gi].wen) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && hci_driver_log_if[gi].wen) begin
             n_read_granted_log_o[gi] <=
                 n_read_granted_log_o[gi] + 1;
           end
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt && !hci_log_if[gi].wen) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && !hci_driver_log_if[gi].wen) begin
             n_write_granted_log_o[gi] <=
                 n_write_granted_log_o[gi] + 1;
           end
-          if (hci_log_if[gi].r_valid && hci_log_if[gi].r_ready) begin
+          if (hci_driver_log_if[gi].r_valid && hci_driver_log_if[gi].r_ready) begin
             if (pending_rsp_is_read_log.size() != 0) begin
               retired_is_read_log = pending_rsp_is_read_log.pop_front();
               if (retired_is_read_log) begin
@@ -132,32 +132,32 @@ module latency_monitor #(
           req_start_cycle_hwpe <= '0;
           req_prev_hwpe <= 1'b0;
         end else begin
-          if (hci_hwpe_if[gi].req && !req_prev_hwpe && !hci_hwpe_if[gi].gnt) begin
+          if (hci_driver_hwpe_if[gi].req && !req_prev_hwpe && !hci_driver_hwpe_if[gi].gnt) begin
             req_start_cycle_hwpe <= cycle_q;
-          end else if (hci_hwpe_if[gi].gnt) begin
+          end else if (hci_driver_hwpe_if[gi].gnt) begin
             req_start_cycle_hwpe <= cycle_q + 1;
           end
 
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt) begin
             if (req_prev_hwpe) begin
               sum_req_to_gnt_latency_hwpe_o[gi] <=
                   sum_req_to_gnt_latency_hwpe_o[gi] +
                   real'(cycle_q - req_start_cycle_hwpe);
             end
             n_gnt_transactions_hwpe_o[gi] <= n_gnt_transactions_hwpe_o[gi] + 1;
-            pending_rsp_is_read_hwpe.push_back(hci_hwpe_if[gi].wen);
+            pending_rsp_is_read_hwpe.push_back(hci_driver_hwpe_if[gi].wen);
           end
-          req_prev_hwpe <= hci_hwpe_if[gi].req;
+          req_prev_hwpe <= hci_driver_hwpe_if[gi].req;
 
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && hci_hwpe_if[gi].wen) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && hci_driver_hwpe_if[gi].wen) begin
             n_read_granted_hwpe_o[gi] <=
                 n_read_granted_hwpe_o[gi] + 1;
           end
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && !hci_hwpe_if[gi].wen) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && !hci_driver_hwpe_if[gi].wen) begin
             n_write_granted_hwpe_o[gi] <=
                 n_write_granted_hwpe_o[gi] + 1;
           end
-          if (hci_hwpe_if[gi].r_valid && hci_hwpe_if[gi].r_ready) begin
+          if (hci_driver_hwpe_if[gi].r_valid && hci_driver_hwpe_if[gi].r_ready) begin
             if (pending_rsp_is_read_hwpe.size() != 0) begin
               retired_is_read_hwpe = pending_rsp_is_read_hwpe.pop_front();
               if (retired_is_read_hwpe) begin
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 7ce7ea2..13cf7ec 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -42,49 +42,38 @@ module tb_hci
   // HCI //
   /////////
 
-  /* HCI interfaces */
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{    // CORE + DMA + EXT parameters
-    DW:  DEFAULT_DW,
-    AW:  DEFAULT_AW,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{
+    DW:  DW_cores,
+    AW:  AW_cores,
+    BW:  BW_cores,
+    UW:  UW_cores,
+    IW:  IW_cores,
+    EW:  EW_cores,
+    EHW: EHW_cores
   };
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{     // Bank parameters
-    DW:  DEFAULT_DW,
-    AW:  ADDR_WIDTH_BANK,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{
+    DW:  DW_hwpe,
+    AW:  AW_hwpe,
+    BW:  BW_hwpe,
+    UW:  UW_hwpe,
+    IW:  IW_hwpe,
+    EW:  EW_hwpe,
+    EHW: EHW_hwpe
   };
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{     // HWPE parameters
-    DW:  HWPE_WIDTH_FACT*DATA_WIDTH,
-    AW:  DEFAULT_AW,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{
+    DW:  DW_mems,
+    AW:  AW_mems,
+    BW:  BW_mems,
+    UW:  UW_mems,
+    IW:  IW_mems,
+    EW:  EW_mems,
+    EHW: EHW_mems
   };
 
   /* Application-driver-side interfaces */
 
-  hci_core_intf #(
-    .DW(HCI_SIZE_hwpe.DW),
-    .AW(HCI_SIZE_hwpe.AW),
-    .BW(HCI_SIZE_hwpe.BW),
-    .UW(HCI_SIZE_hwpe.UW),
-    .IW(HCI_SIZE_hwpe.IW),
-    .EW(HCI_SIZE_hwpe.EW),
-    .EHW(HCI_SIZE_hwpe.EHW)
-  ) hci_hwpe_if [0:N_HWPE-1] (
-    .clk(clk)
-  );
-
   hci_core_intf #(
     .DW(HCI_SIZE_cores.DW),
     .AW(HCI_SIZE_cores.AW),
@@ -93,12 +82,10 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_log_if [0:N_LOG_MASTERS-1] (
+  ) hci_driver_log_if [0:N_LOG_MASTERS-1] (
     .clk(clk)
   );
 
-  /* Interconnect-side master interfaces */
-
   hci_core_intf #(
     .DW(HCI_SIZE_hwpe.DW),
     .AW(HCI_SIZE_hwpe.AW),
@@ -107,10 +94,12 @@ module tb_hci
     .IW(HCI_SIZE_hwpe.IW),
     .EW(HCI_SIZE_hwpe.EW),
     .EHW(HCI_SIZE_hwpe.EHW)
-  ) hci_hwpe_wide_if [0:N_HWPE_MASTERS-1] (
+  ) hci_driver_hwpe_if [0:N_HWPE-1] (
     .clk(clk)
   );
 
+  /* Interconnect-side interfaces (hci_system-style organization) */
+
   hci_core_intf #(
     .DW(HCI_SIZE_cores.DW),
     .AW(HCI_SIZE_cores.AW),
@@ -119,20 +108,19 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_core_if [0:N_CORE+N_HWPE_LOG_MASTERS-1] (
+  ) hci_initiator_narrow [0:N_NARROW_HCI-1] (
     .clk(clk)
   );
 
-  // LOG-only intermediate interfaces for HWPE split lanes.
   hci_core_intf #(
-    .DW(HCI_SIZE_cores.DW),
-    .AW(HCI_SIZE_cores.AW),
-    .BW(HCI_SIZE_cores.BW),
-    .UW(HCI_SIZE_cores.UW),
-    .IW(HCI_SIZE_cores.IW),
-    .EW(HCI_SIZE_cores.EW),
-    .EHW(HCI_SIZE_cores.EHW)
-  ) hci_hwpe_log_if [0:N_HWPE_LOG_MASTERS-1] (
+    .DW(HCI_SIZE_hwpe.DW),
+    .AW(HCI_SIZE_hwpe.AW),
+    .BW(HCI_SIZE_hwpe.BW),
+    .UW(HCI_SIZE_hwpe.UW),
+    .IW(HCI_SIZE_hwpe.IW),
+    .EW(HCI_SIZE_hwpe.EW),
+    .EHW(HCI_SIZE_hwpe.EHW)
+  ) hci_initiator_wide [0:N_WIDE_HCI-1] (
     .clk(clk)
   );
 
@@ -144,7 +132,7 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_dma_if [0:N_DMA-1] (
+  ) hci_initiator_dma [0:N_DMA-1] (
     .clk(clk)
   );
 
@@ -156,12 +144,10 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_ext_if [0:N_EXT-1] (
+  ) hci_initiator_ext [0:N_EXT-1] (
     .clk(clk)
   );
 
-  /* Memory interface */
-
   hci_core_intf #(
     .DW(HCI_SIZE_mems.DW),
     .AW(HCI_SIZE_mems.AW),
@@ -174,93 +160,109 @@ module tb_hci
     .WAIVE_RQ4_ASSERT(1'b1),
     .WAIVE_RSP3_ASSERT(1'b1),
     .WAIVE_RSP5_ASSERT(1'b1)
-  ) hci_mem_if [0:N_BANKS-1] (
+  ) hci_target_mems [0:N_BANKS-1] (
     .clk(clk)
   );
 
-  /* HCI instance */
-
   generate
-    for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_log
-      hci_core_assign i_core_to_hci_assign (
-        .tcdm_target (hci_log_if[ii]),
-        .tcdm_initiator (hci_core_if[ii])
+    for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow
+      hci_core_assign i_core_to_narrow_assign (
+        .tcdm_target(hci_driver_log_if[ii]),
+        .tcdm_initiator(hci_initiator_narrow[ii])
       );
     end
-    for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_log
+
+    for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_hci
       hci_core_assign i_dma_to_hci_assign (
-        .tcdm_target (hci_log_if[N_CORE + ii]),
-        .tcdm_initiator (hci_dma_if[ii])
+        .tcdm_target(hci_driver_log_if[N_CORE + ii]),
+        .tcdm_initiator(hci_initiator_dma[ii])
       );
     end
-    for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_log
+
+    for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_hci
       hci_core_assign i_ext_to_hci_assign (
-        .tcdm_target (hci_log_if[N_CORE + N_DMA + ii]),
-        .tcdm_initiator (hci_ext_if[ii])
+        .tcdm_target(hci_driver_log_if[N_CORE + N_DMA + ii]),
+        .tcdm_initiator(hci_initiator_ext[ii])
       );
     end
 
-    if (INTERCO_TYPE == HCI) begin : gen_hwpe_to_hci
-      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_hci
-        hci_core_assign i_hwpe_to_hci_assign (
-          .tcdm_target (hci_hwpe_if[ii]),
-          .tcdm_initiator (hci_hwpe_wide_if[ii])
+    if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign
+        hci_core_assign i_hwpe_hci_assign (
+          .tcdm_target(hci_driver_hwpe_if[ii]),
+          .tcdm_initiator(hci_initiator_wide[ii])
         );
       end
-    end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_to_log
-      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_to_log
-        hci_core_split #(
-          .DW(HWPE_WIDTH_FACT * DATA_WIDTH),
-          .BW(DATA_WIDTH / 8),
-          .UW(1),
-          .NB_OUT_CHAN(HWPE_WIDTH_FACT),
-          .FIFO_DEPTH(2),
-          .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe)
-        ) i_hwpe_to_log_split (
-          .clk_i(clk),
-          .rst_ni(rst_n),
-          .clear_i(s_clear),
-          .tcdm_target(hci_hwpe_if[ii]),
-          .tcdm_initiator(hci_hwpe_log_if[ii * HWPE_WIDTH_FACT : (ii + 1) * HWPE_WIDTH_FACT - 1])
-        );
+    end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux
+      hci_core_mux_static #(
+        .NB_CHAN(N_HWPE),
+        .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe)
+      ) i_hwpe_mux (
+        .clk_i(clk),
+        .rst_ni(rst_n),
+        .clear_i(s_clear),
+        .sel_i('0),
+        .in(hci_driver_hwpe_if),
+        .out(hci_initiator_wide[0])
+      );
+    end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master
+        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_lane
+          localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f;
+
+          assign hci_initiator_narrow[IDX].req     = hci_driver_hwpe_if[ii].req;
+          assign hci_initiator_narrow[IDX].add     = hci_driver_hwpe_if[ii].add + f * WORD_SIZE;
+          assign hci_initiator_narrow[IDX].wen     = hci_driver_hwpe_if[ii].wen;
+          assign hci_initiator_narrow[IDX].data    = hci_driver_hwpe_if[ii].data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8];
+          assign hci_initiator_narrow[IDX].be      = hci_driver_hwpe_if[ii].be[(f + 1) * WORD_SIZE - 1 : f * WORD_SIZE];
+          assign hci_initiator_narrow[IDX].r_ready = hci_driver_hwpe_if[ii].r_ready;
+          assign hci_initiator_narrow[IDX].user    = hci_driver_hwpe_if[ii].user;
+          assign hci_initiator_narrow[IDX].id      = hci_driver_hwpe_if[ii].id;
+          assign hci_driver_hwpe_if[ii].r_data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8] = hci_initiator_narrow[IDX].r_data;
+          assign hci_initiator_narrow[IDX].ecc     = hci_driver_hwpe_if[ii].ecc;
+          assign hci_initiator_narrow[IDX].ereq    = hci_driver_hwpe_if[ii].ereq;
+          assign hci_initiator_narrow[IDX].r_eready = hci_driver_hwpe_if[ii].r_eready;
+        end
 
-        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_log_rvalid_filter
-          localparam int unsigned IDX_LOG = ii * HWPE_WIDTH_FACT + f;
-          localparam int unsigned IDX_CORE = N_CORE + IDX_LOG;
-          hci_core_r_valid_filter #(
-            .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_cores)
-          ) i_hwpe_log_rvalid_filter (
-            .clk_i(clk),
-            .rst_ni(rst_n),
-            .clear_i(s_clear),
-            .enable_i(1'b1),
-            .tcdm_target(hci_hwpe_log_if[IDX_LOG]),
-            .tcdm_initiator(hci_core_if[IDX_CORE])
-          );
+        assign hci_driver_hwpe_if[ii].r_user = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_user;
+        assign hci_driver_hwpe_if[ii].r_id   = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_id;
+        assign hci_driver_hwpe_if[ii].r_opc  = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_opc;
+        assign hci_driver_hwpe_if[ii].r_ecc  = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_ecc;
+
+        logic [HWPE_WIDTH_FACT-1:0] gnt_vec, rvalid_vec, egnt_vec, revalid_vec;
+        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_red
+          localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f;
+          assign gnt_vec[f]     = hci_initiator_narrow[IDX].gnt;
+          assign rvalid_vec[f]  = hci_initiator_narrow[IDX].r_valid;
+          assign egnt_vec[f]    = hci_initiator_narrow[IDX].egnt;
+          assign revalid_vec[f] = hci_initiator_narrow[IDX].r_evalid;
         end
+        assign hci_driver_hwpe_if[ii].gnt      = &gnt_vec;
+        assign hci_driver_hwpe_if[ii].r_valid  = &rvalid_vec;
+        assign hci_driver_hwpe_if[ii].egnt     = &egnt_vec;
+        assign hci_driver_hwpe_if[ii].r_evalid = &revalid_vec;
       end
-    end else begin
-      // Error: unsupported for now
-      $error("Unsupported INTERCO_TYPE");
+    end else begin : gen_unsupported_mode
+      initial $error("Unsupported INTERCO_TYPE");
     end
   endgenerate
 
-  assign s_clear = 0;
+  assign s_clear = 1'b0;
   assign s_hci_ctrl.arb_policy = ARBITER_MODE;
   assign s_hci_ctrl.invert_prio = INVERT_PRIO;
   assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
 
   hci_interconnect #(
-    .N_HWPE(N_HWPE_MASTERS), // Number of HWPE ports
-    .N_CORE(N_CORE + N_HWPE_LOG_MASTERS), // Number of CORE ports
-    .N_DMA(N_DMA),           // Number of DMA ports
-    .N_EXT(N_EXT),           // Number of External ports
-    .N_MEM(N_BANKS),         // Number of Memory banks
-    .TS_BIT(TS_BIT),         // TEST_SET_BIT (for Log Interconnect)
-    .IW(IW),                 // ID Width
-    .EXPFIFO(EXPFIFO),       // FIFO Depth for HWPE Interconnect
-    .SEL_LIC(SEL_LIC),       // Log interconnect type selector
-    .FILTER_WRITE_R_VALID ( FILTER_WRITE_R_VALID ),
+    .N_HWPE(N_WIDE_HCI),
+    .N_CORE(N_NARROW_HCI),
+    .N_DMA(N_DMA),
+    .N_EXT(N_EXT),
+    .N_MEM(N_BANKS),
+    .TS_BIT(TS_BIT),
+    .IW(IW),
+    .EXPFIFO(EXPFIFO),
+    .SEL_LIC(SEL_LIC),
+    .FILTER_WRITE_R_VALID(FILTER_WRITE_R_VALID),
     .HCI_SIZE_cores(HCI_SIZE_cores),
     .HCI_SIZE_mems(HCI_SIZE_mems),
     .HCI_SIZE_hwpe(HCI_SIZE_hwpe),
@@ -273,11 +275,11 @@ module tb_hci
     .rst_ni(rst_n),
     .clear_i(s_clear),
     .ctrl_i(s_hci_ctrl),
-    .cores(hci_core_if),
-    .dma(hci_dma_if),
-    .ext(hci_ext_if),
-    .mems(hci_mem_if),
-    .hwpe(hci_hwpe_wide_if)
+    .cores(hci_initiator_narrow),
+    .dma(hci_initiator_dma),
+    .ext(hci_initiator_ext),
+    .mems(hci_target_mems),
+    .hwpe(hci_initiator_wide)
   );
 
   //////////
@@ -289,13 +291,13 @@ module tb_hci
     .NbBanks(N_BANKS),
     .DataWidth(DATA_WIDTH),
     .AddrWidth(ADDR_WIDTH),
-    .BeWidth(DATA_WIDTH/8),
+    .BeWidth(DATA_WIDTH / 8),
     .IdWidth(IW)
   ) i_tb_mem (
     .clk_i(clk),
     .rst_ni(rst_n),
     .test_mode_i(1'b0),
-    .tcdm_slave(hci_mem_if)
+    .tcdm_slave(hci_target_mems)
   );
 
   /////////////////////////
@@ -307,9 +309,8 @@ module tb_hci
   int unsigned s_issued_transactions[0:N_DRIVERS-1];
   int unsigned s_issued_read_transactions[0:N_DRIVERS-1];
 
-  /* CORE + DMA + EXT */
   generate
-    for (genvar ii = 0; ii < N_CORE + N_DMA + N_EXT; ii++) begin : gen_app_driver_log
+    for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log
       localparam string STIM_FILE_LOG =
           $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii);
       application_driver #(
@@ -317,13 +318,13 @@ module tb_hci
         .IS_HWPE(0),
         .DATA_WIDTH(DATA_WIDTH),
         .ADDR_WIDTH(ADDR_WIDTH),
-        .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
-        .IW(IW),
+        .APPL_DELAY(APPL_DELAY),
+        .IW(IW_cores),
         .STIM_FILE(STIM_FILE_LOG)
       ) i_app_driver_log (
-        .hci_if(hci_log_if[ii]),
-        .rst_ni(rst_n),
         .clk_i(clk),
+        .rst_ni(rst_n),
+        .hci_if(hci_driver_log_if[ii]),
         .end_stimuli_o(s_end_stimuli[ii]),
         .end_latency_o(s_end_latency[ii]),
         .n_issued_transactions_o(s_issued_transactions[ii]),
@@ -332,7 +333,6 @@ module tb_hci
     end
   endgenerate
 
-  /* HWPE */
   generate
     for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe
       localparam string STIM_FILE_HWPE =
@@ -342,17 +342,17 @@ module tb_hci
         .IS_HWPE(1),
         .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH),
         .ADDR_WIDTH(ADDR_WIDTH),
-        .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
-        .IW(IW),
+        .APPL_DELAY(APPL_DELAY),
+        .IW(IW_hwpe),
         .STIM_FILE(STIM_FILE_HWPE)
       ) i_app_driver_hwpe (
         .clk_i(clk),
         .rst_ni(rst_n),
-        .hci_if(hci_hwpe_if[ii]),
-        .end_stimuli_o(s_end_stimuli[N_CORE+N_DMA+N_EXT + ii]),
-        .end_latency_o(s_end_latency[N_CORE+N_DMA+N_EXT + ii]),
-        .n_issued_transactions_o(s_issued_transactions[N_CORE+N_DMA+N_EXT + ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[N_CORE+N_DMA+N_EXT + ii])
+        .hci_if(hci_driver_hwpe_if[ii]),
+        .end_stimuli_o(s_end_stimuli[N_LOG_MASTERS + ii]),
+        .end_latency_o(s_end_latency[N_LOG_MASTERS + ii]),
+        .n_issued_transactions_o(s_issued_transactions[N_LOG_MASTERS + ii]),
+        .n_issued_read_transactions_o(s_issued_read_transactions[N_LOG_MASTERS + ii])
       );
     end
   endgenerate
@@ -361,19 +361,17 @@ module tb_hci
   // QoS //
   /////////
 
-  real SUM_REQ_TO_GNT_LATENCY_LOG[N_CORE+N_DMA+N_EXT];
+  real SUM_REQ_TO_GNT_LATENCY_LOG[N_LOG_MASTERS];
   real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE];
-  int unsigned N_GNT_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
+  int unsigned N_GNT_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
+  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
+  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_CORE+N_DMA+N_EXT];
+  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE];
 
-  /* REAL THROUGHPUT AND SIMULATION TIME */
-
   real latency_per_master[N_DRIVERS];
   real throughput_completed;
   real stim_latency;
@@ -400,15 +398,14 @@ module tb_hci
     .latency_per_master_o(latency_per_master)
   );
 
-  /* LATENCY MONITOR */
   latency_monitor #(
     .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE)
   ) i_latency_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
-    .hci_log_if(hci_log_if),
-    .hci_hwpe_if(hci_hwpe_if),
+    .hci_driver_log_if(hci_driver_log_if),
+    .hci_driver_hwpe_if(hci_driver_hwpe_if),
     .sum_req_to_gnt_latency_log_o(SUM_REQ_TO_GNT_LATENCY_LOG),
     .sum_req_to_gnt_latency_hwpe_o(SUM_REQ_TO_GNT_LATENCY_HWPE),
     .n_gnt_transactions_log_o(N_GNT_TRANSACTIONS_LOG),
@@ -421,11 +418,6 @@ module tb_hci
     .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
-  ///////////
-  // Other //
-  ///////////
-
-  /* SIMULATION REPORT */
   simulation_report i_simulation_report (
     .end_stimuli_i(s_end_stimuli),
     .end_latency_i(s_end_latency),
@@ -445,7 +437,6 @@ module tb_hci
     .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
-  /* ASSERTIONS */
   localparam int unsigned MAX_BANK_LOCAL_ADDR =
       TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE;
 
@@ -453,15 +444,15 @@ module tb_hci
     for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_assert_hwpe_address
       a_hwpe_addr_in_bounds: assert property (
         @(posedge clk)
-          get_bank_local_address(hci_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR
+          get_bank_local_address(hci_driver_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR
       ) else begin
         $display("--------------------------------------------");
         $display("Time %0t: Test stopped", $time);
         $error(
           "HWPE%0d generated an out-of-bounds address (raw=0x%0h, bank_local=0x%0h, max=0x%0h).",
           ii,
-          hci_hwpe_if[ii].add,
-          get_bank_local_address(hci_hwpe_if[ii].add),
+          hci_driver_hwpe_if[ii].add,
+          get_bank_local_address(hci_driver_hwpe_if[ii].add),
           MAX_BANK_LOCAL_ADDR
         );
         $display("This workload is invalid; rerun with a different workload configuration.");
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index d35ca6e..cfcb3c6 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -19,9 +19,9 @@
 package tb_hci_pkg;
 
   typedef enum logic [1:0] {
-    LOG  = 2'd0,
+    LOG = 2'd0,
     MUX = 2'd1,
-    HCI  = 2'd2
+    HCI = 2'd2
   } interco_e;
 
   //////////////////////////
@@ -29,72 +29,112 @@ package tb_hci_pkg;
   //////////////////////////
   // from verif/config/testbench.mk
 
-  /* Timing parameters */
-
-  localparam time         CLK_PERIOD     = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif;
-  localparam time         APPL_DELAY     = 0;
-  localparam unsigned     RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif;
-
-  /* Simulation parameters */
+  localparam time     CLK_PERIOD     = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif;
+  localparam time     APPL_DELAY     = 0;
+  localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif;
 
   // Transaction counts
-  localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif;
-  localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif;
+  localparam int unsigned N_TRANSACTION_LOG  = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif;
+  localparam int unsigned TRANSACTION_RATIO  = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif;
   localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO);
 
-  // TCDM interface parameters
-  localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
-
-  // Arbiter configuration
-  localparam int unsigned ARBITER_MODE = 0;
-  localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif;
-  localparam int unsigned LOW_PRIO_MAX_STALL = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif;
+  // TCDM and arbitration parameters
+  localparam int unsigned RANDOM_GNT          = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
+  localparam int unsigned ARBITER_MODE        = 0;
+  localparam int unsigned INVERT_PRIO         = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif;
+  localparam int unsigned LOW_PRIO_MAX_STALL  = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif;
 
-  /////////////////////////
-  // Hardware parameters //
-  /////////////////////////
+  /////////////////////////////
+  // Configurable parameters //
+  /////////////////////////////
   // from verif/config/hardware.mk
 
-  /* Config */
-
+  // Interconnect mode
   localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif;
 
-  // Master port counts
-  localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port
-  localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports
-  localparam int unsigned N_DMA  = `ifdef N_DMA `N_DMA `else 1 `endif;   // Number of DMA ports
-  localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif;    // Number of External ports
+  // Number of initiators
+  localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif;
+  localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif;
+  localparam int unsigned N_DMA  = `ifdef N_DMA `N_DMA `else 1 `endif;
+  localparam int unsigned N_EXT  = `ifdef N_EXT `N_EXT `else 1 `endif;
 
   // Interconnect configuration
-  localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif;    // TEST_SET_BIT (for Log Interconnect)
-  localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; // FIFO Depth for HWPE Interconnect
-  localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector
+  localparam int unsigned TS_BIT  = `ifdef TS_BIT `TS_BIT `else 0 `endif;
+  localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif;
+  localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif;
 
-  // Data and memory parameters
-  localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits
-  localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; // Width factor of an HWPE wide-word
-  localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB)
+  // Memory system configuration
+  localparam int unsigned N_BANKS          = `ifdef N_BANKS `N_BANKS `else 16 `endif;
+  localparam int unsigned TOT_MEM_SIZE     = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif;
+  localparam int unsigned DATA_WIDTH       = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif;
+  localparam int unsigned HWPE_WIDTH_FACT  = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif;
 
-  /* Derived parameters */
+  //////////////////////////
+  // Hardcoded parameters //
+  //////////////////////////
 
-  localparam int unsigned N_DRIVERS = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters
-  localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks
+  localparam int unsigned WORD_SIZE = DATA_WIDTH / 8;
+
+  //////////////////////////
+  // Dependent parameters //
+  //////////////////////////
 
   localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
-  localparam int unsigned N_HWPE_LOG_MASTERS = (INTERCO_TYPE == LOG) ? (N_HWPE * HWPE_WIDTH_FACT) : 0;
-  localparam int unsigned N_HWPE_MASTERS = (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0);
+  localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
+
+  // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE.
+  localparam int unsigned N_NARROW_HCI =
+      N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG);
+
+  // If full HCI is used, instantiate one wide port per HWPE.
+  // If static MUX is used, instantiate a single shared wide port.
+  localparam int unsigned N_WIDE_HCI =
+      (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0);
 
-  localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024);    // Width of ADDRESS in bits
-  localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH;               // Width of a memory bank (bits)
-  localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes)
-  localparam int unsigned BIT_BANK_INDEX       = $clog2(N_BANKS);     // Bits of the Bank index
-  localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX;  // Number of address bits per TCDM bank
-  localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank
-  localparam int unsigned FILTER_WRITE_R_VALID[0:N_HWPE_MASTERS-1] = '{default: 0}; // Enable filtering of only r_valid respons
+  // One-hot response ID width used by the interconnect.
+  localparam int unsigned IW = N_NARROW_HCI + N_WIDE_HCI + N_DMA + N_EXT;
 
-  // Keep ID width consistent with stimuli generator.
-  localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * N_LOG_MASTERS + N_TRANSACTION_HWPE * N_HWPE);
-  localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE + N_DMA + N_EXT) + N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT;
+  localparam int unsigned FILTER_WRITE_R_VALID[0:N_WIDE_HCI-1] = '{default: 0};
+
+  localparam int unsigned ADDR_WIDTH           = $clog2(TOT_MEM_SIZE * 1024);
+  localparam int unsigned WIDTH_OF_MEMORY      = DATA_WIDTH;
+  localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8;
+  localparam int unsigned BIT_BANK_INDEX       = $clog2(N_BANKS);
+  localparam int unsigned ADDR_WIDTH_BANK      = ADDR_WIDTH - BIT_BANK_INDEX;
+  localparam int unsigned N_WORDS              =
+      (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE;
+
+  localparam int unsigned TOT_CHECK =
+      N_TRANSACTION_LOG * N_LOG_MASTERS +
+      N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT;
+
+  ///////////////
+  // Bitwidths //
+  ///////////////
+
+  localparam int unsigned DW_cores  = DATA_WIDTH;
+  localparam int unsigned AW_cores  = 32;
+  localparam int unsigned BW_cores  = 8;
+  localparam int unsigned UW_cores  = 1;
+  localparam int unsigned IW_cores  = 8;
+  localparam int unsigned EW_cores  = 1;
+  localparam int unsigned EHW_cores = 1;
+
+  localparam int unsigned DW_hwpe  = DW_cores * HWPE_WIDTH_FACT;
+  localparam int unsigned AW_hwpe  = AW_cores;
+  localparam int unsigned BW_hwpe  = BW_cores;
+  localparam int unsigned UW_hwpe  = UW_cores;
+  localparam int unsigned IW_hwpe  = IW_cores;
+  localparam int unsigned EW_hwpe  = EW_cores;
+  localparam int unsigned EHW_hwpe = EHW_cores;
+
+  localparam int unsigned DW_mems  = DW_cores;
+  localparam int unsigned AW_mems  = ADDR_WIDTH_BANK;
+  localparam int unsigned BW_mems  = BW_cores;
+  localparam int unsigned UW_mems  = UW_cores;
+  localparam int unsigned IW_mems  = IW;
+  localparam int unsigned EW_mems  = EW_cores;
+  localparam int unsigned EHW_mems = EHW_cores;
 
   ///////////
   // Types //
@@ -107,35 +147,32 @@ package tb_hci_pkg;
   } stimuli_t;
 
   typedef struct packed {
-    logic [DATA_WIDTH-1:0] data;
+    logic [DATA_WIDTH-1:0]      data;
     logic [ADDR_WIDTH_BANK-1:0] add;
   } out_intc_to_mem_t;
 
-  // Helper return type for HWPE address/data creation
   typedef struct {
     logic [ADDR_WIDTH-1:0] address;
     logic [DATA_WIDTH-1:0] data;
-    logic rolls_over;
+    logic                  rolls_over;
   } hwpe_addr_data_t;
 
   /////////////
   // Helpers //
   /////////////
 
-  // Zero-time pure function returning address/data for an HWPE lane
   function automatic hwpe_addr_data_t create_address_and_data_hwpe(
-    input logic [ADDR_WIDTH-1:0] address_before,
+    input logic [ADDR_WIDTH-1:0]                  address_before,
     input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before,
-    input int index,
-    input logic rolls_over_check_before
+    input int                                     index,
+    input logic                                   rolls_over_check_before
   );
     hwpe_addr_data_t ret;
     logic [BIT_BANK_INDEX-1:0] bank_index_before, bank_index_after;
     begin
-      bank_index_before = address_before[BIT_BANK_INDEX-1 + 2 : 2];
-      // Legacy behavior: add full-width index and let truncation wrap
-      bank_index_after = index + bank_index_before;
-      ret.rolls_over = rolls_over_check_before;
+      bank_index_before = address_before[BIT_BANK_INDEX-1 + 2:2];
+      bank_index_after  = index + bank_index_before;
+      ret.rolls_over    = rolls_over_check_before;
       if (bank_index_before > bank_index_after) begin
         ret.rolls_over = 1'b1;
       end
@@ -157,8 +194,6 @@ package tb_hci_pkg;
     index = address[BIT_BANK_INDEX-1 + 2:2];
   endtask
 
-  /* Metrics helpers */
-
   task calculate_theoretical_throughput(output real throughput_theo);
     real tot_data, band_memory_limit, tot_time;
     if (TRANSACTION_RATIO >= 1) begin
@@ -166,20 +201,15 @@ package tb_hci_pkg;
     end else begin
       tot_time = N_TRANSACTION_LOG;
     end
-    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_DRIVERS - N_HWPE) +
-               (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE;  // bit
-    throughput_theo = tot_data / tot_time;  // bit per cycle
+    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * N_LOG_MASTERS +
+               (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE;
+    throughput_theo = tot_data / tot_time;
     band_memory_limit = real'(N_BANKS * DATA_WIDTH);
     if (throughput_theo >= band_memory_limit) begin
       throughput_theo = band_memory_limit;
     end
   endtask
 
-  ///////////////
-  // Functions //
-  ///////////////
-
-  // Convert a full system address to per-bank local word address.
   function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i);
     logic [ADDR_WIDTH-1:0] mapped_addr;
     logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv
index 46c251c..fc22a0b 100644
--- a/target/verif/src/tcdm_banks_wrap.sv
+++ b/target/verif/src/tcdm_banks_wrap.sv
@@ -92,9 +92,9 @@ module tcdm_banks_wrap #(
           tcdm_slave[i].r_valid <= 1'b1;
         end else begin
           tcdm_slave[i].r_valid <= 1'b0;
+        end
       end
     end
-    end
   end
 
 endmodule
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index feeefa0..33b1e72 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -9,88 +9,87 @@ if {$GUI == 1} {
     set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE]
     set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS]
 
-    set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE]
     set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS]
-    set N_HWPE_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_LOG_MASTERS]
-    set N_HWPE_MASTERS [examine -radix dec /tb_hci_pkg/N_HWPE_MASTERS]
-
+    set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI]
+    set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI]
     set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT]
+    set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE]
 
     add wave -noupdate /tb_hci/clk
     add wave -noupdate /tb_hci/rst_n
 
-    # Application driver interfaces
-    add wave -noupdate -group application_drivers -divider narrow_Cores
-    for {set i 0} {$i < $N_CORE} {incr i} {
-        add wave -noupdate -group application_drivers -group core_$i /tb_hci/hci_log_if[$i]/*
+    # Application-driver interfaces
+    add wave -noupdate -group application_drivers -divider log_masters
+    for {set i 0} {$i < $N_LOG_MASTERS} {incr i} {
+        add wave -noupdate -group application_drivers -group log_$i /tb_hci/hci_driver_log_if[$i]/*
     }
-    add wave -noupdate -group application_drivers -divider wide_HWPEs
+
+    add wave -noupdate -group application_drivers -divider hwpe_masters
     for {set i 0} {$i < $N_HWPE} {incr i} {
-        add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_hwpe_if[$i]/*
-    }
-    add wave -noupdate -group application_drivers -divider narrow_DMA
-    for {set i 0} {$i < $N_DMA} {incr i} {
-        set idx [expr {$i + $N_CORE}]
-        add wave -noupdate -group application_drivers -group dma_$i /tb_hci/hci_log_if[$idx]/*
-    }
-    add wave -noupdate -group application_drivers -divider narrow_External
-    for {set i 0} {$i < $N_EXT} {incr i} {
-        set idx [expr {$i + $N_CORE + $N_DMA}]
-        add wave -noupdate -group application_drivers -group ext_$i /tb_hci/hci_log_if[$idx]/*
+        add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/*
     }
-    # HCI-side interfaces
-    add wave -noupdate -group hci_interfaces -divider narrow_Cores
+
+    # Interconnect-side interfaces
+    add wave -noupdate -group hci_interfaces -divider narrow_cores
     for {set i 0} {$i < $N_CORE} {incr i} {
-        add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_core_if[$i]/*
+        add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_initiator_narrow[$i]/*
     }
-    if {$INTERCO_TYPE == {LOG}} {
-        add wave -noupdate -group hci_interfaces -divider narrow_HWPEs_split
+
+    if {[string first "LOG" $INTERCO_TYPE] != -1} {
+        add wave -noupdate -group hci_interfaces -divider narrow_hwpe_split
         for {set i 0} {$i < $N_HWPE} {incr i} {
             for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} {
-                set idx [expr {$N_CORE + $i*$HWPE_WIDTH_FACT + $f}]
-                add wave -noupdate -group hci_interfaces -group hwpe_$i -group word_$f /tb_hci/hci_core_if[$idx]/*
+                set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}]
+                if {$idx < $N_NARROW_HCI} {
+                    add wave -noupdate -group hci_interfaces -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/*
+                }
             }
         }
-    } elseif {$INTERCO_TYPE == {HCI} || $INTERCO_TYPE == {MUX}} {
-        add wave -noupdate -group hci_interfaces -divider wide_HWPEs
-        for {set i 0} {$i < $N_HWPE_MASTERS} {incr i} {
-            add wave -noupdate -group hci_interfaces -group hwpe_$i /tb_hci/hci_hwpe_wide_if[$i]/*
+    }
+
+    if {$N_WIDE_HCI > 0} {
+        add wave -noupdate -group hci_interfaces -divider wide_hwpe
+        for {set i 0} {$i < $N_WIDE_HCI} {incr i} {
+            add wave -noupdate -group hci_interfaces -group wide_$i /tb_hci/hci_initiator_wide[$i]/*
         }
-    } else {
-        echo "WARNING: unsupported INTERCO_TYPE value: $INTERCO_TYPE_SYM ($INTERCO_TYPE)"
     }
-    add wave -noupdate -group hci_interfaces -divider narrow_DMA
+
+    add wave -noupdate -group hci_interfaces -divider dma
     for {set i 0} {$i < $N_DMA} {incr i} {
-        add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_dma_if[$i]/*
+        add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_initiator_dma[$i]/*
     }
-    add wave -noupdate -group hci_interfaces -divider narrow_External
+
+    add wave -noupdate -group hci_interfaces -divider ext
     for {set i 0} {$i < $N_EXT} {incr i} {
-        add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_ext_if[$i]/*
+        add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_initiator_ext[$i]/*
     }
+
     # Memory slaves
     add wave -noupdate -group memory_slaves
     for {set i 0} {$i < $N_BANKS} {incr i} {
-        add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_mem_if[$i]/*
+        add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_target_mems[$i]/*
     }
 
+    # Metrics
     add wave -noupdate -group metrics /tb_hci/s_end_latency
     add wave -noupdate -group metrics /tb_hci/s_end_stimuli
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_read_transactions[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/s_issued_read_transactions[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/s_issued_read_transactions[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/s_issued_read_transactions[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/s_issued_read_transactions[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/s_issued_read_transactions[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/s_issued_read_transactions[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/s_issued_read_transactions[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/s_issued_read_transactions[9]} {-format Analog-Step -height 84 -max 503.0} {/tb_hci/s_issued_read_transactions[10]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/s_issued_read_transactions[11]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/s_issued_read_transactions
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/s_issued_transactions[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[9]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[10]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/s_issued_transactions[11]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/s_issued_transactions
+    add wave -noupdate -group metrics /tb_hci/s_issued_transactions
+    add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions
     add wave -noupdate -group metrics /tb_hci/stim_latency
     add wave -noupdate -group metrics /tb_hci/tot_latency
     add wave -noupdate -group metrics /tb_hci/latency_per_master
     add wave -noupdate -group metrics /tb_hci/throughput_completed
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_HWPE
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_GNT_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 1000.0} {/tb_hci/N_GNT_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 1000.0}} /tb_hci/N_GNT_TRANSACTIONS_LOG
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 489.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 493.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 472.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 493.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 506.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 481.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 524.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 512.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 518.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 492.0} {/tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 503.0}} /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[0]} {-format Analog-Step -height 84 -max 511.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE[1]} {-format Analog-Step -height 84 -max 507.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[0]} {-format Analog-Step -height 84 -max 528.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[1]} {-format Analog-Step -height 84 -max 507.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[2]} {-format Analog-Step -height 84 -max 494.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[3]} {-format Analog-Step -height 84 -max 519.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[4]} {-format Analog-Step -height 84 -max 475.99999999999994} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[5]} {-format Analog-Step -height 84 -max 488.00000000000006} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[6]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[7]} {-format Analog-Step -height 84 -max 482.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[8]} {-format Analog-Step -height 84 -max 508.0} {/tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG[9]} {-format Analog-Step -height 84 -max 497.0}} /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[0]} {-format Analog-Step -height 84 -max 1327824.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE[1]} {-format Analog-Step -height 84 -max 1336050.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
-    add wave -noupdate -group metrics -subitemconfig {{/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[0]} {-format Analog-Step -height 84 -max 580398.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[1]} {-format Analog-Step -height 84 -max 580783.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[2]} {-format Analog-Step -height 84 -max 596101.00000000012} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[3]} {-format Analog-Step -height 84 -max 590598.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[4]} {-format Analog-Step -height 84 -max 603272.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[5]} {-format Analog-Step -height 84 -max 601733.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[6]} {-format Analog-Step -height 84 -max 596110.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[7]} {-format Analog-Step -height 84 -max 593715.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[8]} {-format Analog-Step -height 84 -max 602346.0} {/tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG[9]} {-format Analog-Step -height 84 -max 597075.0}} /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG
+    add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG
+    add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
+
     add wave -noupdate /tb_hci/s_clear
     add wave -noupdate /tb_hci/s_hci_ctrl
 

From e5357a0ca9fc697076d2dc619cc38ac3062dae9f Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Thu, 5 Mar 2026 17:54:56 +0100
Subject: [PATCH 07/25] tb: Remove fifo from app_driver, remove r_valid gen
 from tcdm, reintroduce hci_core_split - Revert application_driver to initial
 state, which does not employ FIFOs at interface - Remove r_valid generation
 from TCDM wrapper (r_valid is generated in the LIC, generating it in TCDM
 wrapper creates HCI violation asserts) - Use hci_core_split instead of manual
 splitting of wide HWPE interfaces (higher perf thanks to FIFOs

---
 target/verif/src/application_driver.sv | 96 +++++++-------------------
 target/verif/src/tb_hci.sv             | 48 ++++---------
 target/verif/src/tcdm_banks_wrap.sv    | 22 +++---
 3 files changed, 51 insertions(+), 115 deletions(-)

diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index fd69399..c3b7ffa 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -24,8 +24,6 @@ module application_driver #(
   parameter int unsigned ADDR_WIDTH = 1,
   parameter int unsigned APPL_DELAY = 2,  // Delay on the input signals
   parameter int unsigned IW = 1,
-  parameter bit USE_STREAMER_FIFO = 1'b1,
-  parameter int unsigned STREAMER_FIFO_DEPTH = 2,
   parameter string STIM_FILE = ""
 ) (
   input logic             clk_i,
@@ -47,48 +45,6 @@ module application_driver #(
   logic [ADDR_WIDTH-1:0]  add;
   int unsigned n_completed_read_transactions;
   logic pending_rsp_is_read[$];
-  localparam hci_package::hci_size_parameter_t HCI_SIZE_driver = '{
-    DW:  DATA_WIDTH,
-    AW:  ADDR_WIDTH,
-    BW:  hci_package::DEFAULT_BW,
-    UW:  hci_package::DEFAULT_UW,
-    IW:  IW,
-    EW:  hci_package::DEFAULT_EW,
-    EHW: hci_package::DEFAULT_EHW
-  };
-
-  hci_core_intf #(
-    .DW(HCI_SIZE_driver.DW),
-    .AW(HCI_SIZE_driver.AW),
-    .BW(HCI_SIZE_driver.BW),
-    .UW(HCI_SIZE_driver.UW),
-    .IW(HCI_SIZE_driver.IW),
-    .EW(HCI_SIZE_driver.EW),
-    .EHW(HCI_SIZE_driver.EHW)
-  ) hci_drv_if (
-    .clk(clk_i)
-  );
-
-  generate
-    if (USE_STREAMER_FIFO) begin : gen_driver_streamer_fifo
-      hci_core_fifo #(
-        .FIFO_DEPTH(STREAMER_FIFO_DEPTH),
-        .HCI_SIZE_tcdm_initiator(HCI_SIZE_driver)
-      ) i_driver_streamer_fifo (
-        .clk_i(clk_i),
-        .rst_ni(rst_ni),
-        .clear_i(1'b0),
-        .flags_o(),
-        .tcdm_target(hci_drv_if),
-        .tcdm_initiator(hci_if)
-      );
-    end else begin : gen_driver_direct
-      hci_core_assign i_driver_direct_assign (
-        .tcdm_target(hci_drv_if),
-        .tcdm_initiator(hci_if)
-      );
-    end
-  endgenerate
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter
     logic retired_is_read;
@@ -96,10 +52,10 @@ module application_driver #(
       n_completed_read_transactions <= '0;
       pending_rsp_is_read.delete();
     end else begin
-      if (hci_drv_if.req && hci_drv_if.gnt) begin
-        pending_rsp_is_read.push_back(hci_drv_if.wen);
+      if (hci_if.req && hci_if.gnt) begin
+        pending_rsp_is_read.push_back(hci_if.wen);
       end
-      if (hci_drv_if.r_valid && hci_drv_if.r_ready) begin
+      if (hci_if.r_valid && hci_if.r_ready) begin
         if (pending_rsp_is_read.size() != 0) begin
           retired_is_read = pending_rsp_is_read.pop_front();
           if (retired_is_read) begin
@@ -111,17 +67,17 @@ module application_driver #(
   end
 
   initial begin : proc_application_driver
-    hci_drv_if.id = '0;
-    hci_drv_if.add = '0;
-    hci_drv_if.data = '0;
-    hci_drv_if.req = 1'b0;
-    hci_drv_if.wen = 1'b0;
-    hci_drv_if.ecc = '0;
-    hci_drv_if.ereq = '0;
-    hci_drv_if.r_eready = '0;
-    hci_drv_if.be = '1;
-    hci_drv_if.r_ready = 1'b1;
-    hci_drv_if.user = '0;
+    hci_if.id = '0;
+    hci_if.add = '0;
+    hci_if.data = '0;
+    hci_if.req = 1'b0;
+    hci_if.wen = 1'b0;
+    hci_if.ecc = '0;
+    hci_if.ereq = '0;
+    hci_if.r_eready = '0;
+    hci_if.be = '1;
+    hci_if.r_ready = 1'b1;
+    hci_if.user = '0;
     end_stimuli_o = 1'b0;
     end_latency_o = 1'b0;
     n_issued_transactions_o = '0;
@@ -157,25 +113,25 @@ module application_driver #(
         break;
       end
       #(APPL_DELAY);
-      hci_drv_if.id = id;
-      hci_drv_if.data = data;
-      hci_drv_if.add = add;
-      hci_drv_if.wen = wen;
-      hci_drv_if.req = req;
+      hci_if.id = id;
+      hci_if.data = data;
+      hci_if.add = add;
+      hci_if.wen = wen;
+      hci_if.req = req;
 
       if (req) begin
-        @(posedge clk_i iff hci_drv_if.gnt);
+        @(posedge clk_i iff hci_if.gnt);
         n_issued_transactions_o++;
         if (wen) begin
           n_issued_read_transactions_o++;
         end
         // Deassert in NBA region so monitors sampling this edge see the handshake.
-        hci_drv_if.id <= '0;
-        hci_drv_if.data <= '0;
-        hci_drv_if.add <= '0;
-        hci_drv_if.wen <= 1'b0;
-        hci_drv_if.req <= 1'b0;
-        wait (hci_drv_if.req == 1'b0);
+        hci_if.id <= '0;
+        hci_if.data <= '0;
+        hci_if.add <= '0;
+        hci_if.wen <= 1'b0;
+        hci_if.req <= 1'b0;
+        wait (hci_if.req == 1'b0);
       end else begin
         @(posedge clk_i);
       end
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 13cf7ec..0e51a48 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -207,40 +207,20 @@ module tb_hci
       );
     end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split
       for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master
-        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_lane
-          localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f;
-
-          assign hci_initiator_narrow[IDX].req     = hci_driver_hwpe_if[ii].req;
-          assign hci_initiator_narrow[IDX].add     = hci_driver_hwpe_if[ii].add + f * WORD_SIZE;
-          assign hci_initiator_narrow[IDX].wen     = hci_driver_hwpe_if[ii].wen;
-          assign hci_initiator_narrow[IDX].data    = hci_driver_hwpe_if[ii].data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8];
-          assign hci_initiator_narrow[IDX].be      = hci_driver_hwpe_if[ii].be[(f + 1) * WORD_SIZE - 1 : f * WORD_SIZE];
-          assign hci_initiator_narrow[IDX].r_ready = hci_driver_hwpe_if[ii].r_ready;
-          assign hci_initiator_narrow[IDX].user    = hci_driver_hwpe_if[ii].user;
-          assign hci_initiator_narrow[IDX].id      = hci_driver_hwpe_if[ii].id;
-          assign hci_driver_hwpe_if[ii].r_data[(f + 1) * WORD_SIZE * 8 - 1 : f * WORD_SIZE * 8] = hci_initiator_narrow[IDX].r_data;
-          assign hci_initiator_narrow[IDX].ecc     = hci_driver_hwpe_if[ii].ecc;
-          assign hci_initiator_narrow[IDX].ereq    = hci_driver_hwpe_if[ii].ereq;
-          assign hci_initiator_narrow[IDX].r_eready = hci_driver_hwpe_if[ii].r_eready;
-        end
-
-        assign hci_driver_hwpe_if[ii].r_user = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_user;
-        assign hci_driver_hwpe_if[ii].r_id   = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_id;
-        assign hci_driver_hwpe_if[ii].r_opc  = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_opc;
-        assign hci_driver_hwpe_if[ii].r_ecc  = hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT].r_ecc;
-
-        logic [HWPE_WIDTH_FACT-1:0] gnt_vec, rvalid_vec, egnt_vec, revalid_vec;
-        for (genvar f = 0; f < HWPE_WIDTH_FACT; f++) begin : gen_hwpe_split_red
-          localparam int IDX = N_CORE + ii * HWPE_WIDTH_FACT + f;
-          assign gnt_vec[f]     = hci_initiator_narrow[IDX].gnt;
-          assign rvalid_vec[f]  = hci_initiator_narrow[IDX].r_valid;
-          assign egnt_vec[f]    = hci_initiator_narrow[IDX].egnt;
-          assign revalid_vec[f] = hci_initiator_narrow[IDX].r_evalid;
-        end
-        assign hci_driver_hwpe_if[ii].gnt      = &gnt_vec;
-        assign hci_driver_hwpe_if[ii].r_valid  = &rvalid_vec;
-        assign hci_driver_hwpe_if[ii].egnt     = &egnt_vec;
-        assign hci_driver_hwpe_if[ii].r_evalid = &revalid_vec;
+        hci_core_split #(
+          .DW(HWPE_WIDTH_FACT * DATA_WIDTH),
+          .BW(DATA_WIDTH / 8),
+          .UW(1),
+          .NB_OUT_CHAN(HWPE_WIDTH_FACT),
+          .FIFO_DEPTH(2),
+          .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe)
+        ) i_hwpe_to_log_split (
+          .clk_i(clk),
+          .rst_ni(rst_n),
+          .clear_i(s_clear),
+          .tcdm_target(hci_driver_hwpe_if[ii]),
+          .tcdm_initiator(hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT : N_CORE + (ii + 1) * HWPE_WIDTH_FACT - 1])
+        );
       end
     end else begin : gen_unsupported_mode
       initial $error("Unsupported INTERCO_TYPE");
diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv
index fc22a0b..dd9e223 100644
--- a/target/verif/src/tcdm_banks_wrap.sv
+++ b/target/verif/src/tcdm_banks_wrap.sv
@@ -84,17 +84,17 @@ module tcdm_banks_wrap #(
     );
 
     //r_valid
-    always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen
-      if(~rst_ni) begin
-        tcdm_slave[i].r_valid <= 1'b0;
-      end else begin
-        if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin
-          tcdm_slave[i].r_valid <= 1'b1;
-        end else begin
-          tcdm_slave[i].r_valid <= 1'b0;
-        end
-      end
-    end
+    // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen
+    //   if(~rst_ni) begin
+    //     tcdm_slave[i].r_valid <= 1'b0;
+    //   end else begin
+    //     if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin
+    //       tcdm_slave[i].r_valid <= 1'b1;
+    //     end else begin
+    //       tcdm_slave[i].r_valid <= 1'b0;
+    //     end
+    //   end
+    // end
   end
 
 endmodule

From cfcaf1ade84d18e1545507d2960a6f4ce161af66 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Thu, 5 Mar 2026 17:55:28 +0100
Subject: [PATCH 08/25] rtl: :bug: Fix r_ready masking in hci_core_split

---
 rtl/core/hci_core_split.sv | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rtl/core/hci_core_split.sv b/rtl/core/hci_core_split.sv
index c0c96fb..88e2521 100644
--- a/rtl/core/hci_core_split.sv
+++ b/rtl/core/hci_core_split.sv
@@ -224,7 +224,11 @@ module hci_core_split
     end
 
     // r_ready masking
-    assign tcdm_initiator_lrdy_masked_d = cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid | ~tcdm_initiator_req : tcdm_initiator_r_valid | ~tcdm_initiator_req;
+    // Track lanes that have produced a response for the current split transaction.
+    // Using "~req" here can mark a lane as completed before r_valid is observed.
+    assign tcdm_initiator_lrdy_masked_d =
+        cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid
+                             : tcdm_initiator_r_valid;
     always_ff @(posedge clk_i or negedge rst_ni)
     begin
       if(~rst_ni) begin

From e5bd4f44aedb6f131f07806efce068700fea2a0e Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 6 Mar 2026 18:51:04 +0100
Subject: [PATCH 09/25] src,config: Rewrite app_drivers with FSMs + improve
 json config make

---
 target/verif/bender.mk                 |   6 +-
 target/verif/config/hardware.json      |  10 +-
 target/verif/src/application_driver.sv | 361 +++++++++++++++++++------
 target/verif/src/tb_hci.sv             |  24 +-
 target/verif/verif.mk                  |  26 +-
 5 files changed, 310 insertions(+), 117 deletions(-)

diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index 8ccc663..6b1fa59 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -12,12 +12,12 @@ VERIF_DEFS += \
 	-D N_CORE=$(N_CORE) \
 	-D N_DMA=$(N_DMA) \
 	-D N_EXT=$(N_EXT) \
-	-D TS_BIT=$(TS_BIT) \
-	-D EXPFIFO=$(EXPFIFO) \
-	-D SEL_LIC=$(SEL_LIC) \
 	-D DATA_WIDTH=$(DATA_WIDTH) \
 	-D TOT_MEM_SIZE=$(TOT_MEM_SIZE) \
 	-D N_BANKS=$(N_BANKS) \
+	-D TS_BIT=$(TS_BIT) \
+	-D EXPFIFO=$(EXPFIFO) \
+	-D SEL_LIC=$(SEL_LIC) \
 	-D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \
 	-D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \
 	-D CLK_PERIOD=$(CLK_PERIOD) \
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index 028c714..b6c52a5 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -4,14 +4,14 @@
     "N_HWPE": 2,
     "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
-    "N_DMA": 1,
+    "N_DMA": 0,
     "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 128,
+    "N_BANKS": 32,
     "INTERCO_TYPE": "LOG",
     "TS_BIT": 21,
     "EXPFIFO": 0,
-    "SEL_LIC": 0,
-    "DATA_WIDTH": 32,
-    "TOT_MEM_SIZE": 32,
-    "N_BANKS": 64
+    "SEL_LIC": 0
   }
 }
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index c3b7ffa..d921563 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -19,54 +19,112 @@
 
 module application_driver #(
   parameter int unsigned MASTER_NUMBER = 1,
-  parameter int unsigned IS_HWPE = 1,
   parameter int unsigned DATA_WIDTH = 1,
   parameter int unsigned ADDR_WIDTH = 1,
-  parameter int unsigned APPL_DELAY = 2,  // Delay on the input signals
   parameter int unsigned IW = 1,
   parameter string STIM_FILE = ""
 ) (
   input logic             clk_i,
   input logic             rst_ni,
+  input logic             clear_i, // used to gate the driver or to reset it
   hci_core_intf.initiator hci_if,
-  output logic            end_stimuli_o,
-  output logic            end_latency_o,
-  output int unsigned     n_issued_transactions_o,
-  output int unsigned     n_issued_read_transactions_o
+  output logic            end_req_o,
+  output logic            end_resp_o,
+  output int unsigned     n_issued_tr_o,
+  output int unsigned     n_issued_rd_tr_o,
+  output int unsigned     n_retired_rd_tr_o
 );
 
-  logic [IW-1:0] id;
-  string file_path;
-  int stim;
-  int scan_status;
-  logic wen;
-  logic req;
-  logic [DATA_WIDTH-1:0] data;
-  logic [ADDR_WIDTH-1:0]  add;
-  int unsigned n_completed_read_transactions;
-  logic pending_rsp_is_read[$];
-
-  always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter
-    logic retired_is_read;
-    if (!rst_ni) begin
-      n_completed_read_transactions <= '0;
-      pending_rsp_is_read.delete();
+  int unsigned n_req_issued_q, n_req_issued_d; // total number of issued requests
+  int unsigned n_rd_req_issued_q, n_rd_req_issued_d; // total number of issued read requests
+  int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; // total number of retired read responses
+
+  // Transaction queue from file
+  typedef struct {
+    logic req;
+    logic [IW-1:0] id;
+    logic wen;
+    logic [DATA_WIDTH-1:0] data;
+    logic [ADDR_WIDTH-1:0] add;
+  } transaction_t;
+  transaction_t transactions[$];
+
+  // Fill up the queue by reading the stimuli file until the end
+  initial begin
+    string file_path;
+    int stim;
+    int scan_status;
+
+    if (STIM_FILE != "") begin
+      file_path = STIM_FILE;
     end else begin
-      if (hci_if.req && hci_if.gnt) begin
-        pending_rsp_is_read.push_back(hci_if.wen);
-      end
-      if (hci_if.r_valid && hci_if.r_ready) begin
-        if (pending_rsp_is_read.size() != 0) begin
-          retired_is_read = pending_rsp_is_read.pop_front();
-          if (retired_is_read) begin
-            n_completed_read_transactions <= n_completed_read_transactions + 1;
-          end
+      $fatal("ERROR: Specify STIM_FILE path");
+    end
+    // Open file
+    stim = $fopen(file_path, "r");
+    if (stim == 0) begin
+      $fatal("ERROR: Could not open stimuli file");
+    end
+    // Read every line
+    while (!$feof(stim)) begin
+      transaction_t transaction;
+      scan_status = $fscanf(stim, "%b %b %b %b %b\n", transaction.req, transaction.id, transaction.wen, transaction.data, transaction.add);
+      if (scan_status != 5) begin
+        if (!$feof(stim)) begin
+          $fatal(1, "ERROR: malformed stimuli line in %s", file_path);
         end
+        break;
       end
+      // First-in, first-out queue
+      transactions.push_back(transaction);
+    end
+    $fclose(stim);
+  end
+
+  //////////////////
+  // Requests FSM //
+  //////////////////
+
+  typedef enum logic [1:0] {
+    REQ_IDLE,
+    WAIT_GNT,
+    REQ_DONE,
+    RSP_DONE
+  } req_state_t;
+
+  req_state_t req_state_q, req_state_d;
+  int unsigned tr_idx_q, tr_idx_d; // transaction ID
+  int unsigned last_op_issued_q, last_op_issued_d; // ID of the last issued operation (read or write)
+
+  assign n_issued_tr_o = n_req_issued_q;
+  assign n_issued_rd_tr_o = n_rd_req_issued_q;
+
+  // Sequential logic
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni || clear_i) begin
+      req_state_q <= REQ_IDLE;
+      tr_idx_q <= '0;
+      n_req_issued_q <= '0;
+      n_rd_req_issued_q <= '0;
+      last_op_issued_q <= '0;
+    end else begin
+      req_state_q <= req_state_d;
+      tr_idx_q <= tr_idx_d;
+      n_req_issued_q <= n_req_issued_d;
+      n_rd_req_issued_q <= n_rd_req_issued_d;
+      last_op_issued_q <= last_op_issued_d;
     end
   end
 
-  initial begin : proc_application_driver
+  // Combinational state logic
+  always_comb begin
+    // Defaults (FSM)
+    req_state_d = req_state_q;
+    tr_idx_d = tr_idx_q;
+    n_req_issued_d = n_req_issued_q;
+    n_rd_req_issued_d = n_rd_req_issued_q;
+    last_op_issued_d = last_op_issued_q;
+    // Defaults (HCI outputs)
     hci_if.id = '0;
     hci_if.add = '0;
     hci_if.data = '0;
@@ -78,68 +136,197 @@ module application_driver #(
     hci_if.be = '1;
     hci_if.r_ready = 1'b1;
     hci_if.user = '0;
-    end_stimuli_o = 1'b0;
-    end_latency_o = 1'b0;
-    n_issued_transactions_o = '0;
-    n_issued_read_transactions_o = '0;
+    // Defaults (outputs)
+    end_req_o = 1'b0;
+    end_resp_o = 1'b0;
+    // inputs: gnt, r_data, r_valid, r_user, r_id
 
-    wait (rst_ni);
-    if (STIM_FILE != "") begin
-      file_path = STIM_FILE;
-    end else begin
-      if (IS_HWPE) begin
-        file_path = $sformatf(
-          "../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt",
-          MASTER_NUMBER
-        );
-      end else begin
-        file_path = $sformatf(
-          "../simvectors/generated/stimuli_processed/master_log_%0d.txt",
-          MASTER_NUMBER
-        );
+    case (req_state_q)
+      REQ_IDLE: begin
+        // Check if there are still transactions to issue
+        if (tr_idx_q < transactions.size()) begin
+          // If so, increase transaction index for next iteration
+          tr_idx_d = tr_idx_q + 1;
+          // If this transaction is a request
+          if (transactions[tr_idx_q].req) begin
+            hci_if.req = 1'b1;
+            hci_if.id = transactions[tr_idx_q].id;
+            hci_if.wen = transactions[tr_idx_q].wen;
+            hci_if.data = transactions[tr_idx_q].data;
+            hci_if.add = transactions[tr_idx_q].add;
+            // Update counters
+            n_req_issued_d = n_req_issued_q + 1;
+            if (transactions[tr_idx_q].wen) begin
+              n_rd_req_issued_d = n_rd_req_issued_q + 1;
+            end
+            last_op_issued_d = tr_idx_q;
+            // If granted already, stay in REQ_IDLE, otherwise go to WAIT_GNT
+            if (hci_if.gnt) begin
+              req_state_d = REQ_IDLE;
+            end else begin
+              req_state_d = WAIT_GNT;
+            end
+          end
+        end else begin
+          // If no more transactions to issue
+          if (n_rd_req_issued_q > n_rd_resp_retired_q) begin
+            // If there are still read responses to retire, wait for them before finishing
+            req_state_d = REQ_DONE;
+          end else begin
+            req_state_d = RSP_DONE;
+          end
+        end
+        // Synchronously clear
+        if (clear_i) begin
+          req_state_d = REQ_IDLE;
+        end
       end
-    end
-    stim = $fopen(file_path, "r");
-    if (stim == 0) begin
-      $fatal("ERROR: Could not open stimuli file!");
-    end
-    @(posedge clk_i);
-    while (!$feof(stim)) begin
-      scan_status = $fscanf(stim, "%b %b %b %b %b\n", req, id, wen, data, add);
-      if (scan_status != 5) begin
-        if (!$feof(stim)) begin
-          $fatal(1, "ERROR: malformed stimuli line in %s", file_path);
+      WAIT_GNT: begin
+        hci_if.req = 1'b1;
+        hci_if.id = transactions[last_op_issued_q].id;
+        hci_if.wen = transactions[last_op_issued_q].wen;
+        hci_if.data = transactions[last_op_issued_q].data;
+        hci_if.add = transactions[last_op_issued_q].add;
+        // Check if there are still transactions to issue
+        if (tr_idx_q < transactions.size()) begin
+          // Check whether to stall or not transaction fetching, in case this is a idle transaction that can hide mem latency
+          if (!transactions[tr_idx_q].req) begin
+            tr_idx_d = tr_idx_q + 1;
+          end else begin
+            tr_idx_d = tr_idx_q;
+          end
+          // If grant received, go back to REQ_IDLE and pop another transaction, otherwise stay in WAIT_GNT
+          if (hci_if.gnt) begin
+            req_state_d = REQ_IDLE;
+          end else begin
+            req_state_d = WAIT_GNT;
+          end
+        end else begin
+          // If there are no more transactions, wait for last grant and then finish
+          if (hci_if.gnt) begin
+            if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin
+              // If the last transaction was a read, we need to wait for its response before finishing
+              req_state_d = REQ_DONE;
+            end else begin
+              // If the last transaction was a write, we can finish right away
+              req_state_d = RSP_DONE;
+            end
+          end else begin
+            req_state_d = WAIT_GNT;
+          end
+        end
+        // Synchronously clear
+        if (clear_i) begin
+          req_state_d = REQ_IDLE;
         end
-        break;
       end
-      #(APPL_DELAY);
-      hci_if.id = id;
-      hci_if.data = data;
-      hci_if.add = add;
-      hci_if.wen = wen;
-      hci_if.req = req;
-
-      if (req) begin
-        @(posedge clk_i iff hci_if.gnt);
-        n_issued_transactions_o++;
-        if (wen) begin
-          n_issued_read_transactions_o++;
+      REQ_DONE: begin
+        end_req_o = 1'b1;
+        if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
+          // All read responses have been retired
+          req_state_d = RSP_DONE;
+        end else begin
+          req_state_d = REQ_DONE;
+        end
+        // Synchronously clear
+        if (clear_i) begin
+          req_state_d = REQ_IDLE;
+        end
+      end
+      RSP_DONE: begin
+        end_req_o = 1'b1;
+        end_resp_o = 1'b1;
+        // Synchronously clear
+        if (clear_i) begin
+          req_state_d = REQ_IDLE;
         end
-        // Deassert in NBA region so monitors sampling this edge see the handshake.
-        hci_if.id <= '0;
-        hci_if.data <= '0;
-        hci_if.add <= '0;
-        hci_if.wen <= 1'b0;
-        hci_if.req <= 1'b0;
-        wait (hci_if.req == 1'b0);
-      end else begin
-        @(posedge clk_i);
       end
+      default: begin
+        req_state_d = REQ_IDLE;
+      end
+    endcase
+  end
+
+  ///////////////////////
+  // Read response FSM //
+  ///////////////////////
+
+  // We only consider read responses as write responses are not mandatory in HCI
+
+  typedef enum logic {
+    RESP_IDLE,
+    RESP_WAIT_RVALID
+  } resp_state_t;
+
+  resp_state_t resp_state_q, resp_state_d;
+  int unsigned n_rd_in_flight_q, n_rd_in_flight_d; // number of reads granted but not yet responded
+
+  assign n_retired_rd_tr_o = n_rd_resp_retired_q;
+
+  // Sequential logic
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni || clear_i) begin
+      resp_state_q <= RESP_IDLE;
+      n_rd_resp_retired_q <= '0;
+      n_rd_in_flight_q <= '0;
+    end else begin
+      resp_state_q <= resp_state_d;
+      n_rd_resp_retired_q <= n_rd_resp_retired_d;
+      n_rd_in_flight_q <= n_rd_in_flight_d;
     end
+  end
 
-    $fclose(stim);
-    end_stimuli_o = 1'b1;
-    wait (n_completed_read_transactions >= n_issued_read_transactions_o);
-    end_latency_o = 1'b1;
+  // Combinational state logic
+  always_comb begin
+    // Defaults (FSM)
+    resp_state_d = resp_state_q;
+    n_rd_resp_retired_d = n_rd_resp_retired_q;
+    n_rd_in_flight_d = n_rd_in_flight_q;
+
+    case (resp_state_q)
+      RESP_IDLE: begin
+        // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID
+        if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+          n_rd_in_flight_d = n_rd_in_flight_q + 1;
+          resp_state_d = RESP_WAIT_RVALID;
+        end
+        // Synchronously clear
+        if (clear_i) begin
+          resp_state_d = RESP_IDLE;
+          n_rd_in_flight_d = '0;
+        end
+      end
+      RESP_WAIT_RVALID: begin
+        // Track newly granted reads while waiting
+        if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+          n_rd_in_flight_d = n_rd_in_flight_q + 1;
+        end
+        // When read response handshake happens, retire one response
+        if (hci_if.r_valid && hci_if.r_ready) begin
+          n_rd_resp_retired_d = n_rd_resp_retired_q + 1;
+          // Adjust in-flight: account for simultaneous grant (already added above)
+          if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+            // net in-flight = in_flight + 1 (new grant) - 1 (retired) = in_flight
+            n_rd_in_flight_d = n_rd_in_flight_q; // undo the +1 above, net unchanged
+          end else begin
+            n_rd_in_flight_d = n_rd_in_flight_q - 1;
+          end
+          // If no more in-flight reads (after this retirement), go back to RESP_IDLE
+          if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin
+            resp_state_d = RESP_IDLE;
+          end
+        end
+        // Synchronously clear
+        if (clear_i) begin
+          resp_state_d = RESP_IDLE;
+          n_rd_in_flight_d = '0;
+        end
+      end
+      default: begin
+        resp_state_d = RESP_IDLE;
+        n_rd_in_flight_d = '0;
+      end
+    endcase
   end
+
 endmodule
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 0e51a48..2413371 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -295,20 +295,20 @@ module tb_hci
           $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
-        .IS_HWPE(0),
         .DATA_WIDTH(DATA_WIDTH),
         .ADDR_WIDTH(ADDR_WIDTH),
-        .APPL_DELAY(APPL_DELAY),
         .IW(IW_cores),
         .STIM_FILE(STIM_FILE_LOG)
       ) i_app_driver_log (
         .clk_i(clk),
         .rst_ni(rst_n),
+        .clear_i(1'b0),
         .hci_if(hci_driver_log_if[ii]),
-        .end_stimuli_o(s_end_stimuli[ii]),
-        .end_latency_o(s_end_latency[ii]),
-        .n_issued_transactions_o(s_issued_transactions[ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[ii])
+        .end_req_o(s_end_stimuli[ii]),
+        .end_resp_o(s_end_latency[ii]),
+        .n_issued_tr_o(s_issued_transactions[ii]),
+        .n_issued_rd_tr_o(s_issued_read_transactions[ii]),
+        .n_retired_rd_tr_o()
       );
     end
   endgenerate
@@ -319,20 +319,20 @@ module tb_hci
           $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
-        .IS_HWPE(1),
         .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH),
         .ADDR_WIDTH(ADDR_WIDTH),
-        .APPL_DELAY(APPL_DELAY),
         .IW(IW_hwpe),
         .STIM_FILE(STIM_FILE_HWPE)
       ) i_app_driver_hwpe (
         .clk_i(clk),
         .rst_ni(rst_n),
+        .clear_i(1'b0),
         .hci_if(hci_driver_hwpe_if[ii]),
-        .end_stimuli_o(s_end_stimuli[N_LOG_MASTERS + ii]),
-        .end_latency_o(s_end_latency[N_LOG_MASTERS + ii]),
-        .n_issued_transactions_o(s_issued_transactions[N_LOG_MASTERS + ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[N_LOG_MASTERS + ii])
+        .end_req_o(s_end_stimuli[N_LOG_MASTERS + ii]),
+        .end_resp_o(s_end_latency[N_LOG_MASTERS + ii]),
+        .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]),
+        .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]),
+        .n_retired_rd_tr_o()
       );
     end
   endgenerate
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index e9829ea..155fecb 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -32,10 +32,13 @@ PYTHON ?= python3
 # Config gen #
 ##############
 
-# Source-of-truth JSON configs
-VERIF_CFG_JSON := $(HCI_VERIF_CFG_DIR)/hardware.json \
-	$(HCI_VERIF_CFG_DIR)/testbench.json \
-	$(HCI_VERIF_CFG_DIR)/workload.json
+# JSON configs are configurable from env var (default to config/)
+HARDWARE_JSON  ?= $(HCI_VERIF_CFG_DIR)/hardware.json
+TESTBENCH_JSON ?= $(HCI_VERIF_CFG_DIR)/testbench.json
+WORKLOAD_JSON  ?= $(HCI_VERIF_CFG_DIR)/workload.json
+
+# Source-of-truth JSON configs (used as stim-verif dependencies)
+VERIF_CFG_JSON := $(HARDWARE_JSON) $(TESTBENCH_JSON) $(WORKLOAD_JSON)
 
 # Makefiles to generate from JSON configs
 VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \
@@ -43,9 +46,12 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \
 
 .PHONY: config-verif
 config-verif: $(VERIF_CFG_MK)
-# Generate Makefiles from JSON configs
-$(HCI_VERIF_CFG_GEN_DIR)/%.mk: $(HCI_VERIF_CFG_DIR)/%.json $(HCI_VERIF_CFG_GEN_DIR)/%.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
-	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py $* $(HCI_VERIF_CFG_DIR) $(HCI_VERIF_CFG_GEN_DIR) > $@
+
+$(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(dir $(HARDWARE_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@
+
+$(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(dir $(TESTBENCH_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@
 
 $(HCI_VERIF_CFG_GEN_DIR):
 	mkdir -p $@
@@ -67,9 +73,9 @@ stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
 $(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES)
 	mkdir -p $(SIMVECTORS_GEN_DIR)
 	$(PYTHON) $(GEN_STIM_SCRIPT) \
-		--workload_config $(HCI_VERIF_CFG_DIR)/workload.json \
-		--testbench_config $(HCI_VERIF_CFG_DIR)/testbench.json \
-		--hardware_config $(HCI_VERIF_CFG_DIR)/hardware.json
+		--workload_config $(WORKLOAD_JSON) \
+		--testbench_config $(TESTBENCH_JSON) \
+		--hardware_config $(HARDWARE_JSON)
 	date > $@
 
 .PHONY: clean-stim-verif

From 22705903d25f31e19236c24e896b48a25bb0cb85 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Fri, 6 Mar 2026 23:28:51 +0100
Subject: [PATCH 10/25] verif: Implement advanced verification - Stimuli
 generation also outputs a memory map that is gonna be accessed - HWPE
 dataflow can be flexibly configured from workload.json (see
 workload.schema.json) - Added a traffic opion to the access patterns to be
 generated - Added support for HWPE custom dataflow inside testbench (supporta
 all mode: HCI, LOG, MUX) - Removed old parameters substituted by more
 advanced functionalities - Added possibility to select custom ranges for
 specific access patterns in vector generation

---
 target/verif/README.md                        | 179 +++++
 target/verif/bender.mk                        |   4 +-
 .../verif/config/generated/testbench.mk.tpl   |   2 -
 target/verif/config/hardware.json             |   4 +-
 target/verif/config/testbench.json            |   2 -
 target/verif/config/workload.json             | 178 ++---
 target/verif/config/workload.schema.json      | 239 +++++++
 target/verif/simvectors/README.md             |  23 -
 .../verif/simvectors/hci_stimuli/__init__.py  |   5 +-
 .../verif/simvectors/hci_stimuli/generator.py | 259 +++-----
 .../verif/simvectors/hci_stimuli/patterns.py  | 441 +++++++++++++
 .../verif/simvectors/hci_stimuli/processor.py |  68 +-
 target/verif/simvectors/main.py               | 614 ++++++++++++++----
 target/verif/src/application_driver.sv        |  49 +-
 target/verif/src/simulation_report.sv         |  54 +-
 target/verif/src/tb_hci.sv                    |  65 +-
 target/verif/src/tb_hci_pkg.sv                |  32 +-
 target/verif/src/throughput_monitor.sv        |  10 +-
 target/verif/verif.mk                         |  59 +-
 19 files changed, 1632 insertions(+), 655 deletions(-)
 create mode 100644 target/verif/README.md
 create mode 100644 target/verif/config/workload.schema.json
 delete mode 100644 target/verif/simvectors/README.md
 create mode 100644 target/verif/simvectors/hci_stimuli/patterns.py

diff --git a/target/verif/README.md b/target/verif/README.md
new file mode 100644
index 0000000..20db9b2
--- /dev/null
+++ b/target/verif/README.md
@@ -0,0 +1,179 @@
+# HCI Verification Framework
+
+## Overview
+
+The verification framework drives configurable memory traffic from multiple masters through the HCI interconnect and measures throughput and latency. It is fully driven by three JSON configuration files:
+
+| File | Purpose |
+|------|---------|
+| `config/hardware.json` | Interconnect topology (number of masters, banks, data widths, ...) |
+| `config/testbench.json` | Simulation parameters (clock period, arbitration policy, ...) |
+| `config/workload.json` | Per-master traffic patterns, transaction counts, and dataflow dependencies |
+
+---
+
+## Configuration files
+
+### `hardware.json`
+
+Controls the hardware topology instantiated in the testbench. Generates `config/generated/hardware.mk` which is included by the build system and passed as Verilog defines.
+
+### `testbench.json`
+
+Controls simulation-level knobs (clock period, reset cycles, arbitration parameters). Generates `config/generated/testbench.mk`.
+
+### `workload.json`
+
+The most user-facing configuration. Describes what each master does and in what order. Structure:
+
+```json
+{
+  "description": "...",
+  "log_masters": [ ... ],
+  "hwpe_masters": [ ... ]
+}
+```
+
+#### Per-master fields
+
+Each entry in `log_masters` or `hwpe_masters` supports:
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `id` | recommended | Index within the master array. Must match the positional index (0-based). Used for documentation; mapping is always positional. |
+| `description` | no | Human-readable label. |
+| `mem_access_type` | yes | Traffic pattern. See [Access patterns](#access-patterns). |
+| `n_transactions` | yes* | Number of transactions to issue. Can be derived from geometry for structured patterns — see below. |
+| `phase` | no | Phase name (string). Masters in the same phase can run concurrently. Default: `"default"`. |
+| `wait_for` | no | List of phase names that must complete (`end_req_o` asserted on all masters of those phases) before this master starts. Default: `[]` (start immediately). |
+| `start_delay_cycles` | no | Number of idle cycles prepended to this master's stimuli file (static start delay within a phase). Default: `0`. |
+| `region_base_address` | no | Base byte address of the memory region for this master (int, hex `0x...`, or decimal string). Default: evenly partitioned. |
+| `region_size_bytes` | no | Size in bytes of the memory region for this master. Default: evenly partitioned. |
+| `start_address` | no | Starting address for `linear`/`2d`/`3d` patterns (int, hex `0x...`, or decimal string). Default: `"0"`. |
+| `stride0`, `len_d0` | no | Inner dimension stride (in words) and length for `linear`/`2d`/`3d`. |
+| `stride1`, `len_d1` | no | Middle dimension stride (in words) and length for `2d`/`3d`. |
+| `stride2` | no | Outer dimension stride (in words) for `3d`. |
+| `traffic_pct` | no | [`random`, `linear`] Bus utilization percentage (1–100). After each transaction emits `floor((100-pct)/pct)` idle cycles. Default: `100` (back-to-back). |
+| `traffic_read_pct` | no | [`random`, `linear`] Percentage of accesses that are reads. If omitted, read/write is random per transaction. When set, all reads are issued first, then all writes. |
+| `idle_cycles_between_phases` | no | [`2d`, `3d`, `matmul_phased`] Idle cycles inserted between phases (between outer rows for `2d`/`3d`, between read-A/read-B/write-C for `matmul_phased`). Models compute time. Default: `0`. |
+| `matmul_ratio_a/b/c` | no | [`matmul_phased`] Phase ratio for read-A : read-B : write-C transaction split. Default: `1:1:1`. |
+| `region_base_address_a/b/c` | no | [`matmul_phased`] Explicit per-phase base addresses, overriding the auto-split of the combined region. |
+| `region_size_bytes_a/b/c` | no | [`matmul_phased`] Explicit per-phase region sizes (paired with `region_base_address_a/b/c`). |
+
+\* `n_transactions` can be omitted for structured patterns if geometry fields are provided — `main.py` derives it automatically and reports it. For `random` it is always required.
+
+#### Deriving `n_transactions` from geometry
+
+| Pattern | Geometry fields | Derived count |
+|---------|----------------|---------------|
+| `linear` | `length` | `length` |
+| `2d` | `len_d0`, `len_d1` | `len_d0 × len_d1` |
+| `3d` | `len_d0`, `len_d1`, `len_d2` | `len_d0 × len_d1 × len_d2` |
+| `matmul_phased` | `matrix_m`, `matrix_n`, `matrix_k` | `m×k + k×n + m×n` |
+| `random` | — | must be set explicitly |
+| `idle` | — | ignored |
+
+---
+
+## Access patterns
+
+| `mem_access_type` | Description |
+|-------------------|-------------|
+| `random` | Uniformly random addresses within the assigned region. |
+| `linear` | Strided 1-D sequential scan. |
+| `2d` | Strided 2-D scan (inner `stride0`/`len_d0`, outer `stride1`). |
+| `3d` | Strided 3-D scan (inner `stride0`/`len_d0`, mid `stride1`/`len_d1`, outer `stride2`). |
+| `matmul_phased` | Deterministic phased traffic modelling matrix multiply: read-A phase, read-B phase, write-C phase. The assigned region is split into three equal sub-regions A, B, C. |
+| `idle` | No transactions (driver idles). |
+
+---
+
+## Dataflow: phases and dependencies
+
+### Concept
+
+Masters can be assigned to named **phases** and can declare **dependencies** on other phases. This allows modeling realistic dataflow graphs, e.g.:
+
+- HWPE A and HWPE B run in parallel (same phase, no dependencies)
+- HWPE C starts only after HWPE B finishes (C `wait_for: ["phaseB"]`)
+
+```json
+"hwpe_masters": [
+  { "id": 0, "phase": "phaseA", "wait_for": [],           ... },
+  { "id": 1, "phase": "phaseB", "wait_for": [],           ... },
+  { "id": 2, "phase": "phaseC", "wait_for": ["phaseB"],   ... }
+]
+```
+
+### Mechanism
+
+`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/wait_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define.
+
+In `tb_hci_pkg.sv`, `WAIT_MASKS[i]` holds driver `i`'s mask. In `tb_hci.sv`, each driver's `clear_i` is combinationally held high until all drivers in its mask have asserted `end_req_o`:
+
+```
+clear_i[i] = (eff_mask[i] != 0) && ((s_end_req & eff_mask[i]) != eff_mask[i])
+```
+
+A driver with an empty mask starts as soon as reset is released.
+
+The dependency is on `end_req_o` (all transactions issued), not `end_resp_o` (all read responses retired). This is intentional: a dependent master can begin issuing as soon as the dependency master has issued all its requests, which is the correct model for pipelined HWPE dataflow.
+
+### MUX mode (INTERCO_TYPE=MUX)
+
+`hci_core_mux_static` forwards exactly one HWPE to the interconnect at a time (selected by `sel_i`). Therefore, in MUX mode **at most one HWPE can be active at any given time**, regardless of what `wait_for` says in the workload.
+
+**The user-defined `wait_for` dependencies are ignored for HWPE drivers in MUX mode.** Instead, the testbench automatically enforces a strict sequential chain:
+
+- HWPE 0 starts first (no dependency)
+- HWPE 1 waits for HWPE 0
+- HWPE 2 waits for HWPE 1
+- ...
+- HWPE k waits for HWPE k-1
+
+**Ordering is by index**, which equals the positional order in the `hwpe_masters` array in `workload.json` (0-based). To control execution order in MUX mode, reorder the entries in `hwpe_masters`.
+
+LOG master `wait_for` dependencies are always respected regardless of interconnect mode.
+
+`sel_i` is driven combinationally to the index of the currently active HWPE (the lowest-indexed HWPE whose `clear_i` is 0).
+
+This means **the same workload JSON can be used across all three interconnect modes** (HCI, MUX, LOG). In HCI/LOG mode the declared `wait_for` dependencies are honored; in MUX mode they are overridden by the sequential chain.
+
+---
+
+## Memory layout
+
+The memory uses a **bank-interleaved addressing scheme**: consecutive word addresses map to consecutive banks before wrapping.
+
+```
+byte addr 0x00  → bank 0
+byte addr 0x04  → bank 1   (assuming DATA_WIDTH=32, 4 bytes/word)
+...
+byte addr 0x7C  → bank 31  (for N_BANKS=32)
+byte addr 0x80  → bank 0   (wrap)
+```
+
+In general, bank index = `(byte_addr / (DATA_WIDTH/8)) % N_BANKS`.
+
+`main.py` reports the memory map for each master after stimuli generation, showing:
+- First and last byte addresses accessed
+- Total transfer size in bytes
+- Number of distinct banks touched
+- For matmul_phased: sub-region boundaries for A, B, C matrices
+
+This allows verifying that regions are correctly partitioned and identifying any unintended overlaps between masters.
+
+---
+
+## Build targets
+
+| Target | Action |
+|--------|--------|
+| `make config-verif` | Generate `hardware.mk`, `testbench.mk` from JSON |
+| `make stim-verif` | Generate stimuli and `wait_masks.mk` from workload/hardware/testbench JSON |
+| `make compile-verif` | Compile RTL and testbench with QuestaSim |
+| `make opt-verif` | Optimize compiled design |
+| `make run-verif` | Run simulation |
+| `make clean-verif` | Remove all generated artifacts |
+
+Pass `WORKLOAD_JSON=config/workload_<name>.json` to `make stim-verif` / `make run-verif` to select an alternative workload.
diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index 6b1fa59..4aac401 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -18,14 +18,12 @@ VERIF_DEFS += \
 	-D TS_BIT=$(TS_BIT) \
 	-D EXPFIFO=$(EXPFIFO) \
 	-D SEL_LIC=$(SEL_LIC) \
-	-D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \
-	-D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \
 	-D CLK_PERIOD=$(CLK_PERIOD) \
 	-D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \
 	-D RANDOM_GNT=$(RANDOM_GNT) \
 	-D INTERCO_TYPE=$(INTERCO_TYPE) \
 	-D INVERT_PRIO=$(INVERT_PRIO) \
-	-D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL)
+	-D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) \
 
 # Common targets for bender
 VERIF_TARGS ?=
diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl
index 469736b..a65ee53 100644
--- a/target/verif/config/generated/testbench.mk.tpl
+++ b/target/verif/config/generated/testbench.mk.tpl
@@ -5,8 +5,6 @@
 # This file is auto-generated from testbench.json - DO NOT EDIT MANUALLY
 
 # Testbench parameters (from testbench.json)
-N_TRANSACTION_LOG?=${N_TRANSACTION_LOG}
-TRANSACTION_RATIO?=${TRANSACTION_RATIO}
 CLK_PERIOD?=${CLK_PERIOD}
 RST_CLK_CYCLES?=${RST_CLK_CYCLES}
 RANDOM_GNT?=${RANDOM_GNT}
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index b6c52a5..c8db224 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -1,7 +1,7 @@
 {
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
-    "N_HWPE": 2,
+    "N_HWPE": 3,
     "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
     "N_DMA": 0,
@@ -9,7 +9,7 @@
     "DATA_WIDTH": 32,
     "TOT_MEM_SIZE": 128,
     "N_BANKS": 32,
-    "INTERCO_TYPE": "LOG",
+    "INTERCO_TYPE": "HCI",
     "TS_BIT": 21,
     "EXPFIFO": 0,
     "SEL_LIC": 0
diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json
index 46005ab..9ce418f 100644
--- a/target/verif/config/testbench.json
+++ b/target/verif/config/testbench.json
@@ -1,8 +1,6 @@
 {
   "description": "Testbench configuration parameters",
   "parameters": {
-    "N_TRANSACTION_LOG": 1000,
-    "TRANSACTION_RATIO": 1,
     "CLK_PERIOD": 50,
     "RST_CLK_CYCLES": 10,
     "RANDOM_GNT": 0,
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index 7da0838..e094564 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -1,144 +1,58 @@
 {
-  "description": "Workload configuration for stimuli generation",
-  "simulation_parameters": {
-    "EXACT_OR_MAX_OFFSET": 0,
-    "CYCLE_OFFSET_LOG": 1,
-    "CYCLE_OFFSET_HWPE": 1
-  },
+  "description": "8 cores at 50% traffic in dedicated regions + 3 HWPEs: DMA imports tile B, GEMM reads tile B and writes output, Transpose reads GEMM output and writes transposed result. DMA and GEMM run concurrently (wave0), Transpose waits for wave0.",
+
   "log_masters": [
+    { "id": 0, "description": "Core 0", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00000000", "region_base_address": "0x00000000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 1, "description": "Core 1", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00001000", "region_base_address": "0x00001000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 2, "description": "Core 2", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00002000", "region_base_address": "0x00002000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 3, "description": "Core 3", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00003000", "region_base_address": "0x00003000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 4, "description": "Core 4", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00004000", "region_base_address": "0x00004000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 5, "description": "Core 5", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00005000", "region_base_address": "0x00005000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 6, "description": "Core 6", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00006000", "region_base_address": "0x00006000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 7, "description": "Core 7", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00007000", "region_base_address": "0x00007000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
     {
       "id": 0,
-      "description": "Core 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "DMA — wide tile import into buffer B (0x08000–0x0FFFF)",
+      "mem_access_type": "random",
+      "n_transactions": 1000,
+      "traffic_pct": 100,
+      "traffic_read_pct": 0,
+      "region_base_address": "0x00008000",
+      "region_size_bytes": 32768,
+      "phase": "hwpe_wave0"
     },
     {
       "id": 1,
-      "description": "Core 1",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "GEMM — reads tile B from DMA buffer (0x08000–0x0FFFF), writes output to buffer C (0x10000–0x17FFF)",
+      "mem_access_type": "matmul_phased",
+      "n_transactions": 1000,
+      "region_base_address_a": "0x00008000",
+      "region_size_bytes_a": 32768,
+      "matmul_ratio_b": 0,
+      "region_base_address_c": "0x00010000",
+      "region_size_bytes_c": 32768,
+      "matmul_ratio_a": 1,
+      "matmul_ratio_c": 1,
+      "phase": "hwpe_wave0"
     },
     {
       "id": 2,
-      "description": "Core 2",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 3,
-      "description": "Core 3",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 4,
-      "description": "Core 4",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 5,
-      "description": "Core 5",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 6,
-      "description": "Core 6",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 7,
-      "description": "Core 7",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 8,
-      "description": "DMA 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 9,
-      "description": "External 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    }
-  ],
-  "hwpe_masters": [
-    {
-      "id": 0,
-      "description": "HWPE 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 1,
-      "description": "HWPE 1",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Transpose engine — reads GEMM output (0x10000–0x17FFF), writes transposed result (0x18000–0x1FFFF)",
+      "mem_access_type": "matmul_phased",
+      "n_transactions": 1000,
+      "region_base_address_a": "0x00010000",
+      "region_size_bytes_a": 32768,
+      "matmul_ratio_b": 0,
+      "region_base_address_c": "0x00018000",
+      "region_size_bytes_c": 32768,
+      "matmul_ratio_a": 1,
+      "matmul_ratio_c": 1,
+      "phase": "hwpe_wave1",
+      "wait_for": ["hwpe_wave0"]
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json
new file mode 100644
index 0000000..3b4c653
--- /dev/null
+++ b/target/verif/config/workload.schema.json
@@ -0,0 +1,239 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "workload.schema.json",
+  "title": "HCI Verification Workload Configuration",
+  "description": "Schema for workload.json consumed by target/verif/simvectors/main.py",
+  "type": "object",
+  "required": ["log_masters", "hwpe_masters"],
+  "additionalProperties": false,
+  "properties": {
+    "description": {
+      "type": "string",
+      "description": "Human-readable description of this workload."
+    },
+    "log_masters": {
+      "type": "array",
+      "description": "Ordered list of LOG masters (CORE, then DMA, then EXT). Length must equal N_CORE + N_DMA + N_EXT in hardware.json.",
+      "items": { "$ref": "#/$defs/master" }
+    },
+    "hwpe_masters": {
+      "type": "array",
+      "description": "Ordered list of HWPE masters. Length must equal N_HWPE in hardware.json.",
+      "items": { "$ref": "#/$defs/master" }
+    }
+  },
+
+  "$defs": {
+    "master": {
+      "type": "object",
+      "description": "Per-master stimulus configuration. The active pattern is selected by mem_access_type.",
+      "required": ["mem_access_type"],
+      "unevaluatedProperties": false,
+      "properties": {
+
+        "id": {
+          "type": "integer",
+          "description": "Optional consistency label. Mapping is always positional; a mismatch triggers a warning."
+        },
+        "description": {
+          "type": "string",
+          "description": "Human-readable label shown in the memory-map report."
+        },
+        "mem_access_type": {
+          "type": "string",
+          "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"],
+          "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'. Use traffic_pct/traffic_read_pct on 'random'/'linear' to model bus utilization."
+        },
+        "phase": {
+          "type": "string",
+          "default": "default",
+          "description": "Phase group this master belongs to, referenced by other masters' wait_for."
+        },
+        "wait_for": {
+          "type": "array",
+          "items": { "type": "string" },
+          "default": [],
+          "description": "Phase names this master waits for before its WAIT_MASK clears in the testbench."
+        },
+        "start_delay_cycles": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "Number of idle cycles prepended to the processed stimuli file (static start delay)."
+        },
+
+        "n_transactions": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields (see per-pattern rules)."
+        },
+
+        "region_base_address": {
+          "oneOf": [
+            { "type": "integer", "minimum": 0 },
+            { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
+          ],
+          "description": "[random, matmul_phased] Base byte address of the memory region. Default: master_local_idx * (total_mem / n_peers_of_kind), aligned to access_bytes."
+        },
+        "region_size_bytes": {
+          "oneOf": [
+            { "type": "integer", "minimum": 0 },
+            { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
+          ],
+          "description": "[random, matmul_phased] Size in bytes of the memory region. Default: total_mem / n_peers_of_kind. Clamped so region does not exceed total memory."
+        },
+
+        "start_address": {
+          "type": "string",
+          "default": "0",
+          "description": "[linear, 2d, 3d] Start byte address as decimal, hex (0x...) or binary string."
+        },
+        "stride0": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[linear, 2d, 3d] Innermost stride in words."
+        },
+        "stride1": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[2d, 3d] Middle stride in words."
+        },
+        "stride2": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[3d] Outermost stride in words."
+        },
+        "length": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[linear] Alias for n_transactions."
+        },
+        "len_d0": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[2d, 3d] Innermost dimension length. Can replace n_transactions together with len_d1 (and len_d2 for 3d)."
+        },
+        "len_d1": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[2d, 3d] Middle dimension length."
+        },
+        "len_d2": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[3d] Outermost dimension length."
+        },
+
+        "matrix_m": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Rows of A and C. Can replace n_transactions together with matrix_n and matrix_k (total = m*k + k*n + m*n)."
+        },
+        "matrix_n": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Columns of B and C."
+        },
+        "matrix_k": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Columns of A / rows of B."
+        },
+        "region_base_address_a": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the read-A region. If set together with region_size_bytes_a, overrides the auto-split for phase A."
+        },
+        "region_size_bytes_a": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the read-A region."
+        },
+        "region_base_address_b": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the read-B region."
+        },
+        "region_size_bytes_b": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the read-B region."
+        },
+        "region_base_address_c": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the write-C region."
+        },
+        "region_size_bytes_c": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the write-C region."
+        },
+
+        "matmul_ratio_a": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the read-A phase in the transaction split."
+        },
+        "matmul_ratio_b": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the read-B phase."
+        },
+        "matmul_ratio_c": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the write-C phase."
+        },
+
+        "traffic_pct": {
+          "type": "integer",
+          "minimum": 1,
+          "maximum": 100,
+          "default": 100,
+          "description": "[random, linear] Bus utilization percentage. After each transaction emit floor((100-pct)/pct) idle cycles. E.g. 50 => 1 idle per transaction (req, idle, req, idle, ...). Default 100 = back-to-back."
+        },
+        "traffic_read_pct": {
+          "type": "integer",
+          "minimum": 0,
+          "maximum": 100,
+          "description": "[random, linear] Percentage of accesses that are reads (wen=1). If omitted, wen is random per transaction. When set, all reads are emitted first, then all writes."
+        },
+        "idle_cycles_between_phases": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[2d, 3d, matmul_phased] Number of req=0 cycles inserted at each phase boundary (between outer rows for 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time between memory bursts."
+        }
+      },
+
+      "allOf": [
+        {
+          "if": { "properties": { "mem_access_type": { "const": "random" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["n_transactions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] },
+          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["length"] }] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] },
+          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1"] }] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "3d" } }, "required": ["mem_access_type"] },
+          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1", "len_d2"] }] }
+        },
+        {
+          "if": {
+            "properties": { "mem_access_type": { "enum": ["matmul_phased", "matmul"] } },
+            "required": ["mem_access_type"]
+          },
+          "then": {
+            "anyOf": [{ "required": ["n_transactions"] }, { "required": ["matrix_m", "matrix_n", "matrix_k"] }]
+          }
+        }
+      ]
+    }
+  }
+}
diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md
deleted file mode 100644
index 97be798..0000000
--- a/target/verif/simvectors/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# HCI Stimuli Generator
-
-Python package for generating test stimuli for the HCI verification environment.
-
-## Usage
-
-### Generate Stimuli
-
-```bash
-python target/verif/simvectors/main.py --workload_config <path> --testbench_config <path> --hardware_config <path>
-```
-
-Or use the Makefile:
-```bash
-make stimuli
-```
-
-## Configuration
-
-Configuration files are located in `target/verif/config/`:
-- `hardware.json` - HCI hardware parameters (auto-generates `generated/hardware.mk`)
-- `testbench.json` - Testbench parameters (auto-generates `generated/testbench.mk`)
-- `workload.json` - Workload configuration with simulation parameters and master-specific settings
diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py
index 3f4cd89..6381011 100644
--- a/target/verif/simvectors/hci_stimuli/__init__.py
+++ b/target/verif/simvectors/hci_stimuli/__init__.py
@@ -6,7 +6,6 @@
 """
 
 from .generator import StimuliGenerator
-from .processor import unfold_raw_txt, pad_txt_files
-
-__all__ = ['StimuliGenerator', 'unfold_raw_txt', 'pad_txt_files']
+from .processor import pad_txt_files
 
+__all__ = ['StimuliGenerator', 'pad_txt_files']
diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py
index af121d6..aac6927 100644
--- a/target/verif/simvectors/hci_stimuli/generator.py
+++ b/target/verif/simvectors/hci_stimuli/generator.py
@@ -1,189 +1,80 @@
-"""Stimuli Generator class and access-pattern generators."""
-import random
-import os
-
-class StimuliGenerator:
-    def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION):
-        self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY
-        self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8)
-        self.N_BANKS = N_BANKS
-        self.TOT_MEM_SIZE = TOT_MEM_SIZE
-        self.DATA_WIDTH = DATA_WIDTH
-        self.ADD_WIDTH = int(ADD_WIDTH)
-        self.filepath = filepath
-        os.makedirs(os.path.dirname(filepath),exist_ok=True)
-        self.N_TEST = N_TEST
-        self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET
-        self.CYCLE_OFFSET = CYCLE_OFFSET
+"""StimuliGenerator: infrastructure for writing cycle-accurate stimuli files.
+
+Each output file has one line per simulation cycle in the format:
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
+
+req=0 means no transaction that cycle (id/wen/data/add are don't-cares).
+req=1 means an active transaction.
+"""
+
+import os
+import random
+
+from .patterns import PatternsMixin
+
+
+class StimuliGenerator(PatternsMixin):
+    def __init__(
+        self,
+        IW,
+        WIDTH_OF_MEMORY,
+        N_BANKS,
+        TOT_MEM_SIZE,
+        DATA_WIDTH,
+        ADD_WIDTH,
+        filepath,
+        N_TEST,
+        MASTER_NUMBER_IDENTIFICATION,
+    ):
+        self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY
+        self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY / 8)
+        self.N_BANKS = N_BANKS
+        self.TOT_MEM_SIZE = TOT_MEM_SIZE
+        self.DATA_WIDTH = DATA_WIDTH
+        self.ADD_WIDTH = int(ADD_WIDTH)
+        self.filepath = filepath
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        self.N_TEST = N_TEST
         self.IW = IW
         self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION
 
     def _format_id(self, id_value):
         return bin(id_value % (1 << self.IW))[2:].zfill(self.IW)
-    
-    def random_data(self):
-        data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data
-        data = bin(data_decimal)[2:].zfill(self.DATA_WIDTH)
-        return data
-    
-    def data_wen_offset(self):
-        wen = random.randint(0,1) # write enable signal (1 = read, 0 = write)
-        if (self.EXACT_OR_MAX_OFFSET):
-            cycle_offset = random.randint(1,self.CYCLE_OFFSET)
-        else:
-            cycle_offset = self.CYCLE_OFFSET 
-        if wen:
-            data = "0" * self.DATA_WIDTH
-        else:
-            data = self.random_data()
-        return data, wen, cycle_offset
-    
-
-    def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            for test in range(self.N_TEST):
-                data, wen, cycle_offset = self.data_wen_offset()
-                while True:
-                    add_decimal = int((random.randint(0, int((self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE)/self.WIDTH_OF_MEMORY_BYTE)))*(self.WIDTH_OF_MEMORY_BYTE)) # generate a random word-aligned memory address.
-                    if add_decimal > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        add_decimal = add_decimal - self.TOT_MEM_SIZE*1024 #rolls over
-                    add = bin(add_decimal)[2:].zfill(self.ADD_WIDTH)
-
-                    if wen:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            break
-                    else:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                            break
-                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id  + 1
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-    
-    def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            next_address = int(start_address,2)
-            if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-            for test in range(self.N_TEST):
-                data, wen, cycle_offset = self.data_wen_offset()
-                
-                add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                next_address += (self.WIDTH_OF_MEMORY_BYTE)*stride0 #word-aligned memory address
-                if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                    next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-                if wen:
-                    if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                        break
-                else:
-                    if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                        break
-                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-                
-        return id
-    
-    def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            start_address = int(start_address,2)
-            next_address = start_address
-            j = 0
-            STOP = 0
-            while True:
-                for i in range(len_d0):
-                    data, wen, cycle_offset = self.data_wen_offset()
-                    next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 #word-aligned memory address
-                    add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                    if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-
-                    if wen:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
-                            if id - id_start >= self.N_TEST :
-                                STOP = 1
-                                break
-                    else:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                            file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
-                            if id - id_start >= self.N_TEST :
-                                STOP = 1
-                                break
-                if STOP:
-                    break
-                j = j + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-    
-    def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            start_address = int(start_address,2)
-            next_address = start_address
-            k = 0
-            STOP = 0
-            while True:
-                for j in range(len_d1):
-                    for i in range(len_d0):
-                        data, wen, cycle_offset = self.data_wen_offset()
-                        next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 + k*(self.WIDTH_OF_MEMORY_BYTE)*stride2 #word-aligned memory address
-                        if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                            next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-                        add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-
-                        if wen:
-                            if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                                LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
-                                if id - id_start >= self.N_TEST :
-                                    STOP = 1
-                                    break
-                        else:
-                            if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                                LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                                LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                                file.write(self._format_id(id) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
-                                if id - id_start >= self.N_TEST :
-                                    STOP = 1
-                                    break
-                    if STOP:
-                        break
-                if STOP:
-                        break
-                k = k + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-        
-
+
+    def random_data(self):
+        data_decimal = random.randint(0, (2 ** self.DATA_WIDTH) - 1)
+        return bin(data_decimal)[2:].zfill(self.DATA_WIDTH)
+
+    def _write_req(self, file_obj, id_value, wen, data, add):
+        """Write one active-request line (req=1)."""
+        file_obj.write(
+            "1 "
+            + self._format_id(id_value)
+            + " "
+            + str(wen)
+            + " "
+            + data
+            + " "
+            + add
+            + "\n"
+        )
+
+    def _write_idle(self, file_obj):
+        """Write one idle line (req=0)."""
+        file_obj.write(
+            "0 "
+            + "0" * self.IW
+            + " 0 "
+            + "0" * self.DATA_WIDTH
+            + " "
+            + "0" * self.ADD_WIDTH
+            + "\n"
+        )
+
+    def data_wen(self):
+        wen = random.randint(0, 1)  # 1=read, 0=write
+        if wen:
+            data = "0" * self.DATA_WIDTH
+        else:
+            data = self.random_data()
+        return data, wen
diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py
new file mode 100644
index 0000000..987e526
--- /dev/null
+++ b/target/verif/simvectors/hci_stimuli/patterns.py
@@ -0,0 +1,441 @@
+"""Access-pattern generators for StimuliGenerator.
+
+Each method writes a cycle-accurate stimuli file directly (one line per cycle):
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
+
+req=0 lines are idle cycles. req=1 lines are active transactions.
+There is no intermediate raw format or cycle_offset expansion step.
+
+Patterns:
+  - random:        uniformly random addresses, random read/write mix.
+                   Optional: traffic_pct / traffic_read_pct for uniform idle interleaving.
+  - linear:        strided 1-D scan.
+                   Optional: traffic_pct / traffic_read_pct for uniform idle interleaving.
+  - 2d:            strided 2-D scan (inner stride0, outer stride1).
+                   Optional: idle_cycles_between_phases inserted after each outer row.
+  - 3d:            strided 3-D scan (stride0, stride1, stride2).
+                   Optional: idle_cycles_between_phases inserted after each mid-level block.
+  - idle:          no transactions (single req=0 line).
+  - matmul_phased: phased traffic modelling matrix multiply
+                   (read-A phase, read-B phase, write-C phase).
+                   Optional: idle_cycles_between_phases inserted between phases.
+
+traffic_pct semantics (random, linear):
+  After every transaction, emit floor((100 - traffic_pct) / traffic_pct) idle cycles.
+  traffic_pct=50  => 1 idle per transaction  (req, idle, req, idle, ...)
+  traffic_pct=100 => no idles (back-to-back)
+
+traffic_read_pct semantics (random, linear):
+  Fraction of transactions that are reads (wen=1); remainder are writes (wen=0).
+  Only meaningful when traffic_pct < 100 or as override of the default random mix.
+  When set, wen is no longer random: the first n_reads transactions are reads,
+  the rest are writes (pattern: all reads first, then all writes).
+
+idle_cycles_between_phases semantics (2d, 3d, matmul_phased):
+  Number of req=0 cycles inserted at each phase boundary (between outer rows for
+  2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time
+  between bursts of memory accesses.
+
+All patterns respect the cross-master forbidden address lists:
+  - forbidden_write: addresses already read -> no later master may write here
+  - forbidden_read:  addresses already written -> no later master may read/write here
+"""
+
+import random
+
+
+class PatternsMixin:
+    """Mixin providing memory access pattern generators."""
+
+    # ------------------------------------------------------------------ #
+    # Shared helpers                                                       #
+    # ------------------------------------------------------------------ #
+
+    @staticmethod
+    def _parse_address(addr_str):
+        """Parse a binary, hex (0x...), or decimal string to int."""
+        s = str(addr_str)
+        if s.startswith('0x') or s.startswith('0X'):
+            return int(s, 16)
+        if set(s) <= {'0', '1'}:
+            return int(s, 2)
+        return int(s, 0)
+
+    @staticmethod
+    def _align_down(value, alignment):
+        if alignment <= 0:
+            return value
+        return (value // alignment) * alignment
+
+    @staticmethod
+    def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c):
+        """Split total_ops into three phases according to ratios."""
+        ra = max(0, int(ratio_a))
+        rb = max(0, int(ratio_b))
+        rc = max(0, int(ratio_c))
+        if ra == 0 and rb == 0 and rc == 0:
+            ra, rb, rc = 1, 1, 1
+        rsum = ra + rb + rc
+        cnt_a = (total_ops * ra) // rsum
+        cnt_b = (total_ops * rb) // rsum
+        cnt_c = total_ops - cnt_a - cnt_b
+
+        if total_ops >= 3:
+            if ra > 0 and cnt_a == 0:
+                cnt_a = 1
+                cnt_c = max(0, cnt_c - 1)
+            if rb > 0 and cnt_b == 0:
+                cnt_b = 1
+                cnt_c = max(0, cnt_c - 1)
+            if rc > 0 and cnt_c == 0:
+                if cnt_a > 1:
+                    cnt_a -= 1
+                    cnt_c = 1
+                elif cnt_b > 1:
+                    cnt_b -= 1
+                    cnt_c = 1
+        return cnt_a, cnt_b, cnt_c
+
+    @staticmethod
+    def _idles_per_req(traffic_pct):
+        """Number of idle cycles to emit after each transaction for a given traffic_pct."""
+        traffic_pct = max(1, min(100, int(traffic_pct)))
+        if traffic_pct >= 100:
+            return 0
+        return int(round((100 - traffic_pct) / traffic_pct))
+
+    def _is_allowed(self, add, wen, forbidden_read, forbidden_write):
+        """Return True if the address is not in the relevant forbidden list."""
+        return add not in (forbidden_read if wen else forbidden_write)
+
+    def _record_access(self, add, wen, forbidden_read_new, forbidden_write_new):
+        """Mark add as used by this master."""
+        forbidden_write_new.append(add)
+        if not wen:
+            forbidden_read_new.append(add)
+
+    # ------------------------------------------------------------------ #
+    # Access patterns                                                      #
+    # ------------------------------------------------------------------ #
+
+    def random_gen(
+        self,
+        id_start,
+        forbidden_read,
+        forbidden_write,
+        region_base=0,
+        region_size=None,
+        traffic_pct=100,
+        traffic_read_pct=None,
+    ):
+        """Uniformly random addresses within [region_base, region_base + region_size).
+
+        traffic_pct: bus utilisation percentage. After each transaction,
+          floor((100 - traffic_pct) / traffic_pct) idle cycles are emitted.
+          Default 100 = back-to-back (no idles).
+
+        traffic_read_pct: if set, the first n_reads transactions are reads and
+          the rest are writes. If not set, wen is random per transaction.
+        """
+        total_mem_bytes = int(self.TOT_MEM_SIZE * 1024)
+        if region_size is None:
+            region_size = total_mem_bytes
+        region_size = min(region_size, total_mem_bytes - region_base)
+        n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE)
+        n_idles = self._idles_per_req(traffic_pct)
+
+        # Build wen sequence
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+        else:
+            wen_seq = None
+
+        id_value = id_start
+        forbidden_read_new = []
+        forbidden_write_new = []
+        with open(self.filepath, "w", encoding="ascii") as file:
+            for i in range(self.N_TEST):
+                if wen_seq is not None:
+                    wen = wen_seq[i]
+                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                else:
+                    data, wen = self.data_wen()
+                while True:
+                    add_decimal = region_base + random.randint(0, int(n_words) - 1) * self.WIDTH_OF_MEMORY_BYTE
+                    add = bin(int(add_decimal))[2:].zfill(self.ADD_WIDTH)
+                    if self._is_allowed(add, wen, forbidden_read, forbidden_write):
+                        self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
+                        break
+                self._write_req(file, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(file)
+        forbidden_read.extend(forbidden_read_new)
+        forbidden_write.extend(forbidden_write_new)
+        return id_value
+
+    def linear_gen(
+        self,
+        stride0,
+        start_address,
+        id_start,
+        forbidden_read,
+        forbidden_write,
+        traffic_pct=100,
+        traffic_read_pct=None,
+    ):
+        """Strided 1-D linear scan. Forbidden addresses are skipped (no idle inserted).
+
+        traffic_pct / traffic_read_pct: see random_gen.
+        """
+        n_idles = self._idles_per_req(traffic_pct)
+
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+        else:
+            wen_seq = None
+
+        id_value = id_start
+        forbidden_read_new = []
+        forbidden_write_new = []
+        seq_idx = 0
+        with open(self.filepath, "w", encoding="ascii") as file:
+            next_address = self._parse_address(start_address)
+            if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
+                next_address -= self.TOT_MEM_SIZE * 1024
+            for i in range(self.N_TEST):
+                if wen_seq is not None:
+                    wen = wen_seq[i]
+                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                else:
+                    data, wen = self.data_wen()
+                add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
+                next_address += self.WIDTH_OF_MEMORY_BYTE * stride0
+                if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
+                    next_address -= self.TOT_MEM_SIZE * 1024
+                if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
+                    continue
+                self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
+                self._write_req(file, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(file)
+        forbidden_read.extend(forbidden_read_new)
+        forbidden_write.extend(forbidden_write_new)
+        return id_value
+
+    def gen_2d(
+        self,
+        stride0,
+        len_d0,
+        stride1,
+        start_address,
+        id_start,
+        forbidden_read,
+        forbidden_write,
+        idle_cycles_between_phases=0,
+    ):
+        """Strided 2-D scan: inner loop stride0/len_d0, outer loop stride1.
+
+        idle_cycles_between_phases: req=0 cycles inserted after each completed inner row.
+        """
+        id_value = id_start
+        forbidden_read_new = []
+        forbidden_write_new = []
+        with open(self.filepath, "w", encoding="ascii") as file:
+            start_address_int = self._parse_address(start_address)
+            j = 0
+            while id_value - id_start < self.N_TEST:
+                for i in range(len_d0):
+                    data, wen = self.data_wen()
+                    next_address = (
+                        start_address_int
+                        + i * self.WIDTH_OF_MEMORY_BYTE * stride0
+                        + j * self.WIDTH_OF_MEMORY_BYTE * stride1
+                    )
+                    if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
+                        next_address -= self.TOT_MEM_SIZE * 1024
+                    add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
+                    if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
+                        continue
+                    self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
+                    self._write_req(file, id_value, wen, data, add)
+                    id_value += 1
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                for _ in range(idle_cycles_between_phases):
+                    self._write_idle(file)
+                j += 1
+        forbidden_read.extend(forbidden_read_new)
+        forbidden_write.extend(forbidden_write_new)
+        return id_value
+
+    def gen_3d(
+        self,
+        stride0,
+        len_d0,
+        stride1,
+        len_d1,
+        stride2,
+        start_address,
+        id_start,
+        forbidden_read,
+        forbidden_write,
+        idle_cycles_between_phases=0,
+    ):
+        """Strided 3-D scan: inner stride0/len_d0, mid stride1/len_d1, outer stride2.
+
+        idle_cycles_between_phases: req=0 cycles inserted after each completed mid-level block.
+        """
+        id_value = id_start
+        forbidden_read_new = []
+        forbidden_write_new = []
+        with open(self.filepath, "w", encoding="ascii") as file:
+            start_address_int = self._parse_address(start_address)
+            k = 0
+            while id_value - id_start < self.N_TEST:
+                for j in range(len_d1):
+                    for i in range(len_d0):
+                        data, wen = self.data_wen()
+                        next_address = (
+                            start_address_int
+                            + i * self.WIDTH_OF_MEMORY_BYTE * stride0
+                            + j * self.WIDTH_OF_MEMORY_BYTE * stride1
+                            + k * self.WIDTH_OF_MEMORY_BYTE * stride2
+                        )
+                        if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
+                            next_address -= self.TOT_MEM_SIZE * 1024
+                        add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
+                        if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
+                            continue
+                        self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
+                        self._write_req(file, id_value, wen, data, add)
+                        id_value += 1
+                        if id_value - id_start >= self.N_TEST:
+                            break
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    for _ in range(idle_cycles_between_phases):
+                        self._write_idle(file)
+                k += 1
+        forbidden_read.extend(forbidden_read_new)
+        forbidden_write.extend(forbidden_write_new)
+        return id_value
+
+    def idle_gen(self, id_start):
+        """Emit a single req=0 idle line (master never issues transactions)."""
+        with open(self.filepath, "w", encoding="ascii") as file:
+            self._write_idle(file)
+        return id_start
+
+    def matmul_phased_gen(
+        self,
+        id_start,
+        forbidden_read,
+        forbidden_write,
+        region_base_address,
+        region_size_bytes,
+        matmul_ratio_a=1,
+        matmul_ratio_b=1,
+        matmul_ratio_c=1,
+        idle_cycles_between_phases=0,
+        region_base_address_a=None,
+        region_size_bytes_a=None,
+        region_base_address_b=None,
+        region_size_bytes_b=None,
+        region_base_address_c=None,
+        region_size_bytes_c=None,
+    ):
+        """Phased traffic modelling matrix multiply: read-A, read-B, write-C.
+
+        Each phase reads/writes its own sub-region. If region_base_address_a/b/c
+        and region_size_bytes_a/b/c are provided, they are used directly for each
+        phase. Otherwise the combined region [region_base_address, +region_size_bytes)
+        is split into equal thirds automatically.
+
+        idle_cycles_between_phases: req=0 cycles inserted between each pair of
+          active phases (after read-A and after read-B, if those phases are non-empty).
+          Models computation time between memory bursts.
+        """
+        id_value = id_start
+        forbidden_read_new = []
+        forbidden_write_new = []
+        access_bytes = max(1, self.DATA_WIDTH // 8)
+        total_mem_bytes = int(self.TOT_MEM_SIZE * 1024)
+
+        def _resolve_region(base_override, size_override, fallback_base, fallback_size):
+            if base_override is not None and size_override is not None:
+                b = self._align_down(int(base_override), access_bytes)
+                s = self._align_down(int(size_override), access_bytes)
+            else:
+                b = self._align_down(int(fallback_base), access_bytes)
+                s = self._align_down(int(fallback_size), access_bytes)
+            if b + s > total_mem_bytes:
+                s = self._align_down(total_mem_bytes - b, access_bytes)
+            return b, s
+
+        # If per-phase regions are explicitly provided, use them directly
+        if region_base_address_a is not None and region_size_bytes_a is not None:
+            a_base, a_size = _resolve_region(region_base_address_a, region_size_bytes_a, 0, 0)
+            b_base, b_size = _resolve_region(region_base_address_b, region_size_bytes_b, a_base, a_size)
+            c_base, c_size = _resolve_region(region_base_address_c, region_size_bytes_c, a_base, a_size)
+        else:
+            # Auto-split combined region into thirds
+            base = self._align_down(int(region_base_address), access_bytes)
+            size = self._align_down(int(region_size_bytes), access_bytes)
+            if base + size > total_mem_bytes:
+                size = self._align_down(total_mem_bytes - base, access_bytes)
+            region_words = size // access_bytes
+            if region_words < 3:
+                with open(self.filepath, "w", encoding="ascii") as file:
+                    self._write_idle(file)
+                return id_value
+            a_words = max(1, region_words // 3)
+            b_words = max(1, region_words // 3)
+            c_words = region_words - a_words - b_words
+            a_base = base;            a_size = a_words * access_bytes
+            b_base = a_base + a_size; b_size = b_words * access_bytes
+            c_base = b_base + b_size; c_size = c_words * access_bytes
+
+        a_end = a_base + a_size
+        b_end = b_base + b_size
+        c_end = c_base + c_size
+
+        cnt_a, cnt_b, cnt_c = self._phase_counts(
+            self.N_TEST,
+            int(matmul_ratio_a),
+            int(matmul_ratio_b),
+            int(matmul_ratio_c),
+        )
+
+        def _emit_phase(file_obj, count, wen, phase_base, phase_end):
+            nonlocal id_value
+            addr = phase_base
+            for _ in range(count):
+                data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                self._write_req(file_obj, id_value, wen, data, add)
+                self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
+                id_value += 1
+                addr += access_bytes
+                if addr >= phase_end:
+                    addr = phase_base
+
+        with open(self.filepath, "w", encoding="ascii") as file:
+            _emit_phase(file, cnt_a, 1, a_base, a_end)           # read A
+            if cnt_a > 0 and (cnt_b > 0 or cnt_c > 0):
+                for _ in range(idle_cycles_between_phases):
+                    self._write_idle(file)
+            _emit_phase(file, cnt_b, 1, b_base, b_end)           # read B
+            if cnt_b > 0 and cnt_c > 0:
+                for _ in range(idle_cycles_between_phases):
+                    self._write_idle(file)
+            _emit_phase(file, cnt_c, 0, c_base, c_end)           # write C
+
+        forbidden_read.extend(forbidden_read_new)
+        forbidden_write.extend(forbidden_write_new)
+        return id_value
diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py
index 2f580be..fa8b51b 100644
--- a/target/verif/simvectors/hci_stimuli/processor.py
+++ b/target/verif/simvectors/hci_stimuli/processor.py
@@ -1,62 +1,12 @@
-"""Processor helpers: unfold and pad stimuli text files."""
+"""Processor helpers: pad stimuli files to equal length."""
 
-import os
 
+def pad_txt_files(folder_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT):
+    """No-op: padding stimuli files to a common length is not needed.
 
-# 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list.
-# -folder_path_raw          --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated)
-# -folder_path_processed    --> String that specifies the path of the folder containing the new txt files created by this function
-def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT):
-    file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")]
-    for file in file_names:
-        filepath_read = os.path.join(folder_path_raw,file)
-        filepath_write = os.path.join(folder_path_processed, file)
-        os.makedirs(os.path.dirname(filepath_write),exist_ok=True)
-        with open(filepath_read, 'r', encoding = "ascii") as file_read:
-            with open(filepath_write, 'w', encoding="ascii") as file_write:
-                for line in file_read:
-                    if line != 'zero':
-                        values = line.split()
-                        id = values[0]
-                        cycle_offset = values[1]               
-                        wen = values[2]
-                        data = values[3]
-                        add = values[4]
-                        if "log" in file:
-                            for _ in range(int(cycle_offset)-1):
-                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        else:
-                            for _ in range(int(cycle_offset)-1):
-                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n")
-                    else:
-                        if "log" in file:
-                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        else:
-                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-
-
-# 2) ++PAD++ txt files to have the same number of lines
-# -Folder_path  --> path of the folder containing the txt files to be padded
-def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH_FACT):
-    file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder
-    max_lines = 0
-    line_count = {} # Dictionary to store the number of lines in each txt file
-    # Determining the maximum number of lines among the txt files
-    for file in file_names:
-        file_path = os.path.join(folder_path,file)
-        with open(file_path,'r', encoding = 'ascii') as f:
-            line_count[file] = sum(1 for _ in f)
-            max_lines = max(max_lines, line_count[file])
-    # Pad files
-    for file in file_names:
-        padding_needed = max_lines - line_count[file]
-        if padding_needed > 0:
-            file_path = os.path.join(folder_path,file)
-            with open(file_path, 'a', encoding = 'ascii') as f:
-                if "log" in file:
-                    for _ in range(padding_needed):
-                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
-                else:
-                    for _ in range(padding_needed):
-                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH_FACT*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
+    Each application_driver self-terminates when its transaction queue is
+    exhausted, and later-phase drivers are held in reset via clear_i until
+    their dependencies assert end_req_o. Cross-file length alignment would
+    only add unnecessary idle cycles at the end of early-phase files.
+    """
+    pass
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index bbdd853..54ff02b 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -1,29 +1,26 @@
 """Stimuli generator (reads JSON configs in `verif/config`).
 
 This script is invoked by the top-level Makefile and expects three
-JSON config files: workload, testbench and hardware. It produces raw
-and processed stimuli in `verif/simvectors/generated`.
+JSON config files: workload, testbench and hardware. It produces
+cycle-accurate stimuli in `verif/simvectors/generated/stimuli`.
+
+Each stimuli file has one line per simulation cycle:
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
 """
 
-### LIBRARIES AND DEPENDENCIES ###
 import json
+import math
 import sys
 from pathlib import Path
 import argparse
-import numpy as np
 
 code_directory = Path(__file__).resolve().parent
 
-
-# Try to import the local package `hci_stimuli`. If the running
-# environment doesn't include the `simvectors` directory on `sys.path`
-# (for example when invoked from a different working directory), add
-# `code_directory` to `sys.path` as a minimal fallback.
 try:
-    from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files
+    from hci_stimuli import StimuliGenerator, pad_txt_files
 except Exception:
     sys.path.insert(0, str(code_directory))
-    from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files
+    from hci_stimuli import StimuliGenerator, pad_txt_files
 
 
 def parse_args(argv=None):
@@ -31,6 +28,8 @@ def parse_args(argv=None):
     parser.add_argument('--workload_config', required=True, help="Path to JSON workload configuration file")
     parser.add_argument('--testbench_config', required=True, help="Path to JSON testbench configuration file")
     parser.add_argument('--hardware_config', required=True, help="Path to JSON hardware configuration file")
+    parser.add_argument('--emit_phases_mk', default=None, metavar='PATH',
+                        help="Also write the phases.mk Makefile fragment to PATH")
     parser.add_argument(
         '--golden',
         action='store_true',
@@ -52,19 +51,17 @@ def load_config(filename, description):
     except json.JSONDecodeError as e:
         print(f"ERROR: Invalid JSON in {description} file: {e}")
         sys.exit(1)
+
+
 ### MAIN ENTRYPOINT ###
 def main(argv=None):
-    # parse CLI args
     args = parse_args(argv)
 
-    # load configs
     hardware_config = load_config(args.hardware_config, "Hardware configuration")
     testbench_config = load_config(args.testbench_config, "Testbench configuration")
     workload_config = load_config(args.workload_config, "Workload configuration")
 
-    # helpers imported at module-level (with a small sys.path fallback)
-
-    # Extract hardware parameters
+    # Hardware parameters
     hw_params = hardware_config['parameters']
     N_BANKS = hw_params['N_BANKS']
     TOT_MEM_SIZE = hw_params['TOT_MEM_SIZE']
@@ -75,136 +72,287 @@ def main(argv=None):
     N_HWPE = hw_params['N_HWPE']
     HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT']
 
-    # Extract testbench parameters
-    tb_params = testbench_config['parameters']
-    TEST_RATIO = tb_params['TRANSACTION_RATIO']
-    N_TEST_LOG = tb_params['N_TRANSACTION_LOG']
-
-    # Extract workload simulation parameters
-    workload_sim_params = workload_config['simulation_parameters']
-    CYCLE_OFFSET_LOG = workload_sim_params['CYCLE_OFFSET_LOG']
-    CYCLE_OFFSET_HWPE = workload_sim_params['CYCLE_OFFSET_HWPE']
-    EXACT_OR_MAX_OFFSET = workload_sim_params['EXACT_OR_MAX_OFFSET']
-
-    # Extract workload master parameters
     log_masters = workload_config['log_masters']
     hwpe_masters = workload_config['hwpe_masters']
 
     # Derived parameters
-    WIDTH_OF_MEMORY = DATA_WIDTH
-    WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8
-    N_WORDS = (TOT_MEM_SIZE * 1000 / N_BANKS) / WIDTH_OF_MEMORY_BYTE
-    ADD_WIDTH = int(np.ceil(np.log2(TOT_MEM_SIZE * 1000)))
-    N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO)
+    ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000))
     N_LOG = N_CORE + N_DMA + N_EXT
-    N_MASTER = N_LOG + N_HWPE
-    # Match tb_hci_pkg interface ID widths (hci_system-style model).
-    IW_CORES = 8
-    IW_HWPE = IW_CORES
-    if IW_CORES != IW_HWPE:
-        print("ERROR: stimuli generator currently requires equal log/HWPE interface ID widths")
-        sys.exit(1)
-    IW = IW_CORES
-    CORE_ZERO_FLAG = False
-    EXT_ZERO_FLAG = False
-    DMA_ZERO_FLAG = False
-    HWPE_ZERO_FLAG = False
+    IW = 8
 
     # Validations
     if len(log_masters) != N_LOG:
         print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})")
         sys.exit(1)
-
     if len(hwpe_masters) != N_HWPE:
         print(f"ERROR: Number of HWPE masters in workload config ({len(hwpe_masters)}) doesn't match hardware config N_HWPE ({N_HWPE})")
         sys.exit(1)
-
-    if (not N_WORDS.is_integer()):
-        print("ERROR: the number of words is not an integer value")
-        sys.exit(1)
-    if (N_MASTER < 1):
+    if N_LOG + N_HWPE < 1:
         print("ERROR: the number of masters must be > 0")
         sys.exit(1)
+    n_words = (TOT_MEM_SIZE * 1000 / N_BANKS) / (DATA_WIDTH / 8)
+    if not n_words.is_integer():
+        print("ERROR: the number of words is not an integer value")
+        sys.exit(1)
 
     # Prepare output dirs
     simvectors_dir = code_directory.resolve()
     generated_dir = (simvectors_dir / 'generated').resolve()
-    raw_dir = (generated_dir / 'stimuli_raw').resolve()
-    processed_dir = (generated_dir / 'stimuli_processed').resolve()
+    stimuli_dir = (generated_dir / 'stimuli').resolve()
     generated_dir.mkdir(parents=True, exist_ok=True)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    processed_dir.mkdir(parents=True, exist_ok=True)
+    stimuli_dir.mkdir(parents=True, exist_ok=True)
 
-    # Create zero files when a class of masters is absent. We keep the
-    # original behaviour of creating a single 'zero' file per missing
-    # class to preserve downstream expectations.
-    def _create_zero_file(path: Path):
+    def _create_idle_file(path: Path, data_width: int):
+        """Write a single idle line for a master that is not present in hardware."""
         path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text('zero', encoding='ascii')
+        path.write_text(
+            "0 " + "0" * IW + " 0 " + "0" * data_width + " " + "0" * ADD_WIDTH + "\n",
+            encoding='ascii',
+        )
+
+    CORE_ZERO_FLAG = False
+    DMA_ZERO_FLAG = False
+    EXT_ZERO_FLAG = False
+    HWPE_ZERO_FLAG = False
 
     if N_CORE <= 0:
         CORE_ZERO_FLAG = True
         N_CORE = 1
-        _create_zero_file(raw_dir / 'master_log_0.txt')
+        _create_idle_file(stimuli_dir / 'master_log_0.txt', DATA_WIDTH)
     if N_DMA <= 0:
         DMA_ZERO_FLAG = True
         N_DMA = 1
-        _create_zero_file(raw_dir / f'master_log_{N_CORE}.txt')
+        _create_idle_file(stimuli_dir / f'master_log_{N_CORE}.txt', DATA_WIDTH)
     if N_EXT <= 0:
         EXT_ZERO_FLAG = True
         N_EXT = 1
-        _create_zero_file(raw_dir / f'master_log_{N_CORE + N_DMA}.txt')
+        _create_idle_file(stimuli_dir / f'master_log_{N_CORE + N_DMA}.txt', DATA_WIDTH)
     if N_HWPE <= 0:
         HWPE_ZERO_FLAG = True
         N_HWPE = 1
-        _create_zero_file(raw_dir / 'master_hwpe_0.txt')
+        _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH)
 
     next_start_id = 0
     LIST_OF_FORBIDDEN_ADDRESSES_WRITE = []
     LIST_OF_FORBIDDEN_ADDRESSES_READ = []
 
-    def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, master_global_idx: int):
-        """Create StimuliGenerator and run the configured generation method.
+    # Memory map entries collected during generation, printed at the end
+    memory_map_entries = []
+
+    def _bank_of(byte_addr):
+        return (byte_addr // (DATA_WIDTH // 8)) % N_BANKS
+
+    def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes,
+                           region_base, region_size, start_address, stride0, len_d0,
+                           stride1, len_d1, stride2, master_config, total_mem_bytes):
+        label = f"{kind}_{local_idx}" + (f" ({description})" if description else "")
+        if config == 'idle' or n_test == 0:
+            memory_map_entries.append({'label': label, 'pattern': config, 'n': 0,
+                                       'info': 'idle - no memory accesses'})
+            return
+
+        first_addr = last_addr = None
+        detail = {}
+
+        if config == 'random':
+            first_addr = region_base
+            last_addr = region_base + region_size - access_bytes
+            detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x}  ({region_size} B)"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct']    = f"{rpct}%"
+        elif config == 'matmul_phased':
+            ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'),   None)
+            rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'),   None)
+            rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'),   None)
+            if ra is not None and sa is not None:
+                # Explicit per-phase regions
+                a_base, a_size = ra, sa
+                b_base, b_size = (rb, sb) if rb is not None and sb is not None else (ra, sa)
+                c_base, c_size = (rc, sc) if rc is not None and sc is not None else (ra, sa)
+                detail['matrix_A (read)']  = f"0x{a_base:08x} - 0x{a_base + a_size - access_bytes:08x}  ({a_size} B)"
+                if int(master_config.get('matmul_ratio_b', 1)) > 0:
+                    detail['matrix_B (read)']  = f"0x{b_base:08x} - 0x{b_base + b_size - access_bytes:08x}  ({b_size} B)"
+                detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_size - access_bytes:08x}  ({c_size} B)"
+                first_addr = a_base
+                last_addr  = c_base + c_size - access_bytes
+            else:
+                # Auto-split combined region into thirds
+                a_words = max(1, (region_size // access_bytes) // 3)
+                b_words = max(1, (region_size // access_bytes) // 3)
+                c_words = (region_size // access_bytes) - a_words - b_words
+                a_base = region_base
+                b_base = a_base + a_words * access_bytes
+                c_base = b_base + b_words * access_bytes
+                detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x}  ({region_size} B)  [auto-split]"
+                detail['matrix_A (read)']  = f"0x{a_base:08x} - 0x{b_base - access_bytes:08x}  ({a_words * access_bytes} B)"
+                detail['matrix_B (read)']  = f"0x{b_base:08x} - 0x{c_base - access_bytes:08x}  ({b_words * access_bytes} B)"
+                detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_words * access_bytes - access_bytes:08x}  ({c_words * access_bytes} B)"
+                first_addr = a_base
+                last_addr  = c_base + c_words * access_bytes - access_bytes
+            if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')):
+                m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k'])
+                detail['matrix_dims'] = f"M={m} N={n} K={k}  (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})"
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+        elif config == 'linear':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            last_addr = base + (n_test - 1) * stride0 * access_bytes
+            last_addr = last_addr % total_mem_bytes
+            detail['start'] = f"0x{base:08x}"
+            detail['stride'] = f"{stride0} words ({stride0 * access_bytes} B)"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct']    = f"{rpct}%"
+        elif config == '2d':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            last_addr = (base + (len_d0 - 1) * stride0 * access_bytes
+                         + (n_test // max(len_d0, 1) - 1) * stride1 * access_bytes) % total_mem_bytes
+            detail['dims'] = f"{len_d0} x (n_rows)  stride0={stride0} stride1={stride1}"
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+        elif config == '3d':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            detail['dims'] = f"{len_d0} x {len_d1} x (n_outer)  stride0={stride0} stride1={stride1} stride2={stride2}"
+            last_addr = base  # approximate for 3d
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+
+        if first_addr is not None:
+            detail['first_addr'] = f"0x{first_addr:08x}  (bank {_bank_of(first_addr)})"
+            detail['last_addr']  = f"0x{last_addr:08x}  (bank {_bank_of(last_addr)})"
+            detail['transfer']   = f"{n_test} transactions x {data_width // 8} B = {n_test * data_width // 8} B"
+
+        memory_map_entries.append({'label': label, 'pattern': config, 'n': n_test,
+                                   'detail': detail})
+
+    # (file_path, n_idle_cycles) -- populated during generation,
+    # prepended as idle lines before padding for static start delays.
+    pending_start_delays = []
+
+    def _parse_maybe_bin_int(raw_value, default_value):
+        """Parse an int or binary/hex/decimal string; return default on failure."""
+        if raw_value is None:
+            return default_value
+        if isinstance(raw_value, int):
+            return raw_value
+        if isinstance(raw_value, str):
+            v = raw_value.strip()
+            if not v:
+                return default_value
+            if set(v) <= {"0", "1"}:
+                return int(v, 2)
+            try:
+                return int(v, 0)
+            except ValueError:
+                return default_value
+        return default_value
+
+    def _normalize_mem_access_type(raw_value, master_name):
+        allowed = {"random", "linear", "2d", "3d", "idle", "matmul_phased"}
+        aliases = {"matmul": "matmul_phased"}
+
+        if not isinstance(raw_value, str):
+            print(
+                f"ERROR: {master_name} has invalid mem_access_type={raw_value} "
+                f"(type={type(raw_value).__name__}). Allowed: {', '.join(sorted(allowed))}"
+            )
+            sys.exit(1)
+
+        key = aliases.get(raw_value.strip().lower(), raw_value.strip().lower())
+        if key not in allowed:
+            print(
+                f"ERROR: {master_name} has invalid mem_access_type='{raw_value}'. "
+                f"Allowed: {', '.join(sorted(allowed))}"
+            )
+            sys.exit(1)
+        return key
+
+    def _warn_if_id_mismatch(master_cfg, expected_idx, master_name):
+        raw_id = master_cfg.get("id", expected_idx)
+        try:
+            cfg_id = int(raw_id)
+        except (TypeError, ValueError):
+            print(f"WARNING: {master_name} has non-integer id={raw_id}; positional index {expected_idx} is used.")
+            return
+        if cfg_id != expected_idx:
+            print(
+                f"WARNING: {master_name} has id={cfg_id} but positional index is {expected_idx}; "
+                "stimuli-to-driver mapping is positional."
+            )
+
+    def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_width: int, kind: str, local_idx: int) -> int:
+        """Resolve n_transactions from explicit field or geometry, depending on pattern."""
+        if 'n_transactions' in master_config:
+            return int(master_config['n_transactions'])
+        if mem_access_type == 'linear':
+            length = master_config.get('length')
+            if length is not None:
+                return int(length)
+        elif mem_access_type == '2d':
+            len_d0 = master_config.get('len_d0')
+            len_d1 = master_config.get('len_d1')
+            if len_d0 is not None and len_d1 is not None:
+                return int(len_d0) * int(len_d1)
+        elif mem_access_type == '3d':
+            len_d0 = master_config.get('len_d0')
+            len_d1 = master_config.get('len_d1')
+            len_d2 = master_config.get('len_d2')
+            if len_d0 is not None and len_d1 is not None and len_d2 is not None:
+                return int(len_d0) * int(len_d1) * int(len_d2)
+        elif mem_access_type == 'matmul_phased':
+            m = master_config.get('matrix_m')
+            n = master_config.get('matrix_n')
+            k = master_config.get('matrix_k')
+            if m is not None and n is not None and k is not None:
+                return int(m) * int(k) + int(k) * int(n) + int(m) * int(n)
+        elif mem_access_type == 'idle':
+            return 0
+        print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no "
+              f"'n_transactions' and no geometry fields to derive it from.")
+        sys.exit(1)
 
-        Parameters:
-        - filepath: output raw txt file path
-        - master_config: dict from workload.json for this master
-        - is_hwpe: whether this is an HWPE master (affects data width and counts)
-        - master_global_idx: global master id used by the generator
-        """
+    def _generate_master(
+        filepath: Path,
+        master_config: dict,
+        *,
+        is_hwpe: bool,
+        master_global_idx: int,
+        master_local_idx: int,
+        n_peers_of_kind: int,
+    ):
         nonlocal next_start_id
         data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
-        n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG
-        cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG
+        kind = 'master_hwpe' if is_hwpe else 'master_log'
 
         master = StimuliGenerator(
-            IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH,
-            str(filepath), n_test, EXACT_OR_MAX_OFFSET, cycle_offset, master_global_idx
+            IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH,
+            str(filepath), 0, master_global_idx
         )
 
-        config = str(master_config.get('mem_access_type', '0'))
-        start_address = str(master_config.get('start_address', '0'))
-        stride0 = int(master_config.get('stride0', 0))
-        len_d0 = int(master_config.get('len_d0', 0))
-        stride1 = int(master_config.get('stride1', 0))
-        len_d1 = int(master_config.get('len_d1', 0))
-        stride2 = int(master_config.get('stride2', 0))
-
-        if config == '0':
-            next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '1':
-            next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '2':
-            next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '3':
-            next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
+        if 'mem_access_type' not in master_config:
+            print(f"ERROR: {kind}_{master_local_idx} is missing mem_access_type.")
+            sys.exit(1)
 
-    def _gen_hwpe_master(master_idx, master_config, global_idx):
-        nonlocal next_start_id
-        filepath = raw_dir / f"master_hwpe_{master_idx}.txt"
-        master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH_FACT * DATA_WIDTH, ADD_WIDTH,
-                                   str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx)
-        config = str(master_config.get('mem_access_type', '0'))
+        config = _normalize_mem_access_type(
+            master_config['mem_access_type'],
+            f"{kind}_{master_local_idx}",
+        )
         start_address = str(master_config.get('start_address', '0'))
         stride0 = int(master_config.get('stride0', 0))
         len_d0 = int(master_config.get('len_d0', 0))
@@ -212,19 +360,112 @@ def _gen_hwpe_master(master_idx, master_config, global_idx):
         len_d1 = int(master_config.get('len_d1', 0))
         stride2 = int(master_config.get('stride2', 0))
 
-        if config == '0':
-            next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '1':
-            next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '2':
-            next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '3':
-            next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
+        # Region parameters
+        total_mem_bytes = int(TOT_MEM_SIZE * 1024)
+        access_bytes = max(1, int(data_width // 8))
+        default_region_size = total_mem_bytes // max(1, n_peers_of_kind)
+        default_region_base = master_local_idx * default_region_size
+
+        region_base = _parse_maybe_bin_int(master_config.get('region_base_address'), default_region_base)
+        region_size = _parse_maybe_bin_int(master_config.get('region_size_bytes'), default_region_size)
+
+        region_base = (region_base // access_bytes) * access_bytes
+        if region_base >= total_mem_bytes:
+            region_base = region_base % total_mem_bytes
+        region_size = (max(0, region_size) // access_bytes) * access_bytes
+        if region_size <= 0:
+            region_size = (default_region_size // access_bytes) * access_bytes
+        if region_base + region_size > total_mem_bytes:
+            region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes
+
+        n_test = _resolve_n_transactions(master_config, config, data_width, kind, master_local_idx)
+        master.N_TEST = n_test
+
+        # Start delay: prepend idle lines directly to the stimuli file after generation
+        start_delay = int(master_config.get('start_delay_cycles', 0))
+        if start_delay > 0:
+            pending_start_delays.append((filepath, start_delay, data_width))
+
+        if config == 'random':
+            tpct = master_config.get('traffic_pct')
+            next_start_id = master.random_gen(
+                next_start_id,
+                LIST_OF_FORBIDDEN_ADDRESSES_READ,
+                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                region_base=region_base,
+                region_size=region_size,
+                traffic_pct=int(tpct) if tpct is not None else 100,
+                traffic_read_pct=master_config.get('traffic_read_pct'),
+            )
+        elif config == 'linear':
+            tpct = master_config.get('traffic_pct')
+            next_start_id = master.linear_gen(
+                stride0, start_address, next_start_id,
+                LIST_OF_FORBIDDEN_ADDRESSES_READ,
+                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                traffic_pct=int(tpct) if tpct is not None else 100,
+                traffic_read_pct=master_config.get('traffic_read_pct'),
+            )
+        elif config == '2d':
+            next_start_id = master.gen_2d(
+                stride0, len_d0, stride1, start_address, next_start_id,
+                LIST_OF_FORBIDDEN_ADDRESSES_READ,
+                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
+            )
+        elif config == '3d':
+            next_start_id = master.gen_3d(
+                stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id,
+                LIST_OF_FORBIDDEN_ADDRESSES_READ,
+                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
+            )
+        elif config == 'idle':
+            next_start_id = master.idle_gen(next_start_id)
+        elif config == 'matmul_phased':
+            if not is_hwpe:
+                print(
+                    f"WARNING: mem_access_type='matmul_phased' is typically used for HWPE masters; "
+                    f"{kind}_{master_local_idx} will still use requested phased behavior."
+                )
+            min_region_size = 3 * access_bytes
+            if region_size < min_region_size:
+                print(
+                    f"ERROR: {kind}_{master_local_idx} region_size_bytes="
+                    f"{region_size} is too small for matmul_phased (minimum {min_region_size})."
+                )
+                sys.exit(1)
+            next_start_id = master.matmul_phased_gen(
+                next_start_id,
+                LIST_OF_FORBIDDEN_ADDRESSES_READ,
+                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                region_base,
+                region_size,
+                int(master_config.get('matmul_ratio_a', 1)),
+                int(master_config.get('matmul_ratio_b', 1)),
+                int(master_config.get('matmul_ratio_c', 1)),
+                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
+                region_base_address_a=_parse_maybe_bin_int(master_config.get('region_base_address_a'), None),
+                region_size_bytes_a=_parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None),
+                region_base_address_b=_parse_maybe_bin_int(master_config.get('region_base_address_b'), None),
+                region_size_bytes_b=_parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None),
+                region_base_address_c=_parse_maybe_bin_int(master_config.get('region_base_address_c'), None),
+                region_size_bytes_c=_parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None),
+            )
+
+        _record_memory_map(
+            kind, master_local_idx,
+            master_config.get('description', ''),
+            config, n_test, data_width, access_bytes,
+            region_base, region_size,
+            start_address, stride0, len_d0, stride1, len_d1, stride2,
+            master_config, total_mem_bytes,
+        )
 
     global_idx = 0
-    # Generate logarithmic masters (CORE, DMA, EXT) in order
+
+    # Generate LOG masters (CORE, DMA, EXT) in order
     for i in range(N_LOG):
-        # determine class of this master (core/dma/ext)
         if i < N_CORE:
             if CORE_ZERO_FLAG:
                 global_idx += 1
@@ -239,66 +480,175 @@ def _gen_hwpe_master(master_idx, master_config, global_idx):
                 continue
 
         master_cfg = log_masters[i]
-        _generate_master(raw_dir / f"master_log_{i}.txt", master_cfg, is_hwpe=False, master_global_idx=global_idx)
+        _warn_if_id_mismatch(master_cfg, i, f"master_log_{i}")
+        _generate_master(
+            stimuli_dir / f"master_log_{i}.txt",
+            master_cfg,
+            is_hwpe=False,
+            master_global_idx=global_idx,
+            master_local_idx=i,
+            n_peers_of_kind=max(1, N_LOG),
+        )
         global_idx += 1
 
-    # Generate HWPE masters; their global index follows the previous masters
+    # Generate HWPE masters
     for hw_idx in range(N_HWPE):
         if HWPE_ZERO_FLAG:
             global_idx += 1
             continue
         master_cfg = hwpe_masters[hw_idx]
-        _generate_master(raw_dir / f"master_hwpe_{hw_idx}.txt", master_cfg, is_hwpe=True, master_global_idx=global_idx)
+        _warn_if_id_mismatch(master_cfg, hw_idx, f"master_hwpe_{hw_idx}")
+        _generate_master(
+            stimuli_dir / f"master_hwpe_{hw_idx}.txt",
+            master_cfg,
+            is_hwpe=True,
+            master_global_idx=global_idx,
+            master_local_idx=hw_idx,
+            n_peers_of_kind=max(1, N_HWPE),
+        )
         global_idx += 1
 
-    print("STEP 0 COMPLETED: create raw txt files")
-
-    # Process raw files
-    simvector_raw_path = str(raw_dir)
-    simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve())
-    unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
-    print("STEP 1 COMPLETED: unfold txt files")
-
-    pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
-    print("STEP 2 COMPLETED: pad txt files")
-
+    print("STEP 0 COMPLETED: generate stimuli files")
+
+    # -----------------------------------------------------------------------
+    # Compute WAIT_MASKS and emit phases.mk
+    # -----------------------------------------------------------------------
+    N_DRIVERS = N_LOG + N_HWPE
+
+    phase_to_drivers: dict[str, list[int]] = {}
+    driver_phases: list[str] = []
+
+    for i, m in enumerate(log_masters):
+        phase = str(m.get('phase', 'default'))
+        driver_phases.append(phase)
+        phase_to_drivers.setdefault(phase, []).append(i)
+
+    for i, m in enumerate(hwpe_masters):
+        gidx = N_LOG + i
+        phase = str(m.get('phase', 'default'))
+        driver_phases.append(phase)
+        phase_to_drivers.setdefault(phase, []).append(gidx)
+
+    wait_masks: list[int] = [0] * N_DRIVERS
+
+    for i, m in enumerate(log_masters):
+        mask = 0
+        for dep_phase in m.get('wait_for', []):
+            for dep_drv in phase_to_drivers.get(str(dep_phase), []):
+                mask |= (1 << dep_drv)
+        wait_masks[i] = mask
+
+    for i, m in enumerate(hwpe_masters):
+        gidx = N_LOG + i
+        mask = 0
+        for dep_phase in m.get('wait_for', []):
+            for dep_drv in phase_to_drivers.get(str(dep_phase), []):
+                mask |= (1 << dep_drv)
+        wait_masks[gidx] = mask
+
+    hex_width = max(1, (N_DRIVERS + 3) // 4)
+    mask_literals = [f"{N_DRIVERS}'h{wait_masks[i]:0{hex_width}x}" for i in range(N_DRIVERS)]
+    wait_masks_param = "'{" + ", ".join(mask_literals) + "}"
+
+    if args.emit_phases_mk:
+        phases_mk_path = Path(args.emit_phases_mk)
+        phases_mk_path.parent.mkdir(parents=True, exist_ok=True)
+        phases_mk_path.write_text(
+            "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n"
+            "# Per-driver dependency masks for tb_hci.sv (WAIT_MASKS_PARAM).\n"
+            f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n"
+            f"WAIT_MASKS_PARAM := {wait_masks_param}\n",
+            encoding='utf-8',
+        )
+        print(f"PHASES.MK written: {phases_mk_path}")
+
+    # -----------------------------------------------------------------------
+    # Build and emit memory map report
+    # -----------------------------------------------------------------------
+    word_bytes = DATA_WIDTH // 8
+    bank_stride_bytes = N_BANKS * word_bytes
+    lines = []
+    lines.append("=" * 72)
+    lines.append("MEMORY MAP REPORT")
+    lines.append(f"  Total memory : {TOT_MEM_SIZE} KiB  ({TOT_MEM_SIZE * 1024} B)")
+    lines.append(f"  Banks        : {N_BANKS}  x  {word_bytes} B/word  (interleaved, stride {bank_stride_bytes} B)")
+    lines.append(f"  Data width   : {DATA_WIDTH} b LOG  /  {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE")
+    lines.append("=" * 72)
+    for entry in memory_map_entries:
+        lines.append(f"\n  [{entry['label']}]  pattern={entry['pattern']}  n_transactions={entry['n']}")
+        if 'info' in entry:
+            lines.append(f"    {entry['info']}")
+        for k, v in entry.get('detail', {}).items():
+            lines.append(f"    {k:<14}: {v}")
+    lines.append("")
+    lines.append("  Phase / dependency map:")
+    for phase, drivers in sorted(phase_to_drivers.items()):
+        driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers]
+        lines.append(f"    phase '{phase}': {', '.join(driver_names)}")
+    for i in range(N_DRIVERS):
+        if wait_masks[i]:
+            name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}"
+            deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}"
+                    for j in range(N_DRIVERS) if wait_masks[i] & (1 << j)]
+            lines.append(f"    {name} waits for: {', '.join(deps)}")
+    lines.append("=" * 72)
+
+    report_text = "\n".join(lines) + "\n"
+    print("\n" + report_text)
+    memory_map_path = generated_dir / 'memory_map.txt'
+    memory_map_path.write_text(report_text, encoding='utf-8')
+    print(f"Memory map written: {memory_map_path}")
+
+    # -----------------------------------------------------------------------
+    # Apply per-master start delays
+    # -----------------------------------------------------------------------
+    for fpath, delay, dw in pending_start_delays:
+        if fpath.exists():
+            idle_line = "0 " + "0" * IW + " 0 " + "0" * dw + " " + "0" * ADD_WIDTH + "\n"
+            original = fpath.read_text(encoding='ascii')
+            fpath.write_text(idle_line * delay + original, encoding='ascii')
+
+    # -----------------------------------------------------------------------
+    # Pad all stimuli files to equal length
+    # -----------------------------------------------------------------------
+    pad_txt_files(str(stimuli_dir), IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
+    print("STEP 1 COMPLETED: pad stimuli files")
+
+    # -----------------------------------------------------------------------
+    # Golden vectors
+    # -----------------------------------------------------------------------
     if args.golden:
         golden_dir = (generated_dir / 'golden').resolve()
         golden_dir.mkdir(parents=True, exist_ok=True)
 
-        for stim_path in sorted(processed_dir.glob('master_*.txt')):
+        for stim_path in sorted(stimuli_dir.glob('master_*.txt')):
             try:
                 text = stim_path.read_text(encoding='ascii')
             except OSError:
                 continue
 
-            if text.strip() == 'zero':
-                continue
-
             mem = {}
             out_lines = []
             for raw_line in text.splitlines():
                 line = raw_line.strip()
                 if not line:
                     continue
-
                 parts = line.split()
                 if len(parts) != 5:
                     continue
-
                 req_s, id_s, wen_s, data_s, add_s = parts
                 if req_s != '1':
                     continue
-
                 if wen_s == '0':
                     mem[add_s] = data_s
                     continue
-
                 exp_s = mem.get(add_s, '1' * len(data_s))
                 out_lines.append(f"{id_s} {add_s} {exp_s}")
 
-            (golden_dir / f"golden_{stim_path.name}").write_text("\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii')
-        print("STEP 3 COMPLETED: golden vectors")
+            (golden_dir / f"golden_{stim_path.name}").write_text(
+                "\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii'
+            )
+        print("STEP 2 COMPLETED: golden vectors")
 
 
 if __name__ == '__main__':
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index d921563..d4697e9 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -85,7 +85,8 @@ module application_driver #(
   // Requests FSM //
   //////////////////
 
-  typedef enum logic [1:0] {
+  typedef enum logic [2:0] {
+    REQ_RESET,
     REQ_IDLE,
     WAIT_GNT,
     REQ_DONE,
@@ -102,7 +103,7 @@ module application_driver #(
   // Sequential logic
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni || clear_i) begin
-      req_state_q <= REQ_IDLE;
+      req_state_q <= REQ_RESET;
       tr_idx_q <= '0;
       n_req_issued_q <= '0;
       n_rd_req_issued_q <= '0;
@@ -142,6 +143,11 @@ module application_driver #(
     // inputs: gnt, r_data, r_valid, r_user, r_id
 
     case (req_state_q)
+      REQ_RESET: begin
+        if (!clear_i) begin
+          req_state_d = REQ_IDLE;
+        end
+      end
       REQ_IDLE: begin
         // Check if there are still transactions to issue
         if (tr_idx_q < transactions.size()) begin
@@ -176,10 +182,6 @@ module application_driver #(
             req_state_d = RSP_DONE;
           end
         end
-        // Synchronously clear
-        if (clear_i) begin
-          req_state_d = REQ_IDLE;
-        end
       end
       WAIT_GNT: begin
         hci_if.req = 1'b1;
@@ -215,10 +217,6 @@ module application_driver #(
             req_state_d = WAIT_GNT;
           end
         end
-        // Synchronously clear
-        if (clear_i) begin
-          req_state_d = REQ_IDLE;
-        end
       end
       REQ_DONE: begin
         end_req_o = 1'b1;
@@ -228,21 +226,13 @@ module application_driver #(
         end else begin
           req_state_d = REQ_DONE;
         end
-        // Synchronously clear
-        if (clear_i) begin
-          req_state_d = REQ_IDLE;
-        end
       end
       RSP_DONE: begin
         end_req_o = 1'b1;
         end_resp_o = 1'b1;
-        // Synchronously clear
-        if (clear_i) begin
-          req_state_d = REQ_IDLE;
-        end
       end
       default: begin
-        req_state_d = REQ_IDLE;
+        req_state_d = REQ_RESET;
       end
     endcase
   end
@@ -253,7 +243,8 @@ module application_driver #(
 
   // We only consider read responses as write responses are not mandatory in HCI
 
-  typedef enum logic {
+  typedef enum logic [1:0] {
+    RESP_RESET,
     RESP_IDLE,
     RESP_WAIT_RVALID
   } resp_state_t;
@@ -284,17 +275,17 @@ module application_driver #(
     n_rd_in_flight_d = n_rd_in_flight_q;
 
     case (resp_state_q)
+      RESP_RESET: begin
+        if (!clear_i) begin
+          resp_state_d = RESP_IDLE;
+        end
+      end
       RESP_IDLE: begin
         // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID
         if (hci_if.req && hci_if.wen && hci_if.gnt) begin
           n_rd_in_flight_d = n_rd_in_flight_q + 1;
           resp_state_d = RESP_WAIT_RVALID;
         end
-        // Synchronously clear
-        if (clear_i) begin
-          resp_state_d = RESP_IDLE;
-          n_rd_in_flight_d = '0;
-        end
       end
       RESP_WAIT_RVALID: begin
         // Track newly granted reads while waiting
@@ -316,15 +307,9 @@ module application_driver #(
             resp_state_d = RESP_IDLE;
           end
         end
-        // Synchronously clear
-        if (clear_i) begin
-          resp_state_d = RESP_IDLE;
-          n_rd_in_flight_d = '0;
-        end
       end
       default: begin
-        resp_state_d = RESP_IDLE;
-        n_rd_in_flight_d = '0;
+        resp_state_d = RESP_RESET;
       end
     endcase
   end
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index 28f8b52..e8b02bd 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -20,11 +20,11 @@
 module simulation_report
   import tb_hci_pkg::*;
 (
-  input logic [0:N_DRIVERS-1] end_stimuli_i,
-  input logic [0:N_DRIVERS-1] end_latency_i,
-  input real                 throughput_complete_i,
-  input real                 stim_latency_i,
-  input real                 tot_latency_i,
+  input logic [N_DRIVERS-1:0] end_req_i,
+  input logic [N_DRIVERS-1:0] end_resp_i,
+  input real                  throughput_complete_i,
+  input real                  stim_latency_i,
+  input real                  tot_latency_i,
   input real                 latency_per_master_i[N_DRIVERS],
   input real                 sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE],
   input real                 sum_req_to_gnt_latency_hwpe_i[N_HWPE],
@@ -60,6 +60,13 @@ module simulation_report
     int unsigned log_masters_with_grants;
     int unsigned hwpe_masters_with_grants;
     logic missing_reads;
+    // Ideal bandwidth: maximum data the memory system can serve per cycle.
+    // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide.
+    real ideal_bw_mem_side_bpc;    // bits per cycle (memory-side ceiling)
+    // Master side: sum of all master bandwidths at 100% traffic.
+    real ideal_bw_master_side_bpc; // bits per cycle (master-side ceiling)
+    real ideal_bw_bpc;             // min(mem, master) = bottleneck ideal BW
+    real actual_bw_utilization;    // throughput_complete / ideal_bw
 
     sum_req_to_gnt_latency_all = 0.0;
     average_req_to_gnt_latency_weighted = 0.0;
@@ -81,9 +88,9 @@ module simulation_report
     hwpe_masters_with_grants = '0;
     missing_reads = 1'b0;
 
-    wait (&end_stimuli_i);
+    wait (&end_req_i);
     wait (stim_latency_i >= 0);
-    wait (&end_latency_i);
+    wait (&end_resp_i);
     wait (throughput_complete_i >= 0);
     wait (tot_latency_i >= 0);
     for (int i = 0; i < N_CORE; i++) begin
@@ -184,6 +191,19 @@ module simulation_report
           average_req_to_gnt_latency_hwpe_unweighted / real'(hwpe_masters_with_grants);
     end
 
+    // Ideal bandwidth computation.
+    // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle.
+    ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH);
+    // Master side: N_LOG_MASTERS narrow ports + N_HWPE wide ports, all at 100% traffic.
+    ideal_bw_master_side_bpc = real'(N_LOG_MASTERS) * real'(DATA_WIDTH)
+                             + real'(N_HWPE)         * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
+    // Bottleneck = minimum of the two sides.
+    ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_master_side_bpc)
+                 ? ideal_bw_mem_side_bpc : ideal_bw_master_side_bpc;
+    // Utilization = actual / ideal.
+    actual_bw_utilization = (ideal_bw_bpc > 0.0)
+                          ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0;
+
     $display("------ Simulation Summary ------");
     $display("\\\\HW CONFIG\\\\");
     $display(
@@ -205,10 +225,24 @@ module simulation_report
 
     $display("\n\\\\BANDWIDTH\\\\");
     $display(
-      "Completion bandwidth (writes granted + reads completed): %0.2f bit/cycle",
-      throughput_complete_i
+      "Ideal BW (memory side):  %0.0f bit/cycle  [%0d banks x %0d bits]",
+      ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH
+    );
+    $display(
+      "Ideal BW (master side):  %0.0f bit/cycle  [%0d log x %0d bits + %0d hwpe x %0d bits]",
+      ideal_bw_master_side_bpc,
+      N_LOG_MASTERS, DATA_WIDTH,
+      N_HWPE, HWPE_WIDTH_FACT * DATA_WIDTH
+    );
+    $display(
+      "Ideal BW (bottleneck):   %0.0f bit/cycle",
+      ideal_bw_bpc
+    );
+    $display(
+      "Actual BW (completion):  %0.2f bit/cycle  [utilization: %0.1f%%]",
+      throughput_complete_i, actual_bw_utilization
     );
-    $display("Stimulus phase duration: %0.2f cycles", stim_latency_i);
+    $display("Stimulus phase duration:  %0.2f cycles", stim_latency_i);
     $display("Completion phase duration: %0.2f cycles", tot_latency_i);
     $display(
       "Granted transactions: reads=%0d writes=%0d total=%0d",
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 2413371..840c452 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -27,7 +27,8 @@ module tb_hci
 ();
 
   logic                   clk, rst_n;
-  logic                   s_clear;
+  logic [N_DRIVERS-1:0]   s_end_req;   // end_req_o from all drivers, [N_DRIVERS-1:0] ordering
+  logic [N_DRIVERS-1:0]   s_clear_drv; // per-driver clear_i (held 1 until dependencies done)
   hci_interconnect_ctrl_t s_hci_ctrl;
 
   clk_rst_gen #(
@@ -194,6 +195,16 @@ module tb_hci
         );
       end
     end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux
+      // In MUX mode, sel_i must point at the currently active HWPE (the one not gated by
+      // clear_i). Priority: lowest index wins if multiple are somehow simultaneously active.
+      logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel;
+      always_comb begin
+        s_mux_sel = '0;
+        for (int i = N_HWPE-1; i >= 0; i--) begin
+          if (!s_clear_drv[N_LOG_MASTERS + i])
+            s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+        end
+      end
       hci_core_mux_static #(
         .NB_CHAN(N_HWPE),
         .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe)
@@ -201,7 +212,7 @@ module tb_hci
         .clk_i(clk),
         .rst_ni(rst_n),
         .clear_i(s_clear),
-        .sel_i('0),
+        .sel_i(s_mux_sel),
         .in(hci_driver_hwpe_if),
         .out(hci_initiator_wide[0])
       );
@@ -227,11 +238,34 @@ module tb_hci
     end
   endgenerate
 
+  logic s_clear;
   assign s_clear = 1'b0;
+
   assign s_hci_ctrl.arb_policy = ARBITER_MODE;
   assign s_hci_ctrl.invert_prio = INVERT_PRIO;
   assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
 
+  // Driver clear logic: driver i is held in reset until all drivers j in its effective wait
+  // mask have asserted end_req_o. If the effective mask is zero, the driver starts immediately.
+  //
+  // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict
+  // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static
+  // only forwards one HWPE at a time. HWPE ordering follows the index = position in
+  // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS.
+  generate
+    for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear
+      logic [N_DRIVERS-1:0] eff_mask;
+      if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask
+        // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1
+        assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1));
+      end else begin : gen_default_mask
+        assign eff_mask = WAIT_MASKS[ii];
+      end
+      assign s_clear_drv[ii] = (eff_mask != '0) &&
+                                ((s_end_req & eff_mask) != eff_mask);
+    end
+  endgenerate
+
   hci_interconnect #(
     .N_HWPE(N_WIDE_HCI),
     .N_CORE(N_NARROW_HCI),
@@ -284,15 +318,14 @@ module tb_hci
   // Application drivers //
   /////////////////////////
 
-  logic [0:N_DRIVERS-1] s_end_stimuli;
-  logic [0:N_DRIVERS-1] s_end_latency;
+  logic [N_DRIVERS-1:0] s_end_resp;
   int unsigned s_issued_transactions[0:N_DRIVERS-1];
   int unsigned s_issued_read_transactions[0:N_DRIVERS-1];
 
   generate
     for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log
       localparam string STIM_FILE_LOG =
-          $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii);
+          $sformatf("../simvectors/generated/stimuli/master_log_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
         .DATA_WIDTH(DATA_WIDTH),
@@ -302,10 +335,10 @@ module tb_hci
       ) i_app_driver_log (
         .clk_i(clk),
         .rst_ni(rst_n),
-        .clear_i(1'b0),
+        .clear_i(s_clear_drv[ii]),
         .hci_if(hci_driver_log_if[ii]),
-        .end_req_o(s_end_stimuli[ii]),
-        .end_resp_o(s_end_latency[ii]),
+        .end_req_o(s_end_req[ii]),
+        .end_resp_o(s_end_resp[ii]),
         .n_issued_tr_o(s_issued_transactions[ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[ii]),
         .n_retired_rd_tr_o()
@@ -316,7 +349,7 @@ module tb_hci
   generate
     for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe
       localparam string STIM_FILE_HWPE =
-          $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii);
+          $sformatf("../simvectors/generated/stimuli/master_hwpe_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
         .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH),
@@ -326,10 +359,10 @@ module tb_hci
       ) i_app_driver_hwpe (
         .clk_i(clk),
         .rst_ni(rst_n),
-        .clear_i(1'b0),
+        .clear_i(s_clear_drv[N_LOG_MASTERS + ii]),
         .hci_if(hci_driver_hwpe_if[ii]),
-        .end_req_o(s_end_stimuli[N_LOG_MASTERS + ii]),
-        .end_resp_o(s_end_latency[N_LOG_MASTERS + ii]),
+        .end_req_o(s_end_req[N_LOG_MASTERS + ii]),
+        .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]),
         .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]),
         .n_retired_rd_tr_o()
@@ -366,8 +399,8 @@ module tb_hci
   ) i_throughput_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
-    .end_stimuli_i(s_end_stimuli),
-    .end_latency_i(s_end_latency),
+    .end_req_i(s_end_req),
+    .end_resp_i(s_end_resp),
     .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG),
     .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE),
     .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG),
@@ -399,8 +432,8 @@ module tb_hci
   );
 
   simulation_report i_simulation_report (
-    .end_stimuli_i(s_end_stimuli),
-    .end_latency_i(s_end_latency),
+    .end_req_i(s_end_req),
+    .end_resp_i(s_end_resp),
     .throughput_complete_i(throughput_completed),
     .stim_latency_i(stim_latency),
     .tot_latency_i(tot_latency),
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index cfcb3c6..f8812fd 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -33,11 +33,6 @@ package tb_hci_pkg;
   localparam time     APPL_DELAY     = 0;
   localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif;
 
-  // Transaction counts
-  localparam int unsigned N_TRANSACTION_LOG  = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif;
-  localparam int unsigned TRANSACTION_RATIO  = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif;
-  localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO);
-
   // TCDM and arbitration parameters
   localparam int unsigned RANDOM_GNT          = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
   localparam int unsigned ARBITER_MODE        = 0;
@@ -82,6 +77,13 @@ package tb_hci_pkg;
   localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
   localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
 
+  // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's
+  // end_req_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define.
+  // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential
+  // chain in tb_hci.sv.
+  localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] =
+      `ifdef WAIT_MASKS_PARAM `WAIT_MASKS_PARAM `else '{default: '0} `endif;
+
   // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE.
   localparam int unsigned N_NARROW_HCI =
       N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG);
@@ -104,10 +106,6 @@ package tb_hci_pkg;
   localparam int unsigned N_WORDS              =
       (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE;
 
-  localparam int unsigned TOT_CHECK =
-      N_TRANSACTION_LOG * N_LOG_MASTERS +
-      N_HWPE * N_TRANSACTION_HWPE * HWPE_WIDTH_FACT;
-
   ///////////////
   // Bitwidths //
   ///////////////
@@ -194,22 +192,6 @@ package tb_hci_pkg;
     index = address[BIT_BANK_INDEX-1 + 2:2];
   endtask
 
-  task calculate_theoretical_throughput(output real throughput_theo);
-    real tot_data, band_memory_limit, tot_time;
-    if (TRANSACTION_RATIO >= 1) begin
-      tot_time = N_TRANSACTION_HWPE;
-    end else begin
-      tot_time = N_TRANSACTION_LOG;
-    end
-    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * N_LOG_MASTERS +
-               (N_TRANSACTION_HWPE * HWPE_WIDTH_FACT * DATA_WIDTH) * N_HWPE;
-    throughput_theo = tot_data / tot_time;
-    band_memory_limit = real'(N_BANKS * DATA_WIDTH);
-    if (throughput_theo >= band_memory_limit) begin
-      throughput_theo = band_memory_limit;
-    end
-  endtask
-
   function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i);
     logic [ADDR_WIDTH-1:0] mapped_addr;
     logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv
index bc7294f..9a7e51b 100644
--- a/target/verif/src/throughput_monitor.sv
+++ b/target/verif/src/throughput_monitor.sv
@@ -26,8 +26,8 @@ module throughput_monitor #(
 ) (
   input logic                clk_i,
   input logic                rst_ni,
-  input logic [0:N_MASTER-1] end_stimuli_i,
-  input logic [0:N_MASTER-1] end_latency_i,
+  input logic [N_MASTER-1:0] end_req_i,
+  input logic [N_MASTER-1:0] end_resp_i,
   // Read transactions number
   input int unsigned         n_read_complete_log_i[N_MASTER-N_HWPE],
   input int unsigned         n_read_complete_hwpe_i[N_HWPE],
@@ -52,7 +52,7 @@ module throughput_monitor #(
     #(CLK_PERIOD/100);
     @(posedge clk_i);
     start_time = $time;
-    wait (&end_stimuli_i);
+    wait (&end_req_i);
     end_time = $time;
     stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD);  // cycles
     stim_latency_o = stim_time_cycles;
@@ -69,7 +69,7 @@ module throughput_monitor #(
     #(CLK_PERIOD/100);
     @(posedge clk_i);
     start_time = $time;
-    wait (&end_latency_i);
+    wait (&end_resp_i);
     end_time = $time;
     completion_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD);  // cycles
     tot_latency_o = completion_time_cycles;
@@ -101,7 +101,7 @@ module throughput_monitor #(
         #(CLK_PERIOD/100);
         @(posedge clk_i);
         start_time = $time;
-        wait (end_latency_i[ii]);
+        wait (end_resp_i[ii] == 1'b1);
         end_time = $time;
         latency_per_master_o[ii] = real'(end_time - start_time) / real'(CLK_PERIOD);
       end
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 155fecb..3e1ceaa 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -11,6 +11,9 @@ HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated
 # Include generated Makefiles
 include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk
 include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk
+ifeq (,$(filter clean%,$(MAKECMDGOALS)))
+-include $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+endif
 
 # Bender targets and defines
 include $(HCI_VERIF_DIR)/bender.mk
@@ -28,6 +31,30 @@ SIM_VOPT ?= $(SIM_QUESTA) vopt
 
 PYTHON ?= python3
 
+##################
+# Simvectors gen #
+##################
+
+GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py
+STIM_SRC_FILES := $(shell find $(HCI_VERIF_DIR)/config -type f -not -path '$(HCI_VERIF_CFG_GEN_DIR)/*') \
+                  $(shell find $(HCI_VERIF_DIR)/simvectors -type f -not -path '$(HCI_VERIF_DIR)/simvectors/generated/*')
+SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated
+
+.PHONY: stim-verif
+stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
+$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR)
+	mkdir -p $(SIMVECTORS_GEN_DIR)
+	$(PYTHON) $(GEN_STIM_SCRIPT) \
+		--workload_config $(WORKLOAD_JSON) \
+		--testbench_config $(TESTBENCH_JSON) \
+		--hardware_config $(HARDWARE_JSON) \
+		--emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+	date > $@
+
+.PHONY: clean-stim-verif
+clean-stim-verif:
+	rm -rf $(SIMVECTORS_GEN_DIR)
+
 ##############
 # Config gen #
 ##############
@@ -58,29 +85,7 @@ $(HCI_VERIF_CFG_GEN_DIR):
 
 .PHONY: clean-config-verif
 clean-config-verif:
-	rm -f $(VERIF_CFG_MK)
-
-##################
-# Simvectors gen #
-##################
-
-GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py
-STIM_SRC_FILES := $(shell find {$(HCI_VERIF_DIR)/config,$(HCI_VERIF_DIR)/simvectors} -type f)
-SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated
-
-.PHONY: stim-verif
-stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
-$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES)
-	mkdir -p $(SIMVECTORS_GEN_DIR)
-	$(PYTHON) $(GEN_STIM_SCRIPT) \
-		--workload_config $(WORKLOAD_JSON) \
-		--testbench_config $(TESTBENCH_JSON) \
-		--hardware_config $(HARDWARE_JSON)
-	date > $@
-
-.PHONY: clean-stim-verif
-clean-stim-verif:
-	rm -rf $(SIMVECTORS_GEN_DIR)
+	rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
 
 ##############
 # Simulation #
@@ -113,9 +118,13 @@ ifeq ($(GUI),0)
 	SIM_HCI_VSIM_ARGS += -c
 endif
 
-$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK)
+WAIT_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+WAIT_MASKS_PARAM = $(shell grep '^WAIT_MASKS_PARAM' $(WAIT_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+
+$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp
 	mkdir -p $(HCI_VERIF_DIR)/vsim
-	$(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) --vlog-arg="$(SIM_HCI_VLOG_ARGS)" > $@
+	$(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \
+		--vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+WAIT_MASKS_PARAM=$(WAIT_MASKS_PARAM)\"" > $@
 
 .PHONY: compile-verif
 compile-verif: $(sim_vsim_lib)/.hw_compiled

From 02503990f1f9fcd81dc5c6910840f8f9343e4128 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 13:28:21 +0100
Subject: [PATCH 11/25] tb: Fix error in testing of static mux config

---
 target/verif/src/tb_hci.sv   | 23 +++++++++++++++--------
 target/verif/verif.mk        |  4 +++-
 target/verif/vsim/tb_hci.tcl |  5 +++--
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 840c452..63b720d 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -28,6 +28,7 @@ module tb_hci
 
   logic                   clk, rst_n;
   logic [N_DRIVERS-1:0]   s_end_req;   // end_req_o from all drivers, [N_DRIVERS-1:0] ordering
+  logic [N_DRIVERS-1:0]   s_end_resp;  // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering
   logic [N_DRIVERS-1:0]   s_clear_drv; // per-driver clear_i (held 1 until dependencies done)
   hci_interconnect_ctrl_t s_hci_ctrl;
 
@@ -195,13 +196,15 @@ module tb_hci
         );
       end
     end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux
-      // In MUX mode, sel_i must point at the currently active HWPE (the one not gated by
-      // clear_i). Priority: lowest index wins if multiple are somehow simultaneously active.
+      // In MUX mode, sel_i points at the lowest-indexed HWPE that has not yet finished
+      // (i.e. whose end_resp_o has not fired). Once HWPE k asserts end_resp_o, sel_i
+      // advances to k+1. We cannot use s_clear_drv here because HWPE 0 always has
+      // eff_mask='0 so its clear_drv is permanently 0 even after it finishes.
       logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel;
       always_comb begin
-        s_mux_sel = '0;
+        s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1);
         for (int i = N_HWPE-1; i >= 0; i--) begin
-          if (!s_clear_drv[N_LOG_MASTERS + i])
+          if (!s_end_resp[N_LOG_MASTERS + i])
             s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
         end
       end
@@ -246,23 +249,28 @@ module tb_hci
   assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
 
   // Driver clear logic: driver i is held in reset until all drivers j in its effective wait
-  // mask have asserted end_req_o. If the effective mask is zero, the driver starts immediately.
+  // mask have asserted end_resp_o. If the effective mask is zero, the driver starts immediately.
   //
   // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict
   // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static
   // only forwards one HWPE at a time. HWPE ordering follows the index = position in
   // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS.
+  //
+  // All drivers use end_resp_o (s_end_resp) as the handoff condition, guaranteeing that all
+  // in-flight reads from the predecessor have been fully retired before the successor starts.
+  // This is required for correctness in MUX mode (hci_core_mux_static gates r_valid to the
+  // non-selected channel), and is conservatively safe for all other modes.
   generate
     for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear
       logic [N_DRIVERS-1:0] eff_mask;
       if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask
-        // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1
+        // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1's end_resp_o
         assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1));
       end else begin : gen_default_mask
         assign eff_mask = WAIT_MASKS[ii];
       end
       assign s_clear_drv[ii] = (eff_mask != '0) &&
-                                ((s_end_req & eff_mask) != eff_mask);
+                                ((s_end_resp & eff_mask) != eff_mask);
     end
   endgenerate
 
@@ -318,7 +326,6 @@ module tb_hci
   // Application drivers //
   /////////////////////////
 
-  logic [N_DRIVERS-1:0] s_end_resp;
   int unsigned s_issued_transactions[0:N_DRIVERS-1];
   int unsigned s_issued_read_transactions[0:N_DRIVERS-1];
 
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 3e1ceaa..5ef479b 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -4,6 +4,8 @@
 #
 # Sergio Mazzola <smazzola@iis.ee.ethz.ch>
 
+.DELETE_ON_ERROR:
+
 HCI_VERIF_DIR = $(HCI_ROOT)/target/verif
 HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config
 HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated
@@ -42,7 +44,7 @@ SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated
 
 .PHONY: stim-verif
 stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
-$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR)
+$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR)
 	mkdir -p $(SIMVECTORS_GEN_DIR)
 	$(PYTHON) $(GEN_STIM_SCRIPT) \
 		--workload_config $(WORKLOAD_JSON) \
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index 33b1e72..72d5007 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -71,8 +71,8 @@ if {$GUI == 1} {
     }
 
     # Metrics
-    add wave -noupdate -group metrics /tb_hci/s_end_latency
-    add wave -noupdate -group metrics /tb_hci/s_end_stimuli
+    add wave -noupdate -group metrics /tb_hci/s_end_req
+    add wave -noupdate -group metrics /tb_hci/s_end_resp
     add wave -noupdate -group metrics /tb_hci/s_issued_transactions
     add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions
     add wave -noupdate -group metrics /tb_hci/stim_latency
@@ -91,6 +91,7 @@ if {$GUI == 1} {
     add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
 
     add wave -noupdate /tb_hci/s_clear
+    add wave -noupdate /tb_hci/s_clear_drv
     add wave -noupdate /tb_hci/s_hci_ctrl
 
     configure wave -signalnamewidth 1

From 2719f316f5a264967be571243588ecb8d83386b6 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Fri, 6 Mar 2026 23:41:02 +0100
Subject: [PATCH 12/25] Add support for fractional priority in arbiter

Previously, HCI supported QoS by switching to the low-priority chan
for 1 stall-cycle out of N, where N-1 was specified as the low_prio_max_stall
configuration from a CSR in the system.
This commit changes the mechanism in a more flexible fractional priority.
The counter has two configuration params (8-bit), both from CSRs:
priority_cnt_numerator and priority_cnt_denominator. The former is the
same as low_prio_max_stall, i.e., the number of stall-cycles the arbiter
spends prioritizing the high-priority side. The latter is the total number
of stall cycles after which the counter is cleared, meaning that the
arbiter will spend priority_cnt_denominator-priority_cnt_numerator stall-cycles
prioritizing the low-priority side.
In other words, the arbiter will dedicate a fraction numerator/denominator
of the available bandwidth (on average) to the high priority side.

Hardware-wise the change is minimal, touching only how the counter works
and the CSRs.
---
 rtl/common/hci_package.sv                     |  3 +-
 rtl/hci_interconnect.sv                       |  3 +-
 rtl/interco/hci_arbiter.sv                    | 36 ++++++++++---------
 target/verif/bender.mk                        |  3 +-
 .../verif/config/generated/testbench.mk.tpl   |  3 +-
 target/verif/config/testbench.json            |  3 +-
 target/verif/src/tb_hci.sv                    |  3 +-
 target/verif/src/tb_hci_pkg.sv                |  3 +-
 8 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/rtl/common/hci_package.sv b/rtl/common/hci_package.sv
index 1e3d1e1..15a36b8 100644
--- a/rtl/common/hci_package.sv
+++ b/rtl/common/hci_package.sv
@@ -51,7 +51,8 @@ package hci_package;
   typedef struct packed {
     logic [1:0] arb_policy; // used only in some systems
     logic       invert_prio;
-    logic [7:0] low_prio_max_stall;
+    logic [7:0] priority_cnt_numerator;
+    logic [7:0] priority_cnt_denominator;
   } hci_interconnect_ctrl_t;
 
   typedef struct packed {
diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv
index 53e10ad..fee4918 100644
--- a/rtl/hci_interconnect.sv
+++ b/rtl/hci_interconnect.sv
@@ -278,7 +278,8 @@ module hci_interconnect
       always_comb
       begin
         ctrl_arbiter_tree = ctrl_i;
-        ctrl_arbiter_tree.low_prio_max_stall = 1;
+        ctrl_arbiter_tree.priority_cnt_numerator = 1;
+        ctrl_arbiter_tree.priority_cnt_denominator = 2;
       end
 
       hci_arbiter_tree #(
diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv
index 2447608..da75e51 100644
--- a/rtl/interco/hci_arbiter.sv
+++ b/rtl/interco/hci_arbiter.sv
@@ -44,13 +44,15 @@
  * .. _hci_arbiter_ctrl:
  * .. table:: **hci_arbiter** input control signals.
  *
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | **Name**             | **Type**               | **Description**                                               |
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | *invert_prio*        | `logic`                | When 1, invert priorities between `in_high` and `in_low`.     |
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | *low_prio_max_stall* | `logic[7:0]`           | Maximum number of consecutive stalls on low-priority channel. |
- *   +----------------------+------------------------+---------------------------------------------------------------+
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | **Name**                   | **Type**     | **Description**                                                               |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *invert_prio*              | `logic`      | When 1, invert priorities between `in_high` and `in_low`.                     |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *priority_cnt_numerator*   | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel.                 |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *priority_cnt_denominator* | `logic[7:0]` | Clear condition of priority counter (max low-prio stalls + high-prio stalls). |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
  *
  */
  
@@ -75,8 +77,9 @@ module hci_arbiter
   logic [NB_CHAN-1:0] hs_pass_d;
   logic hs_req_d;
   logic ls_req_d;
+  logic hs_req_masked_d;
   logic switch_channels_d;
-  logic unsigned [7:0] ls_stall_ctr_d;
+  logic unsigned [7:0] priority_cnt_q;
 
   // priority_req is the OR of all requests coming out of the log interconnect.
   // it should be simplified to simply an OR of all requests coming *into* the
@@ -85,10 +88,11 @@ module hci_arbiter
   begin
     hs_req_d = |hs_req_in;
     ls_req_d = |ls_req_in;
-    if (ctrl_i.low_prio_max_stall > 0) //Set to 0 to disable this functionality
+    hs_req_masked_d = hs_req_d;
+    if (ctrl_i.priority_cnt_numerator > 0) //Set to 0 to disable this functionality
     begin
-      if (ls_stall_ctr_d >= ctrl_i.low_prio_max_stall)
-        hs_req_d = 0; //Let low side through for once
+      if (priority_cnt_q >= ctrl_i.priority_cnt_numerator && priority_cnt_q < ctrl_i.priority_cnt_denominator)
+        hs_req_masked_d = 0; //Let low side through for once
     end
   end
   
@@ -96,11 +100,11 @@ module hci_arbiter
 	always_ff @(posedge clk_i or negedge rst_ni)
 	begin
 		if (~rst_ni)
-			ls_stall_ctr_d <= 0;
+			priority_cnt_q <= 0;
+    else if(priority_cnt_q == ctrl_i.priority_cnt_denominator-1)
+      priority_cnt_q <= 0;
 		else if (hs_req_d & ls_req_d)
-			ls_stall_ctr_d <= ls_stall_ctr_d + 1;
-    else
-			ls_stall_ctr_d <= 0;
+			priority_cnt_q <= priority_cnt_q + 1;
 	end
   
   assign switch_channels_d = ctrl_i.invert_prio;
@@ -129,7 +133,7 @@ module hci_arbiter
   // Side select
   generate
     for(genvar ii=0; ii<NB_CHAN; ii++) begin: side_select
-      assign hs_pass_d[ii] = (hs_req_d & hs_req_in[ii]) ^ switch_channels_d;
+      assign hs_pass_d[ii] = (hs_req_masked_d & hs_req_in[ii]) ^ switch_channels_d;
     end // side_select
   endgenerate
 
diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index 4aac401..d4dadc3 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -23,7 +23,8 @@ VERIF_DEFS += \
 	-D RANDOM_GNT=$(RANDOM_GNT) \
 	-D INTERCO_TYPE=$(INTERCO_TYPE) \
 	-D INVERT_PRIO=$(INVERT_PRIO) \
-	-D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) \
+	-D PRIORITY_CNT_NUMERATOR=$(PRIORITY_CNT_NUMERATOR) \
+	-D PRIORITY_CNT_DENOMINATOR=$(PRIORITY_CNT_DENOMINATOR)
 
 # Common targets for bender
 VERIF_TARGS ?=
diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl
index a65ee53..66b617d 100644
--- a/target/verif/config/generated/testbench.mk.tpl
+++ b/target/verif/config/generated/testbench.mk.tpl
@@ -9,4 +9,5 @@ CLK_PERIOD?=${CLK_PERIOD}
 RST_CLK_CYCLES?=${RST_CLK_CYCLES}
 RANDOM_GNT?=${RANDOM_GNT}
 INVERT_PRIO?=${INVERT_PRIO}
-LOW_PRIO_MAX_STALL?=${LOW_PRIO_MAX_STALL}
+PRIORITY_CNT_NUMERATOR?=${PRIORITY_CNT_NUMERATOR}
+PRIORITY_CNT_DENOMINATOR?=${PRIORITY_CNT_DENOMINATOR}
diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json
index 9ce418f..924c947 100644
--- a/target/verif/config/testbench.json
+++ b/target/verif/config/testbench.json
@@ -5,6 +5,7 @@
     "RST_CLK_CYCLES": 10,
     "RANDOM_GNT": 0,
     "INVERT_PRIO": 0,
-    "LOW_PRIO_MAX_STALL": 10
+    "PRIORITY_CNT_NUMERATOR": 10,
+    "PRIORITY_CNT_DENOMINATOR": 11
   }
 }
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 63b720d..f67eb46 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -246,7 +246,8 @@ module tb_hci
 
   assign s_hci_ctrl.arb_policy = ARBITER_MODE;
   assign s_hci_ctrl.invert_prio = INVERT_PRIO;
-  assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
+  assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR;
+  assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR;
 
   // Driver clear logic: driver i is held in reset until all drivers j in its effective wait
   // mask have asserted end_resp_o. If the effective mask is zero, the driver starts immediately.
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index f8812fd..0d16b40 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -37,7 +37,8 @@ package tb_hci_pkg;
   localparam int unsigned RANDOM_GNT          = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
   localparam int unsigned ARBITER_MODE        = 0;
   localparam int unsigned INVERT_PRIO         = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif;
-  localparam int unsigned LOW_PRIO_MAX_STALL  = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif;
+  localparam int unsigned PRIORITY_CNT_NUMERATOR  = `ifdef PRIORITY_CNT_NUMERATOR `PRIORITY_CNT_NUMERATOR `else 3 `endif;
+  localparam int unsigned PRIORITY_CNT_DENOMINATOR  = `ifdef PRIORITY_CNT_DENOMINATOR `PRIORITY_CNT_DENOMINATOR `else 4 `endif;
 
   /////////////////////////////
   // Configurable parameters //

From 5aa665315e7b8db837b6943dcf53bdda0204c5dd Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Sat, 7 Mar 2026 22:03:05 +0100
Subject: [PATCH 13/25] add regression test (no CI yet)

---
 regr/basic.yml                |  31 +++
 regr/bwruntests.py            | 393 ++++++++++++++++++++++++++++++++++
 regr/full_regression.sh       |  37 ++++
 regr/hardware/hci.json        |  17 ++
 regr/hardware/log.json        |  17 ++
 regr/testbench/fair.json      |  11 +
 regr/testbench/hwpe_prio.json |  11 +
 regr/testbench/log_prio.json  |  11 +
 regr/testbench/testbench.json |  11 +
 9 files changed, 539 insertions(+)
 create mode 100644 regr/basic.yml
 create mode 100755 regr/bwruntests.py
 create mode 100755 regr/full_regression.sh
 create mode 100644 regr/hardware/hci.json
 create mode 100644 regr/hardware/log.json
 create mode 100644 regr/testbench/fair.json
 create mode 100644 regr/testbench/hwpe_prio.json
 create mode 100644 regr/testbench/log_prio.json
 create mode 100644 regr/testbench/testbench.json

diff --git a/regr/basic.yml b/regr/basic.yml
new file mode 100644
index 0000000..ab21052
--- /dev/null
+++ b/regr/basic.yml
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Francesco Conti (f.conti@unibo.it)
+#
+
+hci_tests:
+  log_fair:
+    path: .
+    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/log.json
+  hci_fair:
+    path: .
+    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/hci.json
+  hci_hwpe_prio:
+    path: .
+    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio.json HARDWARE_JSON=regr/hardware/hci.json
+  hci_log_prio:
+    path: .
+    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio.json HARDWARE_JSON=regr/hardware/hci.json
diff --git a/regr/bwruntests.py b/regr/bwruntests.py
new file mode 100755
index 0000000..555a1d4
--- /dev/null
+++ b/regr/bwruntests.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 ETH Zurich and University of Bologna
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Run shell commands listed in a file separated by newlines in a parallel
+# fashion. If requested the results (tuples consisting of command, stdout,
+# stderr and returncode) will be gathered in a junit.xml file. There a few
+# knobs to tune the number of spawned processes and the junit.xml formatting.
+
+# Author: Robert Balas (balasr@iis.ee.ethz.ch)
+
+import argparse
+import re
+from subprocess import (Popen, TimeoutExpired,
+                        CalledProcessError, PIPE)
+from threading import Lock
+import shlex
+import sys
+import signal
+import os
+import multiprocessing
+import errno
+import pprint
+import time
+import random
+from collections import OrderedDict
+import json
+
+runtest = argparse.ArgumentParser(
+    prog='bwruntests',
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="""Run PULP tests in parallel""",
+    epilog="""
+Test_file needs to be either a .yaml file (set the --yaml switch)
+which looks like this:
+
+mytests.yml
+[...]
+parallel_bare_tests: # name of the test set
+  parMatrixMul8:     # name of the test
+    path: ./parallel_bare_tests/parMatrixMul8 # path to the test's folder
+    command: make clean all run # command to run in the test's folder
+[...]
+
+or
+
+Test_file needs to be a list of commands to be executed. Each line corresponds
+to a single command and a test
+
+commands.f
+[...]
+make -C ./ml_tests/mlGrad clean all run
+make -C ./ml_tests/mlDct clean all run
+[...]
+
+Example:
+bwruntests.py --proc-verbose -v \\
+    --report_junit -t 3600 --yaml \\
+    -o simplified-runtime.xml runtime-tests.yaml
+
+This Runs a set of tests defined in runtime-tests.yaml and dumps the
+resulting junit.xml into simplified-runtime.xml. The --proc-verbose
+scripts makes sure to print the stdout of each process to the shell. To
+prevent a broken process from running forever, a maximum timeout of 3600
+seconds was set. For debugging purposes we enabled -v (--verbose) which
+shows the full set of commands being run.""")
+
+runtest.version = '0.2'
+
+runtest.add_argument('test_file', type=str,
+                     help='file defining tests to be run')
+runtest.add_argument('--version', action='version',
+                     version='%(prog)s ' + runtest.version)
+runtest.add_argument('-p', '--max_procs', type=int,
+                     default=multiprocessing.cpu_count(),
+                     help="""Number of parallel
+                     processes used to run test.
+                     Default is number of cpu cores.""")
+runtest.add_argument('-t', '--timeout', type=float,
+                     default=None,
+                     help="""Timeout for all processes in seconds""")
+runtest.add_argument('-v', '--verbose', action='store_true',
+                     help="""Enable verbose output""")
+runtest.add_argument('-s', '--proc_verbose', action='store_true',
+                     help="""Write processes' stdout and stderr to shell stdout
+                     after they terminate""")
+runtest.add_argument('--report_junit', action='store_true',
+                     help="""Generate a junit report""")
+runtest.add_argument('--disable_junit_pp', action='store_true',
+                     help="""Disable pretty print of junit report""")
+runtest.add_argument('--disable_results_pp', action='store_true',
+                     help="""Disable printing test results""")
+runtest.add_argument('-y,', '--yaml', action='store_true',
+                     help="""Read tests from yaml file instead of executing
+                     from a list of commands""")
+runtest.add_argument('-o,', '--output', type=str,
+                     help="""Write junit.xml to file instead of stdout""")
+runtest.add_argument('-P,', '--perf', type=str, default=None,
+                     help="""Write performance results to JSON file""")
+stdout_lock = Lock()
+
+shared_total = 0
+len_total = 0
+
+class FinishedProcess(object):
+    """A process that has finished running.
+    """
+    def __init__(self, name, cwd, runargs, returncode,
+                 stdout=None, stderr=None, time=None):
+        self.name = name
+        self.cwd = cwd
+        self.runargs = runargs
+        self.returncode = returncode
+        self.stdout = stdout
+        self.stderr = stderr
+        self.time = time
+        exec_time = 0
+        throughput = 0
+        workload = 0
+        if returncode == 0:
+            matches = re.findall("# hwpe cycles =\s+(\d+)", stdout)
+            if matches:
+                exec_time = int(matches[0])
+        self.exec_time = exec_time
+
+
+    def __repr__(self):
+        runargs = ['name={!r}'.format(self.name)]
+        runargs += ['cwd={!r}'.format(self.cwd)]
+        runargs += ['args={!r}'.format(self.runargs),
+                 'returncode={!r}'.format(self.returncode)]
+        if self.stdout is not None:
+            runargs.append('stdout={!r}'.format(self.stdout))
+        if self.stderr is not None:
+            runargs.append('stderr={!r}'.format(self.stderr))
+        if self.time is not None:
+            runargs.append('time={!r}'.format(self.time))
+        return "{}({})".format(type(self).__name__, ', '.join(runargs))
+
+def fork(name, cwd, *popenargs, check=False, shell=True,
+         **kwargs):
+    """Run subprocess and return process args, error code, stdout and stderr
+    """
+
+    def proc_out(cwd, stdout, stderr):
+        print('cwd={}'.format(cwd))
+        print('stdout=')
+        print(stdout.decode('utf-8'))
+        print('stderr=')
+        print(stderr.decode('utf-8'))
+
+    kwargs['stdout'] = PIPE
+    kwargs['stderr'] = PIPE
+
+    with Popen(*popenargs, preexec_fn=os.setpgrp, cwd=cwd,
+               **kwargs) as process:
+        try:
+            # Child and parent are racing for setting/using the pgid so we have
+            # to set it in both processes. See glib manual.
+            try:
+                os.setpgid(process.pid, process.pid)
+            except OSError as e:
+                if e.errno != errno.EACCES:
+                    raise
+            # measure runtime
+            start = time.time()
+            stdout, stderr = process.communicate(input, timeout=args.timeout)
+        except TimeoutExpired:
+            pgid = os.getpgid(process.pid)
+            os.killpg(pgid, signal.SIGKILL)
+            # process.kill() will only kill the immediate child but not its
+            # forks. This won't work since our commands will create a few forks
+            # (make -> vsim -> etc). We need to make a process group and kill
+            # that
+            stdout, stderr = process.communicate()
+            timeoutmsg = 'TIMEOUT after {:f}s'.format(args.timeout)
+
+            if args.proc_verbose:
+                stdout_lock.acquire()
+                print(name)
+                print(timeoutmsg)
+                proc_out(cwd, stdout, stderr)
+                stdout_lock.release()
+
+            return FinishedProcess(name, cwd, process.args, 1,
+                                   stdout.decode('utf-8'),
+                                   timeoutmsg + '\n'
+                                   + stderr.decode('utf-8'),
+                                   time.time() - start)
+        # Including KeyboardInterrupt, communicate handled that.
+        except:  # noqa: E722
+            pgid = os.getpgid(process.pid)
+            os.killpg(pgid, signal.SIGKILL)
+            # We don't call process.wait() as .__exit__ does that for us.
+            raise
+        retcode = process.poll()
+        if check and retcode:
+            raise CalledProcessError(retcode, process.args,
+                                     output=stdout, stderr=stderr)
+        if args.proc_verbose:
+            stdout_lock.acquire()
+            print(name)
+            proc_out(cwd, stdout, stderr)
+            stdout_lock.release()
+
+    with lock:
+        shared_total.value += 1
+        print("[%s][%d/%d] %s" % ("\033[1;32m OK \033[0m" if retcode == 0 else "\033[1;31mFAIL\033[0m", shared_total.value, len_total.value, name))
+
+    return FinishedProcess(name, cwd, process.args, retcode,
+                           stdout.decode('utf-8'),
+                           stderr.decode('utf-8'),
+                           time.time() - start)
+
+def poolInit(s, t, l):
+    global shared_total
+    global len_total
+    global lock
+    shared_total = s
+    len_total = t
+    lock = l
+
+if __name__ == '__main__':
+    args = runtest.parse_args()
+    pp = pprint.PrettyPrinter(indent=4)
+
+    # lazy importing so that we can work without junit_xml
+    if args.report_junit:
+        try:
+            from junit_xml import TestSuite, TestCase
+        except ImportError:
+            print("""Error: The --report_junit option requires
+the junit_xml library which is not installed.""",
+                  file=sys.stderr)
+            exit(1)
+
+    # lazy import PrettyTable for displaying results
+    if not(args.disable_results_pp):
+        try:
+            from prettytable import PrettyTable
+        except ImportError:
+            print("""Warning: Displaying results requires the PrettyTable
+library which is not installed""")
+
+    tests = []  # list of tuple (testname, working dir, command)
+
+    # load tests (yaml or command list)
+    if args.yaml:
+        try:
+            import yaml
+        except ImportError:
+            print("""Error: The --yaml option requires
+the pyyaml library which is not installed.""",
+                  file=sys.stderr)
+            exit(1)
+        with open(args.test_file) as f:
+            testyaml = yaml.load(f, Loader=yaml.Loader)
+            for testsetname, testv in testyaml.items():
+                for testname, insn in testv.items():
+                    cmd = shlex.split(insn['command'])
+                    cwd = insn['path']
+                    tests.append((testsetname + ':' + testname, cwd, cmd))
+            if args.verbose:
+                pp.pprint(tests)
+    else:  # (command list)
+        with open(args.test_file) as f:
+            testnames = list(map(str.rstrip, f))
+            shellcmds = [shlex.split(e) for e in testnames]
+            cwds = ['./' for e in testnames]
+            tests = list(zip(testnames, cwds, shellcmds))
+            if args.verbose:
+                print('Tests which we are running:')
+                pp.pprint(tests)
+                pp.pprint(shellcmds)
+
+    # Spawning process pool
+    # Disable signals to prevent race. Child processes inherit SIGINT handler
+    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
+    lock = multiprocessing.Lock()
+    shared_total = multiprocessing.Value('i', 0)
+    len_total = multiprocessing.Value('i', len(tests))
+    pool = multiprocessing.Pool(processes=args.max_procs, initializer=poolInit, initargs=(shared_total, len_total, lock ))
+    # Restore SIGINT handler
+    signal.signal(signal.SIGINT, original_sigint_handler)
+    # Shuffle tests
+    random.shuffle(tests)
+    try:
+        procresults = pool.starmap(fork, tests)
+    except KeyboardInterrupt:
+        print("\nTerminating bwruntest.py")
+        pool.terminate()
+        pool.join()
+        exit(1)
+
+    # pp.pprint(procresults)
+    pool.close()
+    pool.join()
+
+    # Generate junit.xml file. Junit.xml differentiates between failure and
+    # errors but we treat everything as errors.
+    if args.report_junit:
+        testcases = []
+        for p in procresults:
+            # we can either expect p.name = testsetname:testname
+            # or p.name = testname
+            testcase = TestCase(p.name,
+                                classname=((p.name).split(':'))[0],
+                                stdout=p.stdout,
+                                stderr=p.stderr,
+                                elapsed_sec=p.time)
+            if p.returncode != 0:
+                testcase.add_failure_info(p.stderr)
+            testcases.append(testcase)
+
+        testsuite = TestSuite('bwruntests', testcases)
+        if args.output:
+            with open(args.output, 'w') as f:
+                TestSuite.to_file(f, [testsuite],
+                                  prettyprint=not(args.disable_junit_pp))
+        else:
+            print(TestSuite.to_xml_string([testsuite],
+                                          prettyprint=(args.disable_junit_pp)))
+
+    # # print JSON for performance regression
+    # if args.perf is not None:
+    #     # if file does not exist, create new dictionary:
+    #     if not os.path.isfile(args.perf):
+    #         d = OrderedDict([])
+    #     # else, load the existing dictionary
+    #     else:
+    #         with open(args.perf) as f:
+    #             d = json.load(f, object_pairs_hook=OrderedDict)
+    #     # save the new execution times
+    #     for p in procresults:
+    #         if p.returncode == 0:
+    #             d[p.name] = p.exec_time
+    #     with open(args.perf, 'w', encoding='utf-8') as f:
+    #         json.dump(d, f, ensure_ascii=False, indent=4)
+
+    # print JSON for performance regression
+    if args.perf is not None:
+        # if file does not exist, create new dictionary:
+        if not os.path.isfile(args.perf):
+            d = list([])
+        # else, load the existing dictionary
+        else:
+            with open(args.perf) as f:
+                d = json.load(f)
+        # save the new execution times
+        for p in procresults:
+            if p.returncode == 0:
+                d.append({ 'name': p.name, 'value': p.exec_time, 'unit': 'cycles'})
+        with open(args.perf, 'w', encoding='utf-8') as f:
+            json.dump(d, f, ensure_ascii=False, indent=4)
+
+    # print summary of test results
+    if not(args.disable_results_pp):
+        testcount = sum(1 for x in tests)
+        testfailcount = sum(1 for p in procresults if p.returncode != 0)
+        testpassedcount = testcount - testfailcount
+        resulttable = PrettyTable(['test', 'cycles', 'time', 'passed/total'])
+        resulttable.align['test'] = "l"
+        for p in procresults:
+            testpassed = 1 if p.returncode == 0 else 0
+            testname = p.name
+            resulttable.add_row([testname,
+                                 p.exec_time,
+                                 '{0:.2f}s'.format(p.time),
+                                 '{0:d}/{1:d}'.format(testpassed, 1)])
+        resulttable.add_row(['total', '', '', '{0:d}/{1:d}'.
+                             format(testpassedcount, testcount)])
+        print(resulttable)
+        if testpassedcount != testcount:
+            import sys; sys.exit(1)
+
diff --git a/regr/full_regression.sh b/regr/full_regression.sh
new file mode 100755
index 0000000..1e10f4e
--- /dev/null
+++ b/regr/full_regression.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright (C) 2020-2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Francesco Conti (f.conti@unibo.it)
+#
+
+export N_PROC=1
+export P_STALL=0.04
+TIMEOUT=400
+
+# Declare a string array with type
+declare -a test_list=(
+    "regr/basic.yml"
+)
+
+# Read the list values with space
+for val in "${test_list[@]}"; do
+    nice -n10 regr/bwruntests.py --report_junit -t ${TIMEOUT} --yaml -o regr/hci_tests.xml -p${N_PROC} $val
+    if test $? -ne 0; then
+        echo "Error in test $val"
+        exit 1
+    fi
+done
+unset P_STALL
diff --git a/regr/hardware/hci.json b/regr/hardware/hci.json
new file mode 100644
index 0000000..c8db224
--- /dev/null
+++ b/regr/hardware/hci.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 3,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 128,
+    "N_BANKS": 32,
+    "INTERCO_TYPE": "HCI",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/regr/hardware/log.json b/regr/hardware/log.json
new file mode 100644
index 0000000..6a6a3bd
--- /dev/null
+++ b/regr/hardware/log.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 3,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 128,
+    "N_BANKS": 32,
+    "INTERCO_TYPE": "LOG",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/regr/testbench/fair.json b/regr/testbench/fair.json
new file mode 100644
index 0000000..011e20b
--- /dev/null
+++ b/regr/testbench/fair.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 0,
+    "PRIORITY_CNT_NUMERATOR": 1,
+    "PRIORITY_CNT_DENOMINATOR": 2
+  }
+}
diff --git a/regr/testbench/hwpe_prio.json b/regr/testbench/hwpe_prio.json
new file mode 100644
index 0000000..4eefeb8
--- /dev/null
+++ b/regr/testbench/hwpe_prio.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 1,
+    "PRIORITY_CNT_NUMERATOR": 9,
+    "PRIORITY_CNT_DENOMINATOR": 10
+  }
+}
diff --git a/regr/testbench/log_prio.json b/regr/testbench/log_prio.json
new file mode 100644
index 0000000..473e3fd
--- /dev/null
+++ b/regr/testbench/log_prio.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 0,
+    "PRIORITY_CNT_NUMERATOR": 9,
+    "PRIORITY_CNT_DENOMINATOR": 10
+  }
+}
diff --git a/regr/testbench/testbench.json b/regr/testbench/testbench.json
new file mode 100644
index 0000000..011e20b
--- /dev/null
+++ b/regr/testbench/testbench.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 0,
+    "PRIORITY_CNT_NUMERATOR": 1,
+    "PRIORITY_CNT_DENOMINATOR": 2
+  }
+}

From 2674f77b791588f39fa521707b2bd20238522558 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Sat, 7 Mar 2026 22:08:13 +0100
Subject: [PATCH 14/25] Add basic github/gitlab CI workflow

---
 .github/workflows/gitlab-ci.yml               | 29 ++++++++++++
 .gitlab-ci.yml                                | 44 +++++++++++++++++++
 regr/basic.yml                                |  8 ++--
 regr/hardware/{hci.json => hci/hardware.json} |  0
 regr/hardware/{log.json => log/hardware.json} |  0
 regr/testbench/fair.json                      | 11 -----
 regr/testbench/{ => fair}/testbench.json      |  0
 .../testbench.json}                           |  0
 .../testbench.json}                           |  0
 9 files changed, 77 insertions(+), 15 deletions(-)
 create mode 100644 .github/workflows/gitlab-ci.yml
 create mode 100644 .gitlab-ci.yml
 rename regr/hardware/{hci.json => hci/hardware.json} (100%)
 rename regr/hardware/{log.json => log/hardware.json} (100%)
 delete mode 100644 regr/testbench/fair.json
 rename regr/testbench/{ => fair}/testbench.json (100%)
 rename regr/testbench/{hwpe_prio.json => hwpe_prio/testbench.json} (100%)
 rename regr/testbench/{log_prio.json => log_prio/testbench.json} (100%)

diff --git a/.github/workflows/gitlab-ci.yml b/.github/workflows/gitlab-ci.yml
new file mode 100644
index 0000000..e89ce0d
--- /dev/null
+++ b/.github/workflows/gitlab-ci.yml
@@ -0,0 +1,29 @@
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Paul Scheffler <paulsc@iis.ee.ethz.ch>
+
+name: gitlab-ci
+
+on: [ push, pull_request, workflow_dispatch ]
+
+permissions:
+  # deployments permission to deploy GitHub pages website
+  deployments: write
+  # contents permission to update benchmark contents in gh-pages branch
+  contents: write
+
+jobs:
+  gitlab-ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check Gitlab CI
+        uses: pulp-platform/pulp-actions/gitlab-ci@v1
+        # Skip on forks or pull requests from forks due to missing secrets.
+        if: github.repository == 'pulp-platform/hci' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository)
+        with:
+          domain: iis-git.ee.ethz.ch
+          repo: github-mirror/hci
+          token: ${{ secrets.GITLAB_TOKEN }}
+
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..a3d926f
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,44 @@
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Paul Scheffler <paulsc@iis.ee.ethz.ch>
+
+# We initialize the nonfree repo, then spawn a sub-pipeline from it
+
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+  # Our reference GCC toolchain for reproducible builds
+
+before_script:
+  - python -V  # Print out python version for debugging
+  - python -m pip install --user virtualenv
+
+.base:
+  artifacts:
+    when: always
+    expire_in: 1 week
+
+stages:
+  - build
+  - test
+
+build:
+  stage: build
+  script:
+    - make checkout
+  artifacts:
+    when: always
+    expire_in: 3 hours
+    paths: [ .bender ]
+
+testset:
+  extends: .base
+  needs: [ build ]
+  stage: test
+  script:
+    - regr/full_regression.sh
+  artifacts:
+    when: always
+    expire_in: 1 year
+    paths: [ regr/hci_tests.xml ]
diff --git a/regr/basic.yml b/regr/basic.yml
index ab21052..9f43cfa 100644
--- a/regr/basic.yml
+++ b/regr/basic.yml
@@ -19,13 +19,13 @@
 hci_tests:
   log_fair:
     path: .
-    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/log.json
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/log/hardware.json
   hci_fair:
     path: .
-    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair.json HARDWARE_JSON=regr/hardware/hci.json
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
   hci_hwpe_prio:
     path: .
-    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio.json HARDWARE_JSON=regr/hardware/hci.json
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
   hci_log_prio:
     path: .
-    command: make clean-sim-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio.json HARDWARE_JSON=regr/hardware/hci.json
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
diff --git a/regr/hardware/hci.json b/regr/hardware/hci/hardware.json
similarity index 100%
rename from regr/hardware/hci.json
rename to regr/hardware/hci/hardware.json
diff --git a/regr/hardware/log.json b/regr/hardware/log/hardware.json
similarity index 100%
rename from regr/hardware/log.json
rename to regr/hardware/log/hardware.json
diff --git a/regr/testbench/fair.json b/regr/testbench/fair.json
deleted file mode 100644
index 011e20b..0000000
--- a/regr/testbench/fair.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "description": "Testbench configuration parameters",
-  "parameters": {
-    "CLK_PERIOD": 50,
-    "RST_CLK_CYCLES": 10,
-    "RANDOM_GNT": 0,
-    "INVERT_PRIO": 0,
-    "PRIORITY_CNT_NUMERATOR": 1,
-    "PRIORITY_CNT_DENOMINATOR": 2
-  }
-}
diff --git a/regr/testbench/testbench.json b/regr/testbench/fair/testbench.json
similarity index 100%
rename from regr/testbench/testbench.json
rename to regr/testbench/fair/testbench.json
diff --git a/regr/testbench/hwpe_prio.json b/regr/testbench/hwpe_prio/testbench.json
similarity index 100%
rename from regr/testbench/hwpe_prio.json
rename to regr/testbench/hwpe_prio/testbench.json
diff --git a/regr/testbench/log_prio.json b/regr/testbench/log_prio/testbench.json
similarity index 100%
rename from regr/testbench/log_prio.json
rename to regr/testbench/log_prio/testbench.json

From 13aa12a70d745780fd948daccbb1413352ec18c3 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 13:39:52 +0100
Subject: [PATCH 15/25] verif: Fix GUI makefile param

---
 target/verif/verif.mk | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 5ef479b..2285537 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -94,11 +94,7 @@ clean-config-verif:
 ##############
 
 # Parameters
-ifdef gui
-  GUI ?= $(gui)
-else
-  GUI ?= 0
-endif
+GUI ?= $(if $(gui),$(gui),0)
 # Top-level to simulate
 sim_top_level ?= tb_hci
 sim_vsim_lib ?= $(HCI_VERIF_DIR)/vsim/work

From 5a93c8c4c237a5b32e2bde4bb63c3376f640b738 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 13:47:24 +0100
Subject: [PATCH 16/25] tb: Remove end_req signal (only track complete time to
 response)

---
 target/verif/src/application_driver.sv |  4 ----
 target/verif/src/simulation_report.sv  |  5 -----
 target/verif/src/tb_hci.sv             |  8 --------
 target/verif/src/tb_hci_pkg.sv         |  2 +-
 target/verif/src/throughput_monitor.sv | 18 ------------------
 5 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index d4697e9..c2a920e 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -28,7 +28,6 @@ module application_driver #(
   input logic             rst_ni,
   input logic             clear_i, // used to gate the driver or to reset it
   hci_core_intf.initiator hci_if,
-  output logic            end_req_o,
   output logic            end_resp_o,
   output int unsigned     n_issued_tr_o,
   output int unsigned     n_issued_rd_tr_o,
@@ -138,7 +137,6 @@ module application_driver #(
     hci_if.r_ready = 1'b1;
     hci_if.user = '0;
     // Defaults (outputs)
-    end_req_o = 1'b0;
     end_resp_o = 1'b0;
     // inputs: gnt, r_data, r_valid, r_user, r_id
 
@@ -219,7 +217,6 @@ module application_driver #(
         end
       end
       REQ_DONE: begin
-        end_req_o = 1'b1;
         if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
           // All read responses have been retired
           req_state_d = RSP_DONE;
@@ -228,7 +225,6 @@ module application_driver #(
         end
       end
       RSP_DONE: begin
-        end_req_o = 1'b1;
         end_resp_o = 1'b1;
       end
       default: begin
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index e8b02bd..420f015 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -20,10 +20,8 @@
 module simulation_report
   import tb_hci_pkg::*;
 (
-  input logic [N_DRIVERS-1:0] end_req_i,
   input logic [N_DRIVERS-1:0] end_resp_i,
   input real                  throughput_complete_i,
-  input real                  stim_latency_i,
   input real                  tot_latency_i,
   input real                 latency_per_master_i[N_DRIVERS],
   input real                 sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE],
@@ -88,8 +86,6 @@ module simulation_report
     hwpe_masters_with_grants = '0;
     missing_reads = 1'b0;
 
-    wait (&end_req_i);
-    wait (stim_latency_i >= 0);
     wait (&end_resp_i);
     wait (throughput_complete_i >= 0);
     wait (tot_latency_i >= 0);
@@ -242,7 +238,6 @@ module simulation_report
       "Actual BW (completion):  %0.2f bit/cycle  [utilization: %0.1f%%]",
       throughput_complete_i, actual_bw_utilization
     );
-    $display("Stimulus phase duration:  %0.2f cycles", stim_latency_i);
     $display("Completion phase duration: %0.2f cycles", tot_latency_i);
     $display(
       "Granted transactions: reads=%0d writes=%0d total=%0d",
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index f67eb46..80966b5 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -27,7 +27,6 @@ module tb_hci
 ();
 
   logic                   clk, rst_n;
-  logic [N_DRIVERS-1:0]   s_end_req;   // end_req_o from all drivers, [N_DRIVERS-1:0] ordering
   logic [N_DRIVERS-1:0]   s_end_resp;  // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering
   logic [N_DRIVERS-1:0]   s_clear_drv; // per-driver clear_i (held 1 until dependencies done)
   hci_interconnect_ctrl_t s_hci_ctrl;
@@ -345,7 +344,6 @@ module tb_hci
         .rst_ni(rst_n),
         .clear_i(s_clear_drv[ii]),
         .hci_if(hci_driver_log_if[ii]),
-        .end_req_o(s_end_req[ii]),
         .end_resp_o(s_end_resp[ii]),
         .n_issued_tr_o(s_issued_transactions[ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[ii]),
@@ -369,7 +367,6 @@ module tb_hci
         .rst_ni(rst_n),
         .clear_i(s_clear_drv[N_LOG_MASTERS + ii]),
         .hci_if(hci_driver_hwpe_if[ii]),
-        .end_req_o(s_end_req[N_LOG_MASTERS + ii]),
         .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]),
         .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]),
@@ -395,7 +392,6 @@ module tb_hci
 
   real latency_per_master[N_DRIVERS];
   real throughput_completed;
-  real stim_latency;
   real tot_latency;
 
   throughput_monitor #(
@@ -407,14 +403,12 @@ module tb_hci
   ) i_throughput_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
-    .end_req_i(s_end_req),
     .end_resp_i(s_end_resp),
     .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG),
     .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE),
     .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG),
     .n_write_granted_hwpe_i(N_WRITE_GRANTED_TRANSACTIONS_HWPE),
     .throughput_complete_o(throughput_completed),
-    .stim_latency_o(stim_latency),
     .tot_latency_o(tot_latency),
     .latency_per_master_o(latency_per_master)
   );
@@ -440,10 +434,8 @@ module tb_hci
   );
 
   simulation_report i_simulation_report (
-    .end_req_i(s_end_req),
     .end_resp_i(s_end_resp),
     .throughput_complete_i(throughput_completed),
-    .stim_latency_i(stim_latency),
     .tot_latency_i(tot_latency),
     .latency_per_master_i(latency_per_master),
     .sum_req_to_gnt_latency_log_i(SUM_REQ_TO_GNT_LATENCY_LOG),
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index 0d16b40..0addc97 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -79,7 +79,7 @@ package tb_hci_pkg;
   localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
 
   // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's
-  // end_req_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define.
+  // end_resp_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define.
   // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential
   // chain in tb_hci.sv.
   localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] =
diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/throughput_monitor.sv
index 9a7e51b..9298577 100644
--- a/target/verif/src/throughput_monitor.sv
+++ b/target/verif/src/throughput_monitor.sv
@@ -26,7 +26,6 @@ module throughput_monitor #(
 ) (
   input logic                clk_i,
   input logic                rst_ni,
-  input logic [N_MASTER-1:0] end_req_i,
   input logic [N_MASTER-1:0] end_resp_i,
   // Read transactions number
   input int unsigned         n_read_complete_log_i[N_MASTER-N_HWPE],
@@ -36,28 +35,11 @@ module throughput_monitor #(
   input int unsigned         n_write_granted_hwpe_i[N_HWPE],
   // Completion-side throughput: accepted writes + completed reads per elapsed completion cycle.
   output real                throughput_complete_o,
-  // Elapsed cycles from reset release to end_stimuli.
-  output real                stim_latency_o,
   // Total simulation time (cycles) and simulation time per master (cycles)
   output real                tot_latency_o,
   output real                latency_per_master_o[N_MASTER]
 );
 
-  // Stimulus duration at stimulus completion.
-  initial begin
-    time start_time, end_time;
-    real stim_time_cycles;
-    stim_latency_o = -1;
-    wait (rst_ni);
-    #(CLK_PERIOD/100);
-    @(posedge clk_i);
-    start_time = $time;
-    wait (&end_req_i);
-    end_time = $time;
-    stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD);  // cycles
-    stim_latency_o = stim_time_cycles;
-  end
-
   // Completion-side throughput at full completion.
   initial begin
     time start_time, end_time;

From d6d653792c6aa69687f62624294bde924f84ff3c Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 13:58:12 +0100
Subject: [PATCH 17/25] verif: Rename tb monitors and fix license headers

---
 Bender.yml                                          |  7 ++++---
 bender.mk                                           |  2 +-
 target/verif/bender.mk                              |  2 +-
 target/verif/config/generated/hardware.mk.tpl       |  2 +-
 target/verif/config/generated/testbench.mk.tpl      |  2 +-
 target/verif/src/application_driver.sv              |  4 +++-
 .../{throughput_monitor.sv => bandwidth_monitor.sv} | 13 ++++++++-----
 .../src/{latency_monitor.sv => req_gnt_monitor.sv}  | 13 ++++++++-----
 target/verif/src/simulation_report.sv               |  5 ++++-
 target/verif/src/tb_hci.sv                          | 11 +++++------
 target/verif/src/tb_hci_pkg.sv                      |  3 +--
 target/verif/verif.mk                               |  2 +-
 12 files changed, 38 insertions(+), 28 deletions(-)
 rename target/verif/src/{throughput_monitor.sv => bandwidth_monitor.sv} (91%)
 rename target/verif/src/{latency_monitor.sv => req_gnt_monitor.sv} (95%)

diff --git a/Bender.yml b/Bender.yml
index 516fe27..37aa7af 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -2,10 +2,11 @@ package:
   name: hci
   authors:
     - "Francesco Conti <f.conti@unibo.it>"
-    - "Gianna Paulin <pauling@iis.ee.ethz.ch"
+    - "Gianna Paulin <pauling@iis.ee.ethz.ch>"
     - "Tobias Riedener <tobiasri@student.ethz.ch>"
     - "Luigi Ghionda <luigi.ghionda2@studio.it>"
     - "Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>"
+    - "Sergio Mazzola <smazzola@iis.ee.ethz.ch>"
 
 dependencies:
   hwpe-stream:            { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.9.0 }
@@ -70,8 +71,8 @@ sources:
           # Level 1
           - target/verif/src/application_driver.sv
           - target/verif/src/tcdm_banks_wrap.sv
-          - target/verif/src/latency_monitor.sv
-          - target/verif/src/throughput_monitor.sv
+          - target/verif/src/req_gnt_monitor.sv
+          - target/verif/src/bandwidth_monitor.sv
           # Level 2
           - target/verif/src/simulation_report.sv
           # Level 3
diff --git a/bender.mk b/bender.mk
index 5f78e57..788c13a 100644
--- a/bender.mk
+++ b/bender.mk
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index d4dadc3..02e640b 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl
index f6bdc14..031d817 100644
--- a/target/verif/config/generated/hardware.mk.tpl
+++ b/target/verif/config/generated/hardware.mk.tpl
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl
index 66b617d..c36dad7 100644
--- a/target/verif/config/generated/testbench.mk.tpl
+++ b/target/verif/config/generated/testbench.mk.tpl
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index c2a920e..35c66c4 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -1,7 +1,9 @@
 /*
  * application_driver.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/bandwidth_monitor.sv
similarity index 91%
rename from target/verif/src/throughput_monitor.sv
rename to target/verif/src/bandwidth_monitor.sv
index 9298577..524e1ff 100644
--- a/target/verif/src/throughput_monitor.sv
+++ b/target/verif/src/bandwidth_monitor.sv
@@ -1,7 +1,10 @@
 /*
- * throughput_monitor.sv
+ * bandwidth_monitor.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -13,11 +16,11 @@
  */
 
 /**
- * Throughput monitor
- * Measures actual throughput and simulation time for each master
+ * Bandwidth monitor
+ * Measures actual bandwidth and completion time for each master
  */
 
-module throughput_monitor #(
+module bandwidth_monitor #(
   parameter int unsigned N_MASTER,
   parameter int unsigned N_HWPE,
   parameter int unsigned CLK_PERIOD,
diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/req_gnt_monitor.sv
similarity index 95%
rename from target/verif/src/latency_monitor.sv
rename to target/verif/src/req_gnt_monitor.sv
index 3aff305..66fa4cb 100644
--- a/target/verif/src/latency_monitor.sv
+++ b/target/verif/src/req_gnt_monitor.sv
@@ -1,7 +1,10 @@
 /*
- * latency_monitor.sv
+ * req_gnt_monitor.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -13,11 +16,11 @@
  */
 
 /**
- * Latency monitor
- * Tracks request-to-grant latency and transaction counters for all masters
+ * Request-to-grant monitor
+ * Tracks request-to-grant stall latency and transaction counters for all masters
  */
 
-module latency_monitor #(
+module req_gnt_monitor #(
   parameter int unsigned N_MASTER = 4,
   parameter int unsigned N_HWPE = 1
 ) (
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index 420f015..78be03d 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -1,7 +1,10 @@
 /*
  * simulation_report.sv
  *
- * Copyright (C) 2026 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 80966b5..d95ce3b 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -4,8 +4,7 @@
  * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
  * Luca Codeluppi <lcodelupp@student.ethz.ch>
  *
- *
- * Copyright (C) 2019-2025 ETH Zurich, University of Bologna
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -394,13 +393,13 @@ module tb_hci
   real throughput_completed;
   real tot_latency;
 
-  throughput_monitor #(
+  bandwidth_monitor #(
     .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE),
     .CLK_PERIOD(CLK_PERIOD),
     .DATA_WIDTH(DATA_WIDTH),
     .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT)
-  ) i_throughput_monitor (
+  ) i_bandwidth_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
     .end_resp_i(s_end_resp),
@@ -413,10 +412,10 @@ module tb_hci
     .latency_per_master_o(latency_per_master)
   );
 
-  latency_monitor #(
+  req_gnt_monitor #(
     .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE)
-  ) i_latency_monitor (
+  ) i_req_gnt_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
     .hci_driver_log_if(hci_driver_log_if),
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index 0addc97..47c5d7b 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -4,8 +4,7 @@
  * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
  * Luca Codeluppi <lcodelupp@student.ethz.ch>
  *
- *
- * Copyright (C) 2019-2025 ETH Zurich, University of Bologna
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 2285537..b94a3b4 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #

From b4d59303f5f2d6bcac934b7f5223f0172b11f81d Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 19:47:38 +0100
Subject: [PATCH 18/25] tb: Add support for multiple patterns per master, in
 different phases

---
 target/verif/README.md                        |   4 +-
 target/verif/config/hardware.json             |   6 +-
 target/verif/config/workload.json             | 538 ++++++++++++++++--
 target/verif/config/workload.schema.json      |  77 ++-
 .../verif/simvectors/hci_stimuli/generator.py |   4 +
 .../verif/simvectors/hci_stimuli/patterns.py  | 512 +++++------------
 target/verif/simvectors/main.py               | 301 +++++++---
 target/verif/src/application_driver.sv        | 296 +++++-----
 target/verif/src/tb_hci.sv                    | 137 +++--
 target/verif/src/tb_hci_pkg.sv                |  23 +-
 target/verif/verif.mk                         |  14 +-
 target/verif/vsim/tb_hci.tcl                  | 116 +++-
 12 files changed, 1315 insertions(+), 713 deletions(-)

diff --git a/target/verif/README.md b/target/verif/README.md
index 20db9b2..cd7c03f 100644
--- a/target/verif/README.md
+++ b/target/verif/README.md
@@ -107,7 +107,7 @@ Masters can be assigned to named **phases** and can declare **dependencies** on
 
 ### Mechanism
 
-`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/wait_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define.
+`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/fence_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define.
 
 In `tb_hci_pkg.sv`, `WAIT_MASKS[i]` holds driver `i`'s mask. In `tb_hci.sv`, each driver's `clear_i` is combinationally held high until all drivers in its mask have asserted `end_req_o`:
 
@@ -170,7 +170,7 @@ This allows verifying that regions are correctly partitioned and identifying any
 | Target | Action |
 |--------|--------|
 | `make config-verif` | Generate `hardware.mk`, `testbench.mk` from JSON |
-| `make stim-verif` | Generate stimuli and `wait_masks.mk` from workload/hardware/testbench JSON |
+| `make stim-verif` | Generate stimuli and `fence_masks.mk` from workload/hardware/testbench JSON |
 | `make compile-verif` | Compile RTL and testbench with QuestaSim |
 | `make opt-verif` | Optimize compiled design |
 | `make run-verif` | Run simulation |
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index c8db224..b349a65 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -1,14 +1,14 @@
 {
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
-    "N_HWPE": 3,
+    "N_HWPE": 2,
     "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
     "N_DMA": 0,
     "N_EXT": 1,
     "DATA_WIDTH": 32,
-    "TOT_MEM_SIZE": 128,
-    "N_BANKS": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
     "INTERCO_TYPE": "HCI",
     "TS_BIT": 21,
     "EXPFIFO": 0,
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index e094564..6c0d227 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -1,58 +1,508 @@
 {
-  "description": "8 cores at 50% traffic in dedicated regions + 3 HWPEs: DMA imports tile B, GEMM reads tile B and writes output, Transpose reads GEMM output and writes transposed result. DMA and GEMM run concurrently (wave0), Transpose waits for wave0.",
+  "description": "MobileViT-inspired materialized attention kernel. d_model=192, num_heads=4, d_head=48, N=256 tokens, B=128 (2 tiles). One head at a time. Static resident: K^T_head (0x00000, 16kB), V_head (0x04000, 16kB). Ping-pong workspaces: Buf0={Q0 (0x08000,8kB), A0/P0 (0x0A000,64kB), O0 (0x1A000,8kB)}, Buf1={Q1 (0x1C000,8kB), A1/P1 (0x1E000,64kB), O1 (0x2E000,8kB)}. Pipeline: DMA preloads all tensors (dma_setup); GEMM computes Q*K^T->A (gemm_qk_t0); 8 cores softmax A->P in-place (softmax_t0); GEMM computes P*V->O (gemm_pv_t0); 8 cores norm O in-place (norm_t0); repeat for tile 1. Single GEMM engine serializes all 4 GEMM ops.",
 
   "log_masters": [
-    { "id": 0, "description": "Core 0", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00000000", "region_base_address": "0x00000000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 1, "description": "Core 1", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00001000", "region_base_address": "0x00001000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 2, "description": "Core 2", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00002000", "region_base_address": "0x00002000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 3, "description": "Core 3", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00003000", "region_base_address": "0x00003000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 4, "description": "Core 4", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00004000", "region_base_address": "0x00004000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 5, "description": "Core 5", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00005000", "region_base_address": "0x00005000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 6, "description": "Core 6", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00006000", "region_base_address": "0x00006000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 7, "description": "Core 7", "mem_access_type": "linear", "n_transactions": 2000, "stride0": 1, "start_address": "0x00007000", "region_base_address": "0x00007000", "region_size_bytes": 4096, "traffic_pct": 50, "traffic_read_pct": 50 },
-    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
-  ],
-
-  "hwpe_masters": [
     {
       "id": 0,
-      "description": "DMA — wide tile import into buffer B (0x08000–0x0FFFF)",
-      "mem_access_type": "random",
-      "n_transactions": 1000,
-      "traffic_pct": 100,
-      "traffic_read_pct": 0,
-      "region_base_address": "0x00008000",
-      "region_size_bytes": 32768,
-      "phase": "hwpe_wave0"
+      "description": "Core 0",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
     },
     {
       "id": 1,
-      "description": "GEMM — reads tile B from DMA buffer (0x08000–0x0FFFF), writes output to buffer C (0x10000–0x17FFF)",
-      "mem_access_type": "matmul_phased",
-      "n_transactions": 1000,
-      "region_base_address_a": "0x00008000",
-      "region_size_bytes_a": 32768,
-      "matmul_ratio_b": 0,
-      "region_base_address_c": "0x00010000",
-      "region_size_bytes_c": 32768,
-      "matmul_ratio_a": 1,
-      "matmul_ratio_c": 1,
-      "phase": "hwpe_wave0"
+      "description": "Core 1",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
     },
     {
       "id": 2,
-      "description": "Transpose engine — reads GEMM output (0x10000–0x17FFF), writes transposed result (0x18000–0x1FFFF)",
-      "mem_access_type": "matmul_phased",
-      "n_transactions": 1000,
-      "region_base_address_a": "0x00010000",
-      "region_size_bytes_a": 32768,
-      "matmul_ratio_b": 0,
-      "region_base_address_c": "0x00018000",
-      "region_size_bytes_c": 32768,
-      "matmul_ratio_a": 1,
-      "matmul_ratio_c": 1,
-      "phase": "hwpe_wave1",
-      "wait_for": ["hwpe_wave0"]
+      "description": "Core 2",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "description": "Core 3",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "description": "Core 4",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "description": "Core 5",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "description": "Core 6",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "description": "Core 7",
+      "patterns": [
+        {
+          "description": "softmax tile 0: read+write A0/P0 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t0",
+          "wait_for": ["gemm_qk_t0"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x0A000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 0: read+write O0 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t0",
+          "wait_for": ["gemm_pv_t0"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1A000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "softmax tile 1: read+write A1/P1 in-place",
+          "mem_access_type": "random",
+          "phase": "softmax_t1",
+          "wait_for": ["gemm_qk_t1"],
+          "n_transactions": 2048,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x1E000",
+          "region_size_bytes": 65536
+        },
+        {
+          "description": "norm tile 1: read+write O1 in-place",
+          "mem_access_type": "random",
+          "phase": "norm_t1",
+          "wait_for": ["gemm_pv_t1"],
+          "n_transactions": 384,
+          "traffic_pct": 75,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x2E000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "description": "External 0 (idle)",
+      "mem_access_type": "idle"
+    }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA: preload K^T_head, V_head, Q0, Q1 (all writes, one-shot)",
+      "patterns": [
+        {
+          "description": "Preload K^T_head (0x00000, 12kB active / 16kB padded), V_head (0x04000), Q0 (0x08000), Q1 (0x1C000)",
+          "mem_access_type": "random",
+          "phase": "dma_setup",
+          "n_transactions": 9216,
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 122880
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "GEMM engine: 4 sequential ops (QK_t0, PV_t0, QK_t1, PV_t1)",
+      "patterns": [
+        {
+          "description": "QK tile 0: read Q0 [128x48 int8], write A0 [128x256 int16]",
+          "mem_access_type": "matmul_phased",
+          "phase": "gemm_qk_t0",
+          "wait_for": ["dma_setup"],
+          "matrix_m": 128,
+          "matrix_n": 256,
+          "matrix_k": 48,
+          "region_base_address_a": "0x08000",
+          "region_size_bytes_a": 8192,
+          "matmul_ratio_a": 1,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 16384,
+          "matmul_ratio_b": 0,
+          "region_base_address_c": "0x0A000",
+          "region_size_bytes_c": 65536,
+          "matmul_ratio_c": 4
+        },
+        {
+          "description": "PV tile 0: read P0 [128x256 int16], write O0 [128x48 int8]",
+          "mem_access_type": "matmul_phased",
+          "phase": "gemm_pv_t0",
+          "wait_for": ["softmax_t0"],
+          "matrix_m": 128,
+          "matrix_n": 48,
+          "matrix_k": 256,
+          "region_base_address_a": "0x0A000",
+          "region_size_bytes_a": 65536,
+          "matmul_ratio_a": 4,
+          "region_base_address_b": "0x04000",
+          "region_size_bytes_b": 16384,
+          "matmul_ratio_b": 0,
+          "region_base_address_c": "0x1A000",
+          "region_size_bytes_c": 8192,
+          "matmul_ratio_c": 1
+        },
+        {
+          "description": "QK tile 1: read Q1 [128x48 int8], write A1 [128x256 int16]",
+          "mem_access_type": "matmul_phased",
+          "phase": "gemm_qk_t1",
+          "wait_for": ["norm_t0"],
+          "matrix_m": 128,
+          "matrix_n": 256,
+          "matrix_k": 48,
+          "region_base_address_a": "0x1C000",
+          "region_size_bytes_a": 8192,
+          "matmul_ratio_a": 1,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 16384,
+          "matmul_ratio_b": 0,
+          "region_base_address_c": "0x1E000",
+          "region_size_bytes_c": 65536,
+          "matmul_ratio_c": 4
+        },
+        {
+          "description": "PV tile 1: read P1 [128x256 int16], write O1 [128x48 int8]",
+          "mem_access_type": "matmul_phased",
+          "phase": "gemm_pv_t1",
+          "wait_for": ["softmax_t1"],
+          "matrix_m": 128,
+          "matrix_n": 48,
+          "matrix_k": 256,
+          "region_base_address_a": "0x1E000",
+          "region_size_bytes_a": 65536,
+          "matmul_ratio_a": 4,
+          "region_base_address_b": "0x04000",
+          "region_size_bytes_b": 16384,
+          "matmul_ratio_b": 0,
+          "region_base_address_c": "0x2E000",
+          "region_size_bytes_c": 8192,
+          "matmul_ratio_c": 1
+        }
+      ]
     }
   ]
 }
diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json
index 3b4c653..2140e37 100644
--- a/target/verif/config/workload.schema.json
+++ b/target/verif/config/workload.schema.json
@@ -24,48 +24,39 @@
   },
 
   "$defs": {
-    "master": {
+
+    "pattern": {
       "type": "object",
-      "description": "Per-master stimulus configuration. The active pattern is selected by mem_access_type.",
+      "description": "A single traffic pattern segment. Multiple patterns on the same master are executed sequentially, separated by PAUSE fence tokens in the stimulus file.",
       "required": ["mem_access_type"],
       "unevaluatedProperties": false,
       "properties": {
 
-        "id": {
-          "type": "integer",
-          "description": "Optional consistency label. Mapping is always positional; a mismatch triggers a warning."
-        },
         "description": {
           "type": "string",
-          "description": "Human-readable label shown in the memory-map report."
+          "description": "Human-readable label for this pattern segment, shown in the memory-map report."
         },
         "mem_access_type": {
           "type": "string",
           "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"],
-          "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'. Use traffic_pct/traffic_read_pct on 'random'/'linear' to model bus utilization."
+          "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'."
         },
         "phase": {
           "type": "string",
           "default": "default",
-          "description": "Phase group this master belongs to, referenced by other masters' wait_for."
+          "description": "Phase name for this pattern segment. Used by other patterns' wait_for to reference this segment. The master's phase identity is determined by its first pattern's phase."
         },
         "wait_for": {
           "type": "array",
           "items": { "type": "string" },
           "default": [],
-          "description": "Phase names this master waits for before its WAIT_MASK clears in the testbench."
-        },
-        "start_delay_cycles": {
-          "type": "integer",
-          "minimum": 0,
-          "default": 0,
-          "description": "Number of idle cycles prepended to the processed stimuli file (static start delay)."
+          "description": "Phase names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced phases have advanced past the same fence level."
         },
 
         "n_transactions": {
           "type": "integer",
           "minimum": 0,
-          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields (see per-pattern rules)."
+          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields."
         },
 
         "region_base_address": {
@@ -73,20 +64,20 @@
             { "type": "integer", "minimum": 0 },
             { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
           ],
-          "description": "[random, matmul_phased] Base byte address of the memory region. Default: master_local_idx * (total_mem / n_peers_of_kind), aligned to access_bytes."
+          "description": "[random, matmul_phased] Base byte address of the memory region."
         },
         "region_size_bytes": {
           "oneOf": [
             { "type": "integer", "minimum": 0 },
             { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
           ],
-          "description": "[random, matmul_phased] Size in bytes of the memory region. Default: total_mem / n_peers_of_kind. Clamped so region does not exceed total memory."
+          "description": "[random, matmul_phased] Size in bytes of the memory region."
         },
 
         "start_address": {
           "type": "string",
           "default": "0",
-          "description": "[linear, 2d, 3d] Start byte address as decimal, hex (0x...) or binary string."
+          "description": "[linear, 2d, 3d] Start byte address."
         },
         "stride0": {
           "type": "integer",
@@ -114,7 +105,7 @@
         "len_d0": {
           "type": "integer",
           "minimum": 1,
-          "description": "[2d, 3d] Innermost dimension length. Can replace n_transactions together with len_d1 (and len_d2 for 3d)."
+          "description": "[2d, 3d] Innermost dimension length."
         },
         "len_d1": {
           "type": "integer",
@@ -130,7 +121,7 @@
         "matrix_m": {
           "type": "integer",
           "minimum": 1,
-          "description": "[matmul_phased] Rows of A and C. Can replace n_transactions together with matrix_n and matrix_k (total = m*k + k*n + m*n)."
+          "description": "[matmul_phased] Rows of A and C."
         },
         "matrix_n": {
           "type": "integer",
@@ -144,7 +135,7 @@
         },
         "region_base_address_a": {
           "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
-          "description": "[matmul_phased] Base address of the read-A region. If set together with region_size_bytes_a, overrides the auto-split for phase A."
+          "description": "[matmul_phased] Base address of the read-A region."
         },
         "region_size_bytes_a": {
           "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
@@ -166,12 +157,11 @@
           "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
           "description": "[matmul_phased] Size in bytes of the write-C region."
         },
-
         "matmul_ratio_a": {
           "type": "integer",
           "minimum": 0,
           "default": 1,
-          "description": "[matmul_phased] Relative weight of the read-A phase in the transaction split."
+          "description": "[matmul_phased] Relative weight of the read-A phase."
         },
         "matmul_ratio_b": {
           "type": "integer",
@@ -191,19 +181,19 @@
           "minimum": 1,
           "maximum": 100,
           "default": 100,
-          "description": "[random, linear] Bus utilization percentage. After each transaction emit floor((100-pct)/pct) idle cycles. E.g. 50 => 1 idle per transaction (req, idle, req, idle, ...). Default 100 = back-to-back."
+          "description": "[random, linear] Bus utilization percentage."
         },
         "traffic_read_pct": {
           "type": "integer",
           "minimum": 0,
           "maximum": 100,
-          "description": "[random, linear] Percentage of accesses that are reads (wen=1). If omitted, wen is random per transaction. When set, all reads are emitted first, then all writes."
+          "description": "[random, linear] Percentage of accesses that are reads (wen=1)."
         },
         "idle_cycles_between_phases": {
           "type": "integer",
           "minimum": 0,
           "default": 0,
-          "description": "[2d, 3d, matmul_phased] Number of req=0 cycles inserted at each phase boundary (between outer rows for 2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time between memory bursts."
+          "description": "[2d, 3d, matmul_phased] Idle cycles inserted at each inner phase boundary."
         }
       },
 
@@ -234,6 +224,37 @@
           }
         }
       ]
+    },
+
+    "master": {
+      "type": "object",
+      "description": "Per-master configuration. A master either has a flat single-pattern config (legacy, backward compatible) or a 'patterns' list for multi-phase execution.",
+      "unevaluatedProperties": false,
+      "properties": {
+        "id": {
+          "type": "integer",
+          "description": "Optional consistency label. Mapping is always positional."
+        },
+        "description": {
+          "type": "string",
+          "description": "Human-readable master label."
+        },
+        "start_delay_cycles": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "Number of idle cycles prepended to the stimulus file before the first pattern."
+        },
+        "patterns": {
+          "type": "array",
+          "minItems": 1,
+          "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for of pattern f defines which phases must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern (legacy format).",
+          "items": { "$ref": "#/$defs/pattern" }
+        }
+      },
+      "allOf": [
+        { "$ref": "#/$defs/pattern" }
+      ]
     }
   }
 }
diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py
index aac6927..52993f5 100644
--- a/target/verif/simvectors/hci_stimuli/generator.py
+++ b/target/verif/simvectors/hci_stimuli/generator.py
@@ -71,6 +71,10 @@ def _write_idle(self, file_obj):
             + "\n"
         )
 
+    def _write_pause(self, file_obj):
+        """Write a PAUSE fence token line."""
+        file_obj.write("PAUSE\n")
+
     def data_wen(self):
         wen = random.randint(0, 1)  # 1=read, 0=write
         if wen:
diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py
index 987e526..063c008 100644
--- a/target/verif/simvectors/hci_stimuli/patterns.py
+++ b/target/verif/simvectors/hci_stimuli/patterns.py
@@ -4,438 +4,240 @@
   req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
 
 req=0 lines are idle cycles. req=1 lines are active transactions.
-There is no intermediate raw format or cycle_offset expansion step.
 
-Patterns:
-  - random:        uniformly random addresses, random read/write mix.
-                   Optional: traffic_pct / traffic_read_pct for uniform idle interleaving.
-  - linear:        strided 1-D scan.
-                   Optional: traffic_pct / traffic_read_pct for uniform idle interleaving.
-  - 2d:            strided 2-D scan (inner stride0, outer stride1).
-                   Optional: idle_cycles_between_phases inserted after each outer row.
-  - 3d:            strided 3-D scan (stride0, stride1, stride2).
-                   Optional: idle_cycles_between_phases inserted after each mid-level block.
-  - idle:          no transactions (single req=0 line).
-  - matmul_phased: phased traffic modelling matrix multiply
-                   (read-A phase, read-B phase, write-C phase).
-                   Optional: idle_cycles_between_phases inserted between phases.
+Fence semantics (one trailing PAUSE per pattern):
+  Each pattern ends with a PAUSE. fence_idx[i] increments when resume_i fires while
+  fence_reached_o is high (i.e. while the driver is sitting at the PAUSE).
+  fence_idx[i] >= k means driver i has been granted to leave fence k-1,
+  i.e. pattern k-1 is complete and driver i is about to start pattern k.
 
-traffic_pct semantics (random, linear):
-  After every transaction, emit floor((100 - traffic_pct) / traffic_pct) idle cycles.
-  traffic_pct=50  => 1 idle per transaction  (req, idle, req, idle, ...)
-  traffic_pct=100 => no idles (back-to-back)
+  resume_i fires when the dependencies of the NEXT pattern are satisfied.
+  So resume_i = "your next job's inputs are ready, proceed".
 
-traffic_read_pct semantics (random, linear):
-  Fraction of transactions that are reads (wen=1); remainder are writes (wen=0).
-  Only meaningful when traffic_pct < 100 or as override of the default random mix.
-  When set, wen is no longer random: the first n_reads transactions are reads,
-  the rest are writes (pattern: all reads first, then all writes).
+  Trailing PAUSE of the last pattern has mask=0 → resume_i fires in one cycle
+  → fence_idx advances to N_patterns, signalling final completion to dependents.
 
-idle_cycles_between_phases semantics (2d, 3d, matmul_phased):
-  Number of req=0 cycles inserted at each phase boundary (between outer rows for
-  2d/3d, between read-A/read-B/write-C for matmul_phased). Models compute time
-  between bursts of memory accesses.
-
-All patterns respect the cross-master forbidden address lists:
-  - forbidden_write: addresses already read -> no later master may write here
-  - forbidden_read:  addresses already written -> no later master may read/write here
+All generators accept append=True to open the file in append mode.
 """
 
 import random
 
 
 class PatternsMixin:
-    """Mixin providing memory access pattern generators."""
-
-    # ------------------------------------------------------------------ #
-    # Shared helpers                                                       #
-    # ------------------------------------------------------------------ #
 
     @staticmethod
     def _parse_address(addr_str):
-        """Parse a binary, hex (0x...), or decimal string to int."""
         s = str(addr_str)
-        if s.startswith('0x') or s.startswith('0X'):
-            return int(s, 16)
-        if set(s) <= {'0', '1'}:
-            return int(s, 2)
+        if s.startswith('0x') or s.startswith('0X'): return int(s, 16)
+        if set(s) <= {'0', '1'}: return int(s, 2)
         return int(s, 0)
 
     @staticmethod
     def _align_down(value, alignment):
-        if alignment <= 0:
-            return value
+        if alignment <= 0: return value
         return (value // alignment) * alignment
 
     @staticmethod
     def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c):
-        """Split total_ops into three phases according to ratios."""
-        ra = max(0, int(ratio_a))
-        rb = max(0, int(ratio_b))
-        rc = max(0, int(ratio_c))
-        if ra == 0 and rb == 0 and rc == 0:
-            ra, rb, rc = 1, 1, 1
-        rsum = ra + rb + rc
-        cnt_a = (total_ops * ra) // rsum
-        cnt_b = (total_ops * rb) // rsum
-        cnt_c = total_ops - cnt_a - cnt_b
-
+        ra = max(0, int(ratio_a)); rb = max(0, int(ratio_b)); rc = max(0, int(ratio_c))
+        if ra == 0 and rb == 0 and rc == 0: ra, rb, rc = 1, 1, 1
+        s = ra + rb + rc
+        ca = (total_ops * ra) // s; cb = (total_ops * rb) // s; cc = total_ops - ca - cb
         if total_ops >= 3:
-            if ra > 0 and cnt_a == 0:
-                cnt_a = 1
-                cnt_c = max(0, cnt_c - 1)
-            if rb > 0 and cnt_b == 0:
-                cnt_b = 1
-                cnt_c = max(0, cnt_c - 1)
-            if rc > 0 and cnt_c == 0:
-                if cnt_a > 1:
-                    cnt_a -= 1
-                    cnt_c = 1
-                elif cnt_b > 1:
-                    cnt_b -= 1
-                    cnt_c = 1
-        return cnt_a, cnt_b, cnt_c
+            if ra > 0 and ca == 0: ca = 1; cc = max(0, cc-1)
+            if rb > 0 and cb == 0: cb = 1; cc = max(0, cc-1)
+            if rc > 0 and cc == 0:
+                if ca > 1: ca -= 1; cc = 1
+                elif cb > 1: cb -= 1; cc = 1
+        return ca, cb, cc
 
     @staticmethod
     def _idles_per_req(traffic_pct):
-        """Number of idle cycles to emit after each transaction for a given traffic_pct."""
         traffic_pct = max(1, min(100, int(traffic_pct)))
-        if traffic_pct >= 100:
-            return 0
-        return int(round((100 - traffic_pct) / traffic_pct))
+        return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct))
+
+    def _is_allowed(self, add, wen, fr, fw):
+        return add not in (fr if wen else fw)
 
-    def _is_allowed(self, add, wen, forbidden_read, forbidden_write):
-        """Return True if the address is not in the relevant forbidden list."""
-        return add not in (forbidden_read if wen else forbidden_write)
+    def _record_access(self, add, wen, fr_new, fw_new):
+        fw_new.append(add)
+        if not wen: fr_new.append(add)
 
-    def _record_access(self, add, wen, forbidden_read_new, forbidden_write_new):
-        """Mark add as used by this master."""
-        forbidden_write_new.append(add)
-        if not wen:
-            forbidden_read_new.append(add)
+    def _open(self, append):
+        return open(self.filepath, "a" if append else "w", encoding="ascii")
 
     # ------------------------------------------------------------------ #
-    # Access patterns                                                      #
+    # Access patterns — each writes: transactions | PAUSE                 #
     # ------------------------------------------------------------------ #
 
-    def random_gen(
-        self,
-        id_start,
-        forbidden_read,
-        forbidden_write,
-        region_base=0,
-        region_size=None,
-        traffic_pct=100,
-        traffic_read_pct=None,
-    ):
-        """Uniformly random addresses within [region_base, region_base + region_size).
-
-        traffic_pct: bus utilisation percentage. After each transaction,
-          floor((100 - traffic_pct) / traffic_pct) idle cycles are emitted.
-          Default 100 = back-to-back (no idles).
-
-        traffic_read_pct: if set, the first n_reads transactions are reads and
-          the rest are writes. If not set, wen is random per transaction.
-        """
-        total_mem_bytes = int(self.TOT_MEM_SIZE * 1024)
-        if region_size is None:
-            region_size = total_mem_bytes
-        region_size = min(region_size, total_mem_bytes - region_base)
+    def random_gen(self, id_start, forbidden_read, forbidden_write,
+                   region_base=0, region_size=None, traffic_pct=100,
+                   traffic_read_pct=None, append=False):
+        total = int(self.TOT_MEM_SIZE * 1024)
+        if region_size is None: region_size = total
+        region_size = min(region_size, total - region_base)
         n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE)
         n_idles = self._idles_per_req(traffic_pct)
-
-        # Build wen sequence
         if traffic_read_pct is not None:
             rpct = max(0, min(100, int(traffic_read_pct)))
             n_reads = (self.N_TEST * rpct) // 100
-            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+            wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
         else:
             wen_seq = None
-
-        id_value = id_start
-        forbidden_read_new = []
-        forbidden_write_new = []
-        with open(self.filepath, "w", encoding="ascii") as file:
+        id_value = id_start; fr_new, fw_new = [], []
+        with self._open(append) as f:
             for i in range(self.N_TEST):
-                if wen_seq is not None:
-                    wen = wen_seq[i]
-                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
-                else:
-                    data, wen = self.data_wen()
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None: data, wen = self.data_wen()
+                else: data = "0"*self.DATA_WIDTH if wen else self.random_data()
                 while True:
-                    add_decimal = region_base + random.randint(0, int(n_words) - 1) * self.WIDTH_OF_MEMORY_BYTE
-                    add = bin(int(add_decimal))[2:].zfill(self.ADD_WIDTH)
+                    ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE
+                    add = bin(int(ad))[2:].zfill(self.ADD_WIDTH)
                     if self._is_allowed(add, wen, forbidden_read, forbidden_write):
-                        self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
-                        break
-                self._write_req(file, id_value, wen, data, add)
-                id_value += 1
-                for _ in range(n_idles):
-                    self._write_idle(file)
-        forbidden_read.extend(forbidden_read_new)
-        forbidden_write.extend(forbidden_write_new)
+                        self._record_access(add, wen, fr_new, fw_new); break
+                self._write_req(f, id_value, wen, data, add); id_value += 1
+                for _ in range(n_idles): self._write_idle(f)
+            self._write_pause(f)
+        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
         return id_value
 
-    def linear_gen(
-        self,
-        stride0,
-        start_address,
-        id_start,
-        forbidden_read,
-        forbidden_write,
-        traffic_pct=100,
-        traffic_read_pct=None,
-    ):
-        """Strided 1-D linear scan. Forbidden addresses are skipped (no idle inserted).
-
-        traffic_pct / traffic_read_pct: see random_gen.
-        """
+    def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden_write,
+                   traffic_pct=100, traffic_read_pct=None, append=False):
         n_idles = self._idles_per_req(traffic_pct)
-
         if traffic_read_pct is not None:
             rpct = max(0, min(100, int(traffic_read_pct)))
             n_reads = (self.N_TEST * rpct) // 100
-            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+            wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
         else:
             wen_seq = None
-
-        id_value = id_start
-        forbidden_read_new = []
-        forbidden_write_new = []
-        seq_idx = 0
-        with open(self.filepath, "w", encoding="ascii") as file:
-            next_address = self._parse_address(start_address)
-            if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
-                next_address -= self.TOT_MEM_SIZE * 1024
+        id_value = id_start; fr_new, fw_new = [], []
+        with self._open(append) as f:
+            addr = self._parse_address(start_address)
+            if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                addr -= self.TOT_MEM_SIZE*1024
             for i in range(self.N_TEST):
-                if wen_seq is not None:
-                    wen = wen_seq[i]
-                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
-                else:
-                    data, wen = self.data_wen()
-                add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                next_address += self.WIDTH_OF_MEMORY_BYTE * stride0
-                if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
-                    next_address -= self.TOT_MEM_SIZE * 1024
-                if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
-                    continue
-                self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
-                self._write_req(file, id_value, wen, data, add)
-                id_value += 1
-                for _ in range(n_idles):
-                    self._write_idle(file)
-        forbidden_read.extend(forbidden_read_new)
-        forbidden_write.extend(forbidden_write_new)
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None: data, wen = self.data_wen()
+                else: data = "0"*self.DATA_WIDTH if wen else self.random_data()
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                addr += self.WIDTH_OF_MEMORY_BYTE * stride0
+                if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                    addr -= self.TOT_MEM_SIZE*1024
+                if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
+                self._record_access(add, wen, fr_new, fw_new)
+                self._write_req(f, id_value, wen, data, add); id_value += 1
+                for _ in range(n_idles): self._write_idle(f)
+            self._write_pause(f)
+        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
         return id_value
 
-    def gen_2d(
-        self,
-        stride0,
-        len_d0,
-        stride1,
-        start_address,
-        id_start,
-        forbidden_read,
-        forbidden_write,
-        idle_cycles_between_phases=0,
-    ):
-        """Strided 2-D scan: inner loop stride0/len_d0, outer loop stride1.
-
-        idle_cycles_between_phases: req=0 cycles inserted after each completed inner row.
-        """
-        id_value = id_start
-        forbidden_read_new = []
-        forbidden_write_new = []
-        with open(self.filepath, "w", encoding="ascii") as file:
-            start_address_int = self._parse_address(start_address)
-            j = 0
+    def gen_2d(self, stride0, len_d0, stride1, start_address, id_start,
+               forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False):
+        id_value = id_start; fr_new, fw_new = [], []
+        with self._open(append) as f:
+            base = self._parse_address(start_address); j = 0
             while id_value - id_start < self.N_TEST:
                 for i in range(len_d0):
                     data, wen = self.data_wen()
-                    next_address = (
-                        start_address_int
-                        + i * self.WIDTH_OF_MEMORY_BYTE * stride0
-                        + j * self.WIDTH_OF_MEMORY_BYTE * stride1
-                    )
-                    if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
-                        next_address -= self.TOT_MEM_SIZE * 1024
-                    add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                    if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
-                        continue
-                    self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
-                    self._write_req(file, id_value, wen, data, add)
-                    id_value += 1
-                    if id_value - id_start >= self.N_TEST:
-                        break
-                for _ in range(idle_cycles_between_phases):
-                    self._write_idle(file)
+                    addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1
+                    if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                        addr -= self.TOT_MEM_SIZE*1024
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
+                    self._record_access(add, wen, fr_new, fw_new)
+                    self._write_req(f, id_value, wen, data, add); id_value += 1
+                    if id_value - id_start >= self.N_TEST: break
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
                 j += 1
-        forbidden_read.extend(forbidden_read_new)
-        forbidden_write.extend(forbidden_write_new)
+            self._write_pause(f)
+        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
         return id_value
 
-    def gen_3d(
-        self,
-        stride0,
-        len_d0,
-        stride1,
-        len_d1,
-        stride2,
-        start_address,
-        id_start,
-        forbidden_read,
-        forbidden_write,
-        idle_cycles_between_phases=0,
-    ):
-        """Strided 3-D scan: inner stride0/len_d0, mid stride1/len_d1, outer stride2.
-
-        idle_cycles_between_phases: req=0 cycles inserted after each completed mid-level block.
-        """
-        id_value = id_start
-        forbidden_read_new = []
-        forbidden_write_new = []
-        with open(self.filepath, "w", encoding="ascii") as file:
-            start_address_int = self._parse_address(start_address)
-            k = 0
+    def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start,
+               forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False):
+        id_value = id_start; fr_new, fw_new = [], []
+        with self._open(append) as f:
+            base = self._parse_address(start_address); k = 0
             while id_value - id_start < self.N_TEST:
                 for j in range(len_d1):
                     for i in range(len_d0):
                         data, wen = self.data_wen()
-                        next_address = (
-                            start_address_int
-                            + i * self.WIDTH_OF_MEMORY_BYTE * stride0
-                            + j * self.WIDTH_OF_MEMORY_BYTE * stride1
-                            + k * self.WIDTH_OF_MEMORY_BYTE * stride2
-                        )
-                        if next_address > self.TOT_MEM_SIZE * 1024 - self.WIDTH_OF_MEMORY_BYTE:
-                            next_address -= self.TOT_MEM_SIZE * 1024
-                        add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                        if not self._is_allowed(add, wen, forbidden_read, forbidden_write):
-                            continue
-                        self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
-                        self._write_req(file, id_value, wen, data, add)
-                        id_value += 1
-                        if id_value - id_start >= self.N_TEST:
-                            break
-                    if id_value - id_start >= self.N_TEST:
-                        break
-                    for _ in range(idle_cycles_between_phases):
-                        self._write_idle(file)
+                        addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + k*self.WIDTH_OF_MEMORY_BYTE*stride2
+                        if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                            addr -= self.TOT_MEM_SIZE*1024
+                        add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                        if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
+                        self._record_access(add, wen, fr_new, fw_new)
+                        self._write_req(f, id_value, wen, data, add); id_value += 1
+                        if id_value - id_start >= self.N_TEST: break
+                    if id_value - id_start >= self.N_TEST: break
+                    for _ in range(idle_cycles_between_phases): self._write_idle(f)
                 k += 1
-        forbidden_read.extend(forbidden_read_new)
-        forbidden_write.extend(forbidden_write_new)
+            self._write_pause(f)
+        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
         return id_value
 
-    def idle_gen(self, id_start):
-        """Emit a single req=0 idle line (master never issues transactions)."""
-        with open(self.filepath, "w", encoding="ascii") as file:
-            self._write_idle(file)
+    def idle_gen(self, id_start, append=False):
+        with self._open(append) as f:
+            self._write_idle(f)
+            self._write_pause(f)
         return id_start
 
-    def matmul_phased_gen(
-        self,
-        id_start,
-        forbidden_read,
-        forbidden_write,
-        region_base_address,
-        region_size_bytes,
-        matmul_ratio_a=1,
-        matmul_ratio_b=1,
-        matmul_ratio_c=1,
-        idle_cycles_between_phases=0,
-        region_base_address_a=None,
-        region_size_bytes_a=None,
-        region_base_address_b=None,
-        region_size_bytes_b=None,
-        region_base_address_c=None,
-        region_size_bytes_c=None,
-    ):
-        """Phased traffic modelling matrix multiply: read-A, read-B, write-C.
-
-        Each phase reads/writes its own sub-region. If region_base_address_a/b/c
-        and region_size_bytes_a/b/c are provided, they are used directly for each
-        phase. Otherwise the combined region [region_base_address, +region_size_bytes)
-        is split into equal thirds automatically.
-
-        idle_cycles_between_phases: req=0 cycles inserted between each pair of
-          active phases (after read-A and after read-B, if those phases are non-empty).
-          Models computation time between memory bursts.
-        """
-        id_value = id_start
-        forbidden_read_new = []
-        forbidden_write_new = []
-        access_bytes = max(1, self.DATA_WIDTH // 8)
-        total_mem_bytes = int(self.TOT_MEM_SIZE * 1024)
-
-        def _resolve_region(base_override, size_override, fallback_base, fallback_size):
-            if base_override is not None and size_override is not None:
-                b = self._align_down(int(base_override), access_bytes)
-                s = self._align_down(int(size_override), access_bytes)
-            else:
-                b = self._align_down(int(fallback_base), access_bytes)
-                s = self._align_down(int(fallback_size), access_bytes)
-            if b + s > total_mem_bytes:
-                s = self._align_down(total_mem_bytes - b, access_bytes)
+    def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write,
+                          region_base_address, region_size_bytes,
+                          matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1,
+                          idle_cycles_between_phases=0,
+                          region_base_address_a=None, region_size_bytes_a=None,
+                          region_base_address_b=None, region_size_bytes_b=None,
+                          region_base_address_c=None, region_size_bytes_c=None,
+                          append=False):
+        id_value = id_start; fr_new, fw_new = [], []
+        ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024)
+
+        def _res(bo, so, fb, fs):
+            b = self._align_down(int(bo if bo is not None else fb), ab)
+            s = self._align_down(int(so if so is not None else fs), ab)
+            if b+s > tm: s = self._align_down(tm-b, ab)
             return b, s
 
-        # If per-phase regions are explicitly provided, use them directly
         if region_base_address_a is not None and region_size_bytes_a is not None:
-            a_base, a_size = _resolve_region(region_base_address_a, region_size_bytes_a, 0, 0)
-            b_base, b_size = _resolve_region(region_base_address_b, region_size_bytes_b, a_base, a_size)
-            c_base, c_size = _resolve_region(region_base_address_c, region_size_bytes_c, a_base, a_size)
+            a_base, a_size = _res(region_base_address_a, region_size_bytes_a, 0, 0)
+            b_base, b_size = _res(region_base_address_b, region_size_bytes_b, a_base, a_size)
+            c_base, c_size = _res(region_base_address_c, region_size_bytes_c, a_base, a_size)
         else:
-            # Auto-split combined region into thirds
-            base = self._align_down(int(region_base_address), access_bytes)
-            size = self._align_down(int(region_size_bytes), access_bytes)
-            if base + size > total_mem_bytes:
-                size = self._align_down(total_mem_bytes - base, access_bytes)
-            region_words = size // access_bytes
-            if region_words < 3:
-                with open(self.filepath, "w", encoding="ascii") as file:
-                    self._write_idle(file)
+            base = self._align_down(int(region_base_address), ab)
+            size = self._align_down(int(region_size_bytes), ab)
+            if base+size > tm: size = self._align_down(tm-base, ab)
+            rw = size // ab
+            if rw < 3:
+                with self._open(append) as f: self._write_idle(f); self._write_pause(f)
                 return id_value
-            a_words = max(1, region_words // 3)
-            b_words = max(1, region_words // 3)
-            c_words = region_words - a_words - b_words
-            a_base = base;            a_size = a_words * access_bytes
-            b_base = a_base + a_size; b_size = b_words * access_bytes
-            c_base = b_base + b_size; c_size = c_words * access_bytes
+            aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw
+            a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab
+            c_base=b_base+b_size; c_size=cw*ab
 
-        a_end = a_base + a_size
-        b_end = b_base + b_size
-        c_end = c_base + c_size
+        ca, cb, cc = self._phase_counts(self.N_TEST, matmul_ratio_a, matmul_ratio_b, matmul_ratio_c)
 
-        cnt_a, cnt_b, cnt_c = self._phase_counts(
-            self.N_TEST,
-            int(matmul_ratio_a),
-            int(matmul_ratio_b),
-            int(matmul_ratio_c),
-        )
-
-        def _emit_phase(file_obj, count, wen, phase_base, phase_end):
+        def _emit(fobj, count, wen, pb, pe):
             nonlocal id_value
-            addr = phase_base
+            addr = pb
             for _ in range(count):
-                data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                data = "0"*self.DATA_WIDTH if wen else self.random_data()
                 add = bin(addr)[2:].zfill(self.ADD_WIDTH)
-                self._write_req(file_obj, id_value, wen, data, add)
-                self._record_access(add, wen, forbidden_read_new, forbidden_write_new)
-                id_value += 1
-                addr += access_bytes
-                if addr >= phase_end:
-                    addr = phase_base
-
-        with open(self.filepath, "w", encoding="ascii") as file:
-            _emit_phase(file, cnt_a, 1, a_base, a_end)           # read A
-            if cnt_a > 0 and (cnt_b > 0 or cnt_c > 0):
-                for _ in range(idle_cycles_between_phases):
-                    self._write_idle(file)
-            _emit_phase(file, cnt_b, 1, b_base, b_end)           # read B
-            if cnt_b > 0 and cnt_c > 0:
-                for _ in range(idle_cycles_between_phases):
-                    self._write_idle(file)
-            _emit_phase(file, cnt_c, 0, c_base, c_end)           # write C
-
-        forbidden_read.extend(forbidden_read_new)
-        forbidden_write.extend(forbidden_write_new)
+                self._write_req(fobj, id_value, wen, data, add)
+                self._record_access(add, wen, fr_new, fw_new)
+                id_value += 1; addr += ab
+                if addr >= pe: addr = pb
+
+        with self._open(append) as f:
+            _emit(f, ca, 1, a_base, a_base+a_size)
+            if ca > 0 and (cb > 0 or cc > 0):
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
+            _emit(f, cb, 1, b_base, b_base+b_size)
+            if cb > 0 and cc > 0:
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
+            _emit(f, cc, 0, c_base, c_base+c_size)
+            self._write_pause(f)
+
+        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
         return id_value
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 54ff02b..c6a0eae 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -327,15 +327,18 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt
               f"'n_transactions' and no geometry fields to derive it from.")
         sys.exit(1)
 
-    def _generate_master(
+    def _generate_pattern(
         filepath: Path,
-        master_config: dict,
+        pattern_config: dict,
         *,
         is_hwpe: bool,
         master_global_idx: int,
         master_local_idx: int,
         n_peers_of_kind: int,
+        append: bool,
     ):
+        """Generate one pattern segment. append=True opens file in append mode.
+        Every pattern always writes a trailing PAUSE (handled by the generator)."""
         nonlocal next_start_id
         data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
         kind = 'master_hwpe' if is_hwpe else 'master_log'
@@ -345,29 +348,28 @@ def _generate_master(
             str(filepath), 0, master_global_idx
         )
 
-        if 'mem_access_type' not in master_config:
-            print(f"ERROR: {kind}_{master_local_idx} is missing mem_access_type.")
+        if 'mem_access_type' not in pattern_config:
+            print(f"ERROR: {kind}_{master_local_idx} pattern is missing mem_access_type.")
             sys.exit(1)
 
         config = _normalize_mem_access_type(
-            master_config['mem_access_type'],
+            pattern_config['mem_access_type'],
             f"{kind}_{master_local_idx}",
         )
-        start_address = str(master_config.get('start_address', '0'))
-        stride0 = int(master_config.get('stride0', 0))
-        len_d0 = int(master_config.get('len_d0', 0))
-        stride1 = int(master_config.get('stride1', 0))
-        len_d1 = int(master_config.get('len_d1', 0))
-        stride2 = int(master_config.get('stride2', 0))
-
-        # Region parameters
+        start_address = str(pattern_config.get('start_address', '0'))
+        stride0 = int(pattern_config.get('stride0', 0))
+        len_d0 = int(pattern_config.get('len_d0', 0))
+        stride1 = int(pattern_config.get('stride1', 0))
+        len_d1 = int(pattern_config.get('len_d1', 0))
+        stride2 = int(pattern_config.get('stride2', 0))
+
         total_mem_bytes = int(TOT_MEM_SIZE * 1024)
         access_bytes = max(1, int(data_width // 8))
         default_region_size = total_mem_bytes // max(1, n_peers_of_kind)
         default_region_base = master_local_idx * default_region_size
 
-        region_base = _parse_maybe_bin_int(master_config.get('region_base_address'), default_region_base)
-        region_size = _parse_maybe_bin_int(master_config.get('region_size_bytes'), default_region_size)
+        region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base)
+        region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size)
 
         region_base = (region_base // access_bytes) * access_bytes
         if region_base >= total_mem_bytes:
@@ -378,16 +380,11 @@ def _generate_master(
         if region_base + region_size > total_mem_bytes:
             region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes
 
-        n_test = _resolve_n_transactions(master_config, config, data_width, kind, master_local_idx)
+        n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx)
         master.N_TEST = n_test
 
-        # Start delay: prepend idle lines directly to the stimuli file after generation
-        start_delay = int(master_config.get('start_delay_cycles', 0))
-        if start_delay > 0:
-            pending_start_delays.append((filepath, start_delay, data_width))
-
         if config == 'random':
-            tpct = master_config.get('traffic_pct')
+            tpct = pattern_config.get('traffic_pct')
             next_start_id = master.random_gen(
                 next_start_id,
                 LIST_OF_FORBIDDEN_ADDRESSES_READ,
@@ -395,33 +392,37 @@ def _generate_master(
                 region_base=region_base,
                 region_size=region_size,
                 traffic_pct=int(tpct) if tpct is not None else 100,
-                traffic_read_pct=master_config.get('traffic_read_pct'),
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
             )
         elif config == 'linear':
-            tpct = master_config.get('traffic_pct')
+            tpct = pattern_config.get('traffic_pct')
             next_start_id = master.linear_gen(
                 stride0, start_address, next_start_id,
                 LIST_OF_FORBIDDEN_ADDRESSES_READ,
                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
                 traffic_pct=int(tpct) if tpct is not None else 100,
-                traffic_read_pct=master_config.get('traffic_read_pct'),
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
             )
         elif config == '2d':
             next_start_id = master.gen_2d(
                 stride0, len_d0, stride1, start_address, next_start_id,
                 LIST_OF_FORBIDDEN_ADDRESSES_READ,
                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
-                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                append=append,
             )
         elif config == '3d':
             next_start_id = master.gen_3d(
                 stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id,
                 LIST_OF_FORBIDDEN_ADDRESSES_READ,
                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
-                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                append=append,
             )
         elif config == 'idle':
-            next_start_id = master.idle_gen(next_start_id)
+            next_start_id = master.idle_gen(next_start_id, append=append)
         elif config == 'matmul_phased':
             if not is_hwpe:
                 print(
@@ -441,27 +442,82 @@ def _generate_master(
                 LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
                 region_base,
                 region_size,
-                int(master_config.get('matmul_ratio_a', 1)),
-                int(master_config.get('matmul_ratio_b', 1)),
-                int(master_config.get('matmul_ratio_c', 1)),
-                idle_cycles_between_phases=int(master_config.get('idle_cycles_between_phases', 0)),
-                region_base_address_a=_parse_maybe_bin_int(master_config.get('region_base_address_a'), None),
-                region_size_bytes_a=_parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None),
-                region_base_address_b=_parse_maybe_bin_int(master_config.get('region_base_address_b'), None),
-                region_size_bytes_b=_parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None),
-                region_base_address_c=_parse_maybe_bin_int(master_config.get('region_base_address_c'), None),
-                region_size_bytes_c=_parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None),
+                int(pattern_config.get('matmul_ratio_a', 1)),
+                int(pattern_config.get('matmul_ratio_b', 1)),
+                int(pattern_config.get('matmul_ratio_c', 1)),
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None),
+                region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None),
+                region_base_address_b=_parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None),
+                region_size_bytes_b=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None),
+                region_base_address_c=_parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None),
+                region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None),
+                append=append,
             )
 
         _record_memory_map(
             kind, master_local_idx,
-            master_config.get('description', ''),
+            pattern_config.get('description', ''),
             config, n_test, data_width, access_bytes,
             region_base, region_size,
             start_address, stride0, len_d0, stride1, len_d1, stride2,
-            master_config, total_mem_bytes,
+            pattern_config, total_mem_bytes,
         )
 
+    def _generate_master(
+        filepath: Path,
+        master_config: dict,
+        *,
+        is_hwpe: bool,
+        master_global_idx: int,
+        master_local_idx: int,
+        n_peers_of_kind: int,
+    ):
+        """Generate stimulus for a master, supporting single flat pattern or patterns list."""
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+
+        # Resolve pattern list: either explicit 'patterns' list or a single flat pattern
+        if 'patterns' in master_config:
+            patterns = master_config['patterns']
+            if not patterns:
+                kind = 'master_hwpe' if is_hwpe else 'master_log'
+                print(f"ERROR: {kind}_{master_local_idx} has empty patterns list.")
+                sys.exit(1)
+        else:
+            # Legacy flat format: treat the master config itself as a single pattern
+            patterns = [master_config]
+
+        # Start delay applies to the whole master (prepended before first pattern)
+        start_delay = int(master_config.get('start_delay_cycles', 0))
+        if start_delay > 0:
+            pending_start_delays.append((filepath, start_delay, data_width))
+
+        # For each pattern with wait_for, prepend a synthetic idle+PAUSE that acts as
+        # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free
+        # pass), so fence_idx advances immediately after the real work is done.
+        # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle
+        # gate, blocking), giving resume_i a single clean meaning: start your next job.
+        dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        first_written = False
+        for p_idx, pattern_config in enumerate(patterns):
+            if pattern_config.get('wait_for'):
+                # Synthetic idle+PAUSE gates this pattern
+                _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE,
+                                         dw, ADD_WIDTH, str(filepath), 0, master_global_idx)
+                _idle.N_TEST = 0
+                _idle.idle_gen(next_start_id, append=first_written)
+                first_written = True
+            _generate_pattern(
+                filepath,
+                pattern_config,
+                is_hwpe=is_hwpe,
+                master_global_idx=master_global_idx,
+                master_local_idx=master_local_idx,
+                n_peers_of_kind=n_peers_of_kind,
+                append=first_written,
+            )
+            first_written = True
+
     global_idx = 0
 
     # Generate LOG masters (CORE, DMA, EXT) in order
@@ -511,56 +567,148 @@ def _generate_master(
     print("STEP 0 COMPLETED: generate stimuli files")
 
     # -----------------------------------------------------------------------
-    # Compute WAIT_MASKS and emit phases.mk
+    # Compute FENCE_MASKS and emit fence_masks.mk
+    #
+    # Fence slot f corresponds to the PAUSE before pattern f in the stimulus
+    # file (i.e. between pattern f-1 and pattern f). The mask at slot f holds
+    # the set of drivers that must have passed fence f before this driver can
+    # resume from that PAUSE.
+    #
+    # For a master with N patterns, there are N fence slots (slot 0 = before
+    # pattern 0, slot f = before pattern f). The wait_for of pattern f defines
+    # the mask at fence slot f.
+    #
+    # Legacy flat masters (no 'patterns' key) are treated as single-pattern
+    # masters: one fence slot (slot 0) from the top-level wait_for field.
     # -----------------------------------------------------------------------
     N_DRIVERS = N_LOG + N_HWPE
 
-    phase_to_drivers: dict[str, list[int]] = {}
-    driver_phases: list[str] = []
-
-    for i, m in enumerate(log_masters):
-        phase = str(m.get('phase', 'default'))
-        driver_phases.append(phase)
-        phase_to_drivers.setdefault(phase, []).append(i)
-
-    for i, m in enumerate(hwpe_masters):
-        gidx = N_LOG + i
-        phase = str(m.get('phase', 'default'))
-        driver_phases.append(phase)
-        phase_to_drivers.setdefault(phase, []).append(gidx)
-
-    wait_masks: list[int] = [0] * N_DRIVERS
+    def _patterns_of(master_config):
+        """Return the list of pattern configs for a master."""
+        if 'patterns' in master_config:
+            return master_config['patterns']
+        return [master_config]
 
-    for i, m in enumerate(log_masters):
+    # Build phase->driver map: every pattern of every driver registers its phase.
+    # This allows wait_for to reference any phase, not just first patterns.
+    # A phase may be associated with multiple drivers (e.g. 8 cores all in softmax_t0).
+    phase_to_drivers: dict[str, list[int]] = {}
+    all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters]
+    for i, (m, _) in enumerate(all_masters):
+        for pat in _patterns_of(m):
+            phase = str(pat.get('phase', 'default'))
+            if i not in phase_to_drivers.get(phase, []):
+                phase_to_drivers.setdefault(phase, []).append(i)
+
+    # Build phase->pattern_index map: for each phase, which pattern index within
+    # each driver corresponds to that phase. Used to compute FENCE_REQ_LEVELS.
+    # phase_pattern_idx[phase][driver] = pattern index of that phase in that driver
+    phase_pattern_idx: dict[str, dict[int, int]] = {}
+    for i, (m, _) in enumerate(all_masters):
+        for p_idx, pat in enumerate(_patterns_of(m)):
+            phase = str(pat.get('phase', 'default'))
+            phase_pattern_idx.setdefault(phase, {})[i] = p_idx
+
+    def _resolve_wait_mask(wait_for_list):
         mask = 0
-        for dep_phase in m.get('wait_for', []):
+        for dep_phase in wait_for_list:
             for dep_drv in phase_to_drivers.get(str(dep_phase), []):
                 mask |= (1 << dep_drv)
-        wait_masks[i] = mask
-
-    for i, m in enumerate(hwpe_masters):
-        gidx = N_LOG + i
-        mask = 0
-        for dep_phase in m.get('wait_for', []):
+        return mask
+
+    # Precompute per-driver fence_idx value after finishing pattern p:
+    # = number of fences (synthetic idle gates + trailing PAUSEs) passed up to and
+    #   including the trailing PAUSE of pattern p.
+    def _fence_idx_after_pattern(drv_idx, pat_idx):
+        pats = _patterns_of(all_masters[drv_idx][0])
+        # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for)
+        n_gates = sum(1 for k in range(pat_idx + 1) if pats[k].get('wait_for'))
+        # Plus trailing PAUSEs for patterns 0..pat_idx
+        n_trailing = pat_idx + 1
+        return n_gates + n_trailing
+
+    def _resolve_req_levels(wait_for_list):
+        """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j."""
+        levels = [0] * N_DRIVERS
+        for dep_phase in wait_for_list:
             for dep_drv in phase_to_drivers.get(str(dep_phase), []):
-                mask |= (1 << dep_drv)
-        wait_masks[gidx] = mask
+                p_idx = phase_pattern_idx.get(str(dep_phase), {}).get(dep_drv, 0)
+                levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx)
+        return levels
+
+    # Build per-driver fence mask and req_level lists.
+    # Each pattern with wait_for gets a synthetic idle gate (mask = wait_for) before it.
+    # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx).
+    # Fences are enumerated in file order: for each pattern p:
+    #   if p has wait_for: synthetic idle fence (mask = wait_for of p)
+    #   trailing PAUSE fence (mask = 0)
+    fence_masks:  list[list[int]]       = []
+    req_levels:   list[list[list[int]]] = []
+    for i, (m, _) in enumerate(all_masters):
+        patterns = _patterns_of(m)
+        per_masks  = []
+        per_levels = []
+        for pat in patterns:
+            if pat.get('wait_for'):
+                # Synthetic idle gate: blocking fence
+                per_masks.append(_resolve_wait_mask(pat['wait_for']))
+                per_levels.append(_resolve_req_levels(pat['wait_for']))
+            # Trailing PAUSE: free pass, just signals completion
+            per_masks.append(0)
+            per_levels.append([0] * N_DRIVERS)
+        fence_masks.append(per_masks)
+        req_levels.append(per_levels)
+
+    max_fences = max((len(fm) for fm in fence_masks), default=1)
+
+    # Pad to max_fences
+    for i in range(N_DRIVERS):
+        while len(fence_masks[i]) < max_fences:
+            fence_masks[i].append(0)
+        while len(req_levels[i]) < max_fences:
+            req_levels[i].append([0] * N_DRIVERS)
 
+    # Emit SV literals
     hex_width = max(1, (N_DRIVERS + 3) // 4)
-    mask_literals = [f"{N_DRIVERS}'h{wait_masks[i]:0{hex_width}x}" for i in range(N_DRIVERS)]
-    wait_masks_param = "'{" + ", ".join(mask_literals) + "}"
+    per_driver_literals = []
+    for i in range(N_DRIVERS):
+        slot_literals = [f"{N_DRIVERS}'h{fence_masks[i][f]:0{hex_width}x}" for f in range(max_fences)]
+        per_driver_literals.append("'{" + ", ".join(slot_literals) + "}")
+    fence_masks_param = "'{" + ", ".join(per_driver_literals) + "}"
+
+    # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned
+    # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector.
+    # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15).
+    LEVEL_BITS = 4
+    packed_width = N_DRIVERS * LEVEL_BITS
+    packed_hex_digits = (packed_width + 3) // 4
+    req_driver_literals = []
+    for i in range(N_DRIVERS):
+        fence_literals = []
+        for f in range(max_fences):
+            val = 0
+            for j in range(N_DRIVERS):
+                val |= (req_levels[i][f][j] & 0xF) << (j * LEVEL_BITS)
+            fence_literals.append(f"{packed_width}'h{val:0{packed_hex_digits}x}")
+        req_driver_literals.append("'{" + ", ".join(fence_literals) + "}")
+    fence_req_levels_packed_param = "'{" + ", ".join(req_driver_literals) + "}"
 
     if args.emit_phases_mk:
         phases_mk_path = Path(args.emit_phases_mk)
         phases_mk_path.parent.mkdir(parents=True, exist_ok=True)
         phases_mk_path.write_text(
             "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n"
-            "# Per-driver dependency masks for tb_hci.sv (WAIT_MASKS_PARAM).\n"
+            "# Per-driver per-fence dependency data for tb_hci.sv.\n"
             f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n"
-            f"WAIT_MASKS_PARAM := {wait_masks_param}\n",
+            f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n"
+            f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n"
+            f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n"
+            f"MAX_FENCES_PARAM := {max_fences}\n"
+            f"FENCE_MASKS_PARAM := {fence_masks_param}\n"
+            f"FENCE_REQ_LEVELS_PACKED_PARAM := {fence_req_levels_packed_param}\n",
             encoding='utf-8',
         )
-        print(f"PHASES.MK written: {phases_mk_path}")
+        print(f"FENCE_MASKS.MK written: {phases_mk_path}")
 
     # -----------------------------------------------------------------------
     # Build and emit memory map report
@@ -586,11 +734,12 @@ def _generate_master(
         driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers]
         lines.append(f"    phase '{phase}': {', '.join(driver_names)}")
     for i in range(N_DRIVERS):
-        if wait_masks[i]:
-            name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}"
-            deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}"
-                    for j in range(N_DRIVERS) if wait_masks[i] & (1 << j)]
-            lines.append(f"    {name} waits for: {', '.join(deps)}")
+        name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}"
+        for f, mask in enumerate(fence_masks[i]):
+            if mask:
+                deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}"
+                        for j in range(N_DRIVERS) if mask & (1 << j)]
+                lines.append(f"    {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}")
     lines.append("=" * 72)
 
     report_text = "\n".join(lines) + "\n"
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index 35c66c4..6ed4787 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -16,7 +16,16 @@
 
 /**
  * Application driver module
- * Reads stimuli from file and drives transactions on HCI interface
+ * Reads stimuli from file and drives transactions on HCI interface.
+ *
+ * Stimulus file format (one line per cycle):
+ *   req(1b) id(IWb) wen(1b) data(Nb) add(Ab)   -- active transaction
+ *   PAUSE                                        -- fence synchronization point
+ *
+ * When a PAUSE token is encountered the driver drains all in-flight reads
+ * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1
+ * until resume_i is asserted. This allows multi-phase execution on a single
+ * driver without resetting counters between phases.
  */
 
 module application_driver #(
@@ -28,56 +37,75 @@ module application_driver #(
 ) (
   input logic             clk_i,
   input logic             rst_ni,
-  input logic             clear_i, // used to gate the driver or to reset it
+  input logic             resume_i,       // asserted by tb_hci when fence dependencies are met
   hci_core_intf.initiator hci_if,
-  output logic            end_resp_o,
+  output logic            fence_reached_o, // held HIGH while driver is paused at a fence
+  output logic            end_resp_o,      // held HIGH after all transactions and responses done
   output int unsigned     n_issued_tr_o,
   output int unsigned     n_issued_rd_tr_o,
   output int unsigned     n_retired_rd_tr_o
 );
 
-  int unsigned n_req_issued_q, n_req_issued_d; // total number of issued requests
-  int unsigned n_rd_req_issued_q, n_rd_req_issued_d; // total number of issued read requests
-  int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; // total number of retired read responses
+  int unsigned n_req_issued_q, n_req_issued_d;
+  int unsigned n_rd_req_issued_q, n_rd_req_issued_d;
+  int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d;
 
-  // Transaction queue from file
+  // Transaction queue from file. is_pause=1 entries are fence tokens, not real transactions.
   typedef struct {
-    logic req;
-    logic [IW-1:0] id;
-    logic wen;
+    logic                  is_pause;
+    logic                  req;
+    logic [IW-1:0]         id;
+    logic                  wen;
     logic [DATA_WIDTH-1:0] data;
     logic [ADDR_WIDTH-1:0] add;
   } transaction_t;
   transaction_t transactions[$];
 
-  // Fill up the queue by reading the stimuli file until the end
+  // Fill up the queue by reading the stimuli file until the end.
+  // PAUSE lines are read as fence tokens with is_pause=1.
   initial begin
     string file_path;
-    int stim;
-    int scan_status;
+    int    stim;
+    string line;
 
     if (STIM_FILE != "") begin
       file_path = STIM_FILE;
     end else begin
       $fatal("ERROR: Specify STIM_FILE path");
     end
-    // Open file
     stim = $fopen(file_path, "r");
     if (stim == 0) begin
-      $fatal("ERROR: Could not open stimuli file");
+      $fatal("ERROR: Could not open stimuli file: %s", file_path);
     end
-    // Read every line
     while (!$feof(stim)) begin
-      transaction_t transaction;
-      scan_status = $fscanf(stim, "%b %b %b %b %b\n", transaction.req, transaction.id, transaction.wen, transaction.data, transaction.add);
-      if (scan_status != 5) begin
-        if (!$feof(stim)) begin
-          $fatal(1, "ERROR: malformed stimuli line in %s", file_path);
+      transaction_t t;
+      int scan_status;
+      void'($fgets(line, stim));
+      // Strip trailing newline/CR for comparison
+      if (line.len() > 0 && (line[line.len()-1] == "\n" || line[line.len()-1] == "\r"))
+        line = line.substr(0, line.len()-2);
+      if (line.len() > 1 && line[line.len()-1] == "\r")
+        line = line.substr(0, line.len()-2);
+      if (line == "PAUSE") begin
+        t.is_pause = 1'b1;
+        t.req      = 1'b0;
+        t.id       = '0;
+        t.wen      = 1'b0;
+        t.data     = '0;
+        t.add      = '0;
+        transactions.push_back(t);
+      end else if (line.len() > 0) begin
+        t.is_pause = 1'b0;
+        scan_status = $sscanf(line, "%b %b %b %b %b",
+            t.req, t.id, t.wen, t.data, t.add);
+        if (scan_status != 5) begin
+          if (!$feof(stim)) begin
+            $fatal(1, "ERROR: malformed stimuli line in %s: '%s'", file_path, line);
+          end
+          break;
         end
-        break;
+        transactions.push_back(t);
       end
-      // First-in, first-out queue
-      transactions.push_back(transaction);
     end
     $fclose(stim);
   end
@@ -87,150 +115,155 @@ module application_driver #(
   //////////////////
 
   typedef enum logic [2:0] {
-    REQ_RESET,
     REQ_IDLE,
     WAIT_GNT,
     REQ_DONE,
+    DRAIN_FOR_PAUSE, // drain in-flight reads before asserting fence_reached_o
+    PAUSED,          // fence synchronization: hold fence_reached_o until resume_i
     RSP_DONE
   } req_state_t;
 
-  req_state_t req_state_q, req_state_d;
-  int unsigned tr_idx_q, tr_idx_d; // transaction ID
-  int unsigned last_op_issued_q, last_op_issued_d; // ID of the last issued operation (read or write)
+  req_state_t  req_state_q, req_state_d;
+  int unsigned tr_idx_q, tr_idx_d;
+  int unsigned last_op_issued_q, last_op_issued_d;
 
-  assign n_issued_tr_o = n_req_issued_q;
+  assign n_issued_tr_o    = n_req_issued_q;
   assign n_issued_rd_tr_o = n_rd_req_issued_q;
 
-  // Sequential logic
   always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni || clear_i) begin
-      req_state_q <= REQ_RESET;
-      tr_idx_q <= '0;
-      n_req_issued_q <= '0;
-      n_rd_req_issued_q <= '0;
-      last_op_issued_q <= '0;
+    if (!rst_ni) begin
+      req_state_q        <= REQ_IDLE;
+      tr_idx_q           <= '0;
+      n_req_issued_q     <= '0;
+      n_rd_req_issued_q  <= '0;
+      last_op_issued_q   <= '0;
     end else begin
-      req_state_q <= req_state_d;
-      tr_idx_q <= tr_idx_d;
-      n_req_issued_q <= n_req_issued_d;
-      n_rd_req_issued_q <= n_rd_req_issued_d;
-      last_op_issued_q <= last_op_issued_d;
+      req_state_q        <= req_state_d;
+      tr_idx_q           <= tr_idx_d;
+      n_req_issued_q     <= n_req_issued_d;
+      n_rd_req_issued_q  <= n_rd_req_issued_d;
+      last_op_issued_q   <= last_op_issued_d;
     end
   end
 
-  // Combinational state logic
   always_comb begin
-    // Defaults (FSM)
-    req_state_d = req_state_q;
-    tr_idx_d = tr_idx_q;
-    n_req_issued_d = n_req_issued_q;
+    // FSM defaults
+    req_state_d      = req_state_q;
+    tr_idx_d         = tr_idx_q;
+    n_req_issued_d   = n_req_issued_q;
     n_rd_req_issued_d = n_rd_req_issued_q;
     last_op_issued_d = last_op_issued_q;
-    // Defaults (HCI outputs)
-    hci_if.id = '0;
-    hci_if.add = '0;
-    hci_if.data = '0;
-    hci_if.req = 1'b0;
-    hci_if.wen = 1'b0;
-    hci_if.ecc = '0;
-    hci_if.ereq = '0;
+    // HCI output defaults
+    hci_if.id       = '0;
+    hci_if.add      = '0;
+    hci_if.data     = '0;
+    hci_if.req      = 1'b0;
+    hci_if.wen      = 1'b0;
+    hci_if.ecc      = '0;
+    hci_if.ereq     = '0;
     hci_if.r_eready = '0;
-    hci_if.be = '1;
-    hci_if.r_ready = 1'b1;
-    hci_if.user = '0;
-    // Defaults (outputs)
-    end_resp_o = 1'b0;
-    // inputs: gnt, r_data, r_valid, r_user, r_id
+    hci_if.be       = '1;
+    hci_if.r_ready  = 1'b1;
+    hci_if.user     = '0;
+    // Output defaults
+    fence_reached_o = 1'b0;
+    end_resp_o      = 1'b0;
 
     case (req_state_q)
-      REQ_RESET: begin
-        if (!clear_i) begin
-          req_state_d = REQ_IDLE;
-        end
-      end
       REQ_IDLE: begin
-        // Check if there are still transactions to issue
         if (tr_idx_q < transactions.size()) begin
-          // If so, increase transaction index for next iteration
-          tr_idx_d = tr_idx_q + 1;
-          // If this transaction is a request
-          if (transactions[tr_idx_q].req) begin
-            hci_if.req = 1'b1;
-            hci_if.id = transactions[tr_idx_q].id;
-            hci_if.wen = transactions[tr_idx_q].wen;
-            hci_if.data = transactions[tr_idx_q].data;
-            hci_if.add = transactions[tr_idx_q].add;
-            // Update counters
-            n_req_issued_d = n_req_issued_q + 1;
-            if (transactions[tr_idx_q].wen) begin
-              n_rd_req_issued_d = n_rd_req_issued_q + 1;
-            end
-            last_op_issued_d = tr_idx_q;
-            // If granted already, stay in REQ_IDLE, otherwise go to WAIT_GNT
-            if (hci_if.gnt) begin
-              req_state_d = REQ_IDLE;
+          if (transactions[tr_idx_q].is_pause) begin
+            // Consume the PAUSE token and drain any in-flight reads before pausing
+            tr_idx_d = tr_idx_q + 1;
+            if (n_rd_req_issued_q > n_rd_resp_retired_q) begin
+              req_state_d = DRAIN_FOR_PAUSE;
             end else begin
-              req_state_d = WAIT_GNT;
+              req_state_d = PAUSED;
+            end
+          end else begin
+            tr_idx_d = tr_idx_q + 1;
+            if (transactions[tr_idx_q].req) begin
+              hci_if.req  = 1'b1;
+              hci_if.id   = transactions[tr_idx_q].id;
+              hci_if.wen  = transactions[tr_idx_q].wen;
+              hci_if.data = transactions[tr_idx_q].data;
+              hci_if.add  = transactions[tr_idx_q].add;
+              n_req_issued_d = n_req_issued_q + 1;
+              if (transactions[tr_idx_q].wen) begin
+                n_rd_req_issued_d = n_rd_req_issued_q + 1;
+              end
+              last_op_issued_d = tr_idx_q;
+              req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT;
             end
           end
         end else begin
-          // If no more transactions to issue
+          // No more transactions
           if (n_rd_req_issued_q > n_rd_resp_retired_q) begin
-            // If there are still read responses to retire, wait for them before finishing
             req_state_d = REQ_DONE;
           end else begin
             req_state_d = RSP_DONE;
           end
         end
       end
+
       WAIT_GNT: begin
-        hci_if.req = 1'b1;
-        hci_if.id = transactions[last_op_issued_q].id;
-        hci_if.wen = transactions[last_op_issued_q].wen;
+        hci_if.req  = 1'b1;
+        hci_if.id   = transactions[last_op_issued_q].id;
+        hci_if.wen  = transactions[last_op_issued_q].wen;
         hci_if.data = transactions[last_op_issued_q].data;
-        hci_if.add = transactions[last_op_issued_q].add;
-        // Check if there are still transactions to issue
+        hci_if.add  = transactions[last_op_issued_q].add;
         if (tr_idx_q < transactions.size()) begin
-          // Check whether to stall or not transaction fetching, in case this is a idle transaction that can hide mem latency
-          if (!transactions[tr_idx_q].req) begin
+          // Advance over any idle (req=0, not-pause) entries to hide memory latency
+          if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin
             tr_idx_d = tr_idx_q + 1;
-          end else begin
-            tr_idx_d = tr_idx_q;
-          end
-          // If grant received, go back to REQ_IDLE and pop another transaction, otherwise stay in WAIT_GNT
-          if (hci_if.gnt) begin
-            req_state_d = REQ_IDLE;
-          end else begin
-            req_state_d = WAIT_GNT;
           end
+          req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT;
         end else begin
-          // If there are no more transactions, wait for last grant and then finish
           if (hci_if.gnt) begin
             if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin
-              // If the last transaction was a read, we need to wait for its response before finishing
               req_state_d = REQ_DONE;
             end else begin
-              // If the last transaction was a write, we can finish right away
               req_state_d = RSP_DONE;
             end
-          end else begin
-            req_state_d = WAIT_GNT;
           end
         end
       end
+
       REQ_DONE: begin
         if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
-          // All read responses have been retired
           req_state_d = RSP_DONE;
-        end else begin
-          req_state_d = REQ_DONE;
         end
       end
+
+      DRAIN_FOR_PAUSE: begin
+        // Wait for all in-flight reads to retire before asserting the fence
+        if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
+          req_state_d = PAUSED;
+        end
+      end
+
+      PAUSED: begin
+        // Hold fence_reached_o HIGH until tb_hci asserts resume_i.
+        // If the next token is also a PAUSE (e.g. trailing free-pass followed by
+        // a blocking synthetic idle), consume it immediately and stay in PAUSED
+        // to avoid a spurious one-cycle REQ_IDLE bounce between consecutive fences.
+        fence_reached_o = 1'b1;
+        if (resume_i) begin
+          if (tr_idx_q < transactions.size() && transactions[tr_idx_q].is_pause) begin
+            tr_idx_d    = tr_idx_q + 1;
+            req_state_d = PAUSED;
+          end else begin
+            req_state_d = REQ_IDLE;
+          end
+        end
+      end
+
       RSP_DONE: begin
         end_resp_o = 1'b1;
       end
+
       default: begin
-        req_state_d = REQ_RESET;
+        req_state_d = REQ_IDLE;
       end
     endcase
   end
@@ -239,75 +272,60 @@ module application_driver #(
   // Read response FSM //
   ///////////////////////
 
-  // We only consider read responses as write responses are not mandatory in HCI
-
   typedef enum logic [1:0] {
-    RESP_RESET,
     RESP_IDLE,
     RESP_WAIT_RVALID
   } resp_state_t;
 
   resp_state_t resp_state_q, resp_state_d;
-  int unsigned n_rd_in_flight_q, n_rd_in_flight_d; // number of reads granted but not yet responded
+  int unsigned n_rd_in_flight_q, n_rd_in_flight_d;
 
   assign n_retired_rd_tr_o = n_rd_resp_retired_q;
 
-  // Sequential logic
   always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni || clear_i) begin
-      resp_state_q <= RESP_IDLE;
+    if (!rst_ni) begin
+      resp_state_q        <= RESP_IDLE;
       n_rd_resp_retired_q <= '0;
-      n_rd_in_flight_q <= '0;
+      n_rd_in_flight_q    <= '0;
     end else begin
-      resp_state_q <= resp_state_d;
+      resp_state_q        <= resp_state_d;
       n_rd_resp_retired_q <= n_rd_resp_retired_d;
-      n_rd_in_flight_q <= n_rd_in_flight_d;
+      n_rd_in_flight_q    <= n_rd_in_flight_d;
     end
   end
 
-  // Combinational state logic
   always_comb begin
-    // Defaults (FSM)
-    resp_state_d = resp_state_q;
+    resp_state_d        = resp_state_q;
     n_rd_resp_retired_d = n_rd_resp_retired_q;
-    n_rd_in_flight_d = n_rd_in_flight_q;
+    n_rd_in_flight_d    = n_rd_in_flight_q;
 
     case (resp_state_q)
-      RESP_RESET: begin
-        if (!clear_i) begin
-          resp_state_d = RESP_IDLE;
-        end
-      end
       RESP_IDLE: begin
-        // If a read request is granted, increment in-flight counter and go to RESP_WAIT_RVALID
         if (hci_if.req && hci_if.wen && hci_if.gnt) begin
           n_rd_in_flight_d = n_rd_in_flight_q + 1;
-          resp_state_d = RESP_WAIT_RVALID;
+          resp_state_d     = RESP_WAIT_RVALID;
         end
       end
+
       RESP_WAIT_RVALID: begin
-        // Track newly granted reads while waiting
         if (hci_if.req && hci_if.wen && hci_if.gnt) begin
           n_rd_in_flight_d = n_rd_in_flight_q + 1;
         end
-        // When read response handshake happens, retire one response
         if (hci_if.r_valid && hci_if.r_ready) begin
           n_rd_resp_retired_d = n_rd_resp_retired_q + 1;
-          // Adjust in-flight: account for simultaneous grant (already added above)
           if (hci_if.req && hci_if.wen && hci_if.gnt) begin
-            // net in-flight = in_flight + 1 (new grant) - 1 (retired) = in_flight
-            n_rd_in_flight_d = n_rd_in_flight_q; // undo the +1 above, net unchanged
+            n_rd_in_flight_d = n_rd_in_flight_q; // +1 grant -1 retire = net 0
           end else begin
             n_rd_in_flight_d = n_rd_in_flight_q - 1;
           end
-          // If no more in-flight reads (after this retirement), go back to RESP_IDLE
           if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin
             resp_state_d = RESP_IDLE;
           end
         end
       end
+
       default: begin
-        resp_state_d = RESP_RESET;
+        resp_state_d = RESP_IDLE;
       end
     endcase
   end
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index d95ce3b..0f743e2 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -26,8 +26,10 @@ module tb_hci
 ();
 
   logic                   clk, rst_n;
-  logic [N_DRIVERS-1:0]   s_end_resp;  // end_resp_o from all drivers, [N_DRIVERS-1:0] ordering
-  logic [N_DRIVERS-1:0]   s_clear_drv; // per-driver clear_i (held 1 until dependencies done)
+  logic [N_DRIVERS-1:0]   s_end_resp;      // end_resp_o from all drivers
+  logic [N_DRIVERS-1:0]   s_fence_reached; // fence_reached_o from all drivers (level, HIGH while PAUSED)
+  logic [N_DRIVERS-1:0]   s_resume;        // resume_i to each driver (asserted when fence deps are met)
+  int unsigned             fence_idx [N_DRIVERS]; // number of fences each driver has passed so far
   hci_interconnect_ctrl_t s_hci_ctrl;
 
   clk_rst_gen #(
@@ -38,9 +40,9 @@ module tb_hci
     .rst_no(rst_n)
   );
 
-  /////////
-  // HCI //
-  /////////
+  ////////////////////
+  // HCI interfaces //
+  ////////////////////
 
   localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{
     DW:  DW_cores,
@@ -164,6 +166,12 @@ module tb_hci
     .clk(clk)
   );
 
+  ///////////////////////////
+  // Interface assignments //
+  ///////////////////////////
+
+  /* Assignments of narrow initiators to LOG branch of HCI */
+
   generate
     for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow
       hci_core_assign i_core_to_narrow_assign (
@@ -185,7 +193,11 @@ module tb_hci
         .tcdm_initiator(hci_initiator_ext[ii])
       );
     end
+  endgenerate
 
+  /* Assignments of wide initiators to HCI (either LOG branch, HCI branch, or static MUX) */
+
+  generate
     if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci
       for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign
         hci_core_assign i_hwpe_hci_assign (
@@ -194,16 +206,36 @@ module tb_hci
         );
       end
     end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux
-      // In MUX mode, sel_i points at the lowest-indexed HWPE that has not yet finished
-      // (i.e. whose end_resp_o has not fired). Once HWPE k asserts end_resp_o, sel_i
-      // advances to k+1. We cannot use s_clear_drv here because HWPE 0 always has
-      // eff_mask='0 so its clear_drv is permanently 0 even after it finishes.
+      // Phase-ordered MUX arbitration:
+      //
+      // The mux is held by whichever HWPE is currently running (not paused, not done).
+      // In-flight reads are drained before any PAUSE (DRAIN_FOR_PAUSE state in the
+      // driver FSM), so sel_i is safe to switch as soon as fence_reached_o goes high.
+      //
+      // When no HWPE is running (all are either paused or done), the mux is granted
+      // to the lowest-indexed HWPE that is paused AND whose fence dependencies are
+      // satisfied (s_resume high). This serializes same-phase jobs by master ID and
+      // respects cross-phase data dependencies.
       logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel;
       always_comb begin
+        automatic logic any_running;
+        any_running = 1'b0;
+        for (int i = 0; i < N_HWPE; i++) begin
+          if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i])
+            any_running = 1'b1;
+        end
         s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1);
         for (int i = N_HWPE-1; i >= 0; i--) begin
-          if (!s_end_resp[N_LOG_MASTERS + i])
-            s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+          if (any_running) begin
+            // Active HWPE holds the mux; lowest index wins (descending loop)
+            if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i])
+              s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+          end else begin
+            // No HWPE running: grant to lowest-indexed paused+ready HWPE
+            if (!s_end_resp[N_LOG_MASTERS + i] && s_fence_reached[N_LOG_MASTERS + i]
+                && s_resume[N_LOG_MASTERS + i])
+              s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+          end
         end
       end
       hci_core_mux_static #(
@@ -239,6 +271,10 @@ module tb_hci
     end
   endgenerate
 
+  /////////////////
+  // Fence logic //
+  /////////////////
+
   logic s_clear;
   assign s_clear = 1'b0;
 
@@ -247,31 +283,54 @@ module tb_hci
   assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR;
   assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR;
 
-  // Driver clear logic: driver i is held in reset until all drivers j in its effective wait
-  // mask have asserted end_resp_o. If the effective mask is zero, the driver starts immediately.
+  // fence_idx[i]: number of fences driver i has fully passed (exited PAUSED state).
+  // Increments when the handshake fires: resume_i asserted while fence_reached_o is high.
+  // This is the same cycle the driver transitions out of PAUSED, so dependents see the
+  // updated count immediately on the next cycle (after the registered flip-flop updates).
+  always_ff @(posedge clk or negedge rst_n) begin
+    if (!rst_n) begin
+      for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0;
+    end else begin
+      for (int i = 0; i < N_DRIVERS; i++) begin
+        if (s_resume[i] && s_fence_reached[i])
+          fence_idx[i] <= fence_idx[i] + 1;
+      end
+    end
+  end
+
+  // s_resume[i]: fire when for every set bit j in FENCE_MASKS[i][fence_idx[i]],
+  // fence_idx[j] >= FENCE_REQ_LEVELS[i][fence_idx[i]][j].
   //
-  // In MUX mode the user-defined WAIT_MASKS are ignored for HWPE drivers: instead a strict
-  // sequential chain is enforced (HWPE i waits for HWPE i-1), because hci_core_mux_static
-  // only forwards one HWPE at a time. HWPE ordering follows the index = position in
-  // hwpe_masters[] in workload.json. LOG master masks are always taken from WAIT_MASKS.
+  // fence_idx[j] >= p_j+1 means j has exited the trailing PAUSE after pattern p_j,
+  // i.e. j has completed pattern p_j. No s_fence_reached shortcut needed — the
+  // trailing PAUSE of a zero-mask fence is exited in one cycle, so fence_idx
+  // advances promptly and the check is unambiguous.
   //
-  // All drivers use end_resp_o (s_end_resp) as the handoff condition, guaranteeing that all
-  // in-flight reads from the predecessor have been fully retired before the successor starts.
-  // This is required for correctness in MUX mode (hci_core_mux_static gates r_valid to the
-  // non-selected channel), and is conservatively safe for all other modes.
-  generate
-    for (genvar ii = 0; ii < N_DRIVERS; ii++) begin : gen_driver_clear
-      logic [N_DRIVERS-1:0] eff_mask;
-      if (INTERCO_TYPE == MUX && ii >= N_LOG_MASTERS) begin : gen_mux_mask
-        // HWPE 0 (ii == N_LOG_MASTERS): no wait; HWPE k waits for HWPE k-1's end_resp_o
-        assign eff_mask = (ii == N_LOG_MASTERS) ? '0 : (N_DRIVERS'(1) << (ii - 1));
-      end else begin : gen_default_mask
-        assign eff_mask = WAIT_MASKS[ii];
+  // In MUX mode bus serialization is handled by s_mux_sel independently.
+  always_comb begin
+    for (int i = 0; i < N_DRIVERS; i++) begin
+      automatic logic [N_DRIVERS-1:0] cur_mask;
+      automatic logic                 all_satisfied;
+      cur_mask = (fence_idx[i] < MAX_FENCES) ? FENCE_MASKS[i][fence_idx[i]] : '0;
+      all_satisfied = 1'b1;
+      for (int j = 0; j < N_DRIVERS; j++) begin
+        if (cur_mask[j]) begin
+          automatic logic [3:0] req;
+          req = (fence_idx[i] < MAX_FENCES) ?
+                FENCE_REQ_LEVELS_PACKED[i][fence_idx[i]][j*4+3 -: 4] : 4'h0;
+          if (fence_idx[j] < req)
+            all_satisfied = 1'b0;
+        end
       end
-      assign s_clear_drv[ii] = (eff_mask != '0) &&
-                                ((s_end_resp & eff_mask) != eff_mask);
+      // Only assert resume_i while the driver is actually in PAUSED state.
+      // Gating with fence_reached_o makes the signal a clean pulse.
+      s_resume[i] = all_satisfied && s_fence_reached[i];
     end
-  endgenerate
+  end
+
+  /////////
+  // HCI //
+  /////////
 
   hci_interconnect #(
     .N_HWPE(N_WIDE_HCI),
@@ -341,8 +400,9 @@ module tb_hci
       ) i_app_driver_log (
         .clk_i(clk),
         .rst_ni(rst_n),
-        .clear_i(s_clear_drv[ii]),
+        .resume_i(s_resume[ii]),
         .hci_if(hci_driver_log_if[ii]),
+        .fence_reached_o(s_fence_reached[ii]),
         .end_resp_o(s_end_resp[ii]),
         .n_issued_tr_o(s_issued_transactions[ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[ii]),
@@ -364,8 +424,9 @@ module tb_hci
       ) i_app_driver_hwpe (
         .clk_i(clk),
         .rst_ni(rst_n),
-        .clear_i(s_clear_drv[N_LOG_MASTERS + ii]),
+        .resume_i(s_resume[N_LOG_MASTERS + ii]),
         .hci_if(hci_driver_hwpe_if[ii]),
+        .fence_reached_o(s_fence_reached[N_LOG_MASTERS + ii]),
         .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]),
         .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]),
         .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]),
@@ -432,6 +493,10 @@ module tb_hci
     .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
+  ///////////////
+  // Reporting //
+  ///////////////
+
   simulation_report i_simulation_report (
     .end_resp_i(s_end_resp),
     .throughput_complete_i(throughput_completed),
@@ -449,6 +514,10 @@ module tb_hci
     .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
+  ////////////////
+  // Assertions //
+  ////////////////
+
   localparam int unsigned MAX_BANK_LOCAL_ADDR =
       TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE;
 
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index 47c5d7b..dc77085 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -77,12 +77,23 @@ package tb_hci_pkg;
   localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
   localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
 
-  // Per-driver dependency masks: WAIT_MASKS[i][j]=1 means driver i must wait for driver j's
-  // end_resp_o before starting. Generated by main.py, passed via WAIT_MASKS_PARAM define.
-  // NOTE: in MUX mode these masks are ignored for HWPE drivers and replaced by a sequential
-  // chain in tb_hci.sv.
-  localparam logic [N_DRIVERS-1:0] WAIT_MASKS [N_DRIVERS] =
-      `ifdef WAIT_MASKS_PARAM `WAIT_MASKS_PARAM `else '{default: '0} `endif;
+  // Fence parameters. Every pattern ends with a PAUSE; fence f = PAUSE after pattern f.
+  // fence_idx[i] == k means driver i has completed k patterns.
+  //
+  // FENCE_MASKS[i][f][j]=1: driver j is a dependency for driver i at fence f.
+  // FENCE_REQ_LEVELS[i][f][j]: minimum fence_idx[j] required before driver i can pass
+  //   fence f. Equals the pattern index of the referenced phase within driver j plus 1
+  //   (i.e., j must have completed that specific pattern).
+  // Both are generated by main.py and passed via defines.
+  localparam int unsigned MAX_FENCES =
+      `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif;
+  localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] =
+      `ifdef FENCE_MASKS_PARAM `FENCE_MASKS_PARAM `else '{default: '{default: '0}} `endif;
+  // FENCE_REQ_LEVELS_PACKED[i][f] is a packed vector of N_DRIVERS × 4 bits.
+  // Bits [j*4+3:j*4] hold the required fence_idx[j] before driver i can pass fence f.
+  // 4 bits supports up to 15 (sufficient for up to 7 patterns × 2 fences/pattern).
+  localparam logic [N_DRIVERS*4-1:0] FENCE_REQ_LEVELS_PACKED [N_DRIVERS][MAX_FENCES] =
+      `ifdef FENCE_REQ_LEVELS_PARAM `FENCE_REQ_LEVELS_PARAM `else '{default: '{default: '0}} `endif;
 
   // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE.
   localparam int unsigned N_NARROW_HCI =
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index b94a3b4..93d8282 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -14,7 +14,7 @@ HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated
 include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk
 include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk
 ifeq (,$(filter clean%,$(MAKECMDGOALS)))
--include $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+-include $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
 endif
 
 # Bender targets and defines
@@ -50,7 +50,7 @@ $(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_
 		--workload_config $(WORKLOAD_JSON) \
 		--testbench_config $(TESTBENCH_JSON) \
 		--hardware_config $(HARDWARE_JSON) \
-		--emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+		--emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
 	date > $@
 
 .PHONY: clean-stim-verif
@@ -87,7 +87,7 @@ $(HCI_VERIF_CFG_GEN_DIR):
 
 .PHONY: clean-config-verif
 clean-config-verif:
-	rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
+	rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
 
 ##############
 # Simulation #
@@ -116,13 +116,15 @@ ifeq ($(GUI),0)
 	SIM_HCI_VSIM_ARGS += -c
 endif
 
-WAIT_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/wait_masks.mk
-WAIT_MASKS_PARAM = $(shell grep '^WAIT_MASKS_PARAM' $(WAIT_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+FENCE_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
+MAX_FENCES_PARAM       = $(shell grep '^MAX_FENCES_PARAM'       $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+FENCE_MASKS_PARAM      = $(shell grep '^FENCE_MASKS_PARAM'      $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+FENCE_REQ_LEVELS_PARAM = $(shell grep '^FENCE_REQ_LEVELS_PACKED_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
 
 $(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp
 	mkdir -p $(HCI_VERIF_DIR)/vsim
 	$(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \
-		--vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+WAIT_MASKS_PARAM=$(WAIT_MASKS_PARAM)\"" > $@
+		--vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+MAX_FENCES_PARAM=$(MAX_FENCES_PARAM) +define+FENCE_MASKS_PARAM=$(FENCE_MASKS_PARAM) +define+FENCE_REQ_LEVELS_PARAM=$(FENCE_REQ_LEVELS_PARAM)\"" > $@
 
 .PHONY: compile-verif
 compile-verif: $(sim_vsim_lib)/.hw_compiled
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index 72d5007..e0de90a 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -10,72 +10,146 @@ if {$GUI == 1} {
     set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS]
 
     set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS]
+    set N_DRIVERS [examine -radix dec /tb_hci_pkg/N_DRIVERS]
     set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI]
     set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI]
     set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT]
     set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE]
+    set MAX_FENCES [examine -radix dec /tb_hci_pkg/MAX_FENCES]
 
     add wave -noupdate /tb_hci/clk
     add wave -noupdate /tb_hci/rst_n
 
+    add wave -noupdate -divider Interfaces
+    # -------------------------------------------------------------------------
     # Application-driver interfaces
-    add wave -noupdate -group application_drivers -divider log_masters
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group driver_side -divider narrow_masters
     for {set i 0} {$i < $N_LOG_MASTERS} {incr i} {
-        add wave -noupdate -group application_drivers -group log_$i /tb_hci/hci_driver_log_if[$i]/*
+        add wave -noupdate -group driver_side -group log_$i /tb_hci/hci_driver_log_if[$i]/*
     }
 
-    add wave -noupdate -group application_drivers -divider hwpe_masters
+    add wave -noupdate -group driver_side -divider hwpe_masters
     for {set i 0} {$i < $N_HWPE} {incr i} {
-        add wave -noupdate -group application_drivers -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/*
+        add wave -noupdate -group driver_side -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/*
     }
 
+    # -------------------------------------------------------------------------
     # Interconnect-side interfaces
-    add wave -noupdate -group hci_interfaces -divider narrow_cores
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group hci_initiator_side -divider narrow_cores
     for {set i 0} {$i < $N_CORE} {incr i} {
-        add wave -noupdate -group hci_interfaces -group core_$i /tb_hci/hci_initiator_narrow[$i]/*
+        add wave -noupdate -group hci_initiator_side -group core_$i /tb_hci/hci_initiator_narrow[$i]/*
     }
 
     if {[string first "LOG" $INTERCO_TYPE] != -1} {
-        add wave -noupdate -group hci_interfaces -divider narrow_hwpe_split
+        add wave -noupdate -group hci_initiator_side -divider narrow_hwpe_split
         for {set i 0} {$i < $N_HWPE} {incr i} {
             for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} {
                 set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}]
                 if {$idx < $N_NARROW_HCI} {
-                    add wave -noupdate -group hci_interfaces -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/*
+                    add wave -noupdate -group hci_initiator_side -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/*
                 }
             }
         }
     }
 
     if {$N_WIDE_HCI > 0} {
-        add wave -noupdate -group hci_interfaces -divider wide_hwpe
+        add wave -noupdate -group hci_initiator_side -divider wide_hwpe
         for {set i 0} {$i < $N_WIDE_HCI} {incr i} {
-            add wave -noupdate -group hci_interfaces -group wide_$i /tb_hci/hci_initiator_wide[$i]/*
+            add wave -noupdate -group hci_initiator_side -group wide_$i /tb_hci/hci_initiator_wide[$i]/*
         }
     }
 
-    add wave -noupdate -group hci_interfaces -divider dma
+    add wave -noupdate -group hci_initiator_side -divider dma
     for {set i 0} {$i < $N_DMA} {incr i} {
-        add wave -noupdate -group hci_interfaces -group dma_$i /tb_hci/hci_initiator_dma[$i]/*
+        add wave -noupdate -group hci_initiator_side -group dma_$i /tb_hci/hci_initiator_dma[$i]/*
     }
 
-    add wave -noupdate -group hci_interfaces -divider ext
+    add wave -noupdate -group hci_initiator_side -divider ext
     for {set i 0} {$i < $N_EXT} {incr i} {
-        add wave -noupdate -group hci_interfaces -group ext_$i /tb_hci/hci_initiator_ext[$i]/*
+        add wave -noupdate -group hci_initiator_side -group ext_$i /tb_hci/hci_initiator_ext[$i]/*
     }
 
+    # -------------------------------------------------------------------------
     # Memory slaves
-    add wave -noupdate -group memory_slaves
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group memory_targets
     for {set i 0} {$i < $N_BANKS} {incr i} {
-        add wave -noupdate -group memory_slaves -group bank_$i /tb_hci/hci_target_mems[$i]/*
+        add wave -noupdate -group memory_targets -group bank_$i /tb_hci/hci_target_mems[$i]/*
     }
 
+    add wave -noupdate -divider "Application drivers"
+    # -------------------------------------------------------------------------
+    # Per-driver driver internals (req/resp FSM states, counters)
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group driver_internals -divider log_masters
+    for {set i 0} {$i < $N_LOG_MASTERS} {incr i} {
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/req_state_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resp_state_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/tr_idx_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_req_issued_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_req_issued_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_resp_retired_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/fence_reached_o
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/end_resp_o
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resume_i
+    }
+
+    add wave -noupdate -group driver_internals -divider hwpe_masters
+    for {set i 0} {$i < $N_HWPE} {incr i} {
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/req_state_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resp_state_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/tr_idx_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_req_issued_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_req_issued_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_resp_retired_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/fence_reached_o
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/end_resp_o
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resume_i
+    }
+
+    add wave -noupdate -divider Testbench
+    # -------------------------------------------------------------------------
+    # Fence / synchronization signals
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group fence_sync /tb_hci/s_end_resp
+    add wave -noupdate -group fence_sync /tb_hci/s_fence_reached
+    add wave -noupdate -group fence_sync /tb_hci/s_resume
+
+    for {set i 0} {$i < $N_DRIVERS} {incr i} {
+        add wave -noupdate -group fence_sync -group fence_idx /tb_hci/fence_idx[$i]
+    }
+
+    # MUX sel (only present when INTERCO_TYPE == MUX)
+    if {[string first "MUX" $INTERCO_TYPE] != -1} {
+        add wave -noupdate -group fence_sync /tb_hci/gen_hwpe_mux/s_mux_sel
+    }
+
+
+    # -------------------------------------------------------------------------
     # Metrics
-    add wave -noupdate -group metrics /tb_hci/s_end_req
-    add wave -noupdate -group metrics /tb_hci/s_end_resp
+    # -------------------------------------------------------------------------
     add wave -noupdate -group metrics /tb_hci/s_issued_transactions
     add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions
-    add wave -noupdate -group metrics /tb_hci/stim_latency
     add wave -noupdate -group metrics /tb_hci/tot_latency
     add wave -noupdate -group metrics /tb_hci/latency_per_master
     add wave -noupdate -group metrics /tb_hci/throughput_completed
@@ -90,8 +164,10 @@ if {$GUI == 1} {
     add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG
     add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
 
+    # -------------------------------------------------------------------------
+    # HCI control
+    # -------------------------------------------------------------------------
     add wave -noupdate /tb_hci/s_clear
-    add wave -noupdate /tb_hci/s_clear_drv
     add wave -noupdate /tb_hci/s_hci_ctrl
 
     configure wave -signalnamewidth 1

From 414bfb5dddefb77e90fd92ef733da35bdbe9fed6 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Mon, 9 Mar 2026 20:31:19 +0100
Subject: [PATCH 19/25] ci: Adapt tests to new hw config in workloads

---
 regr/hardware/hci/hardware.json | 6 +++---
 regr/hardware/log/hardware.json | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/regr/hardware/hci/hardware.json b/regr/hardware/hci/hardware.json
index c8db224..b349a65 100644
--- a/regr/hardware/hci/hardware.json
+++ b/regr/hardware/hci/hardware.json
@@ -1,14 +1,14 @@
 {
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
-    "N_HWPE": 3,
+    "N_HWPE": 2,
     "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
     "N_DMA": 0,
     "N_EXT": 1,
     "DATA_WIDTH": 32,
-    "TOT_MEM_SIZE": 128,
-    "N_BANKS": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
     "INTERCO_TYPE": "HCI",
     "TS_BIT": 21,
     "EXPFIFO": 0,
diff --git a/regr/hardware/log/hardware.json b/regr/hardware/log/hardware.json
index 6a6a3bd..a505fa1 100644
--- a/regr/hardware/log/hardware.json
+++ b/regr/hardware/log/hardware.json
@@ -1,14 +1,14 @@
 {
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
-    "N_HWPE": 3,
+    "N_HWPE": 2,
     "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
     "N_DMA": 0,
     "N_EXT": 1,
     "DATA_WIDTH": 32,
-    "TOT_MEM_SIZE": 128,
-    "N_BANKS": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
     "INTERCO_TYPE": "LOG",
     "TS_BIT": 21,
     "EXPFIFO": 0,

From afa2bfcb8b877aa2f436efcad23c8de0f99b7d4e Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Tue, 10 Mar 2026 16:10:41 +0100
Subject: [PATCH 20/25] verif: Generate HTML dataflow from stimvectors

---
 target/verif/config/hardware.json             |   2 +-
 target/verif/config/workload.json             | 574 +++--------
 target/verif/config/workload.schema.json      |  34 +-
 .../verif/simvectors/hci_stimuli/patterns.py  |   3 +
 target/verif/simvectors/main.py               | 945 ++++++++++++++++--
 5 files changed, 1038 insertions(+), 520 deletions(-)

diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index b349a65..84c47a1 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -9,7 +9,7 @@
     "DATA_WIDTH": 32,
     "TOT_MEM_SIZE": 256,
     "N_BANKS": 64,
-    "INTERCO_TYPE": "HCI",
+    "INTERCO_TYPE": "MUX",
     "TS_BIT": 21,
     "EXPFIFO": 0,
     "SEL_LIC": 0
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index 6c0d227..683a709 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -1,506 +1,178 @@
 {
-  "description": "MobileViT-inspired materialized attention kernel. d_model=192, num_heads=4, d_head=48, N=256 tokens, B=128 (2 tiles). One head at a time. Static resident: K^T_head (0x00000, 16kB), V_head (0x04000, 16kB). Ping-pong workspaces: Buf0={Q0 (0x08000,8kB), A0/P0 (0x0A000,64kB), O0 (0x1A000,8kB)}, Buf1={Q1 (0x1C000,8kB), A1/P1 (0x1E000,64kB), O1 (0x2E000,8kB)}. Pipeline: DMA preloads all tensors (dma_setup); GEMM computes Q*K^T->A (gemm_qk_t0); 8 cores softmax A->P in-place (softmax_t0); GEMM computes P*V->O (gemm_pv_t0); 8 cores norm O in-place (norm_t0); repeat for tile 1. Single GEMM engine serializes all 4 GEMM ops.",
-
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
   "log_masters": [
-    {
-      "id": 0,
-      "description": "Core 0",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
-          "region_size_bytes": 8192
-        }
-      ]
-    },
+    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
+    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
+    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
+    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
+    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
+    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
+    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
+    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
     {
       "id": 1,
-      "description": "Core 1",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
-          "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "description": "Core 2",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
-          "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "description": "Core 3",
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
       "patterns": [
         {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
           "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "description": "Core 4",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
         },
         {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
           "region_size_bytes": 8192
         },
         {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
+          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A1_buf1",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
           "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 5,
-      "description": "Core 5",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
         },
         {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
+          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A2_buf0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
           "region_size_bytes": 8192
         },
         {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
+          "description": "Read back C0 after GEMM tile A0",
+          "mem_access_type": "linear",
+          "job": "dma_store_C0_buf2",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
           "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 6,
-      "description": "Core 6",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
         },
         {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
+          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A3_buf1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
           "region_size_bytes": 8192
         },
         {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
+          "description": "Read back C1 after GEMM tile A1",
+          "mem_access_type": "linear",
+          "job": "dma_store_C1_buf3",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
           "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 7,
-      "description": "Core 7",
-      "patterns": [
-        {
-          "description": "softmax tile 0: read+write A0/P0 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t0",
-          "wait_for": ["gemm_qk_t0"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x0A000",
-          "region_size_bytes": 65536
         },
         {
-          "description": "norm tile 0: read+write O0 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t0",
-          "wait_for": ["gemm_pv_t0"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1A000",
+          "description": "Read back C0 after GEMM tile A2",
+          "mem_access_type": "linear",
+          "job": "dma_store_C2_buf2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
           "region_size_bytes": 8192
         },
         {
-          "description": "softmax tile 1: read+write A1/P1 in-place",
-          "mem_access_type": "random",
-          "phase": "softmax_t1",
-          "wait_for": ["gemm_qk_t1"],
-          "n_transactions": 2048,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x1E000",
-          "region_size_bytes": 65536
-        },
-        {
-          "description": "norm tile 1: read+write O1 in-place",
-          "mem_access_type": "random",
-          "phase": "norm_t1",
-          "wait_for": ["gemm_pv_t1"],
-          "n_transactions": 384,
-          "traffic_pct": 75,
-          "traffic_read_pct": 50,
-          "region_base_address": "0x2E000",
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
           "region_size_bytes": 8192
         }
       ]
     },
-    {
-      "id": 8,
-      "description": "External 0 (idle)",
-      "mem_access_type": "idle"
-    }
-  ],
-
-  "hwpe_masters": [
     {
       "id": 0,
-      "description": "DMA: preload K^T_head, V_head, Q0, Q1 (all writes, one-shot)",
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
       "patterns": [
         {
-          "description": "Preload K^T_head (0x00000, 12kB active / 16kB padded), V_head (0x04000), Q0 (0x08000), Q1 (0x1C000)",
-          "mem_access_type": "random",
-          "phase": "dma_setup",
-          "n_transactions": 9216,
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x00000",
-          "region_size_bytes": 122880
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "description": "GEMM engine: 4 sequential ops (QK_t0, PV_t0, QK_t1, PV_t1)",
-      "patterns": [
-        {
-          "description": "QK tile 0: read Q0 [128x48 int8], write A0 [128x256 int16]",
+          "description": "GEMM tile A0: A0 x B -> C0",
           "mem_access_type": "matmul_phased",
-          "phase": "gemm_qk_t0",
-          "wait_for": ["dma_setup"],
-          "matrix_m": 128,
-          "matrix_n": 256,
-          "matrix_k": 48,
-          "region_base_address_a": "0x08000",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
           "region_size_bytes_a": 8192,
-          "matmul_ratio_a": 1,
           "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 16384,
-          "matmul_ratio_b": 0,
-          "region_base_address_c": "0x0A000",
-          "region_size_bytes_c": 65536,
-          "matmul_ratio_c": 4
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
         },
         {
-          "description": "PV tile 0: read P0 [128x256 int16], write O0 [128x48 int8]",
+          "description": "GEMM tile A1: A1 x B -> C1",
           "mem_access_type": "matmul_phased",
-          "phase": "gemm_pv_t0",
-          "wait_for": ["softmax_t0"],
-          "matrix_m": 128,
-          "matrix_n": 48,
-          "matrix_k": 256,
-          "region_base_address_a": "0x0A000",
-          "region_size_bytes_a": 65536,
-          "matmul_ratio_a": 4,
-          "region_base_address_b": "0x04000",
-          "region_size_bytes_b": 16384,
-          "matmul_ratio_b": 0,
-          "region_base_address_c": "0x1A000",
-          "region_size_bytes_c": 8192,
-          "matmul_ratio_c": 1
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
         },
         {
-          "description": "QK tile 1: read Q1 [128x48 int8], write A1 [128x256 int16]",
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
           "mem_access_type": "matmul_phased",
-          "phase": "gemm_qk_t1",
-          "wait_for": ["norm_t0"],
-          "matrix_m": 128,
-          "matrix_n": 256,
-          "matrix_k": 48,
-          "region_base_address_a": "0x1C000",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
           "region_size_bytes_a": 8192,
-          "matmul_ratio_a": 1,
           "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 16384,
-          "matmul_ratio_b": 0,
-          "region_base_address_c": "0x1E000",
-          "region_size_bytes_c": 65536,
-          "matmul_ratio_c": 4
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
         },
         {
-          "description": "PV tile 1: read P1 [128x256 int16], write O1 [128x48 int8]",
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
           "mem_access_type": "matmul_phased",
-          "phase": "gemm_pv_t1",
-          "wait_for": ["softmax_t1"],
-          "matrix_m": 128,
-          "matrix_n": 48,
-          "matrix_k": 256,
-          "region_base_address_a": "0x1E000",
-          "region_size_bytes_a": 65536,
-          "matmul_ratio_a": 4,
-          "region_base_address_b": "0x04000",
-          "region_size_bytes_b": 16384,
-          "matmul_ratio_b": 0,
-          "region_base_address_c": "0x2E000",
-          "region_size_bytes_c": 8192,
-          "matmul_ratio_c": 1
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
         }
       ]
     }
diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json
index 2140e37..1037a0f 100644
--- a/target/verif/config/workload.schema.json
+++ b/target/verif/config/workload.schema.json
@@ -41,22 +41,22 @@
           "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"],
           "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'."
         },
-        "phase": {
+        "job": {
           "type": "string",
           "default": "default",
-          "description": "Phase name for this pattern segment. Used by other patterns' wait_for to reference this segment. The master's phase identity is determined by its first pattern's phase."
+          "description": "Job name for this pattern segment. Used by other patterns' wait_for_jobs to reference this segment."
         },
-        "wait_for": {
+        "wait_for_jobs": {
           "type": "array",
           "items": { "type": "string" },
           "default": [],
-          "description": "Phase names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced phases have advanced past the same fence level."
+          "description": "Job names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced jobs have advanced past the same fence level."
         },
 
         "n_transactions": {
           "type": "integer",
           "minimum": 0,
-          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry fields."
+          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry or from region_size_bytes (transaction-width based)."
         },
 
         "region_base_address": {
@@ -64,14 +64,14 @@
             { "type": "integer", "minimum": 0 },
             { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
           ],
-          "description": "[random, matmul_phased] Base byte address of the memory region."
+          "description": "[random, linear, matmul_phased] Base byte address of the memory region."
         },
         "region_size_bytes": {
           "oneOf": [
             { "type": "integer", "minimum": 0 },
             { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
           ],
-          "description": "[random, matmul_phased] Size in bytes of the memory region."
+          "description": "[random, linear, matmul_phased] Size in bytes of the memory region. For linear/matmul_phased, if n_transactions is omitted this can be used to derive it from transaction width."
         },
 
         "start_address": {
@@ -181,7 +181,7 @@
           "minimum": 1,
           "maximum": 100,
           "default": 100,
-          "description": "[random, linear] Bus utilization percentage."
+          "description": "[random, linear, matmul_phased] Modeled request utilization percentage (adds req=0 idles after each request when <100)."
         },
         "traffic_read_pct": {
           "type": "integer",
@@ -204,7 +204,13 @@
         },
         {
           "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] },
-          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["length"] }] }
+          "then": {
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["length"] },
+              { "required": ["region_size_bytes"] }
+            ]
+          }
         },
         {
           "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] },
@@ -220,7 +226,11 @@
             "required": ["mem_access_type"]
           },
           "then": {
-            "anyOf": [{ "required": ["n_transactions"] }, { "required": ["matrix_m", "matrix_n", "matrix_k"] }]
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["region_size_bytes"] },
+              { "required": ["matrix_m", "matrix_n", "matrix_k"] }
+            ]
           }
         }
       ]
@@ -228,7 +238,7 @@
 
     "master": {
       "type": "object",
-      "description": "Per-master configuration. A master either has a flat single-pattern config (legacy, backward compatible) or a 'patterns' list for multi-phase execution.",
+      "description": "Per-master configuration. A master either has a flat single-pattern config or a 'patterns' list for multi-job execution.",
       "unevaluatedProperties": false,
       "properties": {
         "id": {
@@ -248,7 +258,7 @@
         "patterns": {
           "type": "array",
           "minItems": 1,
-          "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for of pattern f defines which phases must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern (legacy format).",
+          "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for_jobs of pattern f defines which jobs must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern.",
           "items": { "$ref": "#/$defs/pattern" }
         }
       },
diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py
index 063c008..afc6633 100644
--- a/target/verif/simvectors/hci_stimuli/patterns.py
+++ b/target/verif/simvectors/hci_stimuli/patterns.py
@@ -186,6 +186,7 @@ def idle_gen(self, id_start, append=False):
     def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write,
                           region_base_address, region_size_bytes,
                           matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1,
+                          traffic_pct=100,
                           idle_cycles_between_phases=0,
                           region_base_address_a=None, region_size_bytes_a=None,
                           region_base_address_b=None, region_size_bytes_b=None,
@@ -193,6 +194,7 @@ def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write,
                           append=False):
         id_value = id_start; fr_new, fw_new = [], []
         ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024)
+        n_idles = self._idles_per_req(traffic_pct)
 
         def _res(bo, so, fb, fs):
             b = self._align_down(int(bo if bo is not None else fb), ab)
@@ -228,6 +230,7 @@ def _emit(fobj, count, wen, pb, pe):
                 self._record_access(add, wen, fr_new, fw_new)
                 id_value += 1; addr += ab
                 if addr >= pe: addr = pb
+                for _ in range(n_idles): self._write_idle(fobj)
 
         with self._open(append) as f:
             _emit(f, ca, 1, a_base, a_base+a_size)
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index c6a0eae..0cd3c12 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -11,6 +11,7 @@
 import json
 import math
 import sys
+import html
 from pathlib import Path
 import argparse
 
@@ -71,6 +72,10 @@ def main(argv=None):
     N_EXT = hw_params['N_EXT']
     N_HWPE = hw_params['N_HWPE']
     HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT']
+    N_CORE_CFG = N_CORE
+    N_DMA_CFG = N_DMA
+    N_EXT_CFG = N_EXT
+    N_HWPE_CFG = N_HWPE
 
     log_masters = workload_config['log_masters']
     hwpe_masters = workload_config['hwpe_masters']
@@ -78,8 +83,21 @@ def main(argv=None):
     # Derived parameters
     ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000))
     N_LOG = N_CORE + N_DMA + N_EXT
+    N_LOG_CFG = N_LOG
     IW = 8
 
+    def _narrow_driver_name(local_idx: int) -> str:
+        idx = int(local_idx)
+        if idx < N_CORE_CFG:
+            return f"core_{idx}"
+        idx -= N_CORE_CFG
+        if idx < N_DMA_CFG:
+            return f"dma_{idx}"
+        idx -= N_DMA_CFG
+        if idx < N_EXT_CFG:
+            return f"ext_{idx}"
+        return f"narrow_{local_idx}"
+
     # Validations
     if len(log_masters) != N_LOG:
         print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})")
@@ -145,7 +163,13 @@ def _bank_of(byte_addr):
     def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes,
                            region_base, region_size, start_address, stride0, len_d0,
                            stride1, len_d1, stride2, master_config, total_mem_bytes):
-        label = f"{kind}_{local_idx}" + (f" ({description})" if description else "")
+        if kind == 'master_log':
+            label_prefix = _narrow_driver_name(local_idx)
+        elif kind == 'master_hwpe':
+            label_prefix = f"hwpe_{local_idx}"
+        else:
+            label_prefix = f"{kind}_{local_idx}"
+        label = label_prefix + (f" ({description})" if description else "")
         if config == 'idle' or n_test == 0:
             memory_map_entries.append({'label': label, 'pattern': config, 'n': 0,
                                        'info': 'idle - no memory accesses'})
@@ -199,6 +223,10 @@ def _record_memory_map(kind, local_idx, description, config, n_test, data_width,
             if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')):
                 m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k'])
                 detail['matrix_dims'] = f"M={m} N={n} K={k}  (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
             idle_between = master_config.get('idle_cycles_between_phases', 0)
             if idle_between:
                 detail['idle_between_phases'] = f"{idle_between} cycles"
@@ -283,6 +311,19 @@ def _normalize_mem_access_type(raw_value, master_name):
             sys.exit(1)
         return key
 
+    def _pattern_job_name(pattern_config: dict) -> str:
+        """Resolve job name from the mandatory 'job' key."""
+        return str(pattern_config.get('job', 'default'))
+
+    def _pattern_wait_for_jobs(pattern_config):
+        """Resolve dependency list from the mandatory 'wait_for_jobs' key."""
+        raw = pattern_config.get('wait_for_jobs', [])
+        if raw is None:
+            return []
+        if isinstance(raw, list):
+            return [str(x) for x in raw]
+        return [str(raw)]
+
     def _warn_if_id_mismatch(master_cfg, expected_idx, master_name):
         raw_id = master_cfg.get("id", expected_idx)
         try:
@@ -300,10 +341,16 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt
         """Resolve n_transactions from explicit field or geometry, depending on pattern."""
         if 'n_transactions' in master_config:
             return int(master_config['n_transactions'])
+        access_bytes = max(1, int(data_width // 8))
         if mem_access_type == 'linear':
             length = master_config.get('length')
             if length is not None:
                 return int(length)
+            raw_region_size = master_config.get('region_size_bytes')
+            if raw_region_size is not None:
+                region_size = _parse_maybe_bin_int(raw_region_size, None)
+                if region_size is not None:
+                    return max(0, int(region_size) // access_bytes)
         elif mem_access_type == '2d':
             len_d0 = master_config.get('len_d0')
             len_d1 = master_config.get('len_d1')
@@ -316,6 +363,13 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt
             if len_d0 is not None and len_d1 is not None and len_d2 is not None:
                 return int(len_d0) * int(len_d1) * int(len_d2)
         elif mem_access_type == 'matmul_phased':
+            # For non-random region-based traffic, allow deriving transactions
+            # from region size and transaction width.
+            raw_region_size = master_config.get('region_size_bytes')
+            if raw_region_size is not None:
+                region_size = _parse_maybe_bin_int(raw_region_size, None)
+                if region_size is not None:
+                    return max(0, int(region_size) // access_bytes)
             m = master_config.get('matrix_m')
             n = master_config.get('matrix_n')
             k = master_config.get('matrix_k')
@@ -356,8 +410,19 @@ def _generate_pattern(
             pattern_config['mem_access_type'],
             f"{kind}_{master_local_idx}",
         )
-        start_address = str(pattern_config.get('start_address', '0'))
-        stride0 = int(pattern_config.get('stride0', 0))
+        if 'start_address' in pattern_config:
+            start_address = str(pattern_config['start_address'])
+        elif config == 'linear' and 'region_base_address' in pattern_config:
+            start_address = str(pattern_config['region_base_address'])
+        else:
+            start_address = '0'
+
+        if 'stride0' in pattern_config:
+            stride0 = int(pattern_config['stride0'])
+        elif config == 'linear' and 'region_size_bytes' in pattern_config:
+            stride0 = 1
+        else:
+            stride0 = 0
         len_d0 = int(pattern_config.get('len_d0', 0))
         stride1 = int(pattern_config.get('stride1', 0))
         len_d1 = int(pattern_config.get('len_d1', 0))
@@ -369,7 +434,19 @@ def _generate_pattern(
         default_region_base = master_local_idx * default_region_size
 
         region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base)
-        region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size)
+
+        # For non-random region-based patterns, if n_transactions is provided but
+        # region_size_bytes is omitted, span a full non-wrapping region that can
+        # hold all transactions once at the current transaction width.
+        region_size_input = pattern_config.get('region_size_bytes')
+        if (
+            config in {'linear', 'matmul_phased'}
+            and region_size_input is None
+            and 'n_transactions' in pattern_config
+        ):
+            region_size_input = int(pattern_config['n_transactions']) * access_bytes
+
+        region_size = _parse_maybe_bin_int(region_size_input, default_region_size)
 
         region_base = (region_base // access_bytes) * access_bytes
         if region_base >= total_mem_bytes:
@@ -382,13 +459,18 @@ def _generate_pattern(
 
         n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx)
         master.N_TEST = n_test
+        # Keep forbidden-address filtering local to this pattern invocation.
+        # This allows intended buffer reuse across phases (e.g. double buffering)
+        # while still avoiding duplicates inside a single pattern generator call.
+        forbidden_read_local = list(LIST_OF_FORBIDDEN_ADDRESSES_READ)
+        forbidden_write_local = list(LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
 
         if config == 'random':
             tpct = pattern_config.get('traffic_pct')
             next_start_id = master.random_gen(
                 next_start_id,
-                LIST_OF_FORBIDDEN_ADDRESSES_READ,
-                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                forbidden_read_local,
+                forbidden_write_local,
                 region_base=region_base,
                 region_size=region_size,
                 traffic_pct=int(tpct) if tpct is not None else 100,
@@ -399,8 +481,8 @@ def _generate_pattern(
             tpct = pattern_config.get('traffic_pct')
             next_start_id = master.linear_gen(
                 stride0, start_address, next_start_id,
-                LIST_OF_FORBIDDEN_ADDRESSES_READ,
-                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                forbidden_read_local,
+                forbidden_write_local,
                 traffic_pct=int(tpct) if tpct is not None else 100,
                 traffic_read_pct=pattern_config.get('traffic_read_pct'),
                 append=append,
@@ -408,16 +490,16 @@ def _generate_pattern(
         elif config == '2d':
             next_start_id = master.gen_2d(
                 stride0, len_d0, stride1, start_address, next_start_id,
-                LIST_OF_FORBIDDEN_ADDRESSES_READ,
-                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                forbidden_read_local,
+                forbidden_write_local,
                 idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
                 append=append,
             )
         elif config == '3d':
             next_start_id = master.gen_3d(
                 stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id,
-                LIST_OF_FORBIDDEN_ADDRESSES_READ,
-                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                forbidden_read_local,
+                forbidden_write_local,
                 idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
                 append=append,
             )
@@ -438,13 +520,14 @@ def _generate_pattern(
                 sys.exit(1)
             next_start_id = master.matmul_phased_gen(
                 next_start_id,
-                LIST_OF_FORBIDDEN_ADDRESSES_READ,
-                LIST_OF_FORBIDDEN_ADDRESSES_WRITE,
+                forbidden_read_local,
+                forbidden_write_local,
                 region_base,
                 region_size,
                 int(pattern_config.get('matmul_ratio_a', 1)),
                 int(pattern_config.get('matmul_ratio_b', 1)),
                 int(pattern_config.get('matmul_ratio_c', 1)),
+                traffic_pct=int(pattern_config.get('traffic_pct', 100)),
                 idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
                 region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None),
                 region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None),
@@ -492,7 +575,7 @@ def _generate_master(
         if start_delay > 0:
             pending_start_delays.append((filepath, start_delay, data_width))
 
-        # For each pattern with wait_for, prepend a synthetic idle+PAUSE that acts as
+        # For each pattern with wait_for_jobs, prepend a synthetic idle+PAUSE that acts as
         # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free
         # pass), so fence_idx advances immediately after the real work is done.
         # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle
@@ -500,7 +583,7 @@ def _generate_master(
         dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
         first_written = False
         for p_idx, pattern_config in enumerate(patterns):
-            if pattern_config.get('wait_for'):
+            if _pattern_wait_for_jobs(pattern_config):
                 # Synthetic idle+PAUSE gates this pattern
                 _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE,
                                          dw, ADD_WIDTH, str(filepath), 0, master_global_idx)
@@ -575,11 +658,11 @@ def _generate_master(
     # resume from that PAUSE.
     #
     # For a master with N patterns, there are N fence slots (slot 0 = before
-    # pattern 0, slot f = before pattern f). The wait_for of pattern f defines
+    # pattern 0, slot f = before pattern f). The wait_for_jobs of pattern f defines
     # the mask at fence slot f.
     #
     # Legacy flat masters (no 'patterns' key) are treated as single-pattern
-    # masters: one fence slot (slot 0) from the top-level wait_for field.
+    # masters: one fence slot (slot 0) from the top-level wait_for_jobs field.
     # -----------------------------------------------------------------------
     N_DRIVERS = N_LOG + N_HWPE
 
@@ -589,30 +672,30 @@ def _patterns_of(master_config):
             return master_config['patterns']
         return [master_config]
 
-    # Build phase->driver map: every pattern of every driver registers its phase.
-    # This allows wait_for to reference any phase, not just first patterns.
-    # A phase may be associated with multiple drivers (e.g. 8 cores all in softmax_t0).
-    phase_to_drivers: dict[str, list[int]] = {}
+    # Build job->driver map: every pattern of every driver registers its job.
+    # This allows wait_for_jobs to reference any job, not just first patterns.
+    # A job may be associated with multiple drivers (e.g. 8 cores all in softmax_t0).
+    job_to_drivers = {}
     all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters]
     for i, (m, _) in enumerate(all_masters):
         for pat in _patterns_of(m):
-            phase = str(pat.get('phase', 'default'))
-            if i not in phase_to_drivers.get(phase, []):
-                phase_to_drivers.setdefault(phase, []).append(i)
-
-    # Build phase->pattern_index map: for each phase, which pattern index within
-    # each driver corresponds to that phase. Used to compute FENCE_REQ_LEVELS.
-    # phase_pattern_idx[phase][driver] = pattern index of that phase in that driver
-    phase_pattern_idx: dict[str, dict[int, int]] = {}
+            job = _pattern_job_name(pat)
+            if i not in job_to_drivers.get(job, []):
+                job_to_drivers.setdefault(job, []).append(i)
+
+    # Build job->pattern_index map: for each job, which pattern index within
+    # each driver corresponds to that job. Used to compute FENCE_REQ_LEVELS.
+    # job_pattern_idx[job][driver] = pattern index of that job in that driver
+    job_pattern_idx = {}
     for i, (m, _) in enumerate(all_masters):
         for p_idx, pat in enumerate(_patterns_of(m)):
-            phase = str(pat.get('phase', 'default'))
-            phase_pattern_idx.setdefault(phase, {})[i] = p_idx
+            job = _pattern_job_name(pat)
+            job_pattern_idx.setdefault(job, {})[i] = p_idx
 
-    def _resolve_wait_mask(wait_for_list):
+    def _resolve_wait_mask(wait_for_jobs_list):
         mask = 0
-        for dep_phase in wait_for_list:
-            for dep_drv in phase_to_drivers.get(str(dep_phase), []):
+        for dep_job in wait_for_jobs_list:
+            for dep_drv in job_to_drivers.get(str(dep_job), []):
                 mask |= (1 << dep_drv)
         return mask
 
@@ -621,38 +704,39 @@ def _resolve_wait_mask(wait_for_list):
     #   including the trailing PAUSE of pattern p.
     def _fence_idx_after_pattern(drv_idx, pat_idx):
         pats = _patterns_of(all_masters[drv_idx][0])
-        # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for)
-        n_gates = sum(1 for k in range(pat_idx + 1) if pats[k].get('wait_for'))
+        # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for_jobs)
+        n_gates = sum(1 for k in range(pat_idx + 1) if _pattern_wait_for_jobs(pats[k]))
         # Plus trailing PAUSEs for patterns 0..pat_idx
         n_trailing = pat_idx + 1
         return n_gates + n_trailing
 
-    def _resolve_req_levels(wait_for_list):
+    def _resolve_req_levels(wait_for_jobs_list):
         """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j."""
         levels = [0] * N_DRIVERS
-        for dep_phase in wait_for_list:
-            for dep_drv in phase_to_drivers.get(str(dep_phase), []):
-                p_idx = phase_pattern_idx.get(str(dep_phase), {}).get(dep_drv, 0)
+        for dep_job in wait_for_jobs_list:
+            for dep_drv in job_to_drivers.get(str(dep_job), []):
+                p_idx = job_pattern_idx.get(str(dep_job), {}).get(dep_drv, 0)
                 levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx)
         return levels
 
     # Build per-driver fence mask and req_level lists.
-    # Each pattern with wait_for gets a synthetic idle gate (mask = wait_for) before it.
+    # Each pattern with wait_for_jobs gets a synthetic idle gate (mask = wait_for_jobs) before it.
     # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx).
     # Fences are enumerated in file order: for each pattern p:
-    #   if p has wait_for: synthetic idle fence (mask = wait_for of p)
+    #   if p has wait_for_jobs: synthetic idle fence (mask = wait_for_jobs of p)
     #   trailing PAUSE fence (mask = 0)
-    fence_masks:  list[list[int]]       = []
-    req_levels:   list[list[list[int]]] = []
+    fence_masks = []
+    req_levels = []
     for i, (m, _) in enumerate(all_masters):
         patterns = _patterns_of(m)
         per_masks  = []
         per_levels = []
         for pat in patterns:
-            if pat.get('wait_for'):
+            wait_for_jobs = _pattern_wait_for_jobs(pat)
+            if wait_for_jobs:
                 # Synthetic idle gate: blocking fence
-                per_masks.append(_resolve_wait_mask(pat['wait_for']))
-                per_levels.append(_resolve_req_levels(pat['wait_for']))
+                per_masks.append(_resolve_wait_mask(wait_for_jobs))
+                per_levels.append(_resolve_req_levels(wait_for_jobs))
             # Trailing PAUSE: free pass, just signals completion
             per_masks.append(0)
             per_levels.append([0] * N_DRIVERS)
@@ -699,7 +783,7 @@ def _resolve_req_levels(wait_for_list):
         phases_mk_path.write_text(
             "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n"
             "# Per-driver per-fence dependency data for tb_hci.sv.\n"
-            f"# Drivers 0..{N_LOG-1} = log masters, {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n"
+            f"# Drivers 0..{N_LOG-1} = narrow masters (core/dma/ext), {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n"
             f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n"
             f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n"
             f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n"
@@ -713,6 +797,337 @@ def _resolve_req_levels(wait_for_list):
     # -----------------------------------------------------------------------
     # Build and emit memory map report
     # -----------------------------------------------------------------------
+    INTERCO_TYPE = str(hw_params.get('INTERCO_TYPE', 'HCI')).strip().upper()
+    if INTERCO_TYPE not in {"LOG", "MUX", "HCI"}:
+        INTERCO_TYPE = "HCI"
+    DW_NARROW = int(DATA_WIDTH)
+    DW_WIDE = int(HWPE_WIDTH_FACT * DATA_WIDTH)
+    N_NARROW_HCI_CFG = int(
+        N_CORE_CFG + N_DMA_CFG + N_EXT_CFG
+        + (N_HWPE_CFG * HWPE_WIDTH_FACT if INTERCO_TYPE == "LOG" else 0)
+    )
+    N_WIDE_HCI_CFG = int(N_HWPE_CFG if INTERCO_TYPE == "HCI" else (1 if INTERCO_TYPE == "MUX" else 0))
+    N_MASTER_PORTS_CFG = int(N_NARROW_HCI_CFG + N_WIDE_HCI_CFG)
+
+    def _driver_name(driver_idx):
+        if driver_idx < N_LOG:
+            return _narrow_driver_name(driver_idx)
+        return f"hwpe_{driver_idx - N_LOG}"
+
+    def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peers):
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        access_bytes = max(1, int(data_width // 8))
+        total_mem_bytes = int(TOT_MEM_SIZE * 1024)
+        default_region_size = total_mem_bytes // max(1, n_peers)
+        default_region_base = local_idx * default_region_size
+
+        region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base)
+        region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size)
+
+        region_base = (region_base // access_bytes) * access_bytes
+        if region_base >= total_mem_bytes:
+            region_base = region_base % total_mem_bytes
+        region_size = (max(0, region_size) // access_bytes) * access_bytes
+        if region_size <= 0:
+            region_size = (default_region_size // access_bytes) * access_bytes
+        if region_base + region_size > total_mem_bytes:
+            region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes
+
+        if region_size <= 0:
+            return []
+
+        if mem_access_type == 'idle':
+            return []
+
+        if mem_access_type != 'matmul_phased':
+            return [{
+                'label': 'region',
+                'base': region_base,
+                'size': region_size,
+                'end': region_base + region_size - 1,
+            }]
+
+        ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+        sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+        rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+        sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+        rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+        sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+
+        regions = []
+        if ra is not None and sa is not None:
+            sub_defs = [
+                ('A(read)', ra, sa),
+                ('B(read)', rb if rb is not None else ra, sb if sb is not None else sa),
+                ('C(write)', rc if rc is not None else ra, sc if sc is not None else sa),
+            ]
+            for label, base_raw, size_raw in sub_defs:
+                base = (int(base_raw) // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, int(size_raw)) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size > 0:
+                    regions.append({
+                        'label': label,
+                        'base': base,
+                        'size': size,
+                        'end': base + size - 1,
+                    })
+            return regions
+
+        n_words = region_size // access_bytes
+        if n_words < 3:
+            return [{
+                'label': 'region',
+                'base': region_base,
+                'size': region_size,
+                'end': region_base + region_size - 1,
+            }]
+        a_words = max(1, n_words // 3)
+        b_words = max(1, n_words // 3)
+        c_words = n_words - a_words - b_words
+        sub_regions = [
+            ('A(read)', region_base, a_words * access_bytes),
+            ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes),
+            ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes),
+        ]
+        for label, base, size in sub_regions:
+            if size <= 0:
+                continue
+            regions.append({
+                'label': label,
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            })
+        return regions
+
+    def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test):
+        # Intentionally simple temporal model:
+        # - one time unit per transaction, plus optional req=0 idles from traffic shaping
+        # - no interconnect stall/conflict modeling
+        # - explicit start_delay_cycles is still honored separately
+        base = max(0, int(n_test))
+        tpct = pattern_config.get('traffic_pct')
+        n_idles_per_req = 0
+        if tpct is not None:
+            tp = max(1, min(100, int(tpct)))
+            n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp))
+        cycles = base * (1 + n_idles_per_req)
+        # Keep explicit matmul phase-boundary idle modeling.
+        if mem_access_type == 'matmul_phased':
+            idle_between = int(pattern_config.get('idle_cycles_between_phases', 0))
+            if idle_between > 0:
+                ra = max(0, int(pattern_config.get('matmul_ratio_a', 1)))
+                rb = max(0, int(pattern_config.get('matmul_ratio_b', 1)))
+                rc = max(0, int(pattern_config.get('matmul_ratio_c', 1)))
+                if ra == 0 and rb == 0 and rc == 0:
+                    ra, rb, rc = 1, 1, 1
+                s = ra + rb + rc
+                ca = (base * ra) // s
+                cb = (base * rb) // s
+                cc = base - ca - cb
+                phase_gaps = 0
+                if ca > 0 and (cb > 0 or cc > 0):
+                    phase_gaps += 1
+                if cb > 0 and cc > 0:
+                    phase_gaps += 1
+                cycles += idle_between * phase_gaps
+        return int(cycles)
+
+    pattern_nodes = []
+    node_idx_by_driver_pattern = {}
+    job_to_nodes = {}
+    driver_last_node = {}
+
+    for drv_idx, (master_cfg, is_hwpe) in enumerate(all_masters):
+        patterns = _patterns_of(master_cfg)
+        local_idx = drv_idx - N_LOG if is_hwpe else drv_idx
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        kind = 'master_hwpe' if is_hwpe else 'master_log'
+        n_peers = max(1, N_HWPE if is_hwpe else N_LOG)
+        start_delay = int(master_cfg.get('start_delay_cycles', 0))
+        for p_idx, pat in enumerate(patterns):
+            raw_type = pat.get('mem_access_type', 'idle')
+            mem_access_type = _normalize_mem_access_type(raw_type, f"{kind}_{local_idx}")
+            n_test = _resolve_n_transactions(pat, mem_access_type, data_width, kind, local_idx)
+            declared_wait_for_jobs = _pattern_wait_for_jobs(pat)
+            # Timeline view follows declared dependencies from workload.json.
+            effective_wait_for_jobs = declared_wait_for_jobs
+            node = {
+                'node_idx': len(pattern_nodes),
+                'driver_idx': drv_idx,
+                'driver_name': _driver_name(drv_idx),
+                'is_hwpe': is_hwpe,
+                'local_idx': local_idx,
+                'pattern_idx': p_idx,
+                'description': str(pat.get('description', '')).strip(),
+                'job': _pattern_job_name(pat),
+                'wait_for_jobs_declared': declared_wait_for_jobs,
+                'wait_for_jobs_effective': effective_wait_for_jobs,
+                'n_transactions': int(n_test),
+                'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test)),
+                'mem_access_type': mem_access_type,
+                'traffic_read_pct': pat.get('traffic_read_pct'),
+                'txn_bytes': int(data_width // 8),
+                'start_delay': start_delay if p_idx == 0 else 0,
+                'regions': _resolve_regions(pat, mem_access_type, is_hwpe, local_idx, n_peers),
+            }
+            pattern_nodes.append(node)
+            node_idx_by_driver_pattern[(drv_idx, p_idx)] = node['node_idx']
+            job_to_nodes.setdefault(node['job'], []).append(node['node_idx'])
+            driver_last_node[drv_idx] = node['node_idx']
+
+    n_nodes = len(pattern_nodes)
+    preds = [set() for _ in range(n_nodes)]
+    succs = [set() for _ in range(n_nodes)]
+    mux_serialization_applied = False
+    mux_phase_order = []
+
+    def _add_edge(src, dst):
+        if src == dst or src < 0 or dst < 0:
+            return
+        if src not in preds[dst]:
+            preds[dst].add(src)
+            succs[src].add(dst)
+
+    for node in pattern_nodes:
+        n_idx = node['node_idx']
+        drv_idx = node['driver_idx']
+        p_idx = node['pattern_idx']
+        if p_idx > 0:
+            _add_edge(node_idx_by_driver_pattern[(drv_idx, p_idx - 1)], n_idx)
+        for dep_job in node['wait_for_jobs_effective']:
+            for dep_idx in job_to_nodes.get(dep_job, []):
+                _add_edge(dep_idx, n_idx)
+
+    if INTERCO_TYPE == "MUX":
+        # Match tb_hci MUX semantics in the temporal model:
+        # serialize HWPE execution by job order, and by HWPE ID within a job.
+        hwpe_nodes = [n for n in pattern_nodes if n['is_hwpe']]
+        if hwpe_nodes:
+            job_first_seen = {}
+            for n in sorted(hwpe_nodes, key=lambda x: (x['pattern_idx'], x['local_idx'], x['node_idx'])):
+                job_first_seen.setdefault(n['job'], len(job_first_seen))
+
+            job_preds = {jb: set() for jb in job_first_seen}
+            job_succs = {jb: set() for jb in job_first_seen}
+            for n in hwpe_nodes:
+                cur = n['job']
+                for dep_job in n['wait_for_jobs_effective']:
+                    dep = str(dep_job)
+                    if dep in job_first_seen and dep != cur:
+                        job_preds[cur].add(dep)
+                        job_succs[dep].add(cur)
+
+            phase_indeg = {jb: len(job_preds[jb]) for jb in job_first_seen}
+            phase_ready = sorted([jb for jb, deg in phase_indeg.items() if deg == 0],
+                                 key=lambda jb: job_first_seen[jb])
+            mux_phase_order = []
+            while phase_ready:
+                cur = phase_ready.pop(0)
+                mux_phase_order.append(cur)
+                for nxt in sorted(job_succs[cur], key=lambda jb: job_first_seen[jb]):
+                    phase_indeg[nxt] -= 1
+                    if phase_indeg[nxt] == 0:
+                        phase_ready.append(nxt)
+                phase_ready.sort(key=lambda jb: job_first_seen[jb])
+            if len(mux_phase_order) != len(job_first_seen):
+                mux_phase_order = sorted(job_first_seen.keys(), key=lambda jb: job_first_seen[jb])
+
+            phase_rank = {ph: i for i, ph in enumerate(mux_phase_order)}
+            hwpe_sorted = sorted(
+                hwpe_nodes,
+                key=lambda n: (
+                    phase_rank.get(n['job'], 10 ** 9),
+                    n['local_idx'],
+                    n['pattern_idx'],
+                    n['node_idx'],
+                ),
+            )
+            for i in range(1, len(hwpe_sorted)):
+                _add_edge(hwpe_sorted[i - 1]['node_idx'], hwpe_sorted[i]['node_idx'])
+            mux_serialization_applied = True
+
+    indeg = [len(preds[i]) for i in range(n_nodes)]
+    ready = [i for i, d in enumerate(indeg) if d == 0]
+    ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx']))
+    topo_order = []
+    while ready:
+        cur = ready.pop(0)
+        topo_order.append(cur)
+        for nxt in sorted(succs[cur]):
+            indeg[nxt] -= 1
+            if indeg[nxt] == 0:
+                ready.append(nxt)
+        ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx']))
+
+    schedule_has_cycle = len(topo_order) != n_nodes
+    if schedule_has_cycle:
+        topo_order = list(range(n_nodes))
+
+    node_start = [0 for _ in range(n_nodes)]
+    node_end = [0 for _ in range(n_nodes)]
+    for _ in range(max(1, n_nodes + 1)):
+        changed = False
+        for n_idx in topo_order:
+            dep_end = max((node_end[p] for p in preds[n_idx]), default=0)
+            start_time = max(int(pattern_nodes[n_idx]['start_delay']), dep_end)
+            end_time = start_time + max(0, int(pattern_nodes[n_idx]['cycles']))
+            if start_time != node_start[n_idx] or end_time != node_end[n_idx]:
+                node_start[n_idx] = start_time
+                node_end[n_idx] = end_time
+                changed = True
+        if not changed:
+            break
+
+    for n_idx, node in enumerate(pattern_nodes):
+        node['start_cycle'] = int(node_start[n_idx])
+        node['end_cycle'] = int(node_end[n_idx])
+
+    total_cycles = max((n['end_cycle'] for n in pattern_nodes), default=0)
+
+    driver_windows = {}
+    for node in pattern_nodes:
+        w = driver_windows.setdefault(node['driver_idx'], {
+            'driver_idx': node['driver_idx'],
+            'name': node['driver_name'],
+            'is_hwpe': node['is_hwpe'],
+            'start': node['start_cycle'],
+            'end': node['end_cycle'],
+        })
+        w['start'] = min(w['start'], node['start_cycle'])
+        w['end'] = max(w['end'], node['end_cycle'])
+
+    regions_timeline = {}
+    for node in pattern_nodes:
+        for reg in node['regions']:
+            reg_key = (reg['base'], reg['size'], reg['label'])
+            entry = regions_timeline.setdefault(reg_key, {
+                'base': reg['base'],
+                'size': reg['size'],
+                'end': reg['end'],
+                'label': reg['label'],
+                'accesses': [],
+            })
+            entry['accesses'].append({
+                'driver_idx': node['driver_idx'],
+                'driver_name': node['driver_name'],
+                'job': node['job'],
+                'start': node['start_cycle'],
+                'end': node['end_cycle'],
+                'pattern_idx': node['pattern_idx'],
+                'description': node['description'],
+            })
+    for reg in regions_timeline.values():
+        reg['lifetime_start'] = min((a['start'] for a in reg['accesses']), default=0)
+        reg['lifetime_end'] = max((a['end'] for a in reg['accesses']), default=0)
+
+    # -----------------------------------------------------------------------
+    # Build memory_map.txt
+    # -----------------------------------------------------------------------
     word_bytes = DATA_WIDTH // 8
     bank_stride_bytes = N_BANKS * word_bytes
     lines = []
@@ -721,6 +1136,20 @@ def _resolve_req_levels(wait_for_list):
     lines.append(f"  Total memory : {TOT_MEM_SIZE} KiB  ({TOT_MEM_SIZE * 1024} B)")
     lines.append(f"  Banks        : {N_BANKS}  x  {word_bytes} B/word  (interleaved, stride {bank_stride_bytes} B)")
     lines.append(f"  Data width   : {DATA_WIDTH} b LOG  /  {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE")
+    lines.append(
+        f"  Drivers({DW_NARROW} bit) : "
+        f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}  (LOG total={N_LOG_CFG})"
+    )
+    lines.append(
+        f"  Drivers({DW_WIDE} bit) : "
+        f"HWPE={N_HWPE_CFG}"
+    )
+    lines.append(
+        f"  Interconnect type : {INTERCO_TYPE}  |  "
+        f"Narrow master ports ({DW_NARROW} bit)={N_NARROW_HCI_CFG}  |  "
+        f"Wide master ports ({DW_WIDE} bit)={N_WIDE_HCI_CFG}  |  "
+        f"Slave ports (banks)={N_BANKS}"
+    )
     lines.append("=" * 72)
     for entry in memory_map_entries:
         lines.append(f"\n  [{entry['label']}]  pattern={entry['pattern']}  n_transactions={entry['n']}")
@@ -729,17 +1158,52 @@ def _resolve_req_levels(wait_for_list):
         for k, v in entry.get('detail', {}).items():
             lines.append(f"    {k:<14}: {v}")
     lines.append("")
-    lines.append("  Phase / dependency map:")
-    for phase, drivers in sorted(phase_to_drivers.items()):
-        driver_names = [f"log_{d}" if d < N_LOG else f"hwpe_{d - N_LOG}" for d in drivers]
-        lines.append(f"    phase '{phase}': {', '.join(driver_names)}")
+    lines.append("  Job / dependency map:")
+    for job, drivers in sorted(job_to_drivers.items()):
+        driver_names = [_driver_name(d) for d in drivers]
+        lines.append(f"    job '{job}': {', '.join(driver_names)}")
     for i in range(N_DRIVERS):
-        name = f"log_{i}" if i < N_LOG else f"hwpe_{i - N_LOG}"
+        name = _driver_name(i)
         for f, mask in enumerate(fence_masks[i]):
             if mask:
-                deps = [f"log_{j}" if j < N_LOG else f"hwpe_{j - N_LOG}"
-                        for j in range(N_DRIVERS) if mask & (1 << j)]
+                deps = [_driver_name(j) for j in range(N_DRIVERS) if mask & (1 << j)]
                 lines.append(f"    {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}")
+    lines.append("")
+    lines.append("  Temporal schedule (transaction-count model):")
+    lines.append(f"    Total modeled time: {total_cycles} units (1 unit = 1 transaction)")
+    lines.append("    Note: Declared wait_for_jobs dependencies are used for scheduling.")
+    lines.append("    Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).")
+    lines.append("    Note: No interconnect contention/stall timing is modeled.")
+    if mux_serialization_applied:
+        lines.append("    Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).")
+        lines.append(f"    MUX job order: {', '.join(mux_phase_order)}")
+    if schedule_has_cycle:
+        lines.append("    WARNING: dependency cycle detected while scheduling; using fallback order.")
+    for d in range(N_DRIVERS):
+        if d not in driver_windows:
+            continue
+        w = driver_windows[d]
+        lines.append(f"    {w['name']:<8}: [{w['start']:>6}, {w['end']:>6})  dur={w['end'] - w['start']:>6}")
+        for node in [n for n in pattern_nodes if n['driver_idx'] == d]:
+            reg_tokens = []
+            for reg in node['regions']:
+                reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B")
+            reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions"
+            lines.append(
+                f"      p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']},{node['end_cycle']}) "
+                f"type={node['mem_access_type']} n={node['n_transactions']}  {reg_text}"
+            )
+    lines.append("")
+    lines.append("  Memory region lifetimes:")
+    for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])):
+        reg = regions_timeline[key]
+        users = sorted({a['driver_name'] for a in reg['accesses']})
+        lines.append(
+            f"    {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"({reg['size']:>6} B)  lifetime=[{reg['lifetime_start']},{reg['lifetime_end']})  "
+            f"users={', '.join(users)}"
+        )
     lines.append("=" * 72)
 
     report_text = "\n".join(lines) + "\n"
@@ -748,6 +1212,375 @@ def _resolve_req_levels(wait_for_list):
     memory_map_path.write_text(report_text, encoding='utf-8')
     print(f"Memory map written: {memory_map_path}")
 
+    # -----------------------------------------------------------------------
+    # Build memory_lifetime.html (simple SVG timeline view)
+    # -----------------------------------------------------------------------
+    palette = [
+        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf",
+        "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd",
+        "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e",
+        "#e6ab02", "#a6761d", "#666666",
+    ]
+
+    def _color_for_driver(drv_idx):
+        return palette[drv_idx % len(palette)]
+
+    def _tick_step(total):
+        if total <= 10:
+            return 1
+        raw = max(1, total // 10)
+        mag = 10 ** int(math.log10(raw))
+        return int(math.ceil(raw / mag) * mag)
+
+    total_for_plot = max(1, total_cycles)
+    chart_width = 1280
+    x_left = 220
+    x_right = 24
+    plot_w = chart_width - x_left - x_right
+    row_h = 80
+    y_top = 36
+    exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())]
+    exec_h = y_top + row_h * len(exec_rows) + 72
+    tick = _tick_step(total_for_plot)
+    ticks = list(range(0, total_for_plot + 1, tick))
+    if ticks[-1] != total_for_plot:
+        ticks.append(total_for_plot)
+
+    exec_svg = []
+    exec_svg.append(f'<svg width="{chart_width}" height="{exec_h}" viewBox="0 0 {chart_width} {exec_h}" xmlns="http://www.w3.org/2000/svg">')
+    exec_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    exec_svg.append(f'<text x="{x_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">Execution Timeline (transaction-count model)</text>')
+    for t in ticks:
+        x = x_left + (t / total_for_plot) * plot_w
+        exec_svg.append(f'<line x1="{x:.2f}" y1="{y_top - 6}" x2="{x:.2f}" y2="{exec_h - 52}" stroke="#e0e0e0" stroke-width="1"/>')
+        exec_svg.append(f'<text x="{x:.2f}" y="{exec_h - 36}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" fill="#555">{t}</text>')
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 18}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="12" fill="#333">Transaction number</text>'
+    )
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 2}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="11" fill="#555">'
+        'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.'
+        '</text>'
+    )
+
+    def _rw_mix_text(node):
+        rpct = node.get('traffic_read_pct')
+        if rpct is None:
+            read_bytes = 0
+            write_bytes = 0
+            for reg in node.get('regions', []):
+                label_l = str(reg.get('label', '')).lower()
+                size_b = max(0, int(reg.get('size', 0)))
+                if 'read' in label_l and 'write' not in label_l:
+                    read_bytes += size_b
+                elif 'write' in label_l and 'read' not in label_l:
+                    write_bytes += size_b
+            if (read_bytes + write_bytes) > 0:
+                rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes)))
+            else:
+                rpct = 50
+        rpct = max(0, min(100, int(rpct)))
+        if rpct == 0:
+            return "100% writes"
+        if rpct == 100:
+            return "100% reads"
+        return f"{rpct}% reads / {100 - rpct}% writes"
+
+    def _fmt_kib(bytes_v):
+        return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB"
+
+    def _outside_detail_lines(node, base_addr, size_kib):
+        regs = list(node.get('regions', []))
+        label_map = {str(r.get('label', '')): r for r in regs}
+        has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)'))
+        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs
+
+        if is_matmul:
+            mat_lines = []
+            for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]:
+                if lbl in label_map:
+                    r = label_map[lbl]
+                    mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})")
+            if not mat_lines:
+                return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+            return mat_lines
+
+        if regs:
+            r0 = regs[0]
+            return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"]
+        return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+
+    for row_idx, row in enumerate(exec_rows):
+        y = y_top + row_idx * row_h
+        name = html.escape(row['name'])
+        label_color = "#111111" if row['is_hwpe'] else "#555555"
+        exec_svg.append(f'<text x="{x_left - 10}" y="{y + 34}" text-anchor="end" font-family="Arial, sans-serif" font-size="12" fill="{label_color}">{name}</text>')
+        exec_svg.append(f'<line x1="{x_left}" y1="{y + row_h - 1}" x2="{x_left + plot_w}" y2="{y + row_h - 1}" stroke="#f0f0f0" stroke-width="1"/>')
+        nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']]
+        for node in nodes:
+            if node['end_cycle'] <= node['start_cycle']:
+                continue
+            x = x_left + (node['start_cycle'] / total_for_plot) * plot_w
+            w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w)
+            color = _color_for_driver(node['driver_idx'])
+            base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0)
+            size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0
+            rw_mix = _rw_mix_text(node)
+            line1_full = f"{node['job']}"
+            line2_full = f"{node['mem_access_type']}"
+            line3_full = rw_mix
+            outside_lines = _outside_detail_lines(node, base_addr, size_kib)
+            title = (
+                f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']}, {node['end_cycle']}) "
+                f"{node['mem_access_type']} n={node['n_transactions']} "
+                f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} "
+                f"{' | '.join(outside_lines)}"
+            )
+            exec_svg.append('<g style="cursor:help;">')
+            exec_svg.append(f'<title>{html.escape(title)}</title>')
+            exec_svg.append(
+                f'<rect x="{x:.2f}" y="{y + 4}" width="{w:.2f}" height="34" '
+                f'rx="3" ry="3" fill="{color}" fill-opacity="0.82" stroke="#222" stroke-width="0.2"/>'
+            )
+            line1 = html.escape(line1_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 15}" font-family="Arial, sans-serif" font-size="9" '
+                f'fill="#ffffff" style="pointer-events:none;">{line1}</text>'
+            )
+            line2 = html.escape(line2_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 25}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line2}</text>'
+            )
+            line3 = html.escape(line3_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 34}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line3}</text>'
+            )
+            for ext_idx, ext in enumerate(outside_lines):
+                y_ext = y + 49 + (ext_idx * 9)
+                ext_txt = html.escape(ext)
+                exec_svg.append(
+                    f'<text x="{x + 2:.2f}" y="{y_ext:.2f}" font-family="Arial, sans-serif" font-size="8" '
+                    f'fill="#333333" style="pointer-events:none;">{ext_txt}</text>'
+                )
+            exec_svg.append('</g>')
+    exec_svg.append('</svg>')
+
+    region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))]
+    overlap_rows = []
+    overlaps_by_region = {i: [] for i in range(len(region_rows))}
+    for i in range(len(region_rows)):
+        a = region_rows[i]
+        for j in range(i + 1, len(region_rows)):
+            b = region_rows[j]
+            ov_base = max(a['base'], b['base'])
+            ov_end = min(a['end'], b['end'])
+            if ov_base <= ov_end:
+                ov_size = ov_end - ov_base + 1
+                overlap_rows.append({
+                    'a_idx': i,
+                    'b_idx': j,
+                    'ov_base': ov_base,
+                    'ov_end': ov_end,
+                    'ov_size': ov_size,
+                })
+                overlaps_by_region[i].append((j, ov_base, ov_end, ov_size))
+                overlaps_by_region[j].append((i, ov_base, ov_end, ov_size))
+
+    used_min = min((reg['base'] for reg in region_rows), default=0)
+    used_max = max((reg['end'] for reg in region_rows), default=0)
+    if used_max < used_min:
+        used_max = used_min
+    used_span = max(1, used_max - used_min + 1)
+
+    map_left = 170
+    map_right = 24
+    map_plot_w = chart_width - map_left - map_right
+    map_h = 124
+    bar_y = 56
+    bar_h = 24
+    region_map_svg = []
+    region_map_svg.append(f'<svg width="{chart_width}" height="{map_h}" viewBox="0 0 {chart_width} {map_h}" xmlns="http://www.w3.org/2000/svg">')
+    region_map_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    region_map_svg.append(
+        f'<text x="{map_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">'
+        f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})"
+        f"</text>"
+    )
+    region_map_svg.append(
+        f'<rect x="{map_left}" y="{bar_y}" width="{map_plot_w}" height="{bar_h}" '
+        f'fill="#f6f6f6" stroke="#cfcfcf" stroke-width="1"/>'
+    )
+    for pct in [0, 25, 50, 75, 100]:
+        x = map_left + (pct / 100.0) * map_plot_w
+        addr = used_min + int(((used_span - 1) * pct) / 100.0)
+        region_map_svg.append(f'<line x1="{x:.2f}" y1="{bar_y - 8}" x2="{x:.2f}" y2="{bar_y + bar_h + 8}" stroke="#d7d7d7" stroke-width="1"/>')
+        region_map_svg.append(
+            f'<text x="{x:.2f}" y="{bar_y + bar_h + 22}" text-anchor="middle" '
+            f'font-family="Arial, sans-serif" font-size="11" fill="#555">0x{addr:08x}</text>'
+        )
+    for reg in region_rows:
+        x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w
+        w = max(1.0, (reg['size'] / used_span) * map_plot_w)
+        color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888"
+        title = (
+            f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"size={reg['size']}B accesses={len(reg['accesses'])}"
+        )
+        region_map_svg.append(
+            f'<rect x="{x:.2f}" y="{bar_y + 2}" width="{w:.2f}" height="{bar_h - 4}" '
+            f'rx="2" ry="2" fill="{color}" fill-opacity="0.62" stroke="#222" stroke-width="0.35">'
+            f'<title>{html.escape(title)}</title></rect>'
+        )
+        if w >= 92:
+            region_map_svg.append(
+                f'<text x="{x + 4:.2f}" y="{bar_y + 16}" font-family="Arial, sans-serif" '
+                f'font-size="10" fill="#111">{html.escape(reg["label"])}</text>'
+            )
+    region_map_svg.append('</svg>')
+
+    legend_items = []
+    for d in sorted(driver_windows.keys()):
+        n = _driver_name(d)
+        c = _color_for_driver(d)
+        legend_items.append(
+            f'<span style="display:inline-flex;align-items:center;margin-right:12px;margin-bottom:6px;">'
+            f'<span style="display:inline-block;width:11px;height:11px;background:{c};margin-right:5px;border:1px solid #222;"></span>'
+            f'<span>{html.escape(n)}</span></span>'
+        )
+
+    region_cards = []
+    for idx, reg in enumerate(region_rows):
+        access_rows = []
+        accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start']))
+        for acc in accesses:
+            desc = acc['description'] if acc['description'] else "-"
+            access_rows.append(
+                "<tr>"
+                f"<td>{html.escape(acc['driver_name'])}</td>"
+                f"<td>p{acc['pattern_idx']}</td>"
+                f"<td>{html.escape(acc['job'])}</td>"
+                f"<td>{html.escape(desc)}</td>"
+                f"<td>[{acc['start']}, {acc['end']})</td>"
+                "</tr>"
+            )
+        overlap_refs = overlaps_by_region.get(idx, [])
+        if overlap_refs:
+            ov_txt = ", ".join(
+                [
+                    f"{html.escape(region_rows[j]['label'])} "
+                    f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)"
+                    for (j, ovb, ove, ovs) in overlap_refs
+                ]
+            )
+        else:
+            ov_txt = "none"
+        region_cards.append(
+            "<div class='region-card'>"
+            f"<div><b>{html.escape(reg['label'])}</b> | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B</div>"
+            f"<div class='meta' style='margin:4px 0 8px 0;'><b>Overlaps:</b> {ov_txt}</div>"
+            "<table class='smalltbl'><thead><tr>"
+            "<th>Driver/HWPE</th><th>Pattern</th><th>Job</th><th>Description</th><th>Modeled interval</th>"
+            "</tr></thead><tbody>"
+            f"{''.join(access_rows)}"
+            "</tbody></table>"
+            "</div>"
+        )
+
+    overlap_table_rows = []
+    for ov in overlap_rows:
+        a = region_rows[ov['a_idx']]
+        b = region_rows[ov['b_idx']]
+        overlap_table_rows.append(
+            "<tr>"
+            f"<td>{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})</td>"
+            f"<td>{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})</td>"
+            f"<td>0x{ov['ov_base']:08x}</td>"
+            f"<td>0x{ov['ov_end']:08x}</td>"
+            f"<td>{ov['ov_size']}</td>"
+            "</tr>"
+        )
+
+    note = "Timeline follows declared wait_for_jobs dependencies from workload.json."
+    note_2 = (
+        "Time axis is transaction-count based only "
+        "(no interconnect conflict/stall/arbitration modeling)."
+    )
+    note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing."
+    note_3 = ""
+    if mux_serialization_applied:
+        note_3 = (
+            "MUX mode: HWPE execution is serialized by job order, "
+            "with lower HWPE ID first inside each job (tb_hci-like)."
+        )
+        if mux_phase_order:
+            note_3 += f" Job order: {', '.join(mux_phase_order)}."
+    note_3_html = f"<div class='meta'>{html.escape(note_3)}</div>" if note_3 else ""
+    cycle_warning_html = (
+        "<p style='margin:8px 0;color:#b00020;font-weight:600;'>Warning: dependency cycle detected; fallback scheduling order used.</p>"
+        if schedule_has_cycle else ""
+    )
+    overlap_html = (
+        "<table><thead><tr><th>Region A</th><th>Region B</th><th>Overlap Base</th><th>Overlap End</th><th>Overlap Size (B)</th></tr></thead><tbody>"
+        f"{''.join(overlap_table_rows)}"
+        "</tbody></table>"
+        if overlap_table_rows else
+        "<div class='meta'>No overlaps detected among used regions.</div>"
+    )
+
+    html_doc = (
+        "<!doctype html><html><head><meta charset='utf-8'>"
+        "<title>Memory Access Region View (Transaction-Count Model)</title>"
+        "<style>"
+        "body{font-family:Arial,sans-serif;margin:18px;background:#fafafa;color:#111;}"
+        "h1{font-size:20px;margin:0 0 6px 0;}h2{font-size:16px;margin:18px 0 8px 0;}"
+        ".meta{font-size:13px;color:#333;margin-bottom:10px;}"
+        ".panel{background:#fff;border:1px solid #ddd;border-radius:8px;padding:12px;margin-bottom:14px;overflow-x:auto;}"
+        "table{border-collapse:collapse;width:100%;font-size:12px;background:#fff;}"
+        "th,td{border:1px solid #ddd;padding:5px 7px;text-align:left;vertical-align:top;}"
+        "th{background:#f3f3f3;}"
+        ".region-card{border:1px solid #dcdcdc;border-radius:6px;padding:10px;margin-bottom:10px;background:#fff;}"
+        ".smalltbl{border-collapse:collapse;width:100%;font-size:11px;background:#fff;}"
+        ".smalltbl th,.smalltbl td{border:1px solid #ddd;padding:4px 6px;text-align:left;vertical-align:top;}"
+        ".smalltbl th{background:#f7f7f7;}"
+        "</style></head><body>"
+        "<h1>Memory Access Region View</h1>"
+        f"<div class='meta'><b>Drivers ({DW_NARROW} bit):</b> "
+        f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}</div>"
+        f"<div class='meta'><b>Drivers ({DW_WIDE} bit):</b> HWPE={N_HWPE_CFG}</div>"
+        f"<div class='meta'><b>Interconnect type:</b> {html.escape(INTERCO_TYPE)} | "
+        f"<b>Narrow master ports ({DW_NARROW} bit):</b> {N_NARROW_HCI_CFG} | "
+        f"<b>Wide master ports ({DW_WIDE} bit):</b> {N_WIDE_HCI_CFG} | "
+        f"<b>Slave ports (banks):</b> {N_BANKS} | "
+        f"<b>Total modeled time:</b> {total_cycles} units</div>"
+        f"<div class='meta'>{html.escape(note)}</div>"
+        f"<div class='meta'>{html.escape(note_2)}</div>"
+        f"<div class='meta'>{html.escape(note_2b)}</div>"
+        f"{note_3_html}"
+        f"{cycle_warning_html}"
+        "<div class='panel'><h2 style='margin-top:0;'>Legend</h2>"
+        f"{''.join(legend_items)}</div>"
+        "<div class='panel'>"
+        f"{''.join(exec_svg)}"
+        "</div>"
+        "<div class='panel'>"
+        f"{''.join(region_map_svg)}"
+        "</div>"
+        "<h2>Region Usage Blocks</h2>"
+        f"{''.join(region_cards)}"
+        "<h2>Overlapping Regions</h2>"
+        f"{overlap_html}"
+        "</body></html>"
+    )
+
+    memory_lifetime_path = generated_dir / 'memory_lifetime.html'
+    memory_lifetime_path.write_text(html_doc, encoding='utf-8')
+    print(f"Memory lifetime plot written: {memory_lifetime_path}")
+
     # -----------------------------------------------------------------------
     # Apply per-master start delays
     # -----------------------------------------------------------------------

From 23362d398de9e219e95b867406cf3d38032c355c Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Tue, 10 Mar 2026 17:49:10 +0100
Subject: [PATCH 21/25] verif: Add framework for benchmarking sweep across
 different hw configs

---
 .gitignore                                    |   1 +
 target/verif/config/generated/json_to_mk.py   |  11 +-
 target/verif/config/hardware.json             |   2 +-
 .../hardware_hci_2hwpe_8fact.json             |  17 +
 .../hardware_log_2hwpe_8fact.json             |  17 +
 .../hardware_mux_2hwpe_8fact.json             |  17 +
 target/verif/config/workload.json             |   4 +-
 .../config/workloads/workload_dma_gemm.json   | 180 ++++++
 .../workloads/workload_dma_gemm_cores.json    | 540 ++++++++++++++++++
 .../workloads/workload_dma_gemm_ideal.json    | 121 ++++
 .../results/hardware_hci_2hwpe_8fact.json     | 431 ++++++++++++++
 target/verif/scripts/parse_vsim.py            | 359 ++++++++++++
 target/verif/scripts/plot_sweep_results.py    | 389 +++++++++++++
 target/verif/scripts/run_sweep.sh             |  27 +
 target/verif/src/simulation_report.sv         |  45 +-
 target/verif/verif.mk                         |  11 +-
 16 files changed, 2148 insertions(+), 24 deletions(-)
 create mode 100644 target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json
 create mode 100644 target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json
 create mode 100644 target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json
 create mode 100644 target/verif/config/workloads/workload_dma_gemm.json
 create mode 100644 target/verif/config/workloads/workload_dma_gemm_cores.json
 create mode 100644 target/verif/config/workloads/workload_dma_gemm_ideal.json
 create mode 100644 target/verif/results/hardware_hci_2hwpe_8fact.json
 create mode 100644 target/verif/scripts/parse_vsim.py
 create mode 100644 target/verif/scripts/plot_sweep_results.py
 create mode 100644 target/verif/scripts/run_sweep.sh

diff --git a/.gitignore b/.gitignore
index ada3be7..0fbe9ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ target/verif/vsim/compile.tcl
 target/verif/vsim/modelsim.ini
 target/verif/vsim/transcript
 target/verif/vsim/vsim.wlf
+target/verif/results
diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py
index 4a1c9ba..1d6ad27 100755
--- a/target/verif/config/generated/json_to_mk.py
+++ b/target/verif/config/generated/json_to_mk.py
@@ -6,8 +6,8 @@
 Templates are automatically discovered based on the config type argument.
 """
 
-import json
 import argparse
+import json
 import sys
 from pathlib import Path
 from string import Template
@@ -83,9 +83,9 @@ def parse_args(argv=None):
         help="Configuration type to generate.",
     )
     parser.add_argument(
-        "config_dir",
+        "config_json",
         type=Path,
-        help="Directory containing source-of-truth JSON files.",
+        help="Path to selected source-of-truth JSON file.",
     )
     parser.add_argument(
         "generated_dir",
@@ -98,11 +98,10 @@ def parse_args(argv=None):
 def main():
     args = parse_args()
     config_type = args.config_type
-    config_dir = args.config_dir.resolve()
+    json_file = args.config_json.resolve()
+    config_dir = json_file.parent
     generated_dir = args.generated_dir.resolve()
 
-    # Construct file paths based on config_type argument
-    json_file = config_dir / f"{config_type}.json"
     template_file = generated_dir / f"{config_type}.mk.tpl"
 
     # Load JSON config
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index 84c47a1..b349a65 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -9,7 +9,7 @@
     "DATA_WIDTH": 32,
     "TOT_MEM_SIZE": 256,
     "N_BANKS": 64,
-    "INTERCO_TYPE": "MUX",
+    "INTERCO_TYPE": "HCI",
     "TS_BIT": 21,
     "EXPFIFO": 0,
     "SEL_LIC": 0
diff --git a/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json
new file mode 100644
index 0000000..b349a65
--- /dev/null
+++ b/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "HCI",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json
new file mode 100644
index 0000000..a505fa1
--- /dev/null
+++ b/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "LOG",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json b/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json
new file mode 100644
index 0000000..84c47a1
--- /dev/null
+++ b/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "MUX",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index 683a709..8299bf9 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -14,7 +14,7 @@
 
   "hwpe_masters": [
     {
-      "id": 1,
+      "id": 0,
       "description": "DMA engine (linear setup + ping/pong prefetch)",
       "patterns": [
         {
@@ -107,7 +107,7 @@
       ]
     },
     {
-      "id": 0,
+      "id": 1,
       "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
       "patterns": [
         {
diff --git a/target/verif/config/workloads/workload_dma_gemm.json b/target/verif/config/workloads/workload_dma_gemm.json
new file mode 100644
index 0000000..8299bf9
--- /dev/null
+++ b/target/verif/config/workloads/workload_dma_gemm.json
@@ -0,0 +1,180 @@
+{
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
+  "log_masters": [
+    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
+    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
+    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
+    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
+    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
+    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
+    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
+    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A1_buf1",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A2_buf0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A0",
+          "mem_access_type": "linear",
+          "job": "dma_store_C0_buf2",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A3_buf1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A1",
+          "mem_access_type": "linear",
+          "job": "dma_store_C1_buf3",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A2",
+          "mem_access_type": "linear",
+          "job": "dma_store_C2_buf2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
+    }
+  ]
+}
diff --git a/target/verif/config/workloads/workload_dma_gemm_cores.json b/target/verif/config/workloads/workload_dma_gemm_cores.json
new file mode 100644
index 0000000..aa2da43
--- /dev/null
+++ b/target/verif/config/workloads/workload_dma_gemm_cores.json
@@ -0,0 +1,540 @@
+{
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
+  "log_masters": [
+    {
+      "id": 0,
+      "description": "Core 0 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 0 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Core 1 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 1 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "description": "Core 2 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 2 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "description": "Core 3 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 3 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "description": "Core 4 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 4 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "description": "Core 5 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 5 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "description": "Core 6 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 6 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A1_buf1",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A2_buf0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A0",
+          "mem_access_type": "linear",
+          "job": "dma_store_C0_buf2",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A3_buf1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A1",
+          "mem_access_type": "linear",
+          "job": "dma_store_C1_buf3",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A2",
+          "mem_access_type": "linear",
+          "job": "dma_store_C2_buf2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
+    }
+  ]
+}
diff --git a/target/verif/config/workloads/workload_dma_gemm_ideal.json b/target/verif/config/workloads/workload_dma_gemm_ideal.json
new file mode 100644
index 0000000..b166c78
--- /dev/null
+++ b/target/verif/config/workloads/workload_dma_gemm_ideal.json
@@ -0,0 +1,121 @@
+{
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
+  "log_masters": [
+    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
+    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
+    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
+    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
+    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
+    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
+    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
+    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
+    }
+  ]
+}
diff --git a/target/verif/results/hardware_hci_2hwpe_8fact.json b/target/verif/results/hardware_hci_2hwpe_8fact.json
new file mode 100644
index 0000000..1fd0fd4
--- /dev/null
+++ b/target/verif/results/hardware_hci_2hwpe_8fact.json
@@ -0,0 +1,431 @@
+{
+  "hw_config": {
+    "masters": {
+      "core": 8,
+      "dma": 0,
+      "ext": 1,
+      "hwpe": 2,
+      "total": 11
+    },
+    "memory": {
+      "banks": 64,
+      "total_size_kb": 256,
+      "data_width_bits": 32,
+      "hwpe_width_lanes": 8
+    },
+    "interconnect": {
+      "sel_lic": 0,
+      "ts_bit": 21,
+      "expfifo": 0
+    },
+    "interconnect_side": {
+      "type": "HCI",
+      "n_narrow_hci": 8,
+      "n_wide_hci": 2,
+      "n_dma": 0,
+      "n_ext": 1,
+      "narrow_total_ports": 9,
+      "total_initiator_ports": 11
+    },
+    "id_address": {
+      "iw": 11,
+      "addr_width": 18,
+      "addr_width_bank": 12
+    }
+  },
+  "bandwidth": {
+    "ideal_memory_side_bit_per_cycle": 2048.0,
+    "ideal_interconnect_side_bit_per_cycle": 800.0,
+    "ideal_bottleneck_bit_per_cycle": 800.0,
+    "actual_completion_bit_per_cycle": 306.65,
+    "actual_completion_utilization_pct": 38.3,
+    "completion_phase_duration_cycles": 4488.0,
+    "granted_transactions": {
+      "reads": 3072,
+      "writes": 2304,
+      "total": 5376
+    },
+    "read_complete_responses": 3072
+  },
+  "simulation_time": {
+    "per_master": [
+      {
+        "master_name": "master_hwpe_0",
+        "role_name": "HWPE0",
+        "sim_time_cycles": 4488.0
+      },
+      {
+        "master_name": "master_hwpe_1",
+        "role_name": "HWPE1",
+        "sim_time_cycles": 4228.0
+      },
+      {
+        "master_name": "master_log_0",
+        "role_name": "Core0",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_1",
+        "role_name": "Core1",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_2",
+        "role_name": "Core2",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_3",
+        "role_name": "Core3",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_4",
+        "role_name": "Core4",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_5",
+        "role_name": "Core5",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_6",
+        "role_name": "Core6",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_7",
+        "role_name": "Core7",
+        "sim_time_cycles": 3.0
+      },
+      {
+        "master_name": "master_log_8",
+        "role_name": "EXT0",
+        "sim_time_cycles": 1.0
+      }
+    ],
+    "total_cycles": 4488.0
+  },
+  "read_response_coverage": {
+    "master_log_0": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_1": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_2": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_3": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_4": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_5": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_6": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_7": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_log_8": {
+      "observed": 0,
+      "expected": 0
+    },
+    "master_hwpe_0": {
+      "observed": 1024,
+      "expected": 1024
+    },
+    "master_hwpe_1": {
+      "observed": 2048,
+      "expected": 2048
+    }
+  },
+  "transaction_counts": {
+    "master_log_0": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_1": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_2": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_3": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_4": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_5": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_6": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_7": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_log_8": {
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0
+    },
+    "master_hwpe_0": {
+      "granted_reads": 1024,
+      "granted_writes": 1280,
+      "read_complete": 1024
+    },
+    "master_hwpe_1": {
+      "granted_reads": 2048,
+      "granted_writes": 1024,
+      "read_complete": 2048
+    }
+  },
+  "request_to_grant_latency": {
+    "per_master": [
+      {
+        "master_name": "master_hwpe_0",
+        "avg_req_to_gnt_stall_latency_cycles": 0.63,
+        "req_to_gnt_grants": 2304
+      },
+      {
+        "master_name": "master_hwpe_1",
+        "avg_req_to_gnt_stall_latency_cycles": 0.18,
+        "req_to_gnt_grants": 3072
+      },
+      {
+        "master_name": "master_log_0",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_1",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_2",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_3",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_4",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_5",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_6",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_7",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      },
+      {
+        "master_name": "master_log_8",
+        "avg_req_to_gnt_stall_latency_cycles": 0.0,
+        "req_to_gnt_grants": 0
+      }
+    ],
+    "accumulated": {
+      "cycles": 1987.0,
+      "grants": 5376
+    },
+    "averages": {
+      "log": {
+        "weighted_cycles": 0.0,
+        "unweighted_cycles": 0.0
+      },
+      "hwpe": {
+        "weighted_cycles": 0.37,
+        "unweighted_cycles": 0.4
+      },
+      "global": {
+        "weighted_cycles": 0.37,
+        "unweighted_cycles": 0.4
+      }
+    }
+  },
+  "finish": {
+    "source": "/scratch/smazzola/projects/hci/hci/target/verif/src/simulation_report.sv",
+    "line": 464,
+    "time_ps": 224950,
+    "iteration": 4,
+    "instance": "/tb_hci/i_simulation_report"
+  },
+  "masters": [
+    {
+      "master_name": "master_hwpe_0",
+      "role_name": "HWPE0",
+      "sim_time_cycles": 4488.0,
+      "read_observed": 1024,
+      "read_expected": 1024,
+      "granted_reads": 1024,
+      "granted_writes": 1280,
+      "read_complete": 1024,
+      "avg_req_to_gnt_stall_latency_cycles": 0.63,
+      "req_to_gnt_grants": 2304
+    },
+    {
+      "master_name": "master_hwpe_1",
+      "role_name": "HWPE1",
+      "sim_time_cycles": 4228.0,
+      "read_observed": 2048,
+      "read_expected": 2048,
+      "granted_reads": 2048,
+      "granted_writes": 1024,
+      "read_complete": 2048,
+      "avg_req_to_gnt_stall_latency_cycles": 0.18,
+      "req_to_gnt_grants": 3072
+    },
+    {
+      "master_name": "master_log_0",
+      "role_name": "Core0",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_1",
+      "role_name": "Core1",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_2",
+      "role_name": "Core2",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_3",
+      "role_name": "Core3",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_4",
+      "role_name": "Core4",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_5",
+      "role_name": "Core5",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_6",
+      "role_name": "Core6",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_7",
+      "role_name": "Core7",
+      "sim_time_cycles": 3.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    },
+    {
+      "master_name": "master_log_8",
+      "role_name": "EXT0",
+      "sim_time_cycles": 1.0,
+      "read_observed": 0,
+      "read_expected": 0,
+      "granted_reads": 0,
+      "granted_writes": 0,
+      "read_complete": 0,
+      "avg_req_to_gnt_stall_latency_cycles": 0.0,
+      "req_to_gnt_grants": 0
+    }
+  ]
+}
diff --git a/target/verif/scripts/parse_vsim.py b/target/verif/scripts/parse_vsim.py
new file mode 100644
index 0000000..9c0f0ad
--- /dev/null
+++ b/target/verif/scripts/parse_vsim.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""Parse the final 'Simulation Summary' section from one transcript file."""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List
+
+
+SUMMARY_MARKER = "------ Simulation Summary ------"
+
+
+class ParseError(RuntimeError):
+    """Raised when the transcript summary cannot be parsed."""
+
+
+def _as_float(value: str) -> float:
+    return float(value)
+
+
+def _as_int(value: str) -> int:
+    return int(value)
+
+
+def _clean_line(raw: str) -> str:
+    line = raw.strip()
+    if line.startswith("#"):
+        line = line[1:].strip()
+    return line
+
+
+def _summary_lines(transcript_text: str) -> List[str]:
+    idx = transcript_text.rfind(SUMMARY_MARKER)
+    if idx < 0:
+        raise ParseError(f"Summary marker '{SUMMARY_MARKER}' not found.")
+    return [_clean_line(line) for line in transcript_text[idx:].splitlines()]
+
+
+def _ensure_master(masters: Dict[str, Dict[str, object]], master_name: str) -> Dict[str, object]:
+    entry = masters.get(master_name)
+    if entry is None:
+        entry = {"master_name": master_name}
+        masters[master_name] = entry
+    return entry
+
+
+def parse_summary(transcript_text: str) -> Dict[str, object]:
+    lines = _summary_lines(transcript_text)
+
+    result: Dict[str, object] = {
+        "hw_config": {},
+        "bandwidth": {},
+        "simulation_time": {"per_master": []},
+        "read_response_coverage": {},
+        "transaction_counts": {},
+        "request_to_grant_latency": {
+            "per_master": [],
+            "accumulated": {},
+            "averages": {},
+        },
+        "finish": {},
+    }
+
+    masters: Dict[str, Dict[str, object]] = {}
+
+    patterns = {
+        "masters": re.compile(r"^Masters:\s*CORE=(\d+)\s*DMA=(\d+)\s*EXT=(\d+)\s*HWPE=(\d+)\s*\(total=(\d+)\)$"),
+        "memory": re.compile(
+            r"^Memory:\s*banks=(\d+)\s*total_size=(\d+)\s*kB\s*data_width=(\d+)\s*bits\s*hwpe_width=(\d+)\s*lanes$"
+        ),
+        "interconnect": re.compile(r"^Interconnect:\s*SEL_LIC=(\d+)\s*TS_BIT=(\d+)\s*EXPFIFO=(\d+)$"),
+        "interconnect_side": re.compile(
+            r"^Interconnect-side:\s*TYPE=(LOG|HCI|MUX|UNKNOWN)\s*N_NARROW_HCI=(\d+)\s*N_WIDE_HCI=(\d+)\s*N_DMA=(\d+)\s*N_EXT=(\d+)$"
+        ),
+        "id_addr": re.compile(r"^ID/address:\s*IW=(\d+)\s*ADDR_WIDTH=(\d+)\s*ADDR_WIDTH_BANK=(\d+)$"),
+        "ideal_mem_bw": re.compile(r"^Ideal BW \(memory side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_interco_bw": re.compile(r"^Ideal BW \(interco side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_master_bw_legacy": re.compile(r"^Ideal BW \(master side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_bottleneck_bw": re.compile(r"^Ideal BW \(bottleneck\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "actual_bw": re.compile(
+            r"^Actual BW \(completion\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle\s*\[utilization:\s*([0-9]+(?:\.[0-9]+)?)%\]$"
+        ),
+        "completion_bw_legacy": re.compile(r"^Completion bandwidth .*:\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle$"),
+        "completion_cycles": re.compile(r"^Completion phase duration:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "granted": re.compile(r"^Granted transactions:\s*reads=(\d+)\s*writes=(\d+)\s*total=(\d+)$"),
+        "read_complete": re.compile(r"^Read-complete responses:\s*(\d+)$"),
+        "total_sim_cycles": re.compile(r"^Total simulation time:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "per_master_sim_time": re.compile(r"^([A-Za-z0-9_]+)\s*\((master_[^)]+)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "coverage": re.compile(r"^(master_[^:]+):\s*observed\s*(\d+)\s*/\s*expected\s*(\d+)$"),
+        "tx_counts": re.compile(r"^(master_[^:]+):\s*granted reads=(\d+)\s*writes=(\d+),\s*read-complete=(\d+)$"),
+        "req_gnt": re.compile(
+            r"^(master_[^:]+):\s*avg req->gnt stall latency\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$"
+        ),
+        "total_accum": re.compile(
+            r"^Total accumulated req->gnt latency:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$"
+        ),
+        "class_avg": re.compile(
+            r"^(LOG|HWPE|Global) avg req->gnt stall latency "
+            r"\((weighted by grant count|mean of per-master averages)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"
+        ),
+        "finish_note": re.compile(r"^\*\* Note: \$finish\s*:\s*(.+)\((\d+)\)$"),
+        "finish_time": re.compile(r"^Time:\s*([0-9]+)\s*ps\s*Iteration:\s*(\d+)\s*Instance:\s*(.+)$"),
+    }
+
+    for line in lines:
+        if not line or line == SUMMARY_MARKER:
+            continue
+
+        match = patterns["masters"].match(line)
+        if match:
+            result["hw_config"]["masters"] = {
+                "core": _as_int(match.group(1)),
+                "dma": _as_int(match.group(2)),
+                "ext": _as_int(match.group(3)),
+                "hwpe": _as_int(match.group(4)),
+                "total": _as_int(match.group(5)),
+            }
+            continue
+
+        match = patterns["memory"].match(line)
+        if match:
+            result["hw_config"]["memory"] = {
+                "banks": _as_int(match.group(1)),
+                "total_size_kb": _as_int(match.group(2)),
+                "data_width_bits": _as_int(match.group(3)),
+                "hwpe_width_lanes": _as_int(match.group(4)),
+            }
+            continue
+
+        match = patterns["interconnect"].match(line)
+        if match:
+            result["hw_config"]["interconnect"] = {
+                "sel_lic": _as_int(match.group(1)),
+                "ts_bit": _as_int(match.group(2)),
+                "expfifo": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["interconnect_side"].match(line)
+        if match:
+            narrow_hci = _as_int(match.group(2))
+            wide_hci = _as_int(match.group(3))
+            n_dma = _as_int(match.group(4))
+            n_ext = _as_int(match.group(5))
+            result["hw_config"]["interconnect_side"] = {
+                "type": match.group(1),
+                "n_narrow_hci": narrow_hci,
+                "n_wide_hci": wide_hci,
+                "n_dma": n_dma,
+                "n_ext": n_ext,
+                "narrow_total_ports": narrow_hci + n_dma + n_ext,
+                "total_initiator_ports": narrow_hci + wide_hci + n_dma + n_ext,
+            }
+            continue
+
+        match = patterns["id_addr"].match(line)
+        if match:
+            result["hw_config"]["id_address"] = {
+                "iw": _as_int(match.group(1)),
+                "addr_width": _as_int(match.group(2)),
+                "addr_width_bank": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["ideal_mem_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_memory_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_interco_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_master_bw_legacy"].match(line)
+        if match:
+            result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_bottleneck_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_bottleneck_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["actual_bw"].match(line)
+        if match:
+            result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1))
+            result["bandwidth"]["actual_completion_utilization_pct"] = _as_float(match.group(2))
+            continue
+
+        match = patterns["completion_bw_legacy"].match(line)
+        if match:
+            result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["completion_cycles"].match(line)
+        if match:
+            result["bandwidth"]["completion_phase_duration_cycles"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["granted"].match(line)
+        if match:
+            result["bandwidth"]["granted_transactions"] = {
+                "reads": _as_int(match.group(1)),
+                "writes": _as_int(match.group(2)),
+                "total": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["read_complete"].match(line)
+        if match:
+            result["bandwidth"]["read_complete_responses"] = _as_int(match.group(1))
+            continue
+
+        match = patterns["total_sim_cycles"].match(line)
+        if match:
+            result["simulation_time"]["total_cycles"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["per_master_sim_time"].match(line)
+        if match:
+            role_name = match.group(1)
+            master_name = match.group(2)
+            sim_cycles = _as_float(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["role_name"] = role_name
+            entry["sim_time_cycles"] = sim_cycles
+            continue
+
+        match = patterns["coverage"].match(line)
+        if match:
+            master_name = match.group(1)
+            observed = _as_int(match.group(2))
+            expected = _as_int(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["read_observed"] = observed
+            entry["read_expected"] = expected
+            result["read_response_coverage"][master_name] = {
+                "observed": observed,
+                "expected": expected,
+            }
+            continue
+
+        match = patterns["tx_counts"].match(line)
+        if match:
+            master_name = match.group(1)
+            reads = _as_int(match.group(2))
+            writes = _as_int(match.group(3))
+            read_complete = _as_int(match.group(4))
+            entry = _ensure_master(masters, master_name)
+            entry["granted_reads"] = reads
+            entry["granted_writes"] = writes
+            entry["read_complete"] = read_complete
+            result["transaction_counts"][master_name] = {
+                "granted_reads": reads,
+                "granted_writes": writes,
+                "read_complete": read_complete,
+            }
+            continue
+
+        match = patterns["req_gnt"].match(line)
+        if match:
+            master_name = match.group(1)
+            avg_cycles = _as_float(match.group(2))
+            grants = _as_int(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["avg_req_to_gnt_stall_latency_cycles"] = avg_cycles
+            entry["req_to_gnt_grants"] = grants
+            continue
+
+        match = patterns["total_accum"].match(line)
+        if match:
+            result["request_to_grant_latency"]["accumulated"] = {
+                "cycles": _as_float(match.group(1)),
+                "grants": _as_int(match.group(2)),
+            }
+            continue
+
+        match = patterns["class_avg"].match(line)
+        if match:
+            group = match.group(1).lower()
+            avg_type = match.group(2)
+            value = _as_float(match.group(3))
+            key = "weighted_cycles" if "weighted by grant count" in avg_type else "unweighted_cycles"
+            averages = result["request_to_grant_latency"]["averages"]
+            group_entry = averages.get(group, {})
+            group_entry[key] = value
+            averages[group] = group_entry
+            continue
+
+        match = patterns["finish_note"].match(line)
+        if match:
+            result["finish"]["source"] = match.group(1).strip()
+            result["finish"]["line"] = _as_int(match.group(2))
+            continue
+
+        match = patterns["finish_time"].match(line)
+        if match:
+            result["finish"]["time_ps"] = _as_int(match.group(1))
+            result["finish"]["iteration"] = _as_int(match.group(2))
+            result["finish"]["instance"] = match.group(3).strip()
+            continue
+
+    sorted_masters = [masters[name] for name in sorted(masters.keys())]
+    result["simulation_time"]["per_master"] = [
+        {
+            "master_name": row["master_name"],
+            "role_name": row.get("role_name"),
+            "sim_time_cycles": row.get("sim_time_cycles"),
+        }
+        for row in sorted_masters
+    ]
+    result["request_to_grant_latency"]["per_master"] = [
+        {
+            "master_name": row["master_name"],
+            "avg_req_to_gnt_stall_latency_cycles": row.get("avg_req_to_gnt_stall_latency_cycles"),
+            "req_to_gnt_grants": row.get("req_to_gnt_grants"),
+        }
+        for row in sorted_masters
+    ]
+    result["masters"] = sorted_masters
+
+    return result
+
+
+def _cli_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Parse final Simulation Summary lines from one transcript.")
+    parser.add_argument("--transcript", required=True, help="Path to transcript file")
+    parser.add_argument("--out", default="", help="Optional output JSON file path")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _cli_args()
+    transcript_path = Path(args.transcript)
+    if not transcript_path.exists():
+        raise ParseError(f"Transcript not found: {transcript_path}")
+
+    text = transcript_path.read_text(encoding="utf-8", errors="replace")
+    parsed = parse_summary(text)
+
+    output = json.dumps(parsed, indent=2, sort_keys=False)
+    if args.out:
+        out_path = Path(args.out)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(output + "\n", encoding="ascii")
+    else:
+        print(output)
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except ParseError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(2)
diff --git a/target/verif/scripts/plot_sweep_results.py b/target/verif/scripts/plot_sweep_results.py
new file mode 100644
index 0000000..cd24686
--- /dev/null
+++ b/target/verif/scripts/plot_sweep_results.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""Plot sweep metrics from parsed transcript JSON files."""
+
+import argparse
+import json
+import math
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import ListedColormap
+from matplotlib.lines import Line2D
+from matplotlib.patches import Patch
+
+# Cycles of ideal workload runtime
+IDEAL_WORKLOAD_DURATION = 3866.0
+
+INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2}
+INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"}
+IDEAL_COLOR = "#7f7f7f"
+
+
+def _to_int(value: object, default: int = 0) -> int:
+    try:
+        return int(value)
+    except Exception:
+        return default
+
+
+def _to_float(value: object, default: float = float("nan")) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return default
+
+
+def _master_sort_key(master: str) -> Tuple[int, int]:
+    if master.startswith("master_log_"):
+        return (0, int(master.rsplit("_", 1)[1]))
+    if master.startswith("master_hwpe_"):
+        return (1, int(master.rsplit("_", 1)[1]))
+    return (9, 0)
+
+
+def _parse_cfg_from_filename(path: Path) -> Tuple[str, int, int]:
+    match = re.match(r"^hardware_([a-zA-Z]+)_([0-9]+)hwpe_([0-9]+)fact\.json$", path.name)
+    if not match:
+        return ("UNK", 0, 0)
+    return (match.group(1).upper(), int(match.group(2)), int(match.group(3)))
+
+
+def _derive_interco_side(hw_cfg: Dict[str, object]) -> Dict[str, int]:
+    masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {}
+    memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {}
+    interco_side = hw_cfg.get("interconnect_side", {}) if isinstance(hw_cfg, dict) else {}
+
+    if isinstance(interco_side, dict) and "narrow_total_ports" in interco_side:
+        return {
+            "n_narrow_hci": _to_int(interco_side.get("n_narrow_hci")),
+            "n_wide_hci": _to_int(interco_side.get("n_wide_hci")),
+            "n_dma": _to_int(interco_side.get("n_dma")),
+            "n_ext": _to_int(interco_side.get("n_ext")),
+            "narrow_total_ports": _to_int(interco_side.get("narrow_total_ports")),
+            "total_initiator_ports": _to_int(interco_side.get("total_initiator_ports")),
+        }
+
+    interco_type = str(interco_side.get("type", "UNK")).upper()
+    if interco_type == "UNK":
+        interco_type = str(hw_cfg.get("interco_type", "UNK")).upper()
+
+    n_core = _to_int(masters.get("core"))
+    n_dma = _to_int(masters.get("dma"))
+    n_ext = _to_int(masters.get("ext"))
+    n_hwpe = _to_int(masters.get("hwpe"))
+    hwpe_width = _to_int(memory.get("hwpe_width_lanes"), 1)
+
+    if interco_type == "LOG":
+        n_narrow_hci = n_core + n_hwpe * hwpe_width
+        n_wide_hci = 0
+    elif interco_type == "MUX":
+        n_narrow_hci = n_core
+        n_wide_hci = 1 if n_hwpe > 0 else 0
+    else:
+        n_narrow_hci = n_core
+        n_wide_hci = n_hwpe
+
+    narrow_total = n_narrow_hci + n_dma + n_ext
+    return {
+        "n_narrow_hci": n_narrow_hci,
+        "n_wide_hci": n_wide_hci,
+        "n_dma": n_dma,
+        "n_ext": n_ext,
+        "narrow_total_ports": narrow_total,
+        "total_initiator_ports": narrow_total + n_wide_hci,
+    }
+
+
+def _load_results(results_dir: Path) -> List[Dict[str, object]]:
+    entries: List[Dict[str, object]] = []
+    for path in sorted(results_dir.glob("hardware_*.json")):
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+
+        hw_cfg = data.get("hw_config", {})
+        masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {}
+        memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {}
+        bw = data.get("bandwidth", {})
+
+        interco_from_name, n_hwpe_name, wf_name = _parse_cfg_from_filename(path)
+        interco_type = str(hw_cfg.get("interconnect_side", {}).get("type", interco_from_name)).upper()
+        if interco_type not in INTERCO_ORDER:
+            interco_type = interco_from_name
+
+        n_hwpe = _to_int(masters.get("hwpe"), n_hwpe_name)
+        hwpe_wf = _to_int(memory.get("hwpe_width_lanes"), wf_name)
+        cfg_label = f"{interco_type}_{n_hwpe}x{hwpe_wf}"
+
+        interco_side = _derive_interco_side(hw_cfg if isinstance(hw_cfg, dict) else {})
+        banks = _to_int(memory.get("banks"))
+        data_width = _to_int(memory.get("data_width_bits"))
+        ideal_mem = float(banks * data_width)
+        ideal_interco = float(
+            interco_side["narrow_total_ports"] * data_width
+            + interco_side["n_wide_hci"] * hwpe_wf * data_width
+        )
+        ideal_bottleneck = min(ideal_mem, ideal_interco)
+
+        actual_bw = _to_float(bw.get("actual_completion_bit_per_cycle"))
+        util_pct = (actual_bw / ideal_bottleneck * 100.0) if ideal_bottleneck > 0 and not math.isnan(actual_bw) else float("nan")
+
+        entries.append(
+            {
+                "path": path,
+                "label": cfg_label,
+                "interco_type": interco_type,
+                "n_hwpe": n_hwpe,
+                "hwpe_width_fact": hwpe_wf,
+                "json": data,
+                "total_sim_cycles": _to_float(data.get("simulation_time", {}).get("total_cycles")),
+                "avg_req_to_gnt_per_master": data.get("request_to_grant_latency", {}).get("per_master", []),
+                "ideal_mem_bw": ideal_mem,
+                "ideal_interco_bw": ideal_interco,
+                "ideal_bottleneck_bw": ideal_bottleneck,
+                "actual_bw": actual_bw,
+                "utilization_pct": util_pct,
+            }
+        )
+
+    entries.sort(key=lambda e: (e["n_hwpe"], e["hwpe_width_fact"], INTERCO_ORDER.get(e["interco_type"], 9)))
+    return entries
+
+
+def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    values = [e["total_sim_cycles"] for e in entries]
+    colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries]
+    x = np.arange(len(entries), dtype=float)
+
+    fig, ax = plt.subplots(figsize=(max(8, 1.2 * len(entries)), 5.6))
+    bars = ax.bar(x, values, color=colors, width=0.68)
+    ax.set_title("Total simulation time vs ideal workload runtime")
+    ax.set_ylabel("cycles")
+    ax.set_xlabel("Configuration")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_axisbelow(True)
+    ax.grid(axis="y", alpha=0.25)
+
+    for bar, val in zip(bars, values):
+        if math.isnan(val):
+            continue
+        mult_of_ideal = (val / IDEAL_WORKLOAD_DURATION) if val > 0 and IDEAL_WORKLOAD_DURATION > 0 else float("nan")
+        pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime"
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.0f}\n({pct_txt})",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+        )
+
+    ax.axhline(
+        y=IDEAL_WORKLOAD_DURATION,
+        color="red",
+        linestyle="--",
+        linewidth=1.6,
+        label=f"Ideal workload runtime ({IDEAL_WORKLOAD_DURATION:.0f} cycles)",
+    )
+
+    legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")]
+    legend.append(Line2D([0], [0], color="red", linestyle="--", linewidth=1.6, label="Ideal workload runtime"))
+    ax.legend(handles=legend, loc="lower left")
+    ax.margins(y=0.24)
+    fig.tight_layout(rect=(0.0, 0.02, 1.0, 0.98))
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def _plot_per_master_avg_req_to_gnt(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    masters = sorted(
+        {
+            row.get("master_name", "")
+            for e in entries
+            for row in e.get("avg_req_to_gnt_per_master", [])
+            if isinstance(row, dict) and row.get("master_name", "")
+        },
+        key=_master_sort_key,
+    )
+
+    if not masters:
+        fig, ax = plt.subplots(figsize=(8, 3))
+        ax.text(0.5, 0.5, "No per-master req->gnt data", ha="center", va="center")
+        ax.axis("off")
+        fig.tight_layout()
+        fig.savefig(out_path, dpi=150)
+        plt.close(fig)
+        return
+
+    matrix = np.full((len(masters), len(entries)), np.nan, dtype=float)
+    master_idx = {m: i for i, m in enumerate(masters)}
+    for j, entry in enumerate(entries):
+        for row in entry.get("avg_req_to_gnt_per_master", []):
+            if not isinstance(row, dict):
+                continue
+            m = row.get("master_name", "")
+            if m not in master_idx:
+                continue
+            matrix[master_idx[m], j] = _to_float(row.get("avg_req_to_gnt_stall_latency_cycles"))
+
+    cmap = ListedColormap(plt.cm.get_cmap("viridis")(np.linspace(0.0, 1.0, 256)))
+    cmap.set_bad(color="white")
+
+    fig, ax = plt.subplots(figsize=(max(10, 1.2 * len(entries)), max(4, 0.35 * len(masters))))
+    im = ax.imshow(np.ma.masked_invalid(matrix), aspect="auto", cmap=cmap, interpolation="nearest")
+    ax.set_title("Avg req->gnt stall latency per master")
+    ax.set_xlabel("Configuration")
+    ax.set_ylabel("Master")
+    ax.set_xticks(np.arange(len(entries)))
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_yticks(np.arange(len(masters)))
+    ax.set_yticklabels(masters)
+
+    vmax = np.nanmax(matrix) if np.any(~np.isnan(matrix)) else 0.0
+    thresh = 0.6 * vmax if vmax > 0 else 0.0
+    for r in range(matrix.shape[0]):
+        for c in range(matrix.shape[1]):
+            val = matrix[r, c]
+            if math.isnan(val):
+                continue
+            color = "white" if val < thresh else "black"
+            ax.text(c, r, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color)
+
+    fig.colorbar(im, ax=ax, label="cycles")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    ideal_vals = [e["ideal_bottleneck_bw"] for e in entries]
+    actual_vals = [e["actual_bw"] for e in entries]
+    util_vals = [e["utilization_pct"] for e in entries]
+    sim_cycles_vals = [e["total_sim_cycles"] for e in entries]
+    ideal_app_vals = []
+    for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals):
+        if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_DURATION <= 0.0:
+            ideal_app_vals.append(float("nan"))
+        else:
+            ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_DURATION)
+    actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries]
+    x = np.arange(len(entries), dtype=float)
+    width = 0.34
+
+    fig, ax = plt.subplots(figsize=(max(9, 1.25 * len(entries)), 5.0))
+    ideal_bars = ax.bar(
+        x - width / 2.0,
+        ideal_vals,
+        width=width,
+        color=IDEAL_COLOR,
+        label="Max interco bandwidth",
+    )
+    actual_bars = ax.bar(x + width / 2.0, actual_vals, width=width, color=actual_colors, label="Actual BW (completion)")
+    ax.set_title("Bandwidth: interconnect-side ideal vs actual")
+    ax.set_ylabel("bit/cycle")
+    ax.set_xlabel("Configuration")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_axisbelow(True)
+    ax.grid(axis="y", alpha=0.25)
+
+    for bar, val in zip(ideal_bars, ideal_vals):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.0f}",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            zorder=8,
+        )
+    for bar, val, util in zip(actual_bars, actual_vals, util_vals):
+        if math.isnan(val):
+            continue
+        util_txt = "n/a" if math.isnan(util) else f"{util:.1f}% interco util"
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.1f}\n({util_txt})",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            zorder=8,
+        )
+
+    # Ideal app BW computed from moved data and ideal application duration:
+    # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_DURATION
+    valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)]
+    if valid_ideal_app_vals:
+        ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals)
+        ax.axhline(
+            y=ideal_workload_bw,
+            color="red",
+            linestyle="--",
+            linewidth=1.8,
+            label="Ideal workload bandwidth",
+            zorder=4,
+        )
+        ax.text(
+            x[-1] + 0.35,
+            ideal_workload_bw,
+            f"{ideal_workload_bw:.1f}",
+            color="red",
+            fontsize=9,
+            ha="right",
+            va="bottom",
+            zorder=8,
+        )
+
+    interco_legend = [Patch(facecolor=INTERCO_COLORS[k], label=f"Actual {k}") for k in ("LOG", "MUX", "HCI")]
+    base_legend = [Patch(facecolor=IDEAL_COLOR, label="Max interco bandwidth")]
+    extra_legend = [Line2D([0], [0], color="red", linestyle="--", linewidth=1.8, label="Ideal workload bandwidth")]
+    ax.legend(handles=base_legend + interco_legend + extra_legend, loc="best")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Plot sweep results from parsed transcript JSON files.")
+    parser.add_argument(
+        "--results-dir",
+        default="target/verif/results",
+        help="Directory containing parsed sweep JSON files (hardware_*.json).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        default="target/verif/results/plots",
+        help="Output directory for generated plots.",
+    )
+    args = parser.parse_args()
+
+    results_dir = Path(args.results_dir)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    entries = _load_results(results_dir)
+    if not entries:
+        raise SystemExit(f"No sweep JSON files found in: {results_dir}")
+
+    _plot_total_sim_time(entries, out_dir / "total_simulation_time.png")
+    _plot_per_master_avg_req_to_gnt(entries, out_dir / "avg_req_to_gnt_per_master.png")
+    _plot_bandwidth(entries, out_dir / "bandwidth_ideal_vs_actual.png")
+
+    print(f"Plots written to: {out_dir}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/target/verif/scripts/run_sweep.sh b/target/verif/scripts/run_sweep.sh
new file mode 100644
index 0000000..17f7404
--- /dev/null
+++ b/target/verif/scripts/run_sweep.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# Make sure we are in hci root
+if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then
+  echo "Error: This script must be run from within the hci project git repository." >&2
+  exit 1
+fi
+cd "$git_root"
+
+VERIF_DIR=./target/verif
+RESULTS_DIR=$VERIF_DIR/results
+PLOTS_DIR=$RESULTS_DIR/plots
+
+mkdir -p "$RESULTS_DIR"
+rm -f "$RESULTS_DIR"/hardware_*.json
+
+# For each hardware*.json in target/verif/config/sweep_hardware, run simulation and parse transcript
+for hardware_config in $VERIF_DIR/config/sweep_hardware/hardware_*.json; do
+  make clean-verif
+  echo "Running simulation with hardware config: $hardware_config"
+  HARDWARE_JSON="$hardware_config" GUI=0 make run-verif
+  python3 $VERIF_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out "$RESULTS_DIR"/$(basename "$hardware_config" .json).json
+done
+
+python3 $VERIF_DIR/scripts/plot_sweep_results.py --results-dir "$RESULTS_DIR" --out-dir "$PLOTS_DIR"
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index 78be03d..7f27050 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -63,11 +63,14 @@ module simulation_report
     logic missing_reads;
     // Ideal bandwidth: maximum data the memory system can serve per cycle.
     // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide.
-    real ideal_bw_mem_side_bpc;    // bits per cycle (memory-side ceiling)
-    // Master side: sum of all master bandwidths at 100% traffic.
-    real ideal_bw_master_side_bpc; // bits per cycle (master-side ceiling)
-    real ideal_bw_bpc;             // min(mem, master) = bottleneck ideal BW
+    real ideal_bw_mem_side_bpc;     // bits per cycle (memory-side ceiling)
+    // Interconnect side: HCI-facing narrow/wide initiator interfaces.
+    real ideal_bw_interco_side_bpc; // bits per cycle (interconnect-side ceiling)
+    real ideal_bw_bpc;             // min(mem, interco) = bottleneck ideal BW
     real actual_bw_utilization;    // throughput_complete / ideal_bw
+    int unsigned n_narrow_if_total;
+    int unsigned n_wide_if_total;
+    string interco_type_str;
 
     sum_req_to_gnt_latency_all = 0.0;
     average_req_to_gnt_latency_weighted = 0.0;
@@ -193,15 +196,27 @@ module simulation_report
     // Ideal bandwidth computation.
     // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle.
     ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH);
-    // Master side: N_LOG_MASTERS narrow ports + N_HWPE wide ports, all at 100% traffic.
-    ideal_bw_master_side_bpc = real'(N_LOG_MASTERS) * real'(DATA_WIDTH)
-                             + real'(N_HWPE)         * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
+    // Interconnect side: HCI interface ports (narrow + wide).
+    // NOTE: for MUX mode N_WIDE_HCI=1, i.e. one shared wide initiator.
+    n_narrow_if_total = N_NARROW_HCI + N_DMA + N_EXT;
+    n_wide_if_total = N_WIDE_HCI;
+    ideal_bw_interco_side_bpc = real'(n_narrow_if_total) * real'(DATA_WIDTH)
+                              + real'(n_wide_if_total)   * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
     // Bottleneck = minimum of the two sides.
-    ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_master_side_bpc)
-                 ? ideal_bw_mem_side_bpc : ideal_bw_master_side_bpc;
+    ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_interco_side_bpc)
+                 ? ideal_bw_mem_side_bpc : ideal_bw_interco_side_bpc;
     // Utilization = actual / ideal.
     actual_bw_utilization = (ideal_bw_bpc > 0.0)
                           ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0;
+    if (INTERCO_TYPE == LOG) begin
+      interco_type_str = "LOG";
+    end else if (INTERCO_TYPE == HCI) begin
+      interco_type_str = "HCI";
+    end else if (INTERCO_TYPE == MUX) begin
+      interco_type_str = "MUX";
+    end else begin
+      interco_type_str = "UNKNOWN";
+    end
 
     $display("------ Simulation Summary ------");
     $display("\\\\HW CONFIG\\\\");
@@ -217,6 +232,10 @@ module simulation_report
       "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d",
       SEL_LIC, TS_BIT, EXPFIFO
     );
+    $display(
+      "Interconnect-side: TYPE=%s N_NARROW_HCI=%0d N_WIDE_HCI=%0d N_DMA=%0d N_EXT=%0d",
+      interco_type_str, N_NARROW_HCI, N_WIDE_HCI, N_DMA, N_EXT
+    );
     $display(
       "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d",
       IW, ADDR_WIDTH, ADDR_WIDTH_BANK
@@ -228,10 +247,10 @@ module simulation_report
       ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH
     );
     $display(
-      "Ideal BW (master side):  %0.0f bit/cycle  [%0d log x %0d bits + %0d hwpe x %0d bits]",
-      ideal_bw_master_side_bpc,
-      N_LOG_MASTERS, DATA_WIDTH,
-      N_HWPE, HWPE_WIDTH_FACT * DATA_WIDTH
+      "Ideal BW (interco side): %0.0f bit/cycle  [%0d narrow-if x %0d bits + %0d wide-if x %0d bits]",
+      ideal_bw_interco_side_bpc,
+      n_narrow_if_total, DATA_WIDTH,
+      n_wide_if_total, HWPE_WIDTH_FACT * DATA_WIDTH
     );
     $display(
       "Ideal BW (bottleneck):   %0.0f bit/cycle",
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index 93d8282..e9f93e7 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -77,10 +77,10 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \
 config-verif: $(VERIF_CFG_MK)
 
 $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
-	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(dir $(HARDWARE_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@
 
 $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
-	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(dir $(TESTBENCH_JSON)) $(HCI_VERIF_CFG_GEN_DIR) > $@
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@
 
 $(HCI_VERIF_CFG_GEN_DIR):
 	mkdir -p $@
@@ -157,6 +157,13 @@ clean-sim-verif:
 	rm -f $(HCI_VERIF_DIR)/vsim/transcript
 	rm -f $(HCI_VERIF_DIR)/vsim/vsim.wlf
 
+################
+# Benchmarking #
+################
+
+benchmarking-sweep:
+	. $(HCI_VERIF_DIR)/scripts/run_sweep.sh
+
 ###########
 # Helpers #
 ###########

From f38178eab059d307933bb4a94da59062f8956f47 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 11 Mar 2026 16:21:57 +0100
Subject: [PATCH 22/25] verif/simvectors: Extract html and memory reporting

---
 .../verif/simvectors/hci_stimuli/__init__.py  |   3 +-
 .../verif/simvectors/hci_stimuli/processor.py |  12 -
 target/verif/simvectors/html_report.py        | 394 ++++++++++++++
 target/verif/simvectors/main.py               | 508 ++----------------
 target/verif/simvectors/memory_report.py      | 119 ++++
 5 files changed, 571 insertions(+), 465 deletions(-)
 delete mode 100644 target/verif/simvectors/hci_stimuli/processor.py
 create mode 100644 target/verif/simvectors/html_report.py
 create mode 100644 target/verif/simvectors/memory_report.py

diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py
index 6381011..64b9b07 100644
--- a/target/verif/simvectors/hci_stimuli/__init__.py
+++ b/target/verif/simvectors/hci_stimuli/__init__.py
@@ -6,6 +6,5 @@
 """
 
 from .generator import StimuliGenerator
-from .processor import pad_txt_files
 
-__all__ = ['StimuliGenerator', 'pad_txt_files']
+__all__ = ['StimuliGenerator']
diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py
deleted file mode 100644
index fa8b51b..0000000
--- a/target/verif/simvectors/hci_stimuli/processor.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Processor helpers: pad stimuli files to equal length."""
-
-
-def pad_txt_files(folder_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT):
-    """No-op: padding stimuli files to a common length is not needed.
-
-    Each application_driver self-terminates when its transaction queue is
-    exhausted, and later-phase drivers are held in reset via clear_i until
-    their dependencies assert end_req_o. Cross-file length alignment would
-    only add unnecessary idle cycles at the end of early-phase files.
-    """
-    pass
diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py
new file mode 100644
index 0000000..98a2b1a
--- /dev/null
+++ b/target/verif/simvectors/html_report.py
@@ -0,0 +1,394 @@
+"""HTML report generation for memory lifetime visualization."""
+
+from pathlib import Path
+import html
+import math
+
+
+def build_memory_lifetime_html(
+    *,
+    pattern_nodes,
+    driver_windows,
+    regions_timeline,
+    total_cycles,
+    mux_serialization_applied,
+    mux_phase_order,
+    schedule_has_cycle,
+    driver_name_fn,
+    interco_type,
+    n_core_cfg,
+    n_dma_cfg,
+    n_ext_cfg,
+    n_hwpe_cfg,
+    dw_narrow,
+    dw_wide,
+    n_narrow_hci_cfg,
+    n_wide_hci_cfg,
+    n_banks,
+):
+    palette = [
+        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf",
+        "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd",
+        "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e",
+        "#e6ab02", "#a6761d", "#666666",
+    ]
+
+    def _color_for_driver(drv_idx):
+        return palette[drv_idx % len(palette)]
+
+    def _tick_step(total):
+        if total <= 10:
+            return 1
+        raw = max(1, total // 10)
+        mag = 10 ** int(math.log10(raw))
+        return int(math.ceil(raw / mag) * mag)
+
+    total_for_plot = max(1, total_cycles)
+    chart_width = 1280
+    x_left = 220
+    x_right = 24
+    plot_w = chart_width - x_left - x_right
+    row_h = 80
+    y_top = 36
+    exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())]
+    exec_h = y_top + row_h * len(exec_rows) + 72
+    tick = _tick_step(total_for_plot)
+    ticks = list(range(0, total_for_plot + 1, tick))
+    if ticks[-1] != total_for_plot:
+        ticks.append(total_for_plot)
+
+    exec_svg = []
+    exec_svg.append(f'<svg width="{chart_width}" height="{exec_h}" viewBox="0 0 {chart_width} {exec_h}" xmlns="http://www.w3.org/2000/svg">')
+    exec_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    exec_svg.append(f'<text x="{x_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">Execution Timeline (transaction-count model)</text>')
+    for t in ticks:
+        x = x_left + (t / total_for_plot) * plot_w
+        exec_svg.append(f'<line x1="{x:.2f}" y1="{y_top - 6}" x2="{x:.2f}" y2="{exec_h - 52}" stroke="#e0e0e0" stroke-width="1"/>')
+        exec_svg.append(f'<text x="{x:.2f}" y="{exec_h - 36}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" fill="#555">{t}</text>')
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 18}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="12" fill="#333">Transaction number</text>'
+    )
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 2}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="11" fill="#555">'
+        'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.'
+        '</text>'
+    )
+
+    def _rw_mix_text(node):
+        rpct = node.get('traffic_read_pct')
+        if rpct is None:
+            read_bytes = 0
+            write_bytes = 0
+            for reg in node.get('regions', []):
+                label_l = str(reg.get('label', '')).lower()
+                size_b = max(0, int(reg.get('size', 0)))
+                if 'read' in label_l and 'write' not in label_l:
+                    read_bytes += size_b
+                elif 'write' in label_l and 'read' not in label_l:
+                    write_bytes += size_b
+            if (read_bytes + write_bytes) > 0:
+                rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes)))
+            else:
+                rpct = 50
+        rpct = max(0, min(100, int(rpct)))
+        if rpct == 0:
+            return "100% writes"
+        if rpct == 100:
+            return "100% reads"
+        return f"{rpct}% reads / {100 - rpct}% writes"
+
+    def _fmt_kib(bytes_v):
+        return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB"
+
+    def _outside_detail_lines(node, base_addr, size_kib):
+        regs = list(node.get('regions', []))
+        label_map = {str(r.get('label', '')): r for r in regs}
+        has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)'))
+        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs
+
+        if is_matmul:
+            mat_lines = []
+            for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]:
+                if lbl in label_map:
+                    r = label_map[lbl]
+                    mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})")
+            if not mat_lines:
+                return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+            return mat_lines
+
+        if regs:
+            r0 = regs[0]
+            return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"]
+        return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+
+    for row_idx, row in enumerate(exec_rows):
+        y = y_top + row_idx * row_h
+        name = html.escape(row['name'])
+        label_color = "#111111" if row['is_hwpe'] else "#555555"
+        exec_svg.append(f'<text x="{x_left - 10}" y="{y + 34}" text-anchor="end" font-family="Arial, sans-serif" font-size="12" fill="{label_color}">{name}</text>')
+        exec_svg.append(f'<line x1="{x_left}" y1="{y + row_h - 1}" x2="{x_left + plot_w}" y2="{y + row_h - 1}" stroke="#f0f0f0" stroke-width="1"/>')
+        nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']]
+        for node in nodes:
+            if node['end_cycle'] <= node['start_cycle']:
+                continue
+            x = x_left + (node['start_cycle'] / total_for_plot) * plot_w
+            w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w)
+            color = _color_for_driver(node['driver_idx'])
+            base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0)
+            size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0
+            rw_mix = _rw_mix_text(node)
+            line1_full = f"{node['job']}"
+            line2_full = f"{node['mem_access_type']}"
+            line3_full = rw_mix
+            outside_lines = _outside_detail_lines(node, base_addr, size_kib)
+            title = (
+                f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']}, {node['end_cycle']}) "
+                f"{node['mem_access_type']} n={node['n_transactions']} "
+                f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} "
+                f"{' | '.join(outside_lines)}"
+            )
+            exec_svg.append('<g style="cursor:help;">')
+            exec_svg.append(f'<title>{html.escape(title)}</title>')
+            exec_svg.append(
+                f'<rect x="{x:.2f}" y="{y + 4}" width="{w:.2f}" height="34" '
+                f'rx="3" ry="3" fill="{color}" fill-opacity="0.82" stroke="#222" stroke-width="0.2"/>'
+            )
+            line1 = html.escape(line1_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 15}" font-family="Arial, sans-serif" font-size="9" '
+                f'fill="#ffffff" style="pointer-events:none;">{line1}</text>'
+            )
+            line2 = html.escape(line2_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 25}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line2}</text>'
+            )
+            line3 = html.escape(line3_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 34}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line3}</text>'
+            )
+            for ext_idx, ext in enumerate(outside_lines):
+                y_ext = y + 49 + (ext_idx * 9)
+                ext_txt = html.escape(ext)
+                exec_svg.append(
+                    f'<text x="{x + 2:.2f}" y="{y_ext:.2f}" font-family="Arial, sans-serif" font-size="8" '
+                    f'fill="#333333" style="pointer-events:none;">{ext_txt}</text>'
+                )
+            exec_svg.append('</g>')
+    exec_svg.append('</svg>')
+
+    region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))]
+    overlap_rows = []
+    overlaps_by_region = {i: [] for i in range(len(region_rows))}
+    for i in range(len(region_rows)):
+        a = region_rows[i]
+        for j in range(i + 1, len(region_rows)):
+            b = region_rows[j]
+            ov_base = max(a['base'], b['base'])
+            ov_end = min(a['end'], b['end'])
+            if ov_base <= ov_end:
+                ov_size = ov_end - ov_base + 1
+                overlap_rows.append({
+                    'a_idx': i,
+                    'b_idx': j,
+                    'ov_base': ov_base,
+                    'ov_end': ov_end,
+                    'ov_size': ov_size,
+                })
+                overlaps_by_region[i].append((j, ov_base, ov_end, ov_size))
+                overlaps_by_region[j].append((i, ov_base, ov_end, ov_size))
+
+    used_min = min((reg['base'] for reg in region_rows), default=0)
+    used_max = max((reg['end'] for reg in region_rows), default=0)
+    if used_max < used_min:
+        used_max = used_min
+    used_span = max(1, used_max - used_min + 1)
+
+    map_left = 170
+    map_right = 24
+    map_plot_w = chart_width - map_left - map_right
+    map_h = 124
+    bar_y = 56
+    bar_h = 24
+    region_map_svg = []
+    region_map_svg.append(f'<svg width="{chart_width}" height="{map_h}" viewBox="0 0 {chart_width} {map_h}" xmlns="http://www.w3.org/2000/svg">')
+    region_map_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    region_map_svg.append(
+        f'<text x="{map_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">'
+        f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})"
+        f"</text>"
+    )
+    region_map_svg.append(
+        f'<rect x="{map_left}" y="{bar_y}" width="{map_plot_w}" height="{bar_h}" '
+        f'fill="#f6f6f6" stroke="#cfcfcf" stroke-width="1"/>'
+    )
+    for pct in [0, 25, 50, 75, 100]:
+        x = map_left + (pct / 100.0) * map_plot_w
+        addr = used_min + int(((used_span - 1) * pct) / 100.0)
+        region_map_svg.append(f'<line x1="{x:.2f}" y1="{bar_y - 8}" x2="{x:.2f}" y2="{bar_y + bar_h + 8}" stroke="#d7d7d7" stroke-width="1"/>')
+        region_map_svg.append(
+            f'<text x="{x:.2f}" y="{bar_y + bar_h + 22}" text-anchor="middle" '
+            f'font-family="Arial, sans-serif" font-size="11" fill="#555">0x{addr:08x}</text>'
+        )
+    for reg in region_rows:
+        x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w
+        w = max(1.0, (reg['size'] / used_span) * map_plot_w)
+        color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888"
+        title = (
+            f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"size={reg['size']}B accesses={len(reg['accesses'])}"
+        )
+        region_map_svg.append(
+            f'<rect x="{x:.2f}" y="{bar_y + 2}" width="{w:.2f}" height="{bar_h - 4}" '
+            f'rx="2" ry="2" fill="{color}" fill-opacity="0.62" stroke="#222" stroke-width="0.35">'
+            f'<title>{html.escape(title)}</title></rect>'
+        )
+        if w >= 92:
+            region_map_svg.append(
+                f'<text x="{x + 4:.2f}" y="{bar_y + 16}" font-family="Arial, sans-serif" '
+                f'font-size="10" fill="#111">{html.escape(reg["label"])}</text>'
+            )
+    region_map_svg.append('</svg>')
+
+    legend_items = []
+    for d in sorted(driver_windows.keys()):
+        n = driver_name_fn(d)
+        c = _color_for_driver(d)
+        legend_items.append(
+            f'<span style="display:inline-flex;align-items:center;margin-right:12px;margin-bottom:6px;">'
+            f'<span style="display:inline-block;width:11px;height:11px;background:{c};margin-right:5px;border:1px solid #222;"></span>'
+            f'<span>{html.escape(n)}</span></span>'
+        )
+
+    region_cards = []
+    for idx, reg in enumerate(region_rows):
+        access_rows = []
+        accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start']))
+        for acc in accesses:
+            desc = acc['description'] if acc['description'] else "-"
+            access_rows.append(
+                "<tr>"
+                f"<td>{html.escape(acc['driver_name'])}</td>"
+                f"<td>p{acc['pattern_idx']}</td>"
+                f"<td>{html.escape(acc['job'])}</td>"
+                f"<td>{html.escape(desc)}</td>"
+                f"<td>[{acc['start']}, {acc['end']})</td>"
+                "</tr>"
+            )
+        overlap_refs = overlaps_by_region.get(idx, [])
+        if overlap_refs:
+            ov_txt = ", ".join(
+                [
+                    f"{html.escape(region_rows[j]['label'])} "
+                    f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)"
+                    for (j, ovb, ove, ovs) in overlap_refs
+                ]
+            )
+        else:
+            ov_txt = "none"
+        region_cards.append(
+            "<div class='region-card'>"
+            f"<div><b>{html.escape(reg['label'])}</b> | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B</div>"
+            f"<div class='meta' style='margin:4px 0 8px 0;'><b>Overlaps:</b> {ov_txt}</div>"
+            "<table class='smalltbl'><thead><tr>"
+            "<th>Driver/HWPE</th><th>Pattern</th><th>Job</th><th>Description</th><th>Modeled interval</th>"
+            "</tr></thead><tbody>"
+            f"{''.join(access_rows)}"
+            "</tbody></table>"
+            "</div>"
+        )
+
+    overlap_table_rows = []
+    for ov in overlap_rows:
+        a = region_rows[ov['a_idx']]
+        b = region_rows[ov['b_idx']]
+        overlap_table_rows.append(
+            "<tr>"
+            f"<td>{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})</td>"
+            f"<td>{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})</td>"
+            f"<td>0x{ov['ov_base']:08x}</td>"
+            f"<td>0x{ov['ov_end']:08x}</td>"
+            f"<td>{ov['ov_size']}</td>"
+            "</tr>"
+        )
+
+    note = "Timeline follows declared wait_for_jobs dependencies from workload.json."
+    note_2 = (
+        "Time axis is transaction-count based only "
+        "(no interconnect conflict/stall/arbitration modeling)."
+    )
+    note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing."
+    note_3 = ""
+    if mux_serialization_applied:
+        note_3 = (
+            "MUX mode: HWPE execution is serialized by job order, "
+            "with lower HWPE ID first inside each job (tb_hci-like)."
+        )
+        if mux_phase_order:
+            note_3 += f" Job order: {', '.join(mux_phase_order)}."
+    note_3_html = f"<div class='meta'>{html.escape(note_3)}</div>" if note_3 else ""
+    cycle_warning_html = (
+        "<p style='margin:8px 0;color:#b00020;font-weight:600;'>Warning: dependency cycle detected; fallback scheduling order used.</p>"
+        if schedule_has_cycle else ""
+    )
+    overlap_html = (
+        "<table><thead><tr><th>Region A</th><th>Region B</th><th>Overlap Base</th><th>Overlap End</th><th>Overlap Size (B)</th></tr></thead><tbody>"
+        f"{''.join(overlap_table_rows)}"
+        "</tbody></table>"
+        if overlap_table_rows else
+        "<div class='meta'>No overlaps detected among used regions.</div>"
+    )
+
+    return (
+        "<!doctype html><html><head><meta charset='utf-8'>"
+        "<title>Memory Access Region View (Transaction-Count Model)</title>"
+        "<style>"
+        "body{font-family:Arial,sans-serif;margin:18px;background:#fafafa;color:#111;}"
+        "h1{font-size:20px;margin:0 0 6px 0;}h2{font-size:16px;margin:18px 0 8px 0;}"
+        ".meta{font-size:13px;color:#333;margin-bottom:10px;}"
+        ".panel{background:#fff;border:1px solid #ddd;border-radius:8px;padding:12px;margin-bottom:14px;overflow-x:auto;}"
+        "table{border-collapse:collapse;width:100%;font-size:12px;background:#fff;}"
+        "th,td{border:1px solid #ddd;padding:5px 7px;text-align:left;vertical-align:top;}"
+        "th{background:#f3f3f3;}"
+        ".region-card{border:1px solid #dcdcdc;border-radius:6px;padding:10px;margin-bottom:10px;background:#fff;}"
+        ".smalltbl{border-collapse:collapse;width:100%;font-size:11px;background:#fff;}"
+        ".smalltbl th,.smalltbl td{border:1px solid #ddd;padding:4px 6px;text-align:left;vertical-align:top;}"
+        ".smalltbl th{background:#f7f7f7;}"
+        "</style></head><body>"
+        "<h1>Memory Access Region View</h1>"
+        f"<div class='meta'><b>Drivers ({dw_narrow} bit):</b> "
+        f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}</div>"
+        f"<div class='meta'><b>Drivers ({dw_wide} bit):</b> HWPE={n_hwpe_cfg}</div>"
+        f"<div class='meta'><b>Interconnect type:</b> {html.escape(interco_type)} | "
+        f"<b>Narrow master ports ({dw_narrow} bit):</b> {n_narrow_hci_cfg} | "
+        f"<b>Wide master ports ({dw_wide} bit):</b> {n_wide_hci_cfg} | "
+        f"<b>Slave ports (banks):</b> {n_banks} | "
+        f"<b>Total modeled time:</b> {total_cycles} units</div>"
+        f"<div class='meta'>{html.escape(note)}</div>"
+        f"<div class='meta'>{html.escape(note_2)}</div>"
+        f"<div class='meta'>{html.escape(note_2b)}</div>"
+        f"{note_3_html}"
+        f"{cycle_warning_html}"
+        "<div class='panel'><h2 style='margin-top:0;'>Legend</h2>"
+        f"{''.join(legend_items)}</div>"
+        "<div class='panel'>"
+        f"{''.join(exec_svg)}"
+        "</div>"
+        "<div class='panel'>"
+        f"{''.join(region_map_svg)}"
+        "</div>"
+        "<h2>Region Usage Blocks</h2>"
+        f"{''.join(region_cards)}"
+        "<h2>Overlapping Regions</h2>"
+        f"{overlap_html}"
+        "</body></html>"
+    )
+
+
+def write_memory_lifetime_html(memory_lifetime_path: Path, **kwargs):
+    html_doc = build_memory_lifetime_html(**kwargs)
+    memory_lifetime_path.write_text(html_doc, encoding='utf-8')
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 0cd3c12..3ff2f30 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -11,17 +11,20 @@
 import json
 import math
 import sys
-import html
 from pathlib import Path
 import argparse
 
 code_directory = Path(__file__).resolve().parent
 
 try:
-    from hci_stimuli import StimuliGenerator, pad_txt_files
+    from hci_stimuli import StimuliGenerator
+    from memory_report import write_memory_map_txt
+    from html_report import write_memory_lifetime_html
 except Exception:
     sys.path.insert(0, str(code_directory))
-    from hci_stimuli import StimuliGenerator, pad_txt_files
+    from hci_stimuli import StimuliGenerator
+    from memory_report import write_memory_map_txt
+    from html_report import write_memory_lifetime_html
 
 
 def parse_args(argv=None):
@@ -1128,457 +1131,64 @@ def _add_edge(src, dst):
     # -----------------------------------------------------------------------
     # Build memory_map.txt
     # -----------------------------------------------------------------------
-    word_bytes = DATA_WIDTH // 8
-    bank_stride_bytes = N_BANKS * word_bytes
-    lines = []
-    lines.append("=" * 72)
-    lines.append("MEMORY MAP REPORT")
-    lines.append(f"  Total memory : {TOT_MEM_SIZE} KiB  ({TOT_MEM_SIZE * 1024} B)")
-    lines.append(f"  Banks        : {N_BANKS}  x  {word_bytes} B/word  (interleaved, stride {bank_stride_bytes} B)")
-    lines.append(f"  Data width   : {DATA_WIDTH} b LOG  /  {HWPE_WIDTH_FACT * DATA_WIDTH} b HWPE")
-    lines.append(
-        f"  Drivers({DW_NARROW} bit) : "
-        f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}  (LOG total={N_LOG_CFG})"
-    )
-    lines.append(
-        f"  Drivers({DW_WIDE} bit) : "
-        f"HWPE={N_HWPE_CFG}"
-    )
-    lines.append(
-        f"  Interconnect type : {INTERCO_TYPE}  |  "
-        f"Narrow master ports ({DW_NARROW} bit)={N_NARROW_HCI_CFG}  |  "
-        f"Wide master ports ({DW_WIDE} bit)={N_WIDE_HCI_CFG}  |  "
-        f"Slave ports (banks)={N_BANKS}"
+    memory_map_path = generated_dir / 'memory_map.txt'
+    report_text = write_memory_map_txt(
+        memory_map_path=memory_map_path,
+        total_mem_size_kib=TOT_MEM_SIZE,
+        n_banks=N_BANKS,
+        data_width=DATA_WIDTH,
+        hwpe_data_width=HWPE_WIDTH_FACT * DATA_WIDTH,
+        n_core_cfg=N_CORE_CFG,
+        n_dma_cfg=N_DMA_CFG,
+        n_ext_cfg=N_EXT_CFG,
+        n_log_cfg=N_LOG_CFG,
+        n_hwpe_cfg=N_HWPE_CFG,
+        interco_type=INTERCO_TYPE,
+        dw_narrow=DW_NARROW,
+        dw_wide=DW_WIDE,
+        n_narrow_hci_cfg=N_NARROW_HCI_CFG,
+        n_wide_hci_cfg=N_WIDE_HCI_CFG,
+        memory_map_entries=memory_map_entries,
+        job_to_drivers=job_to_drivers,
+        driver_name_fn=_driver_name,
+        n_drivers=N_DRIVERS,
+        fence_masks=fence_masks,
+        total_cycles=total_cycles,
+        mux_serialization_applied=mux_serialization_applied,
+        mux_phase_order=mux_phase_order,
+        schedule_has_cycle=schedule_has_cycle,
+        driver_windows=driver_windows,
+        pattern_nodes=pattern_nodes,
+        regions_timeline=regions_timeline,
     )
-    lines.append("=" * 72)
-    for entry in memory_map_entries:
-        lines.append(f"\n  [{entry['label']}]  pattern={entry['pattern']}  n_transactions={entry['n']}")
-        if 'info' in entry:
-            lines.append(f"    {entry['info']}")
-        for k, v in entry.get('detail', {}).items():
-            lines.append(f"    {k:<14}: {v}")
-    lines.append("")
-    lines.append("  Job / dependency map:")
-    for job, drivers in sorted(job_to_drivers.items()):
-        driver_names = [_driver_name(d) for d in drivers]
-        lines.append(f"    job '{job}': {', '.join(driver_names)}")
-    for i in range(N_DRIVERS):
-        name = _driver_name(i)
-        for f, mask in enumerate(fence_masks[i]):
-            if mask:
-                deps = [_driver_name(j) for j in range(N_DRIVERS) if mask & (1 << j)]
-                lines.append(f"    {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}")
-    lines.append("")
-    lines.append("  Temporal schedule (transaction-count model):")
-    lines.append(f"    Total modeled time: {total_cycles} units (1 unit = 1 transaction)")
-    lines.append("    Note: Declared wait_for_jobs dependencies are used for scheduling.")
-    lines.append("    Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).")
-    lines.append("    Note: No interconnect contention/stall timing is modeled.")
-    if mux_serialization_applied:
-        lines.append("    Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).")
-        lines.append(f"    MUX job order: {', '.join(mux_phase_order)}")
-    if schedule_has_cycle:
-        lines.append("    WARNING: dependency cycle detected while scheduling; using fallback order.")
-    for d in range(N_DRIVERS):
-        if d not in driver_windows:
-            continue
-        w = driver_windows[d]
-        lines.append(f"    {w['name']:<8}: [{w['start']:>6}, {w['end']:>6})  dur={w['end'] - w['start']:>6}")
-        for node in [n for n in pattern_nodes if n['driver_idx'] == d]:
-            reg_tokens = []
-            for reg in node['regions']:
-                reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B")
-            reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions"
-            lines.append(
-                f"      p{node['pattern_idx']} job={node['job']} "
-                f"[{node['start_cycle']},{node['end_cycle']}) "
-                f"type={node['mem_access_type']} n={node['n_transactions']}  {reg_text}"
-            )
-    lines.append("")
-    lines.append("  Memory region lifetimes:")
-    for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])):
-        reg = regions_timeline[key]
-        users = sorted({a['driver_name'] for a in reg['accesses']})
-        lines.append(
-            f"    {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} "
-            f"({reg['size']:>6} B)  lifetime=[{reg['lifetime_start']},{reg['lifetime_end']})  "
-            f"users={', '.join(users)}"
-        )
-    lines.append("=" * 72)
-
-    report_text = "\n".join(lines) + "\n"
     print("\n" + report_text)
-    memory_map_path = generated_dir / 'memory_map.txt'
-    memory_map_path.write_text(report_text, encoding='utf-8')
     print(f"Memory map written: {memory_map_path}")
 
     # -----------------------------------------------------------------------
     # Build memory_lifetime.html (simple SVG timeline view)
     # -----------------------------------------------------------------------
-    palette = [
-        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf",
-        "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd",
-        "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e",
-        "#e6ab02", "#a6761d", "#666666",
-    ]
-
-    def _color_for_driver(drv_idx):
-        return palette[drv_idx % len(palette)]
-
-    def _tick_step(total):
-        if total <= 10:
-            return 1
-        raw = max(1, total // 10)
-        mag = 10 ** int(math.log10(raw))
-        return int(math.ceil(raw / mag) * mag)
-
-    total_for_plot = max(1, total_cycles)
-    chart_width = 1280
-    x_left = 220
-    x_right = 24
-    plot_w = chart_width - x_left - x_right
-    row_h = 80
-    y_top = 36
-    exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())]
-    exec_h = y_top + row_h * len(exec_rows) + 72
-    tick = _tick_step(total_for_plot)
-    ticks = list(range(0, total_for_plot + 1, tick))
-    if ticks[-1] != total_for_plot:
-        ticks.append(total_for_plot)
-
-    exec_svg = []
-    exec_svg.append(f'<svg width="{chart_width}" height="{exec_h}" viewBox="0 0 {chart_width} {exec_h}" xmlns="http://www.w3.org/2000/svg">')
-    exec_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
-    exec_svg.append(f'<text x="{x_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">Execution Timeline (transaction-count model)</text>')
-    for t in ticks:
-        x = x_left + (t / total_for_plot) * plot_w
-        exec_svg.append(f'<line x1="{x:.2f}" y1="{y_top - 6}" x2="{x:.2f}" y2="{exec_h - 52}" stroke="#e0e0e0" stroke-width="1"/>')
-        exec_svg.append(f'<text x="{x:.2f}" y="{exec_h - 36}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" fill="#555">{t}</text>')
-    exec_svg.append(
-        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 18}" text-anchor="middle" '
-        'font-family="Arial, sans-serif" font-size="12" fill="#333">Transaction number</text>'
-    )
-    exec_svg.append(
-        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 2}" text-anchor="middle" '
-        'font-family="Arial, sans-serif" font-size="11" fill="#555">'
-        'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.'
-        '</text>'
-    )
-
-    def _rw_mix_text(node):
-        rpct = node.get('traffic_read_pct')
-        if rpct is None:
-            read_bytes = 0
-            write_bytes = 0
-            for reg in node.get('regions', []):
-                label_l = str(reg.get('label', '')).lower()
-                size_b = max(0, int(reg.get('size', 0)))
-                if 'read' in label_l and 'write' not in label_l:
-                    read_bytes += size_b
-                elif 'write' in label_l and 'read' not in label_l:
-                    write_bytes += size_b
-            if (read_bytes + write_bytes) > 0:
-                rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes)))
-            else:
-                rpct = 50
-        rpct = max(0, min(100, int(rpct)))
-        if rpct == 0:
-            return "100% writes"
-        if rpct == 100:
-            return "100% reads"
-        return f"{rpct}% reads / {100 - rpct}% writes"
-
-    def _fmt_kib(bytes_v):
-        return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB"
-
-    def _outside_detail_lines(node, base_addr, size_kib):
-        regs = list(node.get('regions', []))
-        label_map = {str(r.get('label', '')): r for r in regs}
-        has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)'))
-        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs
-
-        if is_matmul:
-            mat_lines = []
-            for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]:
-                if lbl in label_map:
-                    r = label_map[lbl]
-                    mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})")
-            if not mat_lines:
-                return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
-            return mat_lines
-
-        if regs:
-            r0 = regs[0]
-            return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"]
-        return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
-
-    for row_idx, row in enumerate(exec_rows):
-        y = y_top + row_idx * row_h
-        name = html.escape(row['name'])
-        label_color = "#111111" if row['is_hwpe'] else "#555555"
-        exec_svg.append(f'<text x="{x_left - 10}" y="{y + 34}" text-anchor="end" font-family="Arial, sans-serif" font-size="12" fill="{label_color}">{name}</text>')
-        exec_svg.append(f'<line x1="{x_left}" y1="{y + row_h - 1}" x2="{x_left + plot_w}" y2="{y + row_h - 1}" stroke="#f0f0f0" stroke-width="1"/>')
-        nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']]
-        for node in nodes:
-            if node['end_cycle'] <= node['start_cycle']:
-                continue
-            x = x_left + (node['start_cycle'] / total_for_plot) * plot_w
-            w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w)
-            color = _color_for_driver(node['driver_idx'])
-            base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0)
-            size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0
-            rw_mix = _rw_mix_text(node)
-            line1_full = f"{node['job']}"
-            line2_full = f"{node['mem_access_type']}"
-            line3_full = rw_mix
-            outside_lines = _outside_detail_lines(node, base_addr, size_kib)
-            title = (
-                f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} "
-                f"[{node['start_cycle']}, {node['end_cycle']}) "
-                f"{node['mem_access_type']} n={node['n_transactions']} "
-                f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} "
-                f"{' | '.join(outside_lines)}"
-            )
-            exec_svg.append('<g style="cursor:help;">')
-            exec_svg.append(f'<title>{html.escape(title)}</title>')
-            exec_svg.append(
-                f'<rect x="{x:.2f}" y="{y + 4}" width="{w:.2f}" height="34" '
-                f'rx="3" ry="3" fill="{color}" fill-opacity="0.82" stroke="#222" stroke-width="0.2"/>'
-            )
-            line1 = html.escape(line1_full)
-            exec_svg.append(
-                f'<text x="{x + 4:.2f}" y="{y + 15}" font-family="Arial, sans-serif" font-size="9" '
-                f'fill="#ffffff" style="pointer-events:none;">{line1}</text>'
-            )
-            line2 = html.escape(line2_full)
-            exec_svg.append(
-                f'<text x="{x + 4:.2f}" y="{y + 25}" font-family="Arial, sans-serif" font-size="8" '
-                f'fill="#ffffff" style="pointer-events:none;">{line2}</text>'
-            )
-            line3 = html.escape(line3_full)
-            exec_svg.append(
-                f'<text x="{x + 4:.2f}" y="{y + 34}" font-family="Arial, sans-serif" font-size="8" '
-                f'fill="#ffffff" style="pointer-events:none;">{line3}</text>'
-            )
-            for ext_idx, ext in enumerate(outside_lines):
-                y_ext = y + 49 + (ext_idx * 9)
-                ext_txt = html.escape(ext)
-                exec_svg.append(
-                    f'<text x="{x + 2:.2f}" y="{y_ext:.2f}" font-family="Arial, sans-serif" font-size="8" '
-                    f'fill="#333333" style="pointer-events:none;">{ext_txt}</text>'
-                )
-            exec_svg.append('</g>')
-    exec_svg.append('</svg>')
-
-    region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))]
-    overlap_rows = []
-    overlaps_by_region = {i: [] for i in range(len(region_rows))}
-    for i in range(len(region_rows)):
-        a = region_rows[i]
-        for j in range(i + 1, len(region_rows)):
-            b = region_rows[j]
-            ov_base = max(a['base'], b['base'])
-            ov_end = min(a['end'], b['end'])
-            if ov_base <= ov_end:
-                ov_size = ov_end - ov_base + 1
-                overlap_rows.append({
-                    'a_idx': i,
-                    'b_idx': j,
-                    'ov_base': ov_base,
-                    'ov_end': ov_end,
-                    'ov_size': ov_size,
-                })
-                overlaps_by_region[i].append((j, ov_base, ov_end, ov_size))
-                overlaps_by_region[j].append((i, ov_base, ov_end, ov_size))
-
-    used_min = min((reg['base'] for reg in region_rows), default=0)
-    used_max = max((reg['end'] for reg in region_rows), default=0)
-    if used_max < used_min:
-        used_max = used_min
-    used_span = max(1, used_max - used_min + 1)
-
-    map_left = 170
-    map_right = 24
-    map_plot_w = chart_width - map_left - map_right
-    map_h = 124
-    bar_y = 56
-    bar_h = 24
-    region_map_svg = []
-    region_map_svg.append(f'<svg width="{chart_width}" height="{map_h}" viewBox="0 0 {chart_width} {map_h}" xmlns="http://www.w3.org/2000/svg">')
-    region_map_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
-    region_map_svg.append(
-        f'<text x="{map_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">'
-        f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})"
-        f"</text>"
-    )
-    region_map_svg.append(
-        f'<rect x="{map_left}" y="{bar_y}" width="{map_plot_w}" height="{bar_h}" '
-        f'fill="#f6f6f6" stroke="#cfcfcf" stroke-width="1"/>'
-    )
-    for pct in [0, 25, 50, 75, 100]:
-        x = map_left + (pct / 100.0) * map_plot_w
-        addr = used_min + int(((used_span - 1) * pct) / 100.0)
-        region_map_svg.append(f'<line x1="{x:.2f}" y1="{bar_y - 8}" x2="{x:.2f}" y2="{bar_y + bar_h + 8}" stroke="#d7d7d7" stroke-width="1"/>')
-        region_map_svg.append(
-            f'<text x="{x:.2f}" y="{bar_y + bar_h + 22}" text-anchor="middle" '
-            f'font-family="Arial, sans-serif" font-size="11" fill="#555">0x{addr:08x}</text>'
-        )
-    for reg in region_rows:
-        x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w
-        w = max(1.0, (reg['size'] / used_span) * map_plot_w)
-        color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888"
-        title = (
-            f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} "
-            f"size={reg['size']}B accesses={len(reg['accesses'])}"
-        )
-        region_map_svg.append(
-            f'<rect x="{x:.2f}" y="{bar_y + 2}" width="{w:.2f}" height="{bar_h - 4}" '
-            f'rx="2" ry="2" fill="{color}" fill-opacity="0.62" stroke="#222" stroke-width="0.35">'
-            f'<title>{html.escape(title)}</title></rect>'
-        )
-        if w >= 92:
-            region_map_svg.append(
-                f'<text x="{x + 4:.2f}" y="{bar_y + 16}" font-family="Arial, sans-serif" '
-                f'font-size="10" fill="#111">{html.escape(reg["label"])}</text>'
-            )
-    region_map_svg.append('</svg>')
-
-    legend_items = []
-    for d in sorted(driver_windows.keys()):
-        n = _driver_name(d)
-        c = _color_for_driver(d)
-        legend_items.append(
-            f'<span style="display:inline-flex;align-items:center;margin-right:12px;margin-bottom:6px;">'
-            f'<span style="display:inline-block;width:11px;height:11px;background:{c};margin-right:5px;border:1px solid #222;"></span>'
-            f'<span>{html.escape(n)}</span></span>'
-        )
-
-    region_cards = []
-    for idx, reg in enumerate(region_rows):
-        access_rows = []
-        accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start']))
-        for acc in accesses:
-            desc = acc['description'] if acc['description'] else "-"
-            access_rows.append(
-                "<tr>"
-                f"<td>{html.escape(acc['driver_name'])}</td>"
-                f"<td>p{acc['pattern_idx']}</td>"
-                f"<td>{html.escape(acc['job'])}</td>"
-                f"<td>{html.escape(desc)}</td>"
-                f"<td>[{acc['start']}, {acc['end']})</td>"
-                "</tr>"
-            )
-        overlap_refs = overlaps_by_region.get(idx, [])
-        if overlap_refs:
-            ov_txt = ", ".join(
-                [
-                    f"{html.escape(region_rows[j]['label'])} "
-                    f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)"
-                    for (j, ovb, ove, ovs) in overlap_refs
-                ]
-            )
-        else:
-            ov_txt = "none"
-        region_cards.append(
-            "<div class='region-card'>"
-            f"<div><b>{html.escape(reg['label'])}</b> | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B</div>"
-            f"<div class='meta' style='margin:4px 0 8px 0;'><b>Overlaps:</b> {ov_txt}</div>"
-            "<table class='smalltbl'><thead><tr>"
-            "<th>Driver/HWPE</th><th>Pattern</th><th>Job</th><th>Description</th><th>Modeled interval</th>"
-            "</tr></thead><tbody>"
-            f"{''.join(access_rows)}"
-            "</tbody></table>"
-            "</div>"
-        )
-
-    overlap_table_rows = []
-    for ov in overlap_rows:
-        a = region_rows[ov['a_idx']]
-        b = region_rows[ov['b_idx']]
-        overlap_table_rows.append(
-            "<tr>"
-            f"<td>{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})</td>"
-            f"<td>{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})</td>"
-            f"<td>0x{ov['ov_base']:08x}</td>"
-            f"<td>0x{ov['ov_end']:08x}</td>"
-            f"<td>{ov['ov_size']}</td>"
-            "</tr>"
-        )
-
-    note = "Timeline follows declared wait_for_jobs dependencies from workload.json."
-    note_2 = (
-        "Time axis is transaction-count based only "
-        "(no interconnect conflict/stall/arbitration modeling)."
-    )
-    note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing."
-    note_3 = ""
-    if mux_serialization_applied:
-        note_3 = (
-            "MUX mode: HWPE execution is serialized by job order, "
-            "with lower HWPE ID first inside each job (tb_hci-like)."
-        )
-        if mux_phase_order:
-            note_3 += f" Job order: {', '.join(mux_phase_order)}."
-    note_3_html = f"<div class='meta'>{html.escape(note_3)}</div>" if note_3 else ""
-    cycle_warning_html = (
-        "<p style='margin:8px 0;color:#b00020;font-weight:600;'>Warning: dependency cycle detected; fallback scheduling order used.</p>"
-        if schedule_has_cycle else ""
-    )
-    overlap_html = (
-        "<table><thead><tr><th>Region A</th><th>Region B</th><th>Overlap Base</th><th>Overlap End</th><th>Overlap Size (B)</th></tr></thead><tbody>"
-        f"{''.join(overlap_table_rows)}"
-        "</tbody></table>"
-        if overlap_table_rows else
-        "<div class='meta'>No overlaps detected among used regions.</div>"
-    )
-
-    html_doc = (
-        "<!doctype html><html><head><meta charset='utf-8'>"
-        "<title>Memory Access Region View (Transaction-Count Model)</title>"
-        "<style>"
-        "body{font-family:Arial,sans-serif;margin:18px;background:#fafafa;color:#111;}"
-        "h1{font-size:20px;margin:0 0 6px 0;}h2{font-size:16px;margin:18px 0 8px 0;}"
-        ".meta{font-size:13px;color:#333;margin-bottom:10px;}"
-        ".panel{background:#fff;border:1px solid #ddd;border-radius:8px;padding:12px;margin-bottom:14px;overflow-x:auto;}"
-        "table{border-collapse:collapse;width:100%;font-size:12px;background:#fff;}"
-        "th,td{border:1px solid #ddd;padding:5px 7px;text-align:left;vertical-align:top;}"
-        "th{background:#f3f3f3;}"
-        ".region-card{border:1px solid #dcdcdc;border-radius:6px;padding:10px;margin-bottom:10px;background:#fff;}"
-        ".smalltbl{border-collapse:collapse;width:100%;font-size:11px;background:#fff;}"
-        ".smalltbl th,.smalltbl td{border:1px solid #ddd;padding:4px 6px;text-align:left;vertical-align:top;}"
-        ".smalltbl th{background:#f7f7f7;}"
-        "</style></head><body>"
-        "<h1>Memory Access Region View</h1>"
-        f"<div class='meta'><b>Drivers ({DW_NARROW} bit):</b> "
-        f"CORE={N_CORE_CFG}, DMA={N_DMA_CFG}, EXT={N_EXT_CFG}</div>"
-        f"<div class='meta'><b>Drivers ({DW_WIDE} bit):</b> HWPE={N_HWPE_CFG}</div>"
-        f"<div class='meta'><b>Interconnect type:</b> {html.escape(INTERCO_TYPE)} | "
-        f"<b>Narrow master ports ({DW_NARROW} bit):</b> {N_NARROW_HCI_CFG} | "
-        f"<b>Wide master ports ({DW_WIDE} bit):</b> {N_WIDE_HCI_CFG} | "
-        f"<b>Slave ports (banks):</b> {N_BANKS} | "
-        f"<b>Total modeled time:</b> {total_cycles} units</div>"
-        f"<div class='meta'>{html.escape(note)}</div>"
-        f"<div class='meta'>{html.escape(note_2)}</div>"
-        f"<div class='meta'>{html.escape(note_2b)}</div>"
-        f"{note_3_html}"
-        f"{cycle_warning_html}"
-        "<div class='panel'><h2 style='margin-top:0;'>Legend</h2>"
-        f"{''.join(legend_items)}</div>"
-        "<div class='panel'>"
-        f"{''.join(exec_svg)}"
-        "</div>"
-        "<div class='panel'>"
-        f"{''.join(region_map_svg)}"
-        "</div>"
-        "<h2>Region Usage Blocks</h2>"
-        f"{''.join(region_cards)}"
-        "<h2>Overlapping Regions</h2>"
-        f"{overlap_html}"
-        "</body></html>"
-    )
-
     memory_lifetime_path = generated_dir / 'memory_lifetime.html'
-    memory_lifetime_path.write_text(html_doc, encoding='utf-8')
+    write_memory_lifetime_html(
+        memory_lifetime_path=memory_lifetime_path,
+        pattern_nodes=pattern_nodes,
+        driver_windows=driver_windows,
+        regions_timeline=regions_timeline,
+        total_cycles=total_cycles,
+        mux_serialization_applied=mux_serialization_applied,
+        mux_phase_order=mux_phase_order,
+        schedule_has_cycle=schedule_has_cycle,
+        driver_name_fn=_driver_name,
+        interco_type=INTERCO_TYPE,
+        n_core_cfg=N_CORE_CFG,
+        n_dma_cfg=N_DMA_CFG,
+        n_ext_cfg=N_EXT_CFG,
+        n_hwpe_cfg=N_HWPE_CFG,
+        dw_narrow=DW_NARROW,
+        dw_wide=DW_WIDE,
+        n_narrow_hci_cfg=N_NARROW_HCI_CFG,
+        n_wide_hci_cfg=N_WIDE_HCI_CFG,
+        n_banks=N_BANKS,
+    )
     print(f"Memory lifetime plot written: {memory_lifetime_path}")
 
     # -----------------------------------------------------------------------
@@ -1590,11 +1200,7 @@ def _outside_detail_lines(node, base_addr, size_kib):
             original = fpath.read_text(encoding='ascii')
             fpath.write_text(idle_line * delay + original, encoding='ascii')
 
-    # -----------------------------------------------------------------------
-    # Pad all stimuli files to equal length
-    # -----------------------------------------------------------------------
-    pad_txt_files(str(stimuli_dir), IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH_FACT)
-    print("STEP 1 COMPLETED: pad stimuli files")
+    print("STEP 1 COMPLETED: generate documents and apply start delays to stimuli")
 
     # -----------------------------------------------------------------------
     # Golden vectors
diff --git a/target/verif/simvectors/memory_report.py b/target/verif/simvectors/memory_report.py
new file mode 100644
index 0000000..2da9cfb
--- /dev/null
+++ b/target/verif/simvectors/memory_report.py
@@ -0,0 +1,119 @@
+"""Text memory map report generation."""
+
+from pathlib import Path
+
+
+def build_memory_map_text(
+    *,
+    total_mem_size_kib,
+    n_banks,
+    data_width,
+    hwpe_data_width,
+    n_core_cfg,
+    n_dma_cfg,
+    n_ext_cfg,
+    n_log_cfg,
+    n_hwpe_cfg,
+    interco_type,
+    dw_narrow,
+    dw_wide,
+    n_narrow_hci_cfg,
+    n_wide_hci_cfg,
+    memory_map_entries,
+    job_to_drivers,
+    driver_name_fn,
+    n_drivers,
+    fence_masks,
+    total_cycles,
+    mux_serialization_applied,
+    mux_phase_order,
+    schedule_has_cycle,
+    driver_windows,
+    pattern_nodes,
+    regions_timeline,
+):
+    word_bytes = data_width // 8
+    bank_stride_bytes = n_banks * word_bytes
+    lines = []
+    lines.append("=" * 72)
+    lines.append("MEMORY MAP REPORT")
+    lines.append(f"  Total memory : {total_mem_size_kib} KiB  ({total_mem_size_kib * 1024} B)")
+    lines.append(f"  Banks        : {n_banks}  x  {word_bytes} B/word  (interleaved, stride {bank_stride_bytes} B)")
+    lines.append(f"  Data width   : {data_width} b LOG  /  {hwpe_data_width} b HWPE")
+    lines.append(
+        f"  Drivers({dw_narrow} bit) : "
+        f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}  (LOG total={n_log_cfg})"
+    )
+    lines.append(
+        f"  Drivers({dw_wide} bit) : "
+        f"HWPE={n_hwpe_cfg}"
+    )
+    lines.append(
+        f"  Interconnect type : {interco_type}  |  "
+        f"Narrow master ports ({dw_narrow} bit)={n_narrow_hci_cfg}  |  "
+        f"Wide master ports ({dw_wide} bit)={n_wide_hci_cfg}  |  "
+        f"Slave ports (banks)={n_banks}"
+    )
+    lines.append("=" * 72)
+    for entry in memory_map_entries:
+        lines.append(f"\n  [{entry['label']}]  pattern={entry['pattern']}  n_transactions={entry['n']}")
+        if 'info' in entry:
+            lines.append(f"    {entry['info']}")
+        for k, v in entry.get('detail', {}).items():
+            lines.append(f"    {k:<14}: {v}")
+    lines.append("")
+    lines.append("  Job / dependency map:")
+    for job, drivers in sorted(job_to_drivers.items()):
+        driver_names = [driver_name_fn(d) for d in drivers]
+        lines.append(f"    job '{job}': {', '.join(driver_names)}")
+    for i in range(n_drivers):
+        name = driver_name_fn(i)
+        for f, mask in enumerate(fence_masks[i]):
+            if mask:
+                deps = [driver_name_fn(j) for j in range(n_drivers) if mask & (1 << j)]
+                lines.append(f"    {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}")
+    lines.append("")
+    lines.append("  Temporal schedule (transaction-count model):")
+    lines.append(f"    Total modeled time: {total_cycles} units (1 unit = 1 transaction)")
+    lines.append("    Note: Declared wait_for_jobs dependencies are used for scheduling.")
+    lines.append("    Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).")
+    lines.append("    Note: No interconnect contention/stall timing is modeled.")
+    if mux_serialization_applied:
+        lines.append("    Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).")
+        lines.append(f"    MUX job order: {', '.join(mux_phase_order)}")
+    if schedule_has_cycle:
+        lines.append("    WARNING: dependency cycle detected while scheduling; using fallback order.")
+    for d in range(n_drivers):
+        if d not in driver_windows:
+            continue
+        w = driver_windows[d]
+        lines.append(f"    {w['name']:<8}: [{w['start']:>6}, {w['end']:>6})  dur={w['end'] - w['start']:>6}")
+        for node in [n for n in pattern_nodes if n['driver_idx'] == d]:
+            reg_tokens = []
+            for reg in node['regions']:
+                reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B")
+            reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions"
+            lines.append(
+                f"      p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']},{node['end_cycle']}) "
+                f"type={node['mem_access_type']} n={node['n_transactions']}  {reg_text}"
+            )
+    lines.append("")
+    lines.append("  Memory region lifetimes:")
+    for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])):
+        reg = regions_timeline[key]
+        users = sorted({a['driver_name'] for a in reg['accesses']})
+        lines.append(
+            f"    {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"({reg['size']:>6} B)  lifetime=[{reg['lifetime_start']},{reg['lifetime_end']})  "
+            f"users={', '.join(users)}"
+        )
+    lines.append("=" * 72)
+
+    return "\n".join(lines) + "\n"
+
+
+def write_memory_map_txt(memory_map_path: Path, **kwargs):
+    report_text = build_memory_map_text(**kwargs)
+    memory_map_path.write_text(report_text, encoding='utf-8')
+    return report_text

From 9fed2aa447ffc4d36e30d7cab7eb428a4cbf3a59 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 11 Mar 2026 16:22:41 +0100
Subject: [PATCH 23/25] verif/exploration: Move exploration stuff to subdir

---
 .gitignore                                    |   2 +-
 target/verif/config/workload.json             | 378 ++++++++++++++-
 .../config/workloads/workload_dma_gemm.json   | 180 --------
 .../hardware}/hardware_hci_2hwpe_8fact.json   |   0
 .../hardware}/hardware_log_2hwpe_8fact.json   |   0
 .../hardware}/hardware_mux_2hwpe_8fact.json   |   0
 .../workloads/workload_dma_gemm_cores.json    |   0
 .../workload_dma_gemm_cores_ideal.json}       |  43 +-
 target/verif/exploration/exploration.mk       |  17 +
 .../{ => exploration}/scripts/parse_vsim.py   |   0
 .../scripts/plot_sweep_results.py             |  14 +-
 target/verif/exploration/scripts/run_sweep.sh |  46 ++
 .../results/hardware_hci_2hwpe_8fact.json     | 431 ------------------
 target/verif/scripts/run_sweep.sh             |  27 --
 target/verif/simvectors/main.py               |  11 +-
 target/verif/verif.mk                         |  10 +-
 16 files changed, 472 insertions(+), 687 deletions(-)
 delete mode 100644 target/verif/config/workloads/workload_dma_gemm.json
 rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_hci_2hwpe_8fact.json (100%)
 rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_log_2hwpe_8fact.json (100%)
 rename target/verif/{config/sweep_hardware => exploration/config/hardware}/hardware_mux_2hwpe_8fact.json (100%)
 rename target/verif/{ => exploration}/config/workloads/workload_dma_gemm_cores.json (100%)
 rename target/verif/{config/workloads/workload_dma_gemm_ideal.json => exploration/config/workloads/workload_dma_gemm_cores_ideal.json} (75%)
 create mode 100644 target/verif/exploration/exploration.mk
 rename target/verif/{ => exploration}/scripts/parse_vsim.py (100%)
 rename target/verif/{ => exploration}/scripts/plot_sweep_results.py (97%)
 create mode 100644 target/verif/exploration/scripts/run_sweep.sh
 delete mode 100644 target/verif/results/hardware_hci_2hwpe_8fact.json
 delete mode 100644 target/verif/scripts/run_sweep.sh

diff --git a/.gitignore b/.gitignore
index 0fbe9ba..6212410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ target/verif/vsim/compile.tcl
 target/verif/vsim/modelsim.ini
 target/verif/vsim/transcript
 target/verif/vsim/vsim.wlf
-target/verif/results
+target/verif/exploration/results
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index 8299bf9..aa2da43 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -1,14 +1,374 @@
 {
-  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
   "log_masters": [
-    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
-    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
-    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
-    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
-    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
-    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
-    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
-    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
+    {
+      "id": 0,
+      "description": "Core 0 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 0 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Core 1 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 1 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "description": "Core 2 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 2 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "description": "Core 3 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 3 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "description": "Core 4 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 4 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "description": "Core 5 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 5 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "description": "Core 6 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 6 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
     { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
   ],
 
diff --git a/target/verif/config/workloads/workload_dma_gemm.json b/target/verif/config/workloads/workload_dma_gemm.json
deleted file mode 100644
index 8299bf9..0000000
--- a/target/verif/config/workloads/workload_dma_gemm.json
+++ /dev/null
@@ -1,180 +0,0 @@
-{
-  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
-  "log_masters": [
-    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
-    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
-    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
-    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
-    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
-    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
-    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
-    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
-    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
-  ],
-
-  "hwpe_masters": [
-    {
-      "id": 0,
-      "description": "DMA engine (linear setup + ping/pong prefetch)",
-      "patterns": [
-        {
-          "description": "Setup preload: B matrix (8 KiB)",
-          "mem_access_type": "linear",
-          "job": "dma_load_B",
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x00000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
-          "mem_access_type": "linear",
-          "job": "dma_load_A0_buf0",
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x02000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
-          "mem_access_type": "linear",
-          "job": "dma_load_A1_buf1",
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x04000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
-          "mem_access_type": "linear",
-          "job": "dma_load_A2_buf0",
-          "wait_for_jobs": ["gemm_A0"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x02000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Read back C0 after GEMM tile A0",
-          "mem_access_type": "linear",
-          "job": "dma_store_C0_buf2",
-          "wait_for_jobs": ["gemm_A0"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 100,
-          "region_base_address": "0x06000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
-          "mem_access_type": "linear",
-          "job": "dma_load_A3_buf1",
-          "wait_for_jobs": ["gemm_A1"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 0,
-          "region_base_address": "0x04000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Read back C1 after GEMM tile A1",
-          "mem_access_type": "linear",
-          "job": "dma_store_C1_buf3",
-          "wait_for_jobs": ["gemm_A1"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 100,
-          "region_base_address": "0x08000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Read back C0 after GEMM tile A2",
-          "mem_access_type": "linear",
-          "job": "dma_store_C2_buf2",
-          "wait_for_jobs": ["gemm_A2"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 100,
-          "region_base_address": "0x06000",
-          "region_size_bytes": 8192
-        },
-        {
-          "description": "Read back C1 after GEMM tile A3",
-          "mem_access_type": "linear",
-          "job": "dma_store_C3_buf3",
-          "wait_for_jobs": ["gemm_A3"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 100,
-          "region_base_address": "0x08000",
-          "region_size_bytes": 8192
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
-      "patterns": [
-        {
-          "description": "GEMM tile A0: A0 x B -> C0",
-          "mem_access_type": "matmul_phased",
-          "job": "gemm_A0",
-          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
-          "traffic_pct": 90,
-          "matrix_m": 16,
-          "matrix_n": 16,
-          "matrix_k": 16,
-          "region_base_address_a": "0x02000",
-          "region_size_bytes_a": 8192,
-          "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 8192,
-          "region_base_address_c": "0x06000",
-          "region_size_bytes_c": 8192
-        },
-        {
-          "description": "GEMM tile A1: A1 x B -> C1",
-          "mem_access_type": "matmul_phased",
-          "job": "gemm_A1",
-          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
-          "traffic_pct": 90,
-          "matrix_m": 16,
-          "matrix_n": 16,
-          "matrix_k": 16,
-          "region_base_address_a": "0x04000",
-          "region_size_bytes_a": 8192,
-          "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 8192,
-          "region_base_address_c": "0x08000",
-          "region_size_bytes_c": 8192
-        },
-        {
-          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
-          "mem_access_type": "matmul_phased",
-          "job": "gemm_A2",
-          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
-          "traffic_pct": 90,
-          "matrix_m": 16,
-          "matrix_n": 16,
-          "matrix_k": 16,
-          "region_base_address_a": "0x02000",
-          "region_size_bytes_a": 8192,
-          "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 8192,
-          "region_base_address_c": "0x06000",
-          "region_size_bytes_c": 8192
-        },
-        {
-          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
-          "mem_access_type": "matmul_phased",
-          "job": "gemm_A3",
-          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
-          "traffic_pct": 90,
-          "matrix_m": 16,
-          "matrix_n": 16,
-          "matrix_k": 16,
-          "region_base_address_a": "0x04000",
-          "region_size_bytes_a": 8192,
-          "region_base_address_b": "0x00000",
-          "region_size_bytes_b": 8192,
-          "region_base_address_c": "0x08000",
-          "region_size_bytes_c": 8192
-        }
-      ]
-    }
-  ]
-}
diff --git a/target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json
similarity index 100%
rename from target/verif/config/sweep_hardware/hardware_hci_2hwpe_8fact.json
rename to target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json
diff --git a/target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json
similarity index 100%
rename from target/verif/config/sweep_hardware/hardware_log_2hwpe_8fact.json
rename to target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json
diff --git a/target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json
similarity index 100%
rename from target/verif/config/sweep_hardware/hardware_mux_2hwpe_8fact.json
rename to target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json
diff --git a/target/verif/config/workloads/workload_dma_gemm_cores.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json
similarity index 100%
rename from target/verif/config/workloads/workload_dma_gemm_cores.json
rename to target/verif/exploration/config/workloads/workload_dma_gemm_cores.json
diff --git a/target/verif/config/workloads/workload_dma_gemm_ideal.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json
similarity index 75%
rename from target/verif/config/workloads/workload_dma_gemm_ideal.json
rename to target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json
index b166c78..c8ab84c 100644
--- a/target/verif/config/workloads/workload_dma_gemm_ideal.json
+++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json
@@ -1,14 +1,29 @@
 {
-  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes.",
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
   "log_masters": [
-    { "id": 0, "description": "Core 0 (idle)", "mem_access_type": "idle" },
-    { "id": 1, "description": "Core 1 (idle)", "mem_access_type": "idle" },
-    { "id": 2, "description": "Core 2 (idle)", "mem_access_type": "idle" },
-    { "id": 3, "description": "Core 3 (idle)", "mem_access_type": "idle" },
-    { "id": 4, "description": "Core 4 (idle)", "mem_access_type": "idle" },
-    { "id": 5, "description": "Core 5 (idle)", "mem_access_type": "idle" },
-    { "id": 6, "description": "Core 6 (idle)", "mem_access_type": "idle" },
-    { "id": 7, "description": "Core 7 (idle)", "mem_access_type": "idle" },
+    {"id": 0, "description": "Core 0 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 1, "description": "Core 1 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 2, "description": "Core 2 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 3, "description": "Core 3 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 4, "description": "Core 4 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 5, "description": "Core 5 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 6, "description": "Core 6 post-GEMM background traffic", "mem_access_type": "idle" },
+    {
+      "id": 7,
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
     { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
   ],
 
@@ -34,16 +49,6 @@
           "traffic_read_pct": 0,
           "region_base_address": "0x02000",
           "region_size_bytes": 8192
-        },
-        {
-          "description": "Read back C1 after GEMM tile A3",
-          "mem_access_type": "linear",
-          "job": "dma_store_C3_buf3",
-          "wait_for_jobs": ["gemm_A3"],
-          "traffic_pct": 100,
-          "traffic_read_pct": 100,
-          "region_base_address": "0x08000",
-          "region_size_bytes": 8192
         }
       ]
     },
diff --git a/target/verif/exploration/exploration.mk b/target/verif/exploration/exploration.mk
new file mode 100644
index 0000000..1a93e7b
--- /dev/null
+++ b/target/verif/exploration/exploration.mk
@@ -0,0 +1,17 @@
+# Copyright 2026 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
+# SPDX-License-Identifier: SHL-0.51
+#
+# Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+
+HCI_VERIF_EXPL_DIR = $(HCI_ROOT)/target/verif/exploration
+
+################
+# Benchmarking #
+################
+
+# Modify this script to configure parameters (e.g., workload to run)
+BENCHMARK_SCRIPT := $(HCI_VERIF_EXPL_DIR)/scripts/run_sweep.sh
+
+benchmarking-sweep:
+	. $(BENCHMARK_SCRIPT)
\ No newline at end of file
diff --git a/target/verif/scripts/parse_vsim.py b/target/verif/exploration/scripts/parse_vsim.py
similarity index 100%
rename from target/verif/scripts/parse_vsim.py
rename to target/verif/exploration/scripts/parse_vsim.py
diff --git a/target/verif/scripts/plot_sweep_results.py b/target/verif/exploration/scripts/plot_sweep_results.py
similarity index 97%
rename from target/verif/scripts/plot_sweep_results.py
rename to target/verif/exploration/scripts/plot_sweep_results.py
index cd24686..786561e 100644
--- a/target/verif/scripts/plot_sweep_results.py
+++ b/target/verif/exploration/scripts/plot_sweep_results.py
@@ -18,7 +18,7 @@
 from matplotlib.patches import Patch
 
 # Cycles of ideal workload runtime
-IDEAL_WORKLOAD_DURATION = 3866.0
+IDEAL_WORKLOAD_RUNTIME = 4121.0
 
 INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2}
 INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"}
@@ -176,7 +176,7 @@ def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> No
     for bar, val in zip(bars, values):
         if math.isnan(val):
             continue
-        mult_of_ideal = (val / IDEAL_WORKLOAD_DURATION) if val > 0 and IDEAL_WORKLOAD_DURATION > 0 else float("nan")
+        mult_of_ideal = (val / IDEAL_WORKLOAD_RUNTIME) if val > 0 and IDEAL_WORKLOAD_RUNTIME > 0 else float("nan")
         pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime"
         ax.text(
             bar.get_x() + bar.get_width() / 2.0,
@@ -188,11 +188,11 @@ def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> No
         )
 
     ax.axhline(
-        y=IDEAL_WORKLOAD_DURATION,
+        y=IDEAL_WORKLOAD_RUNTIME,
         color="red",
         linestyle="--",
         linewidth=1.6,
-        label=f"Ideal workload runtime ({IDEAL_WORKLOAD_DURATION:.0f} cycles)",
+        label=f"Ideal workload runtime ({IDEAL_WORKLOAD_RUNTIME:.0f} cycles)",
     )
 
     legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")]
@@ -273,10 +273,10 @@ def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None:
     sim_cycles_vals = [e["total_sim_cycles"] for e in entries]
     ideal_app_vals = []
     for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals):
-        if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_DURATION <= 0.0:
+        if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_RUNTIME <= 0.0:
             ideal_app_vals.append(float("nan"))
         else:
-            ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_DURATION)
+            ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_RUNTIME)
     actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries]
     x = np.arange(len(entries), dtype=float)
     width = 0.34
@@ -323,7 +323,7 @@ def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None:
         )
 
     # Ideal app BW computed from moved data and ideal application duration:
-    # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_DURATION
+    # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_RUNTIME
     valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)]
     if valid_ideal_app_vals:
         ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals)
diff --git a/target/verif/exploration/scripts/run_sweep.sh b/target/verif/exploration/scripts/run_sweep.sh
new file mode 100644
index 0000000..8759a9f
--- /dev/null
+++ b/target/verif/exploration/scripts/run_sweep.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+set -e
+
+# Make sure we are in hci root
+if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then
+  echo "Error: This script must be run from within the hci project git repository." >&2
+  exit 1
+fi
+cd "$git_root"
+
+# Directories
+VERIF_DIR=./target/verif
+VERIF_EXPL_DIR=$VERIF_DIR/exploration
+RESULTS_DIR=$VERIF_EXPL_DIR/results
+PLOTS_DIR=$RESULTS_DIR/plots
+
+RUN_NAME=gemm_cores_double_buffer
+
+# Makefile settings (verif.mk)
+export GUI=0
+export WORKLOAD_JSON=$VERIF_EXPL_DIR/config/workloads/workload_dma_gemm_cores.json
+export TESTBENCH_JSON=$VERIF_DIR/config/testbench.json
+# HARDWARE_JSON is swept in the loop below
+
+mkdir -p "$RESULTS_DIR"
+
+# For each hardware*.json, run simulation and parse transcript
+for hardware_config in $VERIF_EXPL_DIR/config/hardware/hardware_*.json; do
+  make clean-verif
+  echo -e "\033[32;1mRunning simulation with hardware config: $hardware_config\033[0m"
+  export HARDWARE_JSON="$hardware_config"
+  make run-verif
+  python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).json
+  # Copy html report
+  cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).html
+done
+
+python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME
+
+# Ideal run (manual)
+# do not forget to change IDEAL_WORKLOAD_RUNTIME in plot_sweep_results.py
+python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/ideal.json
+cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/ideal.html
+# Regenerate plots with correct IDEAL_WORKLOAD_RUNTIME
+python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME
diff --git a/target/verif/results/hardware_hci_2hwpe_8fact.json b/target/verif/results/hardware_hci_2hwpe_8fact.json
deleted file mode 100644
index 1fd0fd4..0000000
--- a/target/verif/results/hardware_hci_2hwpe_8fact.json
+++ /dev/null
@@ -1,431 +0,0 @@
-{
-  "hw_config": {
-    "masters": {
-      "core": 8,
-      "dma": 0,
-      "ext": 1,
-      "hwpe": 2,
-      "total": 11
-    },
-    "memory": {
-      "banks": 64,
-      "total_size_kb": 256,
-      "data_width_bits": 32,
-      "hwpe_width_lanes": 8
-    },
-    "interconnect": {
-      "sel_lic": 0,
-      "ts_bit": 21,
-      "expfifo": 0
-    },
-    "interconnect_side": {
-      "type": "HCI",
-      "n_narrow_hci": 8,
-      "n_wide_hci": 2,
-      "n_dma": 0,
-      "n_ext": 1,
-      "narrow_total_ports": 9,
-      "total_initiator_ports": 11
-    },
-    "id_address": {
-      "iw": 11,
-      "addr_width": 18,
-      "addr_width_bank": 12
-    }
-  },
-  "bandwidth": {
-    "ideal_memory_side_bit_per_cycle": 2048.0,
-    "ideal_interconnect_side_bit_per_cycle": 800.0,
-    "ideal_bottleneck_bit_per_cycle": 800.0,
-    "actual_completion_bit_per_cycle": 306.65,
-    "actual_completion_utilization_pct": 38.3,
-    "completion_phase_duration_cycles": 4488.0,
-    "granted_transactions": {
-      "reads": 3072,
-      "writes": 2304,
-      "total": 5376
-    },
-    "read_complete_responses": 3072
-  },
-  "simulation_time": {
-    "per_master": [
-      {
-        "master_name": "master_hwpe_0",
-        "role_name": "HWPE0",
-        "sim_time_cycles": 4488.0
-      },
-      {
-        "master_name": "master_hwpe_1",
-        "role_name": "HWPE1",
-        "sim_time_cycles": 4228.0
-      },
-      {
-        "master_name": "master_log_0",
-        "role_name": "Core0",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_1",
-        "role_name": "Core1",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_2",
-        "role_name": "Core2",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_3",
-        "role_name": "Core3",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_4",
-        "role_name": "Core4",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_5",
-        "role_name": "Core5",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_6",
-        "role_name": "Core6",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_7",
-        "role_name": "Core7",
-        "sim_time_cycles": 3.0
-      },
-      {
-        "master_name": "master_log_8",
-        "role_name": "EXT0",
-        "sim_time_cycles": 1.0
-      }
-    ],
-    "total_cycles": 4488.0
-  },
-  "read_response_coverage": {
-    "master_log_0": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_1": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_2": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_3": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_4": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_5": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_6": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_7": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_log_8": {
-      "observed": 0,
-      "expected": 0
-    },
-    "master_hwpe_0": {
-      "observed": 1024,
-      "expected": 1024
-    },
-    "master_hwpe_1": {
-      "observed": 2048,
-      "expected": 2048
-    }
-  },
-  "transaction_counts": {
-    "master_log_0": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_1": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_2": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_3": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_4": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_5": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_6": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_7": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_log_8": {
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0
-    },
-    "master_hwpe_0": {
-      "granted_reads": 1024,
-      "granted_writes": 1280,
-      "read_complete": 1024
-    },
-    "master_hwpe_1": {
-      "granted_reads": 2048,
-      "granted_writes": 1024,
-      "read_complete": 2048
-    }
-  },
-  "request_to_grant_latency": {
-    "per_master": [
-      {
-        "master_name": "master_hwpe_0",
-        "avg_req_to_gnt_stall_latency_cycles": 0.63,
-        "req_to_gnt_grants": 2304
-      },
-      {
-        "master_name": "master_hwpe_1",
-        "avg_req_to_gnt_stall_latency_cycles": 0.18,
-        "req_to_gnt_grants": 3072
-      },
-      {
-        "master_name": "master_log_0",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_1",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_2",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_3",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_4",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_5",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_6",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_7",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      },
-      {
-        "master_name": "master_log_8",
-        "avg_req_to_gnt_stall_latency_cycles": 0.0,
-        "req_to_gnt_grants": 0
-      }
-    ],
-    "accumulated": {
-      "cycles": 1987.0,
-      "grants": 5376
-    },
-    "averages": {
-      "log": {
-        "weighted_cycles": 0.0,
-        "unweighted_cycles": 0.0
-      },
-      "hwpe": {
-        "weighted_cycles": 0.37,
-        "unweighted_cycles": 0.4
-      },
-      "global": {
-        "weighted_cycles": 0.37,
-        "unweighted_cycles": 0.4
-      }
-    }
-  },
-  "finish": {
-    "source": "/scratch/smazzola/projects/hci/hci/target/verif/src/simulation_report.sv",
-    "line": 464,
-    "time_ps": 224950,
-    "iteration": 4,
-    "instance": "/tb_hci/i_simulation_report"
-  },
-  "masters": [
-    {
-      "master_name": "master_hwpe_0",
-      "role_name": "HWPE0",
-      "sim_time_cycles": 4488.0,
-      "read_observed": 1024,
-      "read_expected": 1024,
-      "granted_reads": 1024,
-      "granted_writes": 1280,
-      "read_complete": 1024,
-      "avg_req_to_gnt_stall_latency_cycles": 0.63,
-      "req_to_gnt_grants": 2304
-    },
-    {
-      "master_name": "master_hwpe_1",
-      "role_name": "HWPE1",
-      "sim_time_cycles": 4228.0,
-      "read_observed": 2048,
-      "read_expected": 2048,
-      "granted_reads": 2048,
-      "granted_writes": 1024,
-      "read_complete": 2048,
-      "avg_req_to_gnt_stall_latency_cycles": 0.18,
-      "req_to_gnt_grants": 3072
-    },
-    {
-      "master_name": "master_log_0",
-      "role_name": "Core0",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_1",
-      "role_name": "Core1",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_2",
-      "role_name": "Core2",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_3",
-      "role_name": "Core3",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_4",
-      "role_name": "Core4",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_5",
-      "role_name": "Core5",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_6",
-      "role_name": "Core6",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_7",
-      "role_name": "Core7",
-      "sim_time_cycles": 3.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    },
-    {
-      "master_name": "master_log_8",
-      "role_name": "EXT0",
-      "sim_time_cycles": 1.0,
-      "read_observed": 0,
-      "read_expected": 0,
-      "granted_reads": 0,
-      "granted_writes": 0,
-      "read_complete": 0,
-      "avg_req_to_gnt_stall_latency_cycles": 0.0,
-      "req_to_gnt_grants": 0
-    }
-  ]
-}
diff --git a/target/verif/scripts/run_sweep.sh b/target/verif/scripts/run_sweep.sh
deleted file mode 100644
index 17f7404..0000000
--- a/target/verif/scripts/run_sweep.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Make sure we are in hci root
-if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then
-  echo "Error: This script must be run from within the hci project git repository." >&2
-  exit 1
-fi
-cd "$git_root"
-
-VERIF_DIR=./target/verif
-RESULTS_DIR=$VERIF_DIR/results
-PLOTS_DIR=$RESULTS_DIR/plots
-
-mkdir -p "$RESULTS_DIR"
-rm -f "$RESULTS_DIR"/hardware_*.json
-
-# For each hardware*.json in target/verif/config/sweep_hardware, run simulation and parse transcript
-for hardware_config in $VERIF_DIR/config/sweep_hardware/hardware_*.json; do
-  make clean-verif
-  echo "Running simulation with hardware config: $hardware_config"
-  HARDWARE_JSON="$hardware_config" GUI=0 make run-verif
-  python3 $VERIF_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out "$RESULTS_DIR"/$(basename "$hardware_config" .json).json
-done
-
-python3 $VERIF_DIR/scripts/plot_sweep_results.py --results-dir "$RESULTS_DIR" --out-dir "$PLOTS_DIR"
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 3ff2f30..e2dd321 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -1132,7 +1132,7 @@ def _add_edge(src, dst):
     # Build memory_map.txt
     # -----------------------------------------------------------------------
     memory_map_path = generated_dir / 'memory_map.txt'
-    report_text = write_memory_map_txt(
+    write_memory_map_txt(
         memory_map_path=memory_map_path,
         total_mem_size_kib=TOT_MEM_SIZE,
         n_banks=N_BANKS,
@@ -1161,15 +1161,14 @@ def _add_edge(src, dst):
         pattern_nodes=pattern_nodes,
         regions_timeline=regions_timeline,
     )
-    print("\n" + report_text)
     print(f"Memory map written: {memory_map_path}")
 
     # -----------------------------------------------------------------------
-    # Build memory_lifetime.html (simple SVG timeline view)
+    # Build dataflow.html (simple SVG timeline view)
     # -----------------------------------------------------------------------
-    memory_lifetime_path = generated_dir / 'memory_lifetime.html'
+    dataflow_path = generated_dir / 'dataflow.html'
     write_memory_lifetime_html(
-        memory_lifetime_path=memory_lifetime_path,
+        memory_lifetime_path=dataflow_path,
         pattern_nodes=pattern_nodes,
         driver_windows=driver_windows,
         regions_timeline=regions_timeline,
@@ -1189,7 +1188,7 @@ def _add_edge(src, dst):
         n_wide_hci_cfg=N_WIDE_HCI_CFG,
         n_banks=N_BANKS,
     )
-    print(f"Memory lifetime plot written: {memory_lifetime_path}")
+    print(f"Dataflow plot written: {dataflow_path}")
 
     # -----------------------------------------------------------------------
     # Apply per-master start delays
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index e9f93e7..5d08eba 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -10,6 +10,9 @@ HCI_VERIF_DIR = $(HCI_ROOT)/target/verif
 HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config
 HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated
 
+# Other Makefiles
+include $(HCI_VERIF_DIR)/exploration/exploration.mk
+
 # Include generated Makefiles
 include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk
 include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk
@@ -157,13 +160,6 @@ clean-sim-verif:
 	rm -f $(HCI_VERIF_DIR)/vsim/transcript
 	rm -f $(HCI_VERIF_DIR)/vsim/vsim.wlf
 
-################
-# Benchmarking #
-################
-
-benchmarking-sweep:
-	. $(HCI_VERIF_DIR)/scripts/run_sweep.sh
-
 ###########
 # Helpers #
 ###########

From bf63bf558e7df718fdfd0aa6ece4429e25e613a7 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 11 Mar 2026 18:08:24 +0100
Subject: [PATCH 24/25] simvectors: Add new mem access patterns, fine-tune idle
 cycles gen, fix RAW deps model

---
 target/verif/README.md                        |   2 +
 target/verif/config/workload.schema.json      | 209 +++++-
 target/verif/simvectors/README.md             | 293 ++++++++
 .../verif/simvectors/hci_stimuli/patterns.py  | 666 +++++++++++++++++-
 target/verif/simvectors/html_report.py        |   2 +-
 target/verif/simvectors/main.py               | 564 +++++++++++++--
 6 files changed, 1647 insertions(+), 89 deletions(-)
 create mode 100644 target/verif/simvectors/README.md

diff --git a/target/verif/README.md b/target/verif/README.md
index cd7c03f..756efbe 100644
--- a/target/verif/README.md
+++ b/target/verif/README.md
@@ -4,6 +4,8 @@
 
 The verification framework drives configurable memory traffic from multiple masters through the HCI interconnect and measures throughput and latency. It is fully driven by three JSON configuration files:
 
+For the up-to-date stimuli-generator pattern catalog and output format, see `target/verif/simvectors/README.md`.
+
 | File | Purpose |
 |------|---------|
 | `config/hardware.json` | Interconnect topology (number of masters, banks, data widths, ...) |
diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json
index 1037a0f..fde1870 100644
--- a/target/verif/config/workload.schema.json
+++ b/target/verif/config/workload.schema.json
@@ -38,8 +38,23 @@
         },
         "mem_access_type": {
           "type": "string",
-          "enum": ["idle", "random", "linear", "2d", "3d", "matmul_phased", "matmul"],
-          "description": "Access pattern selector. 'matmul' is an alias for 'matmul_phased'."
+          "enum": [
+            "idle",
+            "random",
+            "linear",
+            "2d",
+            "3d",
+            "matmul_phased",
+            "matmul",
+            "multi_linear",
+            "bank_group_linear",
+            "rw_rowwise",
+            "gather_scatter",
+            "matmul_tiled_interleave",
+            "matmul_tiled",
+            "hotspot_random"
+          ],
+          "description": "Access pattern selector. Aliases: 'matmul' -> 'matmul_phased', 'matmul_tiled' -> 'matmul_tiled_interleave'."
         },
         "job": {
           "type": "string",
@@ -157,6 +172,160 @@
           "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
           "description": "[matmul_phased] Size in bytes of the write-C region."
         },
+        "regions": {
+          "type": "array",
+          "description": "[multi_linear] Subregions to stream in schedule order.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "stride_words": { "type": "integer", "minimum": 1, "default": 1 },
+              "read_pct": { "type": "integer", "minimum": 0, "maximum": 100 }
+            }
+          }
+        },
+        "schedule": {
+          "type": "string",
+          "description": "[multi_linear, gather_scatter] Access schedule selector (e.g. round_robin, 4read_1write)."
+        },
+        "burst_len": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[multi_linear] Number of consecutive accesses per selected region before switching."
+        },
+        "start_bank": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[bank_group_linear] Starting bank index."
+        },
+        "bank_group_span": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[bank_group_linear] Number of banks in the active group."
+        },
+        "stride_beats": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[bank_group_linear] Stride in beats through the bank-group phase."
+        },
+        "bank_group_hop": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[bank_group_linear] Optional phase hop applied when advancing to the next group window."
+        },
+        "wen": {
+          "type": "integer",
+          "enum": [0, 1],
+          "description": "[bank_group_linear] Fixed direction: 1=read, 0=write. If omitted, reads/writes are mixed."
+        },
+        "row_base_address": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[rw_rowwise] Base address of row 0."
+        },
+        "row_size_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[rw_rowwise] Bytes touched inside each row."
+        },
+        "n_rows": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[rw_rowwise] Number of rows."
+        },
+        "row_stride_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[rw_rowwise] Byte stride between consecutive row bases."
+        },
+        "reads_per_row": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[rw_rowwise] Number of reads emitted per row."
+        },
+        "writes_per_row": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[rw_rowwise] Number of writes emitted per row."
+        },
+        "idle_cycles_between_rows": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[rw_rowwise] Idle cycles inserted between rows."
+        },
+        "read_regions": {
+          "type": "array",
+          "description": "[gather_scatter] Source regions for gather reads.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }
+            }
+          }
+        },
+        "write_region": {
+          "type": "object",
+          "description": "[gather_scatter] Destination region for scatter writes.",
+          "required": ["base", "size_bytes"],
+          "additionalProperties": false,
+          "properties": {
+            "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+            "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }
+          }
+        },
+        "chunk_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[gather_scatter] Address increment size in bytes for gather/scatter stepping."
+        },
+        "tile_a_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes read from A per tile schedule step."
+        },
+        "tile_b_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes read from B per tile schedule step."
+        },
+        "tile_c_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes written to C per tile schedule step."
+        },
+        "tiles": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[matmul_tiled_interleave] Number of tile iterations before the pattern repeats."
+        },
+        "ab_c_schedule": {
+          "type": "string",
+          "description": "[matmul_tiled_interleave] Tile phase order string, e.g. A_B_B_C."
+        },
+        "idle_cycles_between_tiles": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[matmul_tiled_interleave] Idle cycles inserted between tile iterations."
+        },
+        "hot_regions": {
+          "type": "array",
+          "description": "[hotspot_random] Weighted hot regions used for random accesses.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] },
+              "weight": { "type": "integer", "minimum": 1, "default": 1 }
+            }
+          }
+        },
         "matmul_ratio_a": {
           "type": "integer",
           "minimum": 0,
@@ -181,13 +350,13 @@
           "minimum": 1,
           "maximum": 100,
           "default": 100,
-          "description": "[random, linear, matmul_phased] Modeled request utilization percentage (adds req=0 idles after each request when <100)."
+          "description": "[random, linear, matmul_phased, multi_linear, bank_group_linear, rw_rowwise, gather_scatter, matmul_tiled_interleave, hotspot_random] Modeled request utilization percentage (adds req=0 idles after each request when <100)."
         },
         "traffic_read_pct": {
           "type": "integer",
           "minimum": 0,
           "maximum": 100,
-          "description": "[random, linear] Percentage of accesses that are reads (wen=1)."
+          "description": "[random, linear, hotspot_random] Percentage of accesses that are reads (wen=1)."
         },
         "idle_cycles_between_phases": {
           "type": "integer",
@@ -232,6 +401,38 @@
               { "required": ["matrix_m", "matrix_n", "matrix_k"] }
             ]
           }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "multi_linear" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["regions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "bank_group_linear" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["start_bank", "bank_group_span", "n_transactions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "rw_rowwise" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["row_base_address", "row_size_bytes", "n_rows", "row_stride_bytes", "reads_per_row", "writes_per_row"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "gather_scatter" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["read_regions", "write_region"] }
+        },
+        {
+          "if": {
+            "properties": { "mem_access_type": { "enum": ["matmul_tiled_interleave", "matmul_tiled"] } },
+            "required": ["mem_access_type"]
+          },
+          "then": {
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["region_base_address_a", "region_size_bytes_a", "region_base_address_b", "region_size_bytes_b", "region_base_address_c", "region_size_bytes_c"] }
+            ]
+          }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "hotspot_random" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["hot_regions"] }
         }
       ]
     },
diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md
new file mode 100644
index 0000000..472323b
--- /dev/null
+++ b/target/verif/simvectors/README.md
@@ -0,0 +1,293 @@
+# Simvectors Stimuli Generator
+
+## Scope
+`main.py` generates per-master stimuli vectors from:
+- `workload.json`
+- `hardware.json`
+- `testbench.json`
+
+This README summarizes:
+- available `mem_access_type` patterns and JSON parameters
+- every source of `req=0` cycles
+- read/write blocked-set behavior
+- generated outputs and formats
+
+## JSON Structure (workload)
+Top-level:
+- `log_masters`: list of narrow masters
+- `hwpe_masters`: list of wide masters
+
+### Field Format Conventions
+| Kind | Accepted JSON format | Unit / meaning |
+|---|---|---|
+| Address fields (`*_address`, `base`) | integer or string (`"1234"`, `"0x4000"`, `"101010"`) | byte address |
+| Size fields (`*_size_bytes`, `chunk_bytes`, `tile_*_bytes`) | integer or numeric string | bytes |
+| Strides (`stride0/1/2`) | integer | words (word = `DATA_WIDTH/8` for that master) |
+| `row_stride_bytes` | integer or numeric string | bytes |
+| `stride_beats` | integer | beats (beat = transaction width in bytes for that master) |
+| Counters (`n_transactions`, `len_*`, `tiles`, `reads_per_row`, `writes_per_row`) | integer | count |
+| Percentages (`traffic_pct`, `traffic_read_pct`, `read_pct`) | integer | percent |
+| `wen` | `0` or `1` | `0`=write, `1`=read |
+
+### Master-Level Fields
+| Field | Required | Default | Notes |
+|---|---|---|---|
+| `id` | no | positional index | Informational/consistency warning only. |
+| `description` | no | empty | Human-readable label. |
+| `start_delay_cycles` | no | `0` | Prepended `req=0` cycles before first pattern. |
+| `patterns` | no | absent | If present, this list drives generation. |
+
+Precedence/exclusivity:
+- Master format is effectively either:
+  - flat single-pattern master (no `patterns`)
+  - or `patterns` list.
+- If `patterns` exists, flat pattern fields at master level are ignored for traffic generation (except master-level fields like `start_delay_cycles`).
+
+### Common Pattern Fields (apply to every pattern type)
+| Field | Required | Default | Notes |
+|---|---|---|---|
+| `mem_access_type` | yes | none | Pattern selector. |
+| `description` | no | empty | Label used in reports. |
+| `job` | no | `"default"` | Dependency graph node name. |
+| `wait_for_jobs` | no | `[]` | Inserts dependency gate before pattern. |
+| `n_transactions` | conditional | derivable for many patterns | If omitted, derived when supported. |
+| `traffic_pct` | no | `100` | Adds per-request idle shaping (`req=0`) on patterns that implement traffic shaping. |
+
+## Pattern Catalog
+The tables below list pattern-specific fields.  
+Complete field set for a pattern = **common fields above + pattern-specific fields below**.
+`Required = conditional` means: required unless the documented derivation path is present.
+
+### `idle`
+No memory transaction. Emits idle and trailing `PAUSE`.
+
+Pattern-specific fields: none.
+
+### `random`
+Uniform random over a region.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | yes | none | int | Not derivable for this pattern. |
+| `region_base_address` | no | evenly partitioned per master | address (bytes) | |
+| `region_size_bytes` | no | evenly partitioned per master | bytes | |
+| `traffic_read_pct` | no | random R/W mix | % | If set, deterministic read/write split. |
+
+### `linear`
+1D strided stream.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `length` or `region_size_bytes`. |
+| `length` | no | none | int | Alias source for derived `n_transactions`. |
+| `start_address` | no | `"0"` | address (bytes) | If absent and `region_base_address` exists, uses `region_base_address`. |
+| `stride0` | no | `0` (or `1` if `region_size_bytes` set) | words | |
+| `region_base_address` | no | evenly partitioned per master | address (bytes) | Used for region context and start fallback. |
+| `region_size_bytes` | no | evenly partitioned per master | bytes | Can derive `n_transactions`. |
+| `traffic_read_pct` | no | random R/W mix | % | |
+
+### `2d`
+2D walk.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1`. |
+| `start_address` | no | `"0"` | address (bytes) | |
+| `stride0` | no | `0` | words | |
+| `len_d0` | conditional | none | int | |
+| `stride1` | no | `0` | words | |
+| `len_d1` | conditional | none | int | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. |
+
+### `3d`
+3D walk.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1 * len_d2`. |
+| `start_address` | no | `"0"` | address (bytes) | |
+| `stride0` | no | `0` | words | |
+| `len_d0` | conditional | none | int | |
+| `stride1` | no | `0` | words | |
+| `len_d1` | conditional | none | int | |
+| `stride2` | no | `0` | words | |
+| `len_d2` | conditional | none | int | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. |
+
+### `matmul_phased` (alias: `matmul`)
+Phased A-read / B-read / C-write traffic.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from region size or matrix dims. |
+| `region_base_address`, `region_size_bytes` | conditional | evenly partitioned | bytes | Combined region (auto A/B/C split). |
+| `matrix_m`, `matrix_n`, `matrix_k` | no | none | int | Alternative source for derived `n_transactions`. |
+| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | no | none | bytes | Explicit per-phase regions. |
+| `matmul_ratio_a/b/c` | no | `1/1/1` | relative weights | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit phase-boundary idles. |
+
+Mutual exclusivity / precedence:
+- If explicit `*_a/b/c` regions are provided, they take precedence over combined-region auto-split.
+
+### `multi_linear`
+Multiple subregions, schedule-driven interleave.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `regions` | yes | none | array | Each entry has `base`, `size_bytes`, optional `stride_words`, `read_pct`. |
+| `schedule` | no | `round_robin` | string | |
+| `burst_len` | no | `1` | int | |
+| `n_transactions` | conditional | derived | int | Derivable from sum of region sizes. |
+
+### `bank_group_linear`
+Linear stream constrained by bank group phase controls.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `start_bank` | yes | none | int | 0-based bank index. |
+| `bank_group_span` | yes | none | int | Number of banks in active group. |
+| `stride_beats` | no | `1` | beats | |
+| `bank_group_hop` | no | `0` | int | Group-phase hop per wrap. |
+| `wen` | no | mixed R/W | `0` or `1` | Fixed direction if set. |
+| `n_transactions` | yes | none | int | Required for this pattern. |
+
+### `rw_rowwise`
+Per-row read phase then write phase.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `row_base_address` | yes | none | address (bytes) | |
+| `row_size_bytes` | yes | none | bytes | |
+| `n_rows` | yes | none | int | |
+| `row_stride_bytes` | yes | none | bytes | |
+| `reads_per_row` | yes | none | int | |
+| `writes_per_row` | yes | none | int | |
+| `idle_cycles_between_rows` | no | `0` | cycles | Inserts explicit row-boundary idles. |
+| `n_transactions` | conditional | derived | int | Derivable as `n_rows * (reads_per_row + writes_per_row)`. |
+
+### `gather_scatter`
+Gather from multiple read regions, scatter to write region.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `read_regions` | yes | none | array | Each entry: `base`, `size_bytes`. |
+| `write_region` | yes | none | object | `base`, `size_bytes`. |
+| `chunk_bytes` | no | transaction width | bytes | Address increment granularity. |
+| `schedule` | no | `4read_1write` | string | |
+| `n_transactions` | conditional | derived | int | Derivable from region sizes and chunk. |
+
+### `matmul_tiled_interleave` (alias: `matmul_tiled`)
+Tile-like interleaving among A/B/C streams.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `region_base_address`, `region_size_bytes` | no | evenly partitioned | bytes | Used as fallback context for auto split when explicit A/B/C are absent. |
+| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | conditional | fallback split | bytes | Preferred explicit mode. |
+| `tile_a_bytes`, `tile_b_bytes`, `tile_c_bytes` | no | transaction width | bytes | Tile step payloads per stream. |
+| `tiles` | no | `1` | int | |
+| `ab_c_schedule` | no | `A_B_C` | string | |
+| `idle_cycles_between_tiles` | no | `0` | cycles | Inserts explicit tile-boundary idles. |
+| `n_transactions` | conditional | derived | int | Can be derived from tile parameters/schedule. |
+
+Mutual exclusivity / precedence:
+- Preferred: explicit A/B/C regions.
+- If missing, generator falls back to splitting combined region context.
+
+### `hotspot_random`
+Weighted random traffic across hot regions.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `hot_regions` | yes | none | array | Each entry: `base`, `size_bytes`, optional `weight` (default `1`). |
+| `n_transactions` | no | weak fallback | int | Prefer setting explicitly. |
+| `traffic_read_pct` | no | random R/W mix | % | |
+
+## All Sources of `req=0` Cycles
+`req=0` can be generated by:
+
+1. `traffic_pct` shaping
+- For every emitted request, inserts `idles_per_req = round((100-pct)/pct)` idle lines.
+- Applies in random/linear and all new patterns that support `traffic_pct`.
+
+2. Boundary idle knobs (JSON)
+- `idle_cycles_between_phases` in `2d`, `3d`, `matmul_phased`
+- `idle_cycles_between_rows` in `rw_rowwise`
+- `idle_cycles_between_tiles` in `matmul_tiled_interleave`
+
+3. `start_delay_cycles` (per master)
+- Prepends idle lines before first pattern.
+
+4. Dependency gate for `wait_for_jobs`
+- For each dependent pattern, generator inserts a synthetic idle+`PAUSE` gate before real traffic.
+
+5. `idle` pattern
+- Explicitly emits idle and `PAUSE`.
+
+## Read/Write Blocked-Set Functionality
+Address filtering is implemented inside pattern generators via:
+- `_is_allowed(add, wen, read_blocked_set, write_blocked_set)`
+- `_record_access(add, wen, read_blocked_set, write_blocked_set)`
+
+Behavior (within one pattern invocation):
+- Read checks (`wen=1`) consult `read_blocked_set`.
+- Write checks (`wen=0`) consult `write_blocked_set`.
+- On every emitted access, the address is added to `write_blocked_set`.
+- On emitted writes only, the address is also added to `read_blocked_set`.
+
+Effective policy:
+- read after read: allowed
+- write after read: blocked
+- read after write: blocked
+- write after write: blocked
+
+Notes:
+- Blocking state is pattern-local (it does not persist across patterns).
+- Generators are strict about transaction count: each non-idle pattern must emit exactly `n_transactions` (`N_TEST`).
+- If blocking rules make the requested count unreachable for a pattern, generation fails with an explicit error instead of silently under-emitting.
+
+## Outputs
+
+### 1. Stimuli vectors
+Path:
+- `target/verif/simvectors/generated/stimuli/master_log_<i>.txt`
+- `target/verif/simvectors/generated/stimuli/master_hwpe_<i>.txt`
+
+Per-cycle vector line format:
+- `req id wen data add`
+- `req`: `1` active request, `0` idle
+- `id`: request ID (`IW` bits)
+- `wen`: `1` read, `0` write
+- `data`: payload (`DATA_WIDTH` bits for narrow, `HWPE_WIDTH_FACT*DATA_WIDTH` for wide)
+- `add`: byte address (`ADD_WIDTH` bits)
+
+Fence token:
+- A standalone line `PAUSE` is emitted at end of each pattern segment.
+
+### 2. Memory map report
+Path:
+- `target/verif/simvectors/generated/memory_map.txt`
+
+Contains:
+- per-pattern region and traffic summary
+- dependency/fence map
+- temporal schedule summary
+- region lifetimes and overlaps context
+
+### 3. Dataflow visualization
+Path:
+- `target/verif/simvectors/generated/dataflow.html`
+
+Contains:
+- execution timeline (SVG)
+- region-map blocks
+- per-region usage cards
+- overlap table
+
+### 4. Optional outputs
+- `--golden`: emits expected read-data vectors under `generated/golden/`
+- `--emit_phases_mk <path>`: emits fence/dependency Makefile fragment
+
+## Recommended Extra Documentation
+- one minimal JSON example per pattern
+- exact dependency semantics for `job` / `wait_for_jobs` with 2-3 pattern chain examples
+- known caveat: timeline model in report is simplified and may differ from full RTL contention timing
diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py
index afc6633..cf080fe 100644
--- a/target/verif/simvectors/hci_stimuli/patterns.py
+++ b/target/verif/simvectors/hci_stimuli/patterns.py
@@ -22,7 +22,6 @@
 
 import random
 
-
 class PatternsMixin:
 
     @staticmethod
@@ -56,21 +55,128 @@ def _idles_per_req(traffic_pct):
         traffic_pct = max(1, min(100, int(traffic_pct)))
         return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct))
 
-    def _is_allowed(self, add, wen, fr, fw):
-        return add not in (fr if wen else fw)
+    def _is_allowed(self, add, wen, read_blocked_set, write_blocked_set):
+        return add not in (read_blocked_set if wen else write_blocked_set)
+
+    def _record_access(self, add, wen, read_blocked_set, write_blocked_set):
+        write_blocked_set.add(add)
+        if not wen:
+            read_blocked_set.add(add)
+
+    @staticmethod
+    def _init_blocked_sets(read_blocked, write_blocked):
+        return set(read_blocked or []), set(write_blocked or [])
+
+    @staticmethod
+    def _extend_unique_sorted(target, values):
+        if not isinstance(target, list):
+            return
+        known = set(target)
+        for v in sorted(values):
+            if v in known:
+                continue
+            target.append(v)
+            known.add(v)
 
-    def _record_access(self, add, wen, fr_new, fw_new):
-        fw_new.append(add)
-        if not wen: fr_new.append(add)
+    def _commit_blocked_sets(self, read_blocked, write_blocked, read_blocked_set, write_blocked_set):
+        self._extend_unique_sorted(read_blocked, read_blocked_set)
+        self._extend_unique_sorted(write_blocked, write_blocked_set)
+
+    def _require_exact_emits(self, pattern_name, id_start, id_value):
+        emitted = int(id_value - id_start)
+        expected = int(self.N_TEST)
+        if emitted == expected:
+            return
+        raise RuntimeError(
+            f"{pattern_name}: emitted {emitted} transaction(s), expected {expected}. "
+            "Adjust region/shape/traffic to satisfy the read/write blocked policy."
+        )
 
     def _open(self, append):
         return open(self.filepath, "a" if append else "w", encoding="ascii")
 
+    def _total_mem_bytes(self):
+        return int(self.TOT_MEM_SIZE * 1024)
+
+    def _normalize_addr(self, addr):
+        total = self._total_mem_bytes()
+        if total <= self.WIDTH_OF_MEMORY_BYTE:
+            return 0
+        max_addr = total - self.WIDTH_OF_MEMORY_BYTE
+        a = int(addr) % total
+        if a > max_addr:
+            a = max_addr
+        return a
+
+    @staticmethod
+    def _parse_read_write_schedule(schedule, default="4read_1write"):
+        raw = str(schedule if schedule is not None else default).strip().lower()
+        if not raw:
+            raw = default
+        tokens = []
+        for chunk in raw.replace("-", "_").split("_"):
+            c = chunk.strip()
+            if not c:
+                continue
+            count = 1
+            word = c
+            if c[0].isdigit():
+                i = 0
+                while i < len(c) and c[i].isdigit():
+                    i += 1
+                count = max(1, int(c[:i]))
+                word = c[i:]
+            elif c[-1].isdigit():
+                i = len(c) - 1
+                while i >= 0 and c[i].isdigit():
+                    i -= 1
+                count = max(1, int(c[i + 1:]))
+                word = c[:i + 1]
+            word = word.strip()
+            if word in {"read", "r"}:
+                tokens.extend(["R"] * count)
+            elif word in {"write", "w"}:
+                tokens.extend(["W"] * count)
+        if not tokens:
+            return ["R", "R", "R", "R", "W"]
+        return tokens
+
+    @staticmethod
+    def _parse_abc_schedule(schedule, default="A_B_C"):
+        raw = str(schedule if schedule is not None else default).strip().upper()
+        if not raw:
+            raw = default
+        tokens = []
+        for chunk in raw.replace("-", "_").split("_"):
+            c = chunk.strip()
+            if not c:
+                continue
+            count = 1
+            letter = c
+            if c[0].isdigit():
+                i = 0
+                while i < len(c) and c[i].isdigit():
+                    i += 1
+                count = max(1, int(c[:i]))
+                letter = c[i:]
+            elif c[-1].isdigit():
+                i = len(c) - 1
+                while i >= 0 and c[i].isdigit():
+                    i -= 1
+                count = max(1, int(c[i + 1:]))
+                letter = c[:i + 1]
+            letter = letter.strip().upper()
+            if letter in {"A", "B", "C"}:
+                tokens.extend([letter] * count)
+        if not tokens:
+            return ["A", "B", "C"]
+        return tokens
+
     # ------------------------------------------------------------------ #
     # Access patterns — each writes: transactions | PAUSE                 #
     # ------------------------------------------------------------------ #
 
-    def random_gen(self, id_start, forbidden_read, forbidden_write,
+    def random_gen(self, id_start, read_blocked, write_blocked,
                    region_base=0, region_size=None, traffic_pct=100,
                    traffic_read_pct=None, append=False):
         total = int(self.TOT_MEM_SIZE * 1024)
@@ -84,24 +190,32 @@ def random_gen(self, id_start, forbidden_read, forbidden_write,
             wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
         else:
             wen_seq = None
-        id_value = id_start; fr_new, fw_new = [], []
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        max_attempts = max(1, n_words * 4)
         with self._open(append) as f:
             for i in range(self.N_TEST):
                 wen = wen_seq[i] if wen_seq is not None else None
                 if wen is None: data, wen = self.data_wen()
                 else: data = "0"*self.DATA_WIDTH if wen else self.random_data()
-                while True:
+                placed = False
+                for _ in range(max_attempts):
                     ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE
                     add = bin(int(ad))[2:].zfill(self.ADD_WIDTH)
-                    if self._is_allowed(add, wen, forbidden_read, forbidden_write):
-                        self._record_access(add, wen, fr_new, fw_new); break
+                    if self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        placed = True
+                        break
+                if not placed:
+                    continue
                 self._write_req(f, id_value, wen, data, add); id_value += 1
                 for _ in range(n_idles): self._write_idle(f)
             self._write_pause(f)
-        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("random", id_start, id_value)
         return id_value
 
-    def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden_write,
+    def linear_gen(self, stride0, start_address, id_start, read_blocked, write_blocked,
                    traffic_pct=100, traffic_read_pct=None, append=False):
         n_idles = self._idles_per_req(traffic_pct)
         if traffic_read_pct is not None:
@@ -110,7 +224,8 @@ def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden
             wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
         else:
             wen_seq = None
-        id_value = id_start; fr_new, fw_new = [], []
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
         with self._open(append) as f:
             addr = self._parse_address(start_address)
             if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
@@ -123,42 +238,50 @@ def linear_gen(self, stride0, start_address, id_start, forbidden_read, forbidden
                 addr += self.WIDTH_OF_MEMORY_BYTE * stride0
                 if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
                     addr -= self.TOT_MEM_SIZE*1024
-                if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
-                self._record_access(add, wen, fr_new, fw_new)
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
                 self._write_req(f, id_value, wen, data, add); id_value += 1
                 for _ in range(n_idles): self._write_idle(f)
             self._write_pause(f)
-        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("linear", id_start, id_value)
         return id_value
 
     def gen_2d(self, stride0, len_d0, stride1, start_address, id_start,
-               forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False):
-        id_value = id_start; fr_new, fw_new = [], []
+               read_blocked, write_blocked, idle_cycles_between_phases=0, append=False):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
         with self._open(append) as f:
             base = self._parse_address(start_address); j = 0
             while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
                 for i in range(len_d0):
                     data, wen = self.data_wen()
                     addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1
                     if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
                         addr -= self.TOT_MEM_SIZE*1024
                     add = bin(addr)[2:].zfill(self.ADD_WIDTH)
-                    if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
-                    self._record_access(add, wen, fr_new, fw_new)
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
                     self._write_req(f, id_value, wen, data, add); id_value += 1
                     if id_value - id_start >= self.N_TEST: break
                 for _ in range(idle_cycles_between_phases): self._write_idle(f)
+                if id_value == emitted_before:
+                    break
                 j += 1
             self._write_pause(f)
-        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("2d", id_start, id_value)
         return id_value
 
     def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start,
-               forbidden_read, forbidden_write, idle_cycles_between_phases=0, append=False):
-        id_value = id_start; fr_new, fw_new = [], []
+               read_blocked, write_blocked, idle_cycles_between_phases=0, append=False):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
         with self._open(append) as f:
             base = self._parse_address(start_address); k = 0
             while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
                 for j in range(len_d1):
                     for i in range(len_d0):
                         data, wen = self.data_wen()
@@ -166,15 +289,18 @@ def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_st
                         if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
                             addr -= self.TOT_MEM_SIZE*1024
                         add = bin(addr)[2:].zfill(self.ADD_WIDTH)
-                        if not self._is_allowed(add, wen, forbidden_read, forbidden_write): continue
-                        self._record_access(add, wen, fr_new, fw_new)
+                        if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
                         self._write_req(f, id_value, wen, data, add); id_value += 1
                         if id_value - id_start >= self.N_TEST: break
                     if id_value - id_start >= self.N_TEST: break
                     for _ in range(idle_cycles_between_phases): self._write_idle(f)
+                if id_value == emitted_before:
+                    break
                 k += 1
             self._write_pause(f)
-        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("3d", id_start, id_value)
         return id_value
 
     def idle_gen(self, id_start, append=False):
@@ -183,7 +309,7 @@ def idle_gen(self, id_start, append=False):
             self._write_pause(f)
         return id_start
 
-    def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write,
+    def matmul_phased_gen(self, id_start, read_blocked, write_blocked,
                           region_base_address, region_size_bytes,
                           matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1,
                           traffic_pct=100,
@@ -192,7 +318,8 @@ def matmul_phased_gen(self, id_start, forbidden_read, forbidden_write,
                           region_base_address_b=None, region_size_bytes_b=None,
                           region_base_address_c=None, region_size_bytes_c=None,
                           append=False):
-        id_value = id_start; fr_new, fw_new = [], []
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
         ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024)
         n_idles = self._idles_per_req(traffic_pct)
 
@@ -213,6 +340,7 @@ def _res(bo, so, fb, fs):
             rw = size // ab
             if rw < 3:
                 with self._open(append) as f: self._write_idle(f); self._write_pause(f)
+                self._require_exact_emits("matmul_phased", id_start, id_value)
                 return id_value
             aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw
             a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab
@@ -226,8 +354,13 @@ def _emit(fobj, count, wen, pb, pe):
             for _ in range(count):
                 data = "0"*self.DATA_WIDTH if wen else self.random_data()
                 add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    addr += ab
+                    if addr >= pe:
+                        addr = pb
+                    continue
                 self._write_req(fobj, id_value, wen, data, add)
-                self._record_access(add, wen, fr_new, fw_new)
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
                 id_value += 1; addr += ab
                 if addr >= pe: addr = pb
                 for _ in range(n_idles): self._write_idle(fobj)
@@ -242,5 +375,476 @@ def _emit(fobj, count, wen, pb, pe):
             _emit(f, cc, 0, c_base, c_base+c_size)
             self._write_pause(f)
 
-        forbidden_read.extend(fr_new); forbidden_write.extend(fw_new)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("matmul_phased", id_start, id_value)
+        return id_value
+
+    def multi_linear_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        regions,
+        schedule="round_robin",
+        burst_len=1,
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        burst = max(1, int(burst_len))
+
+        norm_regions = []
+        for reg in regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            stride_words = max(1, int(reg.get("stride_words", 1)))
+            read_pct = reg.get("read_pct")
+            if read_pct is not None:
+                read_pct = max(0, min(100, int(read_pct)))
+            norm_regions.append({
+                "base": base,
+                "size": size,
+                "stride_words": stride_words,
+                "read_pct": read_pct,
+                "offset": 0,
+            })
+
+        if not norm_regions:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("multi_linear", id_start, id_value)
+            return id_value
+
+        rr = 0
+        stalled_rounds = 0
+        with self._open(append) as f:
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                reg = norm_regions[rr % len(norm_regions)]
+                rr += 1
+                chunk = burst if str(schedule).strip().lower() == "round_robin" else max(1, self.N_TEST)
+                for _ in range(chunk):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = reg["base"] + reg["offset"]
+                    add = bin(self._normalize_addr(addr))[2:].zfill(self.ADD_WIDTH)
+                    if reg["read_pct"] is None:
+                        data, wen = self.data_wen()
+                    else:
+                        wen = 1 if random.randint(1, 100) <= reg["read_pct"] else 0
+                        data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                    if self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        self._write_req(f, id_value, wen, data, add)
+                        id_value += 1
+                        for _ in range(n_idles):
+                            self._write_idle(f)
+                    step = reg["stride_words"] * ab
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                if id_value == emitted_before:
+                    stalled_rounds += 1
+                    if stalled_rounds >= len(norm_regions):
+                        break
+                else:
+                    stalled_rounds = 0
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("multi_linear", id_start, id_value)
+        return id_value
+
+    def bank_group_linear_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        start_bank,
+        bank_group_span,
+        stride_beats=1,
+        bank_group_hop=0,
+        wen=None,
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        span = max(1, min(int(bank_group_span), int(self.N_BANKS)))
+        start_bank = int(start_bank) % max(1, int(self.N_BANKS))
+        stride = max(1, int(stride_beats))
+        hop = max(0, int(bank_group_hop))
+
+        with self._open(append) as f:
+            for tx in range(self.N_TEST):
+                phase = tx * stride
+                group_idx = phase // span
+                bank_base = (start_bank + group_idx * hop * span) % self.N_BANKS
+                bank = (bank_base + (phase % span)) % self.N_BANKS
+                row = group_idx
+                word_idx = row * self.N_BANKS + bank
+                addr = self._normalize_addr(word_idx * ab)
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                if wen is None:
+                    data, wen_cur = self.data_wen()
+                else:
+                    wen_cur = 1 if int(wen) else 0
+                    data = "0" * self.DATA_WIDTH if wen_cur else self.random_data()
+                if not self._is_allowed(add, wen_cur, read_blocked_set, write_blocked_set):
+                    continue
+                self._record_access(add, wen_cur, read_blocked_set, write_blocked_set)
+                self._write_req(f, id_value, wen_cur, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("bank_group_linear", id_start, id_value)
+        return id_value
+
+    def rw_rowwise_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        row_base_address,
+        row_size_bytes,
+        n_rows,
+        row_stride_bytes,
+        reads_per_row,
+        writes_per_row,
+        traffic_pct=100,
+        idle_cycles_between_rows=0,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        n_idles = self._idles_per_req(traffic_pct)
+        base = self._align_down(int(row_base_address), ab)
+        row_size = max(ab, self._align_down(int(row_size_bytes), ab))
+        row_stride = max(ab, self._align_down(int(row_stride_bytes), ab))
+        n_rows = max(0, int(n_rows))
+        reads_per_row = max(0, int(reads_per_row))
+        writes_per_row = max(0, int(writes_per_row))
+
+        with self._open(append) as f:
+            for r in range(n_rows):
+                if id_value - id_start >= self.N_TEST:
+                    break
+                row_base = self._normalize_addr(base + r * row_stride)
+                for i in range(reads_per_row):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = self._normalize_addr(row_base + (i * ab) % row_size)
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    wen = 1
+                    data = "0" * self.DATA_WIDTH
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                    self._write_req(f, id_value, wen, data, add)
+                    id_value += 1
+                    for _ in range(n_idles):
+                        self._write_idle(f)
+                for i in range(writes_per_row):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = self._normalize_addr(row_base + (i * ab) % row_size)
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    wen = 0
+                    data = self.random_data()
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                    self._write_req(f, id_value, wen, data, add)
+                    id_value += 1
+                    for _ in range(n_idles):
+                        self._write_idle(f)
+                if r < n_rows - 1:
+                    for _ in range(max(0, int(idle_cycles_between_rows))):
+                        self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("rw_rowwise", id_start, id_value)
+        return id_value
+
+    def gather_scatter_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        read_regions,
+        write_region,
+        chunk_bytes=0,
+        schedule="4read_1write",
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        chunk_val = ab if chunk_bytes is None else int(chunk_bytes)
+        step = max(ab, self._align_down(chunk_val if chunk_val > 0 else ab, ab))
+        tokens = self._parse_read_write_schedule(schedule)
+
+        reads = []
+        for reg in read_regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            reads.append({"base": base, "size": size, "offset": 0})
+
+        wb = self._align_down(int((write_region or {}).get("base", 0)), ab)
+        ws = self._align_down(int((write_region or {}).get("size_bytes", 0)), ab)
+        if wb >= tm:
+            wb %= tm
+        if wb + ws > tm:
+            ws = self._align_down(tm - wb, ab)
+
+        if not reads and ws <= 0:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("gather_scatter", id_start, id_value)
+            return id_value
+
+        read_rr = 0
+        token_idx = 0
+        write_offset = 0
+        max_no_progress = max(32, len(tokens) * max(1, len(reads) + (1 if ws > 0 else 0)))
+        no_progress_iters = 0
+        with self._open(append) as f:
+            while id_value - id_start < self.N_TEST:
+                token = tokens[token_idx % len(tokens)]
+                token_idx += 1
+                wen = 1 if token == "R" else 0
+                if token == "R" and reads:
+                    reg = reads[read_rr % len(reads)]
+                    read_rr += 1
+                    addr = self._normalize_addr(reg["base"] + reg["offset"])
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                elif ws > 0:
+                    addr = self._normalize_addr(wb + write_offset)
+                    write_offset = (write_offset + step) % ws
+                elif reads:
+                    reg = reads[read_rr % len(reads)]
+                    read_rr += 1
+                    wen = 1
+                    addr = self._normalize_addr(reg["base"] + reg["offset"])
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                else:
+                    break
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    no_progress_iters += 1
+                    if no_progress_iters >= max_no_progress:
+                        break
+                    continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                no_progress_iters = 0
+                self._write_req(f, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("gather_scatter", id_start, id_value)
+        return id_value
+
+    def matmul_tiled_interleave_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        region_base_address_a,
+        region_size_bytes_a,
+        region_base_address_b,
+        region_size_bytes_b,
+        region_base_address_c,
+        region_size_bytes_c,
+        tile_a_bytes=0,
+        tile_b_bytes=0,
+        tile_c_bytes=0,
+        tiles=1,
+        ab_c_schedule="A_B_C",
+        traffic_pct=100,
+        idle_cycles_between_tiles=0,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        tile_idle = max(0, int(idle_cycles_between_tiles))
+        tokens = self._parse_abc_schedule(ab_c_schedule)
+
+        def _res(base_raw, size_raw):
+            base = self._align_down(int(base_raw), ab)
+            size = self._align_down(int(size_raw), ab)
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            return base, size
+
+        a_base, a_size = _res(region_base_address_a, region_size_bytes_a)
+        b_base, b_size = _res(region_base_address_b, region_size_bytes_b)
+        c_base, c_size = _res(region_base_address_c, region_size_bytes_c)
+        if a_size <= 0 or b_size <= 0 or c_size <= 0:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("matmul_tiled_interleave", id_start, id_value)
+            return id_value
+
+        ta = ab if tile_a_bytes is None else int(tile_a_bytes)
+        tb = ab if tile_b_bytes is None else int(tile_b_bytes)
+        tc = ab if tile_c_bytes is None else int(tile_c_bytes)
+        cnt_a = max(1, self._align_down(ta if ta > 0 else ab, ab) // ab)
+        cnt_b = max(1, self._align_down(tb if tb > 0 else ab, ab) // ab)
+        cnt_c = max(1, self._align_down(tc if tc > 0 else ab, ab) // ab)
+        counts = {"A": cnt_a, "B": cnt_b, "C": cnt_c}
+        ptr = {"A": 0, "B": 0, "C": 0}
+        base = {"A": a_base, "B": b_base, "C": c_base}
+        size = {"A": a_size, "B": b_size, "C": c_size}
+        max_tiles = max(1, int(tiles))
+
+        with self._open(append) as f:
+            tile_idx = 0
+            stalled_tiles = 0
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                for tok in tokens:
+                    for _ in range(counts[tok]):
+                        if id_value - id_start >= self.N_TEST:
+                            break
+                        addr = self._normalize_addr(base[tok] + ptr[tok])
+                        ptr[tok] = (ptr[tok] + ab) % size[tok]
+                        wen = 0 if tok == "C" else 1
+                        add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                        data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                        if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                            continue
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        self._write_req(f, id_value, wen, data, add)
+                        id_value += 1
+                        for _ in range(n_idles):
+                            self._write_idle(f)
+                tile_idx += 1
+                if tile_idle > 0 and id_value - id_start < self.N_TEST:
+                    for _ in range(tile_idle):
+                        self._write_idle(f)
+                if tile_idx >= max_tiles:
+                    tile_idx = 0
+                if id_value == emitted_before:
+                    stalled_tiles += 1
+                    if stalled_tiles >= max_tiles:
+                        break
+                else:
+                    stalled_tiles = 0
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("matmul_tiled_interleave", id_start, id_value)
+        return id_value
+
+    def hotspot_random_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        hot_regions,
+        traffic_pct=100,
+        traffic_read_pct=None,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+
+        regions = []
+        weights = []
+        for reg in hot_regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            weight = max(1, int(reg.get("weight", 1)))
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            regions.append({"base": base, "size": size})
+            weights.append(weight)
+
+        if not regions:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("hotspot_random", id_start, id_value)
+            return id_value
+
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+        else:
+            wen_seq = None
+
+        with self._open(append) as f:
+            for i in range(self.N_TEST):
+                reg = random.choices(regions, weights=weights, k=1)[0]
+                n_words = max(1, reg["size"] // ab)
+                ad = reg["base"] + random.randint(0, n_words - 1) * ab
+                addr = self._normalize_addr(ad)
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None:
+                    data, wen = self.data_wen()
+                else:
+                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                self._write_req(f, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("hotspot_random", id_start, id_value)
         return id_value
diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py
index 98a2b1a..5784714 100644
--- a/target/verif/simvectors/html_report.py
+++ b/target/verif/simvectors/html_report.py
@@ -106,7 +106,7 @@ def _outside_detail_lines(node, base_addr, size_kib):
         regs = list(node.get('regions', []))
         label_map = {str(r.get('label', '')): r for r in regs}
         has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)'))
-        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased'} or has_matmul_regs
+        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased', 'matmul_tiled_interleave'} or has_matmul_regs
 
         if is_matmul:
             mat_lines = []
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index e2dd321..190d0bd 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -84,7 +84,7 @@ def main(argv=None):
     hwpe_masters = workload_config['hwpe_masters']
 
     # Derived parameters
-    ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1000))
+    ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1024))
     N_LOG = N_CORE + N_DMA + N_EXT
     N_LOG_CFG = N_LOG
     IW = 8
@@ -111,7 +111,7 @@ def _narrow_driver_name(local_idx: int) -> str:
     if N_LOG + N_HWPE < 1:
         print("ERROR: the number of masters must be > 0")
         sys.exit(1)
-    n_words = (TOT_MEM_SIZE * 1000 / N_BANKS) / (DATA_WIDTH / 8)
+    n_words = (TOT_MEM_SIZE * 1024 / N_BANKS) / (DATA_WIDTH / 8)
     if not n_words.is_integer():
         print("ERROR: the number of words is not an integer value")
         sys.exit(1)
@@ -154,8 +154,6 @@ def _create_idle_file(path: Path, data_width: int):
         _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH)
 
     next_start_id = 0
-    LIST_OF_FORBIDDEN_ADDRESSES_WRITE = []
-    LIST_OF_FORBIDDEN_ADDRESSES_READ = []
 
     # Memory map entries collected during generation, printed at the end
     memory_map_entries = []
@@ -233,6 +231,135 @@ def _record_memory_map(kind, local_idx, description, config, n_test, data_width,
             idle_between = master_config.get('idle_cycles_between_phases', 0)
             if idle_between:
                 detail['idle_between_phases'] = f"{idle_between} cycles"
+        elif config == 'multi_linear':
+            regs = master_config.get('regions', []) or []
+            detail['schedule'] = str(master_config.get('schedule', 'round_robin'))
+            detail['burst_len'] = int(master_config.get('burst_len', 1))
+            for idx, reg in enumerate(regs):
+                base = _parse_maybe_bin_int(reg.get('base'), 0)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                stride_w = int(reg.get('stride_words', 1))
+                rpct = reg.get('read_pct')
+                rpct_txt = f", read={int(rpct)}%" if rpct is not None else ""
+                detail[f"region_{idx}"] = (
+                    f"0x{base:08x} - 0x{base + max(0, size) - 1:08x}  "
+                    f"({size} B, stride={stride_w} words{rpct_txt})"
+                )
+            if regs:
+                first_addr = _parse_maybe_bin_int(regs[0].get('base'), 0)
+                last_reg = regs[-1]
+                lb = _parse_maybe_bin_int(last_reg.get('base'), 0)
+                ls = _parse_maybe_bin_int(last_reg.get('size_bytes'), 0)
+                last_addr = lb + max(0, ls) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'bank_group_linear':
+            span = max(1, int(master_config.get('bank_group_span', 1)))
+            start_bank = int(master_config.get('start_bank', 0)) % max(1, int(N_BANKS))
+            stride_beats = max(1, int(master_config.get('stride_beats', 1)))
+            first_addr = start_bank * access_bytes
+            phase = max(0, n_test - 1) * stride_beats
+            group_idx = phase // span
+            bank = (start_bank + (phase % span)) % max(1, int(N_BANKS))
+            last_addr = (group_idx * N_BANKS + bank) * access_bytes
+            last_addr = last_addr % total_mem_bytes
+            detail['start_bank'] = start_bank
+            detail['bank_group_span'] = span
+            detail['stride_beats'] = stride_beats
+            if 'bank_group_hop' in master_config:
+                detail['bank_group_hop'] = int(master_config.get('bank_group_hop', 0))
+            if 'wen' in master_config:
+                detail['wen'] = int(master_config.get('wen', 1))
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'rw_rowwise':
+            row_base = _parse_maybe_bin_int(master_config.get('row_base_address'), region_base)
+            row_size = _parse_maybe_bin_int(master_config.get('row_size_bytes'), access_bytes)
+            n_rows = max(0, int(master_config.get('n_rows', 0)))
+            row_stride = _parse_maybe_bin_int(master_config.get('row_stride_bytes'), row_size)
+            rpr = max(0, int(master_config.get('reads_per_row', 0)))
+            wpr = max(0, int(master_config.get('writes_per_row', 0)))
+            first_addr = row_base
+            last_addr = row_base + max(0, n_rows - 1) * row_stride + max(0, row_size - access_bytes)
+            last_addr = last_addr % total_mem_bytes
+            detail['rows'] = f"n_rows={n_rows}, row_size={row_size} B, row_stride={row_stride} B"
+            detail['per_row'] = f"reads={rpr}, writes={wpr}"
+            idle_between = int(master_config.get('idle_cycles_between_rows', 0))
+            if idle_between:
+                detail['idle_between_rows'] = f"{idle_between} cycles"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'gather_scatter':
+            rr = master_config.get('read_regions', []) or []
+            wr = master_config.get('write_region', {}) or {}
+            for idx, reg in enumerate(rr):
+                b = _parse_maybe_bin_int(reg.get('base'), 0)
+                s = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                detail[f"read_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x}  ({s} B)"
+            wb = _parse_maybe_bin_int(wr.get('base'), 0)
+            ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0)
+            detail['write_region'] = f"0x{wb:08x} - 0x{wb + max(0, ws) - 1:08x}  ({ws} B)"
+            detail['schedule'] = str(master_config.get('schedule', '4read_1write'))
+            detail['chunk_bytes'] = int(_parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes))
+            if rr:
+                first_addr = _parse_maybe_bin_int(rr[0].get('base'), 0)
+            else:
+                first_addr = wb
+            last_addr = wb + max(0, ws) - access_bytes if ws > 0 else first_addr
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), region_base)
+            sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), region_size // 3)
+            rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), ra + sa)
+            sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), region_size // 3)
+            rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), rb + sb)
+            sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), region_size - max(0, sa) - max(0, sb))
+            detail['matrix_A (read)'] = f"0x{ra:08x} - 0x{ra + max(0, sa) - access_bytes:08x}  ({sa} B)"
+            detail['matrix_B (read)'] = f"0x{rb:08x} - 0x{rb + max(0, sb) - access_bytes:08x}  ({sb} B)"
+            detail['matrix_C (write)'] = f"0x{rc:08x} - 0x{rc + max(0, sc) - access_bytes:08x}  ({sc} B)"
+            detail['tile_bytes'] = (
+                f"A={int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes))}, "
+                f"B={int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes))}, "
+                f"C={int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes))}"
+            )
+            detail['tiles'] = int(master_config.get('tiles', 1))
+            detail['ab_c_schedule'] = str(master_config.get('ab_c_schedule', 'A_B_C'))
+            idle_tiles = int(master_config.get('idle_cycles_between_tiles', 0))
+            if idle_tiles:
+                detail['idle_between_tiles'] = f"{idle_tiles} cycles"
+            first_addr = ra
+            last_addr = rc + max(0, sc) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'hotspot_random':
+            hrs = master_config.get('hot_regions', []) or []
+            for idx, reg in enumerate(hrs):
+                b = _parse_maybe_bin_int(reg.get('base'), 0)
+                s = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                w = int(reg.get('weight', 1))
+                detail[f"hot_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x}  ({s} B, weight={w})"
+            if hrs:
+                first_addr = _parse_maybe_bin_int(hrs[0].get('base'), 0)
+                lb = _parse_maybe_bin_int(hrs[-1].get('base'), 0)
+                ls = _parse_maybe_bin_int(hrs[-1].get('size_bytes'), 0)
+                last_addr = lb + max(0, ls) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct'] = f"{rpct}%"
         elif config == 'linear':
             base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
             first_addr = base
@@ -295,8 +422,24 @@ def _parse_maybe_bin_int(raw_value, default_value):
         return default_value
 
     def _normalize_mem_access_type(raw_value, master_name):
-        allowed = {"random", "linear", "2d", "3d", "idle", "matmul_phased"}
-        aliases = {"matmul": "matmul_phased"}
+        allowed = {
+            "random",
+            "linear",
+            "2d",
+            "3d",
+            "idle",
+            "matmul_phased",
+            "multi_linear",
+            "bank_group_linear",
+            "rw_rowwise",
+            "gather_scatter",
+            "matmul_tiled_interleave",
+            "hotspot_random",
+        }
+        aliases = {
+            "matmul": "matmul_phased",
+            "matmul_tiled": "matmul_tiled_interleave",
+        }
 
         if not isinstance(raw_value, str):
             print(
@@ -378,6 +521,60 @@ def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_widt
             k = master_config.get('matrix_k')
             if m is not None and n is not None and k is not None:
                 return int(m) * int(k) + int(k) * int(n) + int(m) * int(n)
+        elif mem_access_type == 'multi_linear':
+            total = 0
+            for reg in master_config.get('regions', []) or []:
+                size_v = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                total += max(0, int(size_v)) // access_bytes
+            if total > 0:
+                return total
+        elif mem_access_type == 'bank_group_linear':
+            print(
+                f"ERROR: {kind}_{local_idx} mem_access_type='bank_group_linear' "
+                "requires explicit 'n_transactions'."
+            )
+            sys.exit(1)
+        elif mem_access_type == 'rw_rowwise':
+            n_rows = master_config.get('n_rows')
+            rpr = master_config.get('reads_per_row')
+            wpr = master_config.get('writes_per_row')
+            if n_rows is not None and rpr is not None and wpr is not None:
+                return max(0, int(n_rows)) * (max(0, int(rpr)) + max(0, int(wpr)))
+        elif mem_access_type == 'gather_scatter':
+            chunk = _parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes)
+            step = max(access_bytes, int(chunk) if chunk is not None else access_bytes)
+            total = 0
+            for reg in master_config.get('read_regions', []) or []:
+                total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // step
+            wr = master_config.get('write_region', {}) or {}
+            total += max(0, int(_parse_maybe_bin_int(wr.get('size_bytes'), 0))) // step
+            if total > 0:
+                return total
+        elif mem_access_type == 'matmul_tiled_interleave':
+            tiles = max(1, int(master_config.get('tiles', 1)))
+            sched = str(master_config.get('ab_c_schedule', 'A_B_C')).upper().replace('-', '_')
+            toks = [t for t in sched.split('_') if t]
+            if not toks:
+                toks = ['A', 'B', 'C']
+            cnt_a = max(1, int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes)) // access_bytes)
+            cnt_b = max(1, int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes)) // access_bytes)
+            cnt_c = max(1, int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes)) // access_bytes)
+            per_tile = 0
+            for t in toks:
+                if t == 'A':
+                    per_tile += cnt_a
+                elif t == 'B':
+                    per_tile += cnt_b
+                elif t == 'C':
+                    per_tile += cnt_c
+            if per_tile > 0:
+                return tiles * per_tile
+        elif mem_access_type == 'hotspot_random':
+            total = 0
+            for reg in master_config.get('hot_regions', []) or []:
+                total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // access_bytes
+            if total > 0:
+                return total
         elif mem_access_type == 'idle':
             return 0
         print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no "
@@ -443,7 +640,7 @@ def _generate_pattern(
         # hold all transactions once at the current transaction width.
         region_size_input = pattern_config.get('region_size_bytes')
         if (
-            config in {'linear', 'matmul_phased'}
+            config in {'linear', 'matmul_phased', 'matmul_tiled_interleave'}
             and region_size_input is None
             and 'n_transactions' in pattern_config
         ):
@@ -462,47 +659,74 @@ def _generate_pattern(
 
         n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx)
         master.N_TEST = n_test
-        # Keep forbidden-address filtering local to this pattern invocation.
-        # This allows intended buffer reuse across phases (e.g. double buffering)
-        # while still avoiding duplicates inside a single pattern generator call.
-        forbidden_read_local = list(LIST_OF_FORBIDDEN_ADDRESSES_READ)
-        forbidden_write_local = list(LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
+        # Read/write blocked filtering is pattern-local only.
+        read_blocked_local = []
+        write_blocked_local = []
+        tpct_raw = pattern_config.get('traffic_pct', 100)
+        tpct = 100 if tpct_raw is None else int(tpct_raw)
+
+        multi_regions_cfg = []
+        for reg in pattern_config.get('regions', []) or []:
+            multi_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+                'stride_words': int(reg.get('stride_words', 1)),
+                'read_pct': reg.get('read_pct'),
+            })
+
+        read_regions_cfg = []
+        for reg in pattern_config.get('read_regions', []) or []:
+            read_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+            })
+        wr_cfg_raw = pattern_config.get('write_region', {}) or {}
+        write_region_cfg = {
+            'base': _parse_maybe_bin_int(wr_cfg_raw.get('base'), 0),
+            'size_bytes': _parse_maybe_bin_int(wr_cfg_raw.get('size_bytes'), 0),
+        }
+
+        hot_regions_cfg = []
+        for reg in pattern_config.get('hot_regions', []) or []:
+            hot_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+                'weight': int(reg.get('weight', 1)),
+            })
 
         if config == 'random':
-            tpct = pattern_config.get('traffic_pct')
             next_start_id = master.random_gen(
                 next_start_id,
-                forbidden_read_local,
-                forbidden_write_local,
+                read_blocked_local,
+                write_blocked_local,
                 region_base=region_base,
                 region_size=region_size,
-                traffic_pct=int(tpct) if tpct is not None else 100,
+                traffic_pct=tpct,
                 traffic_read_pct=pattern_config.get('traffic_read_pct'),
                 append=append,
             )
         elif config == 'linear':
-            tpct = pattern_config.get('traffic_pct')
             next_start_id = master.linear_gen(
                 stride0, start_address, next_start_id,
-                forbidden_read_local,
-                forbidden_write_local,
-                traffic_pct=int(tpct) if tpct is not None else 100,
+                read_blocked_local,
+                write_blocked_local,
+                traffic_pct=tpct,
                 traffic_read_pct=pattern_config.get('traffic_read_pct'),
                 append=append,
             )
         elif config == '2d':
             next_start_id = master.gen_2d(
                 stride0, len_d0, stride1, start_address, next_start_id,
-                forbidden_read_local,
-                forbidden_write_local,
+                read_blocked_local,
+                write_blocked_local,
                 idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
                 append=append,
             )
         elif config == '3d':
             next_start_id = master.gen_3d(
                 stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id,
-                forbidden_read_local,
-                forbidden_write_local,
+                read_blocked_local,
+                write_blocked_local,
                 idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
                 append=append,
             )
@@ -523,8 +747,8 @@ def _generate_pattern(
                 sys.exit(1)
             next_start_id = master.matmul_phased_gen(
                 next_start_id,
-                forbidden_read_local,
-                forbidden_write_local,
+                read_blocked_local,
+                write_blocked_local,
                 region_base,
                 region_size,
                 int(pattern_config.get('matmul_ratio_a', 1)),
@@ -540,6 +764,105 @@ def _generate_pattern(
                 region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None),
                 append=append,
             )
+        elif config == 'multi_linear':
+            next_start_id = master.multi_linear_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                regions=multi_regions_cfg,
+                schedule=pattern_config.get('schedule', 'round_robin'),
+                burst_len=int(pattern_config.get('burst_len', 1)),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'bank_group_linear':
+            next_start_id = master.bank_group_linear_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                start_bank=int(pattern_config.get('start_bank', 0)),
+                bank_group_span=int(pattern_config.get('bank_group_span', 1)),
+                stride_beats=int(pattern_config.get('stride_beats', 1)),
+                bank_group_hop=int(pattern_config.get('bank_group_hop', 0)),
+                wen=pattern_config.get('wen'),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'rw_rowwise':
+            next_start_id = master.rw_rowwise_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                row_base_address=_parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base),
+                row_size_bytes=_parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes),
+                n_rows=int(pattern_config.get('n_rows', 1)),
+                row_stride_bytes=_parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), access_bytes),
+                reads_per_row=int(pattern_config.get('reads_per_row', 0)),
+                writes_per_row=int(pattern_config.get('writes_per_row', 0)),
+                traffic_pct=tpct,
+                idle_cycles_between_rows=int(pattern_config.get('idle_cycles_between_rows', 0)),
+                append=append,
+            )
+        elif config == 'gather_scatter':
+            next_start_id = master.gather_scatter_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                read_regions=read_regions_cfg,
+                write_region=write_region_cfg,
+                chunk_bytes=_parse_maybe_bin_int(pattern_config.get('chunk_bytes'), access_bytes),
+                schedule=pattern_config.get('schedule', '4read_1write'),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+            rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+            rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+            if ra is None or sa is None or rb is None or sb is None or rc is None or sc is None:
+                # Fallback to split the combined region into A/B/C thirds.
+                n_words = max(3, region_size // access_bytes)
+                a_words = max(1, n_words // 3)
+                b_words = max(1, n_words // 3)
+                c_words = max(1, n_words - a_words - b_words)
+                ra = region_base
+                sa = a_words * access_bytes
+                rb = ra + sa
+                sb = b_words * access_bytes
+                rc = rb + sb
+                sc = c_words * access_bytes
+            next_start_id = master.matmul_tiled_interleave_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                region_base_address_a=ra,
+                region_size_bytes_a=sa,
+                region_base_address_b=rb,
+                region_size_bytes_b=sb,
+                region_base_address_c=rc,
+                region_size_bytes_c=sc,
+                tile_a_bytes=_parse_maybe_bin_int(pattern_config.get('tile_a_bytes'), access_bytes),
+                tile_b_bytes=_parse_maybe_bin_int(pattern_config.get('tile_b_bytes'), access_bytes),
+                tile_c_bytes=_parse_maybe_bin_int(pattern_config.get('tile_c_bytes'), access_bytes),
+                tiles=int(pattern_config.get('tiles', 1)),
+                ab_c_schedule=pattern_config.get('ab_c_schedule', 'A_B_C'),
+                traffic_pct=tpct,
+                idle_cycles_between_tiles=int(pattern_config.get('idle_cycles_between_tiles', 0)),
+                append=append,
+            )
+        elif config == 'hotspot_random':
+            next_start_id = master.hotspot_random_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                hot_regions=hot_regions_cfg,
+                traffic_pct=tpct,
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
+            )
 
         _record_memory_map(
             kind, master_local_idx,
@@ -841,6 +1164,162 @@ def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peer
 
         if mem_access_type == 'idle':
             return []
+        if mem_access_type == 'multi_linear':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), region_size)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                rpct = reg.get('read_pct')
+                if rpct is None:
+                    lbl = f"R{idx}"
+                else:
+                    lbl = f"R{idx}({'read' if int(rpct) >= 50 else 'write'})"
+                regions.append({
+                    'label': lbl,
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            return regions
+        if mem_access_type == 'bank_group_linear':
+            span = max(1, int(pattern_config.get('bank_group_span', 1)))
+            start_bank = int(pattern_config.get('start_bank', 0)) % max(1, int(N_BANKS))
+            n_tx = _parse_maybe_bin_int(pattern_config.get('n_transactions'), 1)
+            n_tx = max(1, int(n_tx))
+            rows = max(1, math.ceil(n_tx / span))
+            size = min(total_mem_bytes, rows * span * access_bytes)
+            base = (start_bank * access_bytes) % max(1, total_mem_bytes)
+            if base + size > total_mem_bytes:
+                size = max(access_bytes, total_mem_bytes - base)
+            return [{
+                'label': 'bank_group',
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            }]
+        if mem_access_type == 'rw_rowwise':
+            row_base = _parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base)
+            row_size = _parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes)
+            n_rows = max(1, int(pattern_config.get('n_rows', 1)))
+            row_stride = _parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), row_size)
+            base = (row_base // access_bytes) * access_bytes
+            if base >= total_mem_bytes:
+                base = base % total_mem_bytes
+            size = ((max(0, row_stride) * max(0, n_rows - 1)) + max(0, row_size))
+            size = (size // access_bytes) * access_bytes
+            if base + size > total_mem_bytes:
+                size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+            if size <= 0:
+                size = access_bytes
+            return [{
+                'label': 'rowwise',
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            }]
+        if mem_access_type == 'gather_scatter':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('read_regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                regions.append({
+                    'label': f"gather_{idx}(read)",
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            wr = pattern_config.get('write_region', {}) or {}
+            wb = _parse_maybe_bin_int(wr.get('base'), region_base)
+            ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0)
+            wb = (wb // access_bytes) * access_bytes
+            if wb >= total_mem_bytes:
+                wb = wb % total_mem_bytes
+            ws = (max(0, ws) // access_bytes) * access_bytes
+            if wb + ws > total_mem_bytes:
+                ws = ((total_mem_bytes - wb) // access_bytes) * access_bytes
+            if ws > 0:
+                regions.append({
+                    'label': 'scatter(write)',
+                    'base': wb,
+                    'size': ws,
+                    'end': wb + ws - 1,
+                })
+            return regions
+        if mem_access_type == 'hotspot_random':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('hot_regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                regions.append({
+                    'label': f"hot_{idx}",
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            return regions
+        if mem_access_type == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+            rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+            rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+            regions = []
+            if ra is not None and sa is not None and rb is not None and sb is not None and rc is not None and sc is not None:
+                sub_defs = [
+                    ('A(read)', ra, sa),
+                    ('B(read)', rb, sb),
+                    ('C(write)', rc, sc),
+                ]
+            else:
+                n_words = max(3, region_size // access_bytes)
+                a_words = max(1, n_words // 3)
+                b_words = max(1, n_words // 3)
+                c_words = max(1, n_words - a_words - b_words)
+                sub_defs = [
+                    ('A(read)', region_base, a_words * access_bytes),
+                    ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes),
+                    ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes),
+                ]
+            for label, base_raw, size_raw in sub_defs:
+                base = (int(base_raw) // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, int(size_raw)) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size > 0:
+                    regions.append({
+                        'label': label,
+                        'base': base,
+                        'size': size,
+                        'end': base + size - 1,
+                    })
+            return regions
 
         if mem_access_type != 'matmul_phased':
             return [{
@@ -907,38 +1386,17 @@ def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peer
             })
         return regions
 
-    def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test):
-        # Intentionally simple temporal model:
-        # - one time unit per transaction, plus optional req=0 idles from traffic shaping
-        # - no interconnect stall/conflict modeling
-        # - explicit start_delay_cycles is still honored separately
+    def _estimate_pattern_cycles(pattern_config, _mem_access_type, n_test, _txn_bytes):
+        # Temporal model intentionally follows emitted traffic only:
+        # one unit per transaction plus req=0 idles from traffic_pct shaping.
+        # No absolute/phase/tile/row cycle estimation is applied here.
         base = max(0, int(n_test))
         tpct = pattern_config.get('traffic_pct')
         n_idles_per_req = 0
         if tpct is not None:
             tp = max(1, min(100, int(tpct)))
             n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp))
-        cycles = base * (1 + n_idles_per_req)
-        # Keep explicit matmul phase-boundary idle modeling.
-        if mem_access_type == 'matmul_phased':
-            idle_between = int(pattern_config.get('idle_cycles_between_phases', 0))
-            if idle_between > 0:
-                ra = max(0, int(pattern_config.get('matmul_ratio_a', 1)))
-                rb = max(0, int(pattern_config.get('matmul_ratio_b', 1)))
-                rc = max(0, int(pattern_config.get('matmul_ratio_c', 1)))
-                if ra == 0 and rb == 0 and rc == 0:
-                    ra, rb, rc = 1, 1, 1
-                s = ra + rb + rc
-                ca = (base * ra) // s
-                cb = (base * rb) // s
-                cc = base - ca - cb
-                phase_gaps = 0
-                if ca > 0 and (cb > 0 or cc > 0):
-                    phase_gaps += 1
-                if cb > 0 and cc > 0:
-                    phase_gaps += 1
-                cycles += idle_between * phase_gaps
-        return int(cycles)
+        return int(base * (1 + n_idles_per_req))
 
     pattern_nodes = []
     node_idx_by_driver_pattern = {}
@@ -971,7 +1429,7 @@ def _estimate_pattern_cycles(pattern_config, mem_access_type, n_test):
                 'wait_for_jobs_declared': declared_wait_for_jobs,
                 'wait_for_jobs_effective': effective_wait_for_jobs,
                 'n_transactions': int(n_test),
-                'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test)),
+                'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test, int(data_width // 8))),
                 'mem_access_type': mem_access_type,
                 'traffic_read_pct': pat.get('traffic_read_pct'),
                 'txn_bytes': int(data_width // 8),

From a3e4008b271cd2fe808265bd6a88d6ac9fb486c4 Mon Sep 17 00:00:00 2001
From: Sergio Mazzola <smazzola@iis.ee.ethz.ch>
Date: Wed, 11 Mar 2026 19:04:36 +0100
Subject: [PATCH 25/25] verif: Improve documentation

---
 target/verif/simvectors/main.py        | 22 ++++++++++++++-
 target/verif/src/application_driver.sv | 14 ++++++++--
 target/verif/src/tb_hci.sv             | 37 +++++++++++++++++---------
 target/verif/src/tb_hci_pkg.sv         | 29 ++++++++++++++------
 target/verif/src/tcdm_banks_wrap.sv    |  2 ++
 5 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 190d0bd..da740bb 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -4,7 +4,14 @@
 JSON config files: workload, testbench and hardware. It produces
 cycle-accurate stimuli in `verif/simvectors/generated/stimuli`.
 
-Each stimuli file has one line per simulation cycle:
+Each stimuli file encodes an offered per-cycle request stream plus PAUSE fence tokens.
+
+Idle lines (req=0) represent intended issue gaps in the absence of backpressure.
+The application driver may consume some idle entries while stalled on an earlier
+request grant, and hide some memory/interconnect latency. So the file is not a
+strict wall-clock replay under contention.
+
+Stimuli line format:
   req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
 """
 
@@ -1089,6 +1096,19 @@ def _resolve_req_levels(wait_for_jobs_list):
     # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned
     # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector.
     # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15).
+    max_req_level = 0
+    for i in range(N_DRIVERS):
+        for f in range(max_fences):
+            for j in range(N_DRIVERS):
+                max_req_level = max(max_req_level, int(req_levels[i][f][j]))
+    if max_req_level > 15:
+        print(
+            "ERROR: Fence dependency level overflow: "
+            f"required fence_idx={max_req_level}, but packed format supports only 0..15. "
+            "Reduce the number of fence crossings per dependent job or widen LEVEL_BITS."
+        )
+        sys.exit(1)
+
     LEVEL_BITS = 4
     packed_width = N_DRIVERS * LEVEL_BITS
     packed_hex_digits = (packed_width + 3) // 4
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index 6ed4787..a6b7aa5 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -20,12 +20,20 @@
  *
  * Stimulus file format (one line per cycle):
  *   req(1b) id(IWb) wen(1b) data(Nb) add(Ab)   -- active transaction
- *   PAUSE                                        -- fence synchronization point
+ *   PAUSE                                      -- fence synchronization point
+ *
+ * Idle entries (req=0) are consumed as issue gaps when the driver is free to
+ * advance. While stalled waiting for a grant, the driver may advance over later
+ * idle entries, so the stimuli file represents offered traffic order and fence
+ * structure rather than an exact wall-clock replay under backpressure.
  *
  * When a PAUSE token is encountered the driver drains all in-flight reads
  * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1
  * until resume_i is asserted. This allows multi-phase execution on a single
  * driver without resetting counters between phases.
+ * Multiple consecutive PAUSE tokens are legal and represent multiple fence slots
+ * with no intervening traffic (e.g. free-pass completion fence followed by a
+ * synthetic blocking fence for the next pattern).
  */
 
 module application_driver #(
@@ -213,7 +221,9 @@ module application_driver #(
         hci_if.data = transactions[last_op_issued_q].data;
         hci_if.add  = transactions[last_op_issued_q].add;
         if (tr_idx_q < transactions.size()) begin
-          // Advance over any idle (req=0, not-pause) entries to hide memory latency
+          // Consume later idle entries while stalled so the driver can hide memory
+          // latency/backpressure when the workload permits it. This makes req=0 tokens
+          // issue-gap hints, not strict simulation-time no-op cycles.
           if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin
             tr_idx_d = tr_idx_q + 1;
           end
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 0f743e2..5ada60b 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -283,10 +283,12 @@ module tb_hci
   assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR;
   assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR;
 
-  // fence_idx[i]: number of fences driver i has fully passed (exited PAUSED state).
-  // Increments when the handshake fires: resume_i asserted while fence_reached_o is high.
-  // This is the same cycle the driver transitions out of PAUSED, so dependents see the
-  // updated count immediately on the next cycle (after the registered flip-flop updates).
+  // fence_idx[i] = number of PAUSE tokens driver i has passed.
+  // This counts all fences in file order, including synthetic blocking fences
+  // and trailing completion fences.
+  //
+  // fence_idx increments when resume_i is asserted while fence_reached_o is high,
+  // i.e. when the PAUSED-state handshake completes and the driver leaves that fence.
   always_ff @(posedge clk or negedge rst_n) begin
     if (!rst_n) begin
       for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0;
@@ -298,15 +300,13 @@ module tb_hci
     end
   end
 
-  // s_resume[i]: fire when for every set bit j in FENCE_MASKS[i][fence_idx[i]],
-  // fence_idx[j] >= FENCE_REQ_LEVELS[i][fence_idx[i]][j].
-  //
-  // fence_idx[j] >= p_j+1 means j has exited the trailing PAUSE after pattern p_j,
-  // i.e. j has completed pattern p_j. No s_fence_reached shortcut needed — the
-  // trailing PAUSE of a zero-mask fence is exited in one cycle, so fence_idx
-  // advances promptly and the check is unambiguous.
+  // s_resume[i] is asserted only while driver i is paused at its current fence.
+  // Driver i may pass that fence when, for every dependency bit j set in the
+  // current FENCE_MASKS entry, fence_idx[j] is at least the required level
+  // encoded in FENCE_REQ_LEVELS_PACKED.
   //
-  // In MUX mode bus serialization is handled by s_mux_sel independently.
+  // In other words: blocking fences wait for explicit dependency completion;
+  // trailing zero-mask fences are free passes.
   always_comb begin
     for (int i = 0; i < N_DRIVERS; i++) begin
       automatic logic [N_DRIVERS-1:0] cur_mask;
@@ -542,4 +542,17 @@ module tb_hci
     end
   endgenerate
 
+  // Advisory check only. The hard overflow guard is in Python generation
+  // before packing FENCE_REQ_LEVELS into 4-bit fields.
+  // In case of failure due to this asser, modify tb_hci_pkg.sv and generation of fence_masks.mk
+  initial begin
+    if (MAX_FENCES > 16) begin
+      $warning(
+        "MAX_FENCES=%0d exceeds the nominal 4-bit fence-level range; "
+        "ensure no dependency requires a level > 15.",
+        MAX_FENCES
+      );
+    end
+  end
+
 endmodule
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index dc77085..507565c 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -77,14 +77,27 @@ package tb_hci_pkg;
   localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
   localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
 
-  // Fence parameters. Every pattern ends with a PAUSE; fence f = PAUSE after pattern f.
-  // fence_idx[i] == k means driver i has completed k patterns.
-  //
-  // FENCE_MASKS[i][f][j]=1: driver j is a dependency for driver i at fence f.
-  // FENCE_REQ_LEVELS[i][f][j]: minimum fence_idx[j] required before driver i can pass
-  //   fence f. Equals the pattern index of the referenced phase within driver j plus 1
-  //   (i.e., j must have completed that specific pattern).
-  // Both are generated by main.py and passed via defines.
+// Fence parameters.
+//
+// Fence slots enumerate every PAUSE token in the stimulus file, in file order.
+// This includes:
+//   (1) synthetic blocking PAUSEs inserted before patterns that have wait_for_jobs,
+//   (2) trailing PAUSEs emitted after every pattern.
+//
+// fence_idx[i] counts how many PAUSE tokens driver i has passed so far.
+// It is therefore a "passed-fence count", not a "completed-pattern count".
+//
+// FENCE_MASKS[i][f][j] = 1:
+//   at fence slot f of driver i, driver j is a dependency.
+//
+// FENCE_REQ_LEVELS_PACKED[i][f][j]:
+//   minimum fence_idx[j] required before driver i may pass fence slot f.
+//
+// For a synthetic pre-pattern fence, the required level corresponds to the
+// dependency driver's fence count after the referenced job has completed.
+// For a trailing pattern fence, the mask is zero and the fence is a free pass.
+//
+// Both arrays are generated by main.py and passed via defines.
   localparam int unsigned MAX_FENCES =
       `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif;
   localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] =
diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv
index dd9e223..7aad9a8 100644
--- a/target/verif/src/tcdm_banks_wrap.sv
+++ b/target/verif/src/tcdm_banks_wrap.sv
@@ -83,6 +83,8 @@ module tcdm_banks_wrap #(
       .rdata_o(tcdm_slave[i].r_data                     )  // read data
     );
 
+    //NOTE: Commented out. r_valid response is handled by interconnect
+
     //r_valid
     // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen
     //   if(~rst_ni) begin