diff --git a/.github/workflows/gitlab-ci.yml b/.github/workflows/gitlab-ci.yml
new file mode 100644
index 0000000..e89ce0d
--- /dev/null
+++ b/.github/workflows/gitlab-ci.yml
@@ -0,0 +1,29 @@
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Paul Scheffler <paulsc@iis.ee.ethz.ch>
+
+name: gitlab-ci
+
+on: [ push, pull_request, workflow_dispatch ]
+
+permissions:
+  # deployments permission to deploy GitHub pages website
+  deployments: write
+  # contents permission to update benchmark contents in gh-pages branch
+  contents: write
+
+jobs:
+  gitlab-ci:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check Gitlab CI
+        uses: pulp-platform/pulp-actions/gitlab-ci@v1
+        # Skip on forks or pull requests from forks due to missing secrets.
+        if: github.repository == 'pulp-platform/hci' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository)
+        with:
+          domain: iis-git.ee.ethz.ch
+          repo: github-mirror/hci
+          token: ${{ secrets.GITLAB_TOKEN }}
+
diff --git a/.gitignore b/.gitignore
index ada3be7..6212410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ target/verif/vsim/compile.tcl
 target/verif/vsim/modelsim.ini
 target/verif/vsim/transcript
 target/verif/vsim/vsim.wlf
+target/verif/exploration/results
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..a3d926f
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,44 @@
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Paul Scheffler <paulsc@iis.ee.ethz.ch>
+
+# We initialize the nonfree repo, then spawn a sub-pipeline from it
+
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+  # Our reference GCC toolchain for reproducible builds
+
+before_script:
+  - python -V  # Print out python version for debugging
+  - python -m pip install --user virtualenv
+
+.base:
+  artifacts:
+    when: always
+    expire_in: 1 week
+
+stages:
+  - build
+  - test
+
+build:
+  stage: build
+  script:
+    - make checkout
+  artifacts:
+    when: always
+    expire_in: 3 hours
+    paths: [ .bender ]
+
+testset:
+  extends: .base
+  needs: [ build ]
+  stage: test
+  script:
+    - regr/full_regression.sh
+  artifacts:
+    when: always
+    expire_in: 1 year
+    paths: [ regr/hci_tests.xml ]
diff --git a/Bender.yml b/Bender.yml
index 516fe27..37aa7af 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -2,10 +2,11 @@ package:
   name: hci
   authors:
     - "Francesco Conti <f.conti@unibo.it>"
-    - "Gianna Paulin <pauling@iis.ee.ethz.ch"
+    - "Gianna Paulin <pauling@iis.ee.ethz.ch>"
     - "Tobias Riedener <tobiasri@student.ethz.ch>"
     - "Luigi Ghionda <luigi.ghionda2@studio.it>"
     - "Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>"
+    - "Sergio Mazzola <smazzola@iis.ee.ethz.ch>"
 
 dependencies:
   hwpe-stream:            { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.9.0 }
@@ -70,8 +71,8 @@ sources:
           # Level 1
           - target/verif/src/application_driver.sv
           - target/verif/src/tcdm_banks_wrap.sv
-          - target/verif/src/latency_monitor.sv
-          - target/verif/src/throughput_monitor.sv
+          - target/verif/src/req_gnt_monitor.sv
+          - target/verif/src/bandwidth_monitor.sv
           # Level 2
           - target/verif/src/simulation_report.sv
           # Level 3
diff --git a/README.md b/README.md
index 690bc53..fc10244 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,38 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc
  - https://github.com/pulp-platform/neureka
  - https://github.com/pulp-platform/redmule
 
+# Verification flow
+The typical full flow is:
+
+```
+make checkout       # Fetch and check out dependencies via Bender
+make config-verif   # Generate Makefiles from JSON verification configs
+make stim-verif     # Generate simulation stimulus vectors (requires Python 3)
+make compile-verif  # Compile RTL and testbench with QuestaSim
+make opt-verif      # Optimize the compiled design with vopt
+make run-verif      # Run the simulation (batch mode by default)
+```
+
+To open the simulation in the QuestaSim GUI with waveforms, pass `GUI=1`:
+
+```
+make run-verif GUI=1
+```
+
+Cleanup targets:
+
+| Target               | Effect                                             |
+|----------------------|----------------------------------------------------|
+| `clean-config-verif` | Remove generated configuration Makefiles           |
+| `clean-stim-verif`   | Remove generated stimulus vectors                  |
+| `clean-sim-verif`    | Remove QuestaSim build artifacts (work lib, logs)  |
+| `clean-verif`        | Run all three clean targets above                  |
+
+**Notes:**
+- On IIS machines, defaults to QuestaSim (`questa-2022.3`) (can be overriden with `SIM_QUESTA=<version>`). On non-IIS machines, defaults to QuestaSim available in `PATH`.
+- Verification configuration is driven by JSON files under `target/verif/config/`. Edit those before running `config-verif` and `stim-verif`.
+- `run-verif` depends on `opt-verif` and `stim-verif`, so after `checkout` and `config-verif` you can jump straight to it.
+
 # Style guide
 These IPs use a slightly different style than other PULP IPs. Refer to `STYLE.md` for some indications.
 
diff --git a/bender.mk b/bender.mk
index 5f78e57..788c13a 100644
--- a/bender.mk
+++ b/bender.mk
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
diff --git a/regr/basic.yml b/regr/basic.yml
new file mode 100644
index 0000000..9f43cfa
--- /dev/null
+++ b/regr/basic.yml
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Francesco Conti (f.conti@unibo.it)
+#
+
+hci_tests:
+  log_fair:
+    path: .
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/log/hardware.json
+  hci_fair:
+    path: .
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
+  hci_hwpe_prio:
+    path: .
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
+  hci_log_prio:
+    path: .
+    command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json
diff --git a/regr/bwruntests.py b/regr/bwruntests.py
new file mode 100755
index 0000000..555a1d4
--- /dev/null
+++ b/regr/bwruntests.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 ETH Zurich and University of Bologna
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Run shell commands listed in a file separated by newlines in a parallel
+# fashion. If requested the results (tuples consisting of command, stdout,
+# stderr and returncode) will be gathered in a junit.xml file. There a few
+# knobs to tune the number of spawned processes and the junit.xml formatting.
+
+# Author: Robert Balas (balasr@iis.ee.ethz.ch)
+
+import argparse
+import re
+from subprocess import (Popen, TimeoutExpired,
+                        CalledProcessError, PIPE)
+from threading import Lock
+import shlex
+import sys
+import signal
+import os
+import multiprocessing
+import errno
+import pprint
+import time
+import random
+from collections import OrderedDict
+import json
+
+runtest = argparse.ArgumentParser(
+    prog='bwruntests',
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="""Run PULP tests in parallel""",
+    epilog="""
+Test_file needs to be either a .yaml file (set the --yaml switch)
+which looks like this:
+
+mytests.yml
+[...]
+parallel_bare_tests: # name of the test set
+  parMatrixMul8:     # name of the test
+    path: ./parallel_bare_tests/parMatrixMul8 # path to the test's folder
+    command: make clean all run # command to run in the test's folder
+[...]
+
+or
+
+Test_file needs to be a list of commands to be executed. Each line corresponds
+to a single command and a test
+
+commands.f
+[...]
+make -C ./ml_tests/mlGrad clean all run
+make -C ./ml_tests/mlDct clean all run
+[...]
+
+Example:
+bwruntests.py --proc-verbose -v \\
+    --report_junit -t 3600 --yaml \\
+    -o simplified-runtime.xml runtime-tests.yaml
+
+This Runs a set of tests defined in runtime-tests.yaml and dumps the
+resulting junit.xml into simplified-runtime.xml. The --proc-verbose
+scripts makes sure to print the stdout of each process to the shell. To
+prevent a broken process from running forever, a maximum timeout of 3600
+seconds was set. For debugging purposes we enabled -v (--verbose) which
+shows the full set of commands being run.""")
+
+runtest.version = '0.2'
+
+runtest.add_argument('test_file', type=str,
+                     help='file defining tests to be run')
+runtest.add_argument('--version', action='version',
+                     version='%(prog)s ' + runtest.version)
+runtest.add_argument('-p', '--max_procs', type=int,
+                     default=multiprocessing.cpu_count(),
+                     help="""Number of parallel
+                     processes used to run test.
+                     Default is number of cpu cores.""")
+runtest.add_argument('-t', '--timeout', type=float,
+                     default=None,
+                     help="""Timeout for all processes in seconds""")
+runtest.add_argument('-v', '--verbose', action='store_true',
+                     help="""Enable verbose output""")
+runtest.add_argument('-s', '--proc_verbose', action='store_true',
+                     help="""Write processes' stdout and stderr to shell stdout
+                     after they terminate""")
+runtest.add_argument('--report_junit', action='store_true',
+                     help="""Generate a junit report""")
+runtest.add_argument('--disable_junit_pp', action='store_true',
+                     help="""Disable pretty print of junit report""")
+runtest.add_argument('--disable_results_pp', action='store_true',
+                     help="""Disable printing test results""")
+runtest.add_argument('-y,', '--yaml', action='store_true',
+                     help="""Read tests from yaml file instead of executing
+                     from a list of commands""")
+runtest.add_argument('-o,', '--output', type=str,
+                     help="""Write junit.xml to file instead of stdout""")
+runtest.add_argument('-P,', '--perf', type=str, default=None,
+                     help="""Write performance results to JSON file""")
+stdout_lock = Lock()
+
+shared_total = 0
+len_total = 0
+
+class FinishedProcess(object):
+    """A process that has finished running.
+    """
+    def __init__(self, name, cwd, runargs, returncode,
+                 stdout=None, stderr=None, time=None):
+        self.name = name
+        self.cwd = cwd
+        self.runargs = runargs
+        self.returncode = returncode
+        self.stdout = stdout
+        self.stderr = stderr
+        self.time = time
+        exec_time = 0
+        throughput = 0
+        workload = 0
+        if returncode == 0:
+            matches = re.findall("# hwpe cycles =\s+(\d+)", stdout)
+            if matches:
+                exec_time = int(matches[0])
+        self.exec_time = exec_time
+
+
+    def __repr__(self):
+        runargs = ['name={!r}'.format(self.name)]
+        runargs += ['cwd={!r}'.format(self.cwd)]
+        runargs += ['args={!r}'.format(self.runargs),
+                 'returncode={!r}'.format(self.returncode)]
+        if self.stdout is not None:
+            runargs.append('stdout={!r}'.format(self.stdout))
+        if self.stderr is not None:
+            runargs.append('stderr={!r}'.format(self.stderr))
+        if self.time is not None:
+            runargs.append('time={!r}'.format(self.time))
+        return "{}({})".format(type(self).__name__, ', '.join(runargs))
+
+def fork(name, cwd, *popenargs, check=False, shell=True,
+         **kwargs):
+    """Run subprocess and return process args, error code, stdout and stderr
+    """
+
+    def proc_out(cwd, stdout, stderr):
+        print('cwd={}'.format(cwd))
+        print('stdout=')
+        print(stdout.decode('utf-8'))
+        print('stderr=')
+        print(stderr.decode('utf-8'))
+
+    kwargs['stdout'] = PIPE
+    kwargs['stderr'] = PIPE
+
+    with Popen(*popenargs, preexec_fn=os.setpgrp, cwd=cwd,
+               **kwargs) as process:
+        try:
+            # Child and parent are racing for setting/using the pgid so we have
+            # to set it in both processes. See glib manual.
+            try:
+                os.setpgid(process.pid, process.pid)
+            except OSError as e:
+                if e.errno != errno.EACCES:
+                    raise
+            # measure runtime
+            start = time.time()
+            stdout, stderr = process.communicate(input, timeout=args.timeout)
+        except TimeoutExpired:
+            pgid = os.getpgid(process.pid)
+            os.killpg(pgid, signal.SIGKILL)
+            # process.kill() will only kill the immediate child but not its
+            # forks. This won't work since our commands will create a few forks
+            # (make -> vsim -> etc). We need to make a process group and kill
+            # that
+            stdout, stderr = process.communicate()
+            timeoutmsg = 'TIMEOUT after {:f}s'.format(args.timeout)
+
+            if args.proc_verbose:
+                stdout_lock.acquire()
+                print(name)
+                print(timeoutmsg)
+                proc_out(cwd, stdout, stderr)
+                stdout_lock.release()
+
+            return FinishedProcess(name, cwd, process.args, 1,
+                                   stdout.decode('utf-8'),
+                                   timeoutmsg + '\n'
+                                   + stderr.decode('utf-8'),
+                                   time.time() - start)
+        # Including KeyboardInterrupt, communicate handled that.
+        except:  # noqa: E722
+            pgid = os.getpgid(process.pid)
+            os.killpg(pgid, signal.SIGKILL)
+            # We don't call process.wait() as .__exit__ does that for us.
+            raise
+        retcode = process.poll()
+        if check and retcode:
+            raise CalledProcessError(retcode, process.args,
+                                     output=stdout, stderr=stderr)
+        if args.proc_verbose:
+            stdout_lock.acquire()
+            print(name)
+            proc_out(cwd, stdout, stderr)
+            stdout_lock.release()
+
+    with lock:
+        shared_total.value += 1
+        print("[%s][%d/%d] %s" % ("\033[1;32m OK \033[0m" if retcode == 0 else "\033[1;31mFAIL\033[0m", shared_total.value, len_total.value, name))
+
+    return FinishedProcess(name, cwd, process.args, retcode,
+                           stdout.decode('utf-8'),
+                           stderr.decode('utf-8'),
+                           time.time() - start)
+
+def poolInit(s, t, l):
+    global shared_total
+    global len_total
+    global lock
+    shared_total = s
+    len_total = t
+    lock = l
+
+if __name__ == '__main__':
+    args = runtest.parse_args()
+    pp = pprint.PrettyPrinter(indent=4)
+
+    # lazy importing so that we can work without junit_xml
+    if args.report_junit:
+        try:
+            from junit_xml import TestSuite, TestCase
+        except ImportError:
+            print("""Error: The --report_junit option requires
+the junit_xml library which is not installed.""",
+                  file=sys.stderr)
+            exit(1)
+
+    # lazy import PrettyTable for displaying results
+    if not(args.disable_results_pp):
+        try:
+            from prettytable import PrettyTable
+        except ImportError:
+            print("""Warning: Displaying results requires the PrettyTable
+library which is not installed""")
+
+    tests = []  # list of tuple (testname, working dir, command)
+
+    # load tests (yaml or command list)
+    if args.yaml:
+        try:
+            import yaml
+        except ImportError:
+            print("""Error: The --yaml option requires
+the pyyaml library which is not installed.""",
+                  file=sys.stderr)
+            exit(1)
+        with open(args.test_file) as f:
+            testyaml = yaml.load(f, Loader=yaml.Loader)
+            for testsetname, testv in testyaml.items():
+                for testname, insn in testv.items():
+                    cmd = shlex.split(insn['command'])
+                    cwd = insn['path']
+                    tests.append((testsetname + ':' + testname, cwd, cmd))
+            if args.verbose:
+                pp.pprint(tests)
+    else:  # (command list)
+        with open(args.test_file) as f:
+            testnames = list(map(str.rstrip, f))
+            shellcmds = [shlex.split(e) for e in testnames]
+            cwds = ['./' for e in testnames]
+            tests = list(zip(testnames, cwds, shellcmds))
+            if args.verbose:
+                print('Tests which we are running:')
+                pp.pprint(tests)
+                pp.pprint(shellcmds)
+
+    # Spawning process pool
+    # Disable signals to prevent race. Child processes inherit SIGINT handler
+    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
+    lock = multiprocessing.Lock()
+    shared_total = multiprocessing.Value('i', 0)
+    len_total = multiprocessing.Value('i', len(tests))
+    pool = multiprocessing.Pool(processes=args.max_procs, initializer=poolInit, initargs=(shared_total, len_total, lock ))
+    # Restore SIGINT handler
+    signal.signal(signal.SIGINT, original_sigint_handler)
+    # Shuffle tests
+    random.shuffle(tests)
+    try:
+        procresults = pool.starmap(fork, tests)
+    except KeyboardInterrupt:
+        print("\nTerminating bwruntest.py")
+        pool.terminate()
+        pool.join()
+        exit(1)
+
+    # pp.pprint(procresults)
+    pool.close()
+    pool.join()
+
+    # Generate junit.xml file. Junit.xml differentiates between failure and
+    # errors but we treat everything as errors.
+    if args.report_junit:
+        testcases = []
+        for p in procresults:
+            # we can either expect p.name = testsetname:testname
+            # or p.name = testname
+            testcase = TestCase(p.name,
+                                classname=((p.name).split(':'))[0],
+                                stdout=p.stdout,
+                                stderr=p.stderr,
+                                elapsed_sec=p.time)
+            if p.returncode != 0:
+                testcase.add_failure_info(p.stderr)
+            testcases.append(testcase)
+
+        testsuite = TestSuite('bwruntests', testcases)
+        if args.output:
+            with open(args.output, 'w') as f:
+                TestSuite.to_file(f, [testsuite],
+                                  prettyprint=not(args.disable_junit_pp))
+        else:
+            print(TestSuite.to_xml_string([testsuite],
+                                          prettyprint=(args.disable_junit_pp)))
+
+    # # print JSON for performance regression
+    # if args.perf is not None:
+    #     # if file does not exist, create new dictionary:
+    #     if not os.path.isfile(args.perf):
+    #         d = OrderedDict([])
+    #     # else, load the existing dictionary
+    #     else:
+    #         with open(args.perf) as f:
+    #             d = json.load(f, object_pairs_hook=OrderedDict)
+    #     # save the new execution times
+    #     for p in procresults:
+    #         if p.returncode == 0:
+    #             d[p.name] = p.exec_time
+    #     with open(args.perf, 'w', encoding='utf-8') as f:
+    #         json.dump(d, f, ensure_ascii=False, indent=4)
+
+    # print JSON for performance regression
+    if args.perf is not None:
+        # if file does not exist, create new dictionary:
+        if not os.path.isfile(args.perf):
+            d = list([])
+        # else, load the existing dictionary
+        else:
+            with open(args.perf) as f:
+                d = json.load(f)
+        # save the new execution times
+        for p in procresults:
+            if p.returncode == 0:
+                d.append({ 'name': p.name, 'value': p.exec_time, 'unit': 'cycles'})
+        with open(args.perf, 'w', encoding='utf-8') as f:
+            json.dump(d, f, ensure_ascii=False, indent=4)
+
+    # print summary of test results
+    if not(args.disable_results_pp):
+        testcount = sum(1 for x in tests)
+        testfailcount = sum(1 for p in procresults if p.returncode != 0)
+        testpassedcount = testcount - testfailcount
+        resulttable = PrettyTable(['test', 'cycles', 'time', 'passed/total'])
+        resulttable.align['test'] = "l"
+        for p in procresults:
+            testpassed = 1 if p.returncode == 0 else 0
+            testname = p.name
+            resulttable.add_row([testname,
+                                 p.exec_time,
+                                 '{0:.2f}s'.format(p.time),
+                                 '{0:d}/{1:d}'.format(testpassed, 1)])
+        resulttable.add_row(['total', '', '', '{0:d}/{1:d}'.
+                             format(testpassedcount, testcount)])
+        print(resulttable)
+        if testpassedcount != testcount:
+            import sys; sys.exit(1)
+
diff --git a/regr/full_regression.sh b/regr/full_regression.sh
new file mode 100755
index 0000000..1e10f4e
--- /dev/null
+++ b/regr/full_regression.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright (C) 2020-2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Author: Francesco Conti (f.conti@unibo.it)
+#
+
+export N_PROC=1
+export P_STALL=0.04
+TIMEOUT=400
+
+# Declare a string array with type
+declare -a test_list=(
+    "regr/basic.yml"
+)
+
+# Read the list values with space
+for val in "${test_list[@]}"; do
+    nice -n10 regr/bwruntests.py --report_junit -t ${TIMEOUT} --yaml -o regr/hci_tests.xml -p${N_PROC} $val
+    if test $? -ne 0; then
+        echo "Error in test $val"
+        exit 1
+    fi
+done
+unset P_STALL
diff --git a/regr/hardware/hci/hardware.json b/regr/hardware/hci/hardware.json
new file mode 100644
index 0000000..b349a65
--- /dev/null
+++ b/regr/hardware/hci/hardware.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "HCI",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/regr/hardware/log/hardware.json b/regr/hardware/log/hardware.json
new file mode 100644
index 0000000..a505fa1
--- /dev/null
+++ b/regr/hardware/log/hardware.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "LOG",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/regr/testbench/fair/testbench.json b/regr/testbench/fair/testbench.json
new file mode 100644
index 0000000..011e20b
--- /dev/null
+++ b/regr/testbench/fair/testbench.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 0,
+    "PRIORITY_CNT_NUMERATOR": 1,
+    "PRIORITY_CNT_DENOMINATOR": 2
+  }
+}
diff --git a/regr/testbench/hwpe_prio/testbench.json b/regr/testbench/hwpe_prio/testbench.json
new file mode 100644
index 0000000..4eefeb8
--- /dev/null
+++ b/regr/testbench/hwpe_prio/testbench.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 1,
+    "PRIORITY_CNT_NUMERATOR": 9,
+    "PRIORITY_CNT_DENOMINATOR": 10
+  }
+}
diff --git a/regr/testbench/log_prio/testbench.json b/regr/testbench/log_prio/testbench.json
new file mode 100644
index 0000000..473e3fd
--- /dev/null
+++ b/regr/testbench/log_prio/testbench.json
@@ -0,0 +1,11 @@
+{
+  "description": "Testbench configuration parameters",
+  "parameters": {
+    "CLK_PERIOD": 50,
+    "RST_CLK_CYCLES": 10,
+    "RANDOM_GNT": 0,
+    "INVERT_PRIO": 0,
+    "PRIORITY_CNT_NUMERATOR": 9,
+    "PRIORITY_CNT_DENOMINATOR": 10
+  }
+}
diff --git a/rtl/common/hci_package.sv b/rtl/common/hci_package.sv
index 1e3d1e1..15a36b8 100644
--- a/rtl/common/hci_package.sv
+++ b/rtl/common/hci_package.sv
@@ -51,7 +51,8 @@ package hci_package;
   typedef struct packed {
     logic [1:0] arb_policy; // used only in some systems
     logic       invert_prio;
-    logic [7:0] low_prio_max_stall;
+    logic [7:0] priority_cnt_numerator;
+    logic [7:0] priority_cnt_denominator;
   } hci_interconnect_ctrl_t;
 
   typedef struct packed {
diff --git a/rtl/core/hci_core_split.sv b/rtl/core/hci_core_split.sv
index c0c96fb..88e2521 100644
--- a/rtl/core/hci_core_split.sv
+++ b/rtl/core/hci_core_split.sv
@@ -224,7 +224,11 @@ module hci_core_split
     end
 
     // r_ready masking
-    assign tcdm_initiator_lrdy_masked_d = cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid | ~tcdm_initiator_req : tcdm_initiator_r_valid | ~tcdm_initiator_req;
+    // Track lanes that have produced a response for the current split transaction.
+    // Using "~req" here can mark a lane as completed before r_valid is observed.
+    assign tcdm_initiator_lrdy_masked_d =
+        cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid
+                             : tcdm_initiator_r_valid;
     always_ff @(posedge clk_i or negedge rst_ni)
     begin
       if(~rst_ni) begin
diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv
index 9825891..fee4918 100644
--- a/rtl/hci_interconnect.sv
+++ b/rtl/hci_interconnect.sv
@@ -142,8 +142,22 @@ module hci_interconnect
     EW:  DEFAULT_EW,
     EHW: DEFAULT_EHW
   };
-  `HCI_INTF_ARRAY(hwpe_mem_muxed, clk_i, 0:N_MEM-1);
-
+  hci_core_intf #(
+    .DW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).DW  ),
+    .AW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).AW  ),
+    .BW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).BW  ),
+    .UW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).UW  ),
+    .IW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).IW  ),
+    .EW  ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EW  ),
+    .EHW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EHW )
+`ifndef SYNTHESIS
+    ,
+    .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // hwpe_mem_muxed is an internal muxed signal, not a protocol-compliant port
+    .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT )
+`endif
+  ) hwpe_mem_muxed [0:N_MEM-1] (
+    .clk ( clk_i )
+  );
 
   localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe_mem) = '{
     DW:  DEFAULT_DW,
@@ -254,15 +268,31 @@ module hci_interconnect
     
       end : hwpe_req2mem
 
+      // Set arbitration tree to be perfectly fair. It must not
+      // follow the max stall policy of the HWPE vs LIC arbiter.
+      // FIXME: it would be interesting to explore what happens
+      // with an unfair but configurable setting. Probably we need
+      // a generator to do that, I do not see a way to code it in
+      // pure SystemVerilog.
+      hci_interconnect_ctrl_t ctrl_arbiter_tree;
+      always_comb
+      begin
+        ctrl_arbiter_tree = ctrl_i;
+        ctrl_arbiter_tree.priority_cnt_numerator = 1;
+        ctrl_arbiter_tree.priority_cnt_denominator = 2;
+      end
+
       hci_arbiter_tree #(
         .NB_REQUESTS(N_HWPE),
         .NB_CHAN ( N_MEM ),
+        .WAIVE_RQ3_ASSERT  ( WAIVE_RQ3_ASSERT  ),
+        .WAIVE_RQ4_ASSERT  ( WAIVE_RQ4_ASSERT  ),
         .`HCI_SIZE_PARAM(out)(`HCI_SIZE_PARAM(hwpe_mem_muxed))
       ) i_wide_port_arbiter_tree (
         .clk_i   ( clk_i               ),
         .rst_ni  ( rst_ni              ),
         .clear_i ( clear_i             ),
-        .ctrl_i  ( ctrl_i              ),
+        .ctrl_i  ( ctrl_arbiter_tree   ),
         .in      ( hwpe_mem            ),
         .out     ( hwpe_mem_muxed      )
       );
diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv
index a291532..da75e51 100644
--- a/rtl/interco/hci_arbiter.sv
+++ b/rtl/interco/hci_arbiter.sv
@@ -44,13 +44,15 @@
  * .. _hci_arbiter_ctrl:
  * .. table:: **hci_arbiter** input control signals.
  *
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | **Name**             | **Type**               | **Description**                                               |
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | *invert_prio*        | `logic`                | When 1, invert priorities between `in_high` and `in_low`.     |
- *   +----------------------+------------------------+---------------------------------------------------------------+
- *   | *low_prio_max_stall* | `logic[7:0]`           | Maximum number of consecutive stalls on low-priority channel. |
- *   +----------------------+------------------------+---------------------------------------------------------------+
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | **Name**                   | **Type**     | **Description**                                                               |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *invert_prio*              | `logic`      | When 1, invert priorities between `in_high` and `in_low`.                     |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *priority_cnt_numerator*   | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel.                 |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
+ *   | *priority_cnt_denominator* | `logic[7:0]` | Clear condition of priority counter (max low-prio stalls + high-prio stalls). |
+ *   +----------------------------+--------------+-------------------------------------------------------------------------------+
  *
  */
  
@@ -75,8 +77,9 @@ module hci_arbiter
   logic [NB_CHAN-1:0] hs_pass_d;
   logic hs_req_d;
   logic ls_req_d;
+  logic hs_req_masked_d;
   logic switch_channels_d;
-  logic unsigned [7:0] ls_stall_ctr_d;
+  logic unsigned [7:0] priority_cnt_q;
 
   // priority_req is the OR of all requests coming out of the log interconnect.
   // it should be simplified to simply an OR of all requests coming *into* the
@@ -85,10 +88,11 @@ module hci_arbiter
   begin
     hs_req_d = |hs_req_in;
     ls_req_d = |ls_req_in;
-    if (ctrl_i.low_prio_max_stall > 0) //Set to 0 to disable this functionality
+    hs_req_masked_d = hs_req_d;
+    if (ctrl_i.priority_cnt_numerator > 0) //Set to 0 to disable this functionality
     begin
-      if (ls_stall_ctr_d >= ctrl_i.low_prio_max_stall)
-        hs_req_d = 0; //Let low side through for once
+      if (priority_cnt_q >= ctrl_i.priority_cnt_numerator && priority_cnt_q < ctrl_i.priority_cnt_denominator)
+        hs_req_masked_d = 0; //Let low side through for once
     end
   end
   
@@ -96,11 +100,11 @@ module hci_arbiter
 	always_ff @(posedge clk_i or negedge rst_ni)
 	begin
 		if (~rst_ni)
-			ls_stall_ctr_d <= 0;
+			priority_cnt_q <= 0;
+    else if(priority_cnt_q == ctrl_i.priority_cnt_denominator-1)
+      priority_cnt_q <= 0;
 		else if (hs_req_d & ls_req_d)
-			ls_stall_ctr_d <= ls_stall_ctr_d + 1;
-    else
-			ls_stall_ctr_d <= 0;
+			priority_cnt_q <= priority_cnt_q + 1;
 	end
   
   assign switch_channels_d = ctrl_i.invert_prio;
@@ -129,7 +133,7 @@ module hci_arbiter
   // Side select
   generate
     for(genvar ii=0; ii<NB_CHAN; ii++) begin: side_select
-      assign hs_pass_d[ii] = (hs_req_d & hs_req_in[ii]) ^ switch_channels_d;
+      assign hs_pass_d[ii] = (hs_req_masked_d & hs_req_in[ii]) ^ switch_channels_d;
     end // side_select
   endgenerate
 
@@ -142,38 +146,48 @@ module hci_arbiter
         in_low [ii].gnt = '0;
         if(hs_pass_d[ii]) 
         begin
-          out[ii].req     = in_high[ii].req;
-          out[ii].add     = in_high[ii].add;
-          out[ii].wen     = in_high[ii].wen;
-          out[ii].be      = in_high[ii].be;
-          out[ii].data    = in_high[ii].data;
-          out[ii].id      = in_high[ii].id;
-          out[ii].user    = in_high[ii].user;
-          out[ii].ecc     = in_high[ii].ecc;
-          in_high[ii].gnt = out[ii].gnt;
-        end 
+          out[ii].req      = in_high[ii].req;
+          out[ii].add      = in_high[ii].add;
+          out[ii].wen      = in_high[ii].wen;
+          out[ii].be       = in_high[ii].be;
+          out[ii].data     = in_high[ii].data;
+          out[ii].id       = in_high[ii].id;
+          out[ii].user     = in_high[ii].user;
+          out[ii].r_ready  = in_high[ii].r_ready;
+          out[ii].ecc      = in_high[ii].ecc;
+          out[ii].ereq     = in_high[ii].ereq;
+          out[ii].r_eready = in_high[ii].r_eready;
+          in_high[ii].gnt  = out[ii].gnt;
+        end
         else
         begin
-          out[ii].req    = in_low[ii].req;
-          out[ii].add    = in_low[ii].add;
-          out[ii].wen    = in_low[ii].wen;
-          out[ii].be     = in_low[ii].be;
-          out[ii].data   = in_low[ii].data;
-          out[ii].id     = in_low[ii].id;
-          out[ii].user   = in_low[ii].user;
-          out[ii].ecc    = in_low[ii].ecc;
-          in_low[ii].gnt = out[ii].gnt;
+          out[ii].req      = in_low[ii].req;
+          out[ii].add      = in_low[ii].add;
+          out[ii].wen      = in_low[ii].wen;
+          out[ii].be       = in_low[ii].be;
+          out[ii].data     = in_low[ii].data;
+          out[ii].id       = in_low[ii].id;
+          out[ii].user     = in_low[ii].user;
+          out[ii].r_ready  = in_low[ii].r_ready;
+          out[ii].ecc      = in_low[ii].ecc;
+          out[ii].ereq     = in_low[ii].ereq;
+          out[ii].r_eready = in_low[ii].r_eready;
+          in_low[ii].gnt   = out[ii].gnt;
         end
-        in_high[ii].r_data = out[ii].r_data;
-        in_low [ii].r_data = out[ii].r_data;
-        in_high[ii].r_id   = out[ii].r_id;
-        in_low [ii].r_id   = out[ii].r_id;
-        in_high[ii].r_opc  = out[ii].r_opc;
-        in_low [ii].r_opc  = out[ii].r_opc;
-        in_high[ii].r_user = out[ii].r_user;
-        in_low [ii].r_user = out[ii].r_user;
-        in_high[ii].r_ecc  = out[ii].r_ecc;
-        in_low [ii].r_ecc  = out[ii].r_ecc;
+        in_high[ii].r_data   = out[ii].r_data;
+        in_low [ii].r_data   = out[ii].r_data;
+        in_high[ii].r_id     = out[ii].r_id;
+        in_low [ii].r_id     = out[ii].r_id;
+        in_high[ii].r_opc    = out[ii].r_opc;
+        in_low [ii].r_opc    = out[ii].r_opc;
+        in_high[ii].r_user   = out[ii].r_user;
+        in_low [ii].r_user   = out[ii].r_user;
+        in_high[ii].r_ecc    = out[ii].r_ecc;
+        in_low [ii].r_ecc    = out[ii].r_ecc;
+        in_high[ii].egnt     = out[ii].egnt;
+        in_low [ii].egnt     = out[ii].egnt;
+        in_high[ii].r_evalid = out[ii].r_evalid;
+        in_low [ii].r_evalid = out[ii].r_evalid;
         // r_valid signals are NOT propagated by the arbiter, they are generated at
         // routing stage. In previous HCI versions, we used a r_valid-less version
         // of the protocol here.
diff --git a/rtl/interco/hci_arbiter_tree.sv b/rtl/interco/hci_arbiter_tree.sv
index 2a10524..f304432 100644
--- a/rtl/interco/hci_arbiter_tree.sv
+++ b/rtl/interco/hci_arbiter_tree.sv
@@ -39,7 +39,9 @@ module hci_arbiter_tree
 #(
   parameter int unsigned NB_REQUESTS = 1,
   parameter int unsigned NB_CHAN = 16,
-  parameter hci_size_parameter_t `HCI_SIZE_PARAM(out)  = '0
+  parameter hci_size_parameter_t `HCI_SIZE_PARAM(out)  = '0,
+  parameter bit WAIVE_RQ3_ASSERT  = 1'b0,
+  parameter bit WAIVE_RQ4_ASSERT  = 1'b0
 )
 (
   input  logic                   clk_i,
@@ -74,7 +76,22 @@ module hci_arbiter_tree
    EHW: `HCI_SIZE_GET_EHW(out)
   };
   
-  `HCI_INTF_ARRAY(arb_out, clk_i, 0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1);
+  hci_core_intf #(
+    .DW  ( `HCI_SIZE_PARAM(arb_out).DW  ),
+    .AW  ( `HCI_SIZE_PARAM(arb_out).AW  ),
+    .BW  ( `HCI_SIZE_PARAM(arb_out).BW  ),
+    .UW  ( `HCI_SIZE_PARAM(arb_out).UW  ),
+    .IW  ( `HCI_SIZE_PARAM(arb_out).IW  ),
+    .EW  ( `HCI_SIZE_PARAM(arb_out).EW  ),
+    .EHW ( `HCI_SIZE_PARAM(arb_out).EHW )
+`ifndef SYNTHESIS
+    ,
+    .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // arb_out is an internal arbitration signal, not a protocol-compliant port
+    .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT )
+`endif
+  ) arb_out [0:NB_LEVELS*MAX_ARBITERS_PER_LEVEL*NB_CHAN-1] (
+    .clk ( clk_i )
+  );
   
   generate		
     for(genvar lvl=0; lvl<NB_LEVELS; lvl++) begin : arbiter_tree_levels
diff --git a/rtl/interco/hci_router.sv b/rtl/interco/hci_router.sv
index 471a7b4..da263b2 100644
--- a/rtl/interco/hci_router.sv
+++ b/rtl/interco/hci_router.sv
@@ -231,14 +231,16 @@ module hci_router
 
     for(genvar ii=0; ii<NB_OUT_CHAN; ii++)
     begin : virt_out_bind
-      assign out[ii].req  = virt_out[ii].req;
-      assign out[ii].wen  = virt_out[ii].wen;
-      assign out[ii].be   = virt_out[ii].be;
-      assign out[ii].data = virt_out[ii].data;
-      assign out[ii].add  = virt_out[ii].add;
-      assign virt_out[ii].gnt     = out[ii].gnt;
-      assign virt_out[ii].r_valid = out_r_valid[ii];
-      assign virt_out[ii].r_data  = out[ii].r_data;
+      assign out[ii].req     = virt_out[ii].req;
+      assign out[ii].wen     = virt_out[ii].wen;
+      assign out[ii].be      = virt_out[ii].be;
+      assign out[ii].data    = virt_out[ii].data;
+      assign out[ii].add     = virt_out[ii].add;
+      assign out[ii].r_ready = virt_out[ii].r_ready;
+      assign virt_out[ii].gnt      = out[ii].gnt;
+      assign virt_out[ii].r_valid  = out_r_valid[ii];
+      assign virt_out[ii].r_data   = out[ii].r_data;
+      assign virt_out[ii].r_opc    = out[ii].r_opc;
 
       // unimplemented user bits = 0
       assign out[ii].user = '0;
@@ -246,12 +248,21 @@ module hci_router
       // unimplemented id bits = 0
       assign out[ii].id = '0;
 
+      // ECC handshake signals
       if(USE_ECC) begin : ecc_assignment
-        assign out[ii].ecc  = virt_out[ii].ecc;
-        assign virt_out[ii].r_ecc  = out[ii].r_ecc;
+        assign out[ii].ereq     = virt_out[ii].ereq;
+        assign out[ii].r_eready = virt_out[ii].r_eready;
+        assign out[ii].ecc      = virt_out[ii].ecc;
+        assign virt_out[ii].egnt     = out[ii].egnt;
+        assign virt_out[ii].r_evalid = out[ii].r_evalid;
+        assign virt_out[ii].r_ecc    = out[ii].r_ecc;
       end else begin
-        assign out[ii].ecc         = '0;
-        assign virt_out[ii].r_ecc  = '0;
+        assign out[ii].ereq     = '0;
+        assign out[ii].r_eready = '1;
+        assign out[ii].ecc      = '0;
+        assign virt_out[ii].egnt     = '0;
+        assign virt_out[ii].r_evalid = '0;
+        assign virt_out[ii].r_ecc    = '0;
       end
 
     end // virt_out_bind
diff --git a/target/verif/README.md b/target/verif/README.md
new file mode 100644
index 0000000..756efbe
--- /dev/null
+++ b/target/verif/README.md
@@ -0,0 +1,181 @@
+# HCI Verification Framework
+
+## Overview
+
+The verification framework drives configurable memory traffic from multiple masters through the HCI interconnect and measures throughput and latency. It is fully driven by three JSON configuration files:
+
+For the up-to-date stimuli-generator pattern catalog and output format, see `target/verif/simvectors/README.md`.
+
+| File | Purpose |
+|------|---------|
+| `config/hardware.json` | Interconnect topology (number of masters, banks, data widths, ...) |
+| `config/testbench.json` | Simulation parameters (clock period, arbitration policy, ...) |
+| `config/workload.json` | Per-master traffic patterns, transaction counts, and dataflow dependencies |
+
+---
+
+## Configuration files
+
+### `hardware.json`
+
+Controls the hardware topology instantiated in the testbench. Generates `config/generated/hardware.mk` which is included by the build system and passed as Verilog defines.
+
+### `testbench.json`
+
+Controls simulation-level knobs (clock period, reset cycles, arbitration parameters). Generates `config/generated/testbench.mk`.
+
+### `workload.json`
+
+The most user-facing configuration. Describes what each master does and in what order. Structure:
+
+```json
+{
+  "description": "...",
+  "log_masters": [ ... ],
+  "hwpe_masters": [ ... ]
+}
+```
+
+#### Per-master fields
+
+Each entry in `log_masters` or `hwpe_masters` supports:
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `id` | recommended | Index within the master array. Must match the positional index (0-based). Used for documentation; mapping is always positional. |
+| `description` | no | Human-readable label. |
+| `mem_access_type` | yes | Traffic pattern. See [Access patterns](#access-patterns). |
+| `n_transactions` | yes* | Number of transactions to issue. Can be derived from geometry for structured patterns — see below. |
+| `phase` | no | Phase name (string). Masters in the same phase can run concurrently. Default: `"default"`. |
+| `wait_for` | no | List of phase names that must complete (`end_req_o` asserted on all masters of those phases) before this master starts. Default: `[]` (start immediately). |
+| `start_delay_cycles` | no | Number of idle cycles prepended to this master's stimuli file (static start delay within a phase). Default: `0`. |
+| `region_base_address` | no | Base byte address of the memory region for this master (int, hex `0x...`, or decimal string). Default: evenly partitioned. |
+| `region_size_bytes` | no | Size in bytes of the memory region for this master. Default: evenly partitioned. |
+| `start_address` | no | Starting address for `linear`/`2d`/`3d` patterns (int, hex `0x...`, or decimal string). Default: `"0"`. |
+| `stride0`, `len_d0` | no | Inner dimension stride (in words) and length for `linear`/`2d`/`3d`. |
+| `stride1`, `len_d1` | no | Middle dimension stride (in words) and length for `2d`/`3d`. |
+| `stride2` | no | Outer dimension stride (in words) for `3d`. |
+| `traffic_pct` | no | [`random`, `linear`] Bus utilization percentage (1–100). After each transaction emits `floor((100-pct)/pct)` idle cycles. Default: `100` (back-to-back). |
+| `traffic_read_pct` | no | [`random`, `linear`] Percentage of accesses that are reads. If omitted, read/write is random per transaction. When set, all reads are issued first, then all writes. |
+| `idle_cycles_between_phases` | no | [`2d`, `3d`, `matmul_phased`] Idle cycles inserted between phases (between outer rows for `2d`/`3d`, between read-A/read-B/write-C for `matmul_phased`). Models compute time. Default: `0`. |
+| `matmul_ratio_a/b/c` | no | [`matmul_phased`] Phase ratio for read-A : read-B : write-C transaction split. Default: `1:1:1`. |
+| `region_base_address_a/b/c` | no | [`matmul_phased`] Explicit per-phase base addresses, overriding the auto-split of the combined region. |
+| `region_size_bytes_a/b/c` | no | [`matmul_phased`] Explicit per-phase region sizes (paired with `region_base_address_a/b/c`). |
+
+\* `n_transactions` can be omitted for structured patterns if geometry fields are provided — `main.py` derives it automatically and reports it. For `random` it is always required.
+
+#### Deriving `n_transactions` from geometry
+
+| Pattern | Geometry fields | Derived count |
+|---------|----------------|---------------|
+| `linear` | `length` | `length` |
+| `2d` | `len_d0`, `len_d1` | `len_d0 × len_d1` |
+| `3d` | `len_d0`, `len_d1`, `len_d2` | `len_d0 × len_d1 × len_d2` |
+| `matmul_phased` | `matrix_m`, `matrix_n`, `matrix_k` | `m×k + k×n + m×n` |
+| `random` | — | must be set explicitly |
+| `idle` | — | ignored |
+
+---
+
+## Access patterns
+
+| `mem_access_type` | Description |
+|-------------------|-------------|
+| `random` | Uniformly random addresses within the assigned region. |
+| `linear` | Strided 1-D sequential scan. |
+| `2d` | Strided 2-D scan (inner `stride0`/`len_d0`, outer `stride1`). |
+| `3d` | Strided 3-D scan (inner `stride0`/`len_d0`, mid `stride1`/`len_d1`, outer `stride2`). |
+| `matmul_phased` | Deterministic phased traffic modelling matrix multiply: read-A phase, read-B phase, write-C phase. The assigned region is split into three equal sub-regions A, B, C. |
+| `idle` | No transactions (driver idles). |
+
+---
+
+## Dataflow: phases and dependencies
+
+### Concept
+
+Masters can be assigned to named **phases** and can declare **dependencies** on other phases. This allows modeling realistic dataflow graphs, e.g.:
+
+- HWPE A and HWPE B run in parallel (same phase, no dependencies)
+- HWPE C starts only after HWPE B finishes (C `wait_for: ["phaseB"]`)
+
+```json
+"hwpe_masters": [
+  { "id": 0, "phase": "phaseA", "wait_for": [],           ... },
+  { "id": 1, "phase": "phaseB", "wait_for": [],           ... },
+  { "id": 2, "phase": "phaseC", "wait_for": ["phaseB"],   ... }
+]
+```
+
+### Mechanism
+
+`main.py` reads the `phase`/`wait_for` fields and computes a **wait mask** per driver: a bitmask of width `N_DRIVERS` where bit `j` is set if this driver must wait for driver `j`'s `end_req_o`. The masks are encoded as a SV unpacked array literal in `config/generated/fence_masks.mk` and passed to the simulator as the `WAIT_MASKS_PARAM` define.
+
+In `tb_hci_pkg.sv`, `WAIT_MASKS[i]` holds driver `i`'s mask. In `tb_hci.sv`, each driver's `clear_i` is combinationally held high until all drivers in its mask have asserted `end_req_o`:
+
+```
+clear_i[i] = (eff_mask[i] != 0) && ((s_end_req & eff_mask[i]) != eff_mask[i])
+```
+
+A driver with an empty mask starts as soon as reset is released.
+
+The dependency is on `end_req_o` (all transactions issued), not `end_resp_o` (all read responses retired). This is intentional: a dependent master can begin issuing as soon as the dependency master has issued all its requests, which is the correct model for pipelined HWPE dataflow.
+
+### MUX mode (INTERCO_TYPE=MUX)
+
+`hci_core_mux_static` forwards exactly one HWPE to the interconnect at a time (selected by `sel_i`). Therefore, in MUX mode **at most one HWPE can be active at any given time**, regardless of what `wait_for` says in the workload.
+
+**The user-defined `wait_for` dependencies are ignored for HWPE drivers in MUX mode.** Instead, the testbench automatically enforces a strict sequential chain:
+
+- HWPE 0 starts first (no dependency)
+- HWPE 1 waits for HWPE 0
+- HWPE 2 waits for HWPE 1
+- ...
+- HWPE k waits for HWPE k-1
+
+**Ordering is by index**, which equals the positional order in the `hwpe_masters` array in `workload.json` (0-based). To control execution order in MUX mode, reorder the entries in `hwpe_masters`.
+
+LOG master `wait_for` dependencies are always respected regardless of interconnect mode.
+
+`sel_i` is driven combinationally to the index of the currently active HWPE (the lowest-indexed HWPE whose `clear_i` is 0).
+
+This means **the same workload JSON can be used across all three interconnect modes** (HCI, MUX, LOG). In HCI/LOG mode the declared `wait_for` dependencies are honored; in MUX mode they are overridden by the sequential chain.
+
+---
+
+## Memory layout
+
+The memory uses a **bank-interleaved addressing scheme**: consecutive word addresses map to consecutive banks before wrapping.
+
+```
+byte addr 0x00  → bank 0
+byte addr 0x04  → bank 1   (assuming DATA_WIDTH=32, 4 bytes/word)
+...
+byte addr 0x7C  → bank 31  (for N_BANKS=32)
+byte addr 0x80  → bank 0   (wrap)
+```
+
+In general, bank index = `(byte_addr / (DATA_WIDTH/8)) % N_BANKS`.
+
+`main.py` reports the memory map for each master after stimuli generation, showing:
+- First and last byte addresses accessed
+- Total transfer size in bytes
+- Number of distinct banks touched
+- For matmul_phased: sub-region boundaries for A, B, C matrices
+
+This allows verifying that regions are correctly partitioned and identifying any unintended overlaps between masters.
+
+---
+
+## Build targets
+
+| Target | Action |
+|--------|--------|
+| `make config-verif` | Generate `hardware.mk`, `testbench.mk` from JSON |
+| `make stim-verif` | Generate stimuli and `fence_masks.mk` from workload/hardware/testbench JSON |
+| `make compile-verif` | Compile RTL and testbench with QuestaSim |
+| `make opt-verif` | Optimize compiled design |
+| `make run-verif` | Run simulation |
+| `make clean-verif` | Remove all generated artifacts |
+
+Pass `WORKLOAD_JSON=config/workload_<name>.json` to `make stim-verif` / `make run-verif` to select an alternative workload.
diff --git a/target/verif/bender.mk b/target/verif/bender.mk
index 2e1362e..02e640b 100644
--- a/target/verif/bender.mk
+++ b/target/verif/bender.mk
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
@@ -8,25 +8,24 @@
 VERIF_DEFS ?=
 VERIF_DEFS += \
 	-D N_HWPE=$(N_HWPE) \
-	-D HWPE_WIDTH=$(HWPE_WIDTH) \
+	-D HWPE_WIDTH_FACT=$(HWPE_WIDTH_FACT) \
 	-D N_CORE=$(N_CORE) \
 	-D N_DMA=$(N_DMA) \
 	-D N_EXT=$(N_EXT) \
-	-D TS_BIT=$(TS_BIT) \
-	-D EXPFIFO=$(EXPFIFO) \
-	-D SEL_LIC=$(SEL_LIC) \
 	-D DATA_WIDTH=$(DATA_WIDTH) \
 	-D TOT_MEM_SIZE=$(TOT_MEM_SIZE) \
 	-D N_BANKS=$(N_BANKS) \
-	-D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \
-	-D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \
+	-D TS_BIT=$(TS_BIT) \
+	-D EXPFIFO=$(EXPFIFO) \
+	-D SEL_LIC=$(SEL_LIC) \
 	-D CLK_PERIOD=$(CLK_PERIOD) \
 	-D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \
-	-D MAX_CYCLES_BETWEEN_GNT_RVALID=$(MAX_CYCLES_BETWEEN_GNT_RVALID) \
 	-D RANDOM_GNT=$(RANDOM_GNT) \
+	-D INTERCO_TYPE=$(INTERCO_TYPE) \
 	-D INVERT_PRIO=$(INVERT_PRIO) \
-	-D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL)
+	-D PRIORITY_CNT_NUMERATOR=$(PRIORITY_CNT_NUMERATOR) \
+	-D PRIORITY_CNT_DENOMINATOR=$(PRIORITY_CNT_DENOMINATOR)
 
 # Common targets for bender
 VERIF_TARGS ?=
-VERIF_TARGS += -t hci_verif
\ No newline at end of file
+VERIF_TARGS += -t hci_verif
diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl
index f06a13d..031d817 100644
--- a/target/verif/config/generated/hardware.mk.tpl
+++ b/target/verif/config/generated/hardware.mk.tpl
@@ -1,4 +1,4 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
@@ -6,10 +6,11 @@
 
 # Hardware configuration parameters (from hardware.json)
 N_HWPE?=${N_HWPE}
-HWPE_WIDTH?=${HWPE_WIDTH}
+HWPE_WIDTH_FACT?=${HWPE_WIDTH_FACT}
 N_CORE?=${N_CORE}
 N_DMA?=${N_DMA}
 N_EXT?=${N_EXT}
+INTERCO_TYPE?=${INTERCO_TYPE}
 TS_BIT?=${TS_BIT}
 EXPFIFO?=${EXPFIFO}
 SEL_LIC?=${SEL_LIC}
diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py
index fcec54f..1d6ad27 100755
--- a/target/verif/config/generated/json_to_mk.py
+++ b/target/verif/config/generated/json_to_mk.py
@@ -6,8 +6,8 @@
 Templates are automatically discovered based on the config type argument.
 """
 
-import json
 import argparse
+import json
 import sys
 from pathlib import Path
 from string import Template
@@ -45,6 +45,34 @@ def flatten_dict(d, prefix=''):
             items.append((new_key, v))
     return dict(items)
 
+
+def get_parameters(config):
+    """Return flattened parameters from config JSON."""
+    params = config.get("parameters")
+    if not isinstance(params, dict):
+        return {}
+    return flatten_dict(params)
+
+
+def load_all_parameters(config_dir):
+    """Load flattened parameters from all JSON files in config_dir."""
+    merged = {}
+    for json_path in sorted(config_dir.glob("*.json")):
+        cfg = load_json_config(json_path)
+        merged.update(get_parameters(cfg))
+    return merged
+
+
+def template_variables(template_content):
+    """Extract Template variable names used by template content."""
+    pattern = Template.pattern
+    vars_found = set()
+    for match in pattern.finditer(template_content):
+        name = match.group("named") or match.group("braced")
+        if name is not None:
+            vars_found.add(name)
+    return vars_found
+
 def parse_args(argv=None):
     parser = argparse.ArgumentParser(
         description="Convert JSON configuration to Makefile fragment using templates."
@@ -55,9 +83,9 @@ def parse_args(argv=None):
         help="Configuration type to generate.",
     )
     parser.add_argument(
-        "config_dir",
+        "config_json",
         type=Path,
-        help="Directory containing source-of-truth JSON files.",
+        help="Path to selected source-of-truth JSON file.",
     )
     parser.add_argument(
         "generated_dir",
@@ -70,11 +98,10 @@ def parse_args(argv=None):
 def main():
     args = parse_args()
     config_type = args.config_type
-    config_dir = args.config_dir.resolve()
+    json_file = args.config_json.resolve()
+    config_dir = json_file.parent
     generated_dir = args.generated_dir.resolve()
 
-    # Construct file paths based on config_type argument
-    json_file = config_dir / f"{config_type}.json"
     template_file = generated_dir / f"{config_type}.mk.tpl"
 
     # Load JSON config
@@ -83,16 +110,22 @@ def main():
     # Load template
     template_content = load_template(template_file)
 
-    # Flatten the parameters dict for template substitution
-    template_data = flatten_dict(config['parameters'])
+    # Build substitution dictionary:
+    # 1. all parameters from all configs (fallback)
+    # 2. parameters from selected config (override)
+    template_data = load_all_parameters(config_dir)
+    template_data.update(get_parameters(config))
 
     # Apply template substitution
     template = Template(template_content)
-    try:
-        result = template.substitute(template_data)
-    except KeyError as e:
-        print(f"ERROR: Missing template variable: {e}", file=sys.stderr)
+    missing = sorted(v for v in template_variables(template_content) if v not in template_data)
+    if missing:
+        print(
+            f"ERROR: Missing template variable(s): {', '.join(missing)}",
+            file=sys.stderr,
+        )
         sys.exit(1)
+    result = template.substitute(template_data)
 
     # Output to stdout
     print(result)
diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl
index a1cf812..c36dad7 100644
--- a/target/verif/config/generated/testbench.mk.tpl
+++ b/target/verif/config/generated/testbench.mk.tpl
@@ -1,15 +1,13 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
 # This file is auto-generated from testbench.json - DO NOT EDIT MANUALLY
 
 # Testbench parameters (from testbench.json)
-N_TRANSACTION_LOG?=${N_TRANSACTION_LOG}
-TRANSACTION_RATIO?=${TRANSACTION_RATIO}
 CLK_PERIOD?=${CLK_PERIOD}
 RST_CLK_CYCLES?=${RST_CLK_CYCLES}
-MAX_CYCLES_BETWEEN_GNT_RVALID?=${MAX_CYCLES_BETWEEN_GNT_RVALID}
 RANDOM_GNT?=${RANDOM_GNT}
 INVERT_PRIO?=${INVERT_PRIO}
-LOW_PRIO_MAX_STALL?=${LOW_PRIO_MAX_STALL}
+PRIORITY_CNT_NUMERATOR?=${PRIORITY_CNT_NUMERATOR}
+PRIORITY_CNT_DENOMINATOR?=${PRIORITY_CNT_DENOMINATOR}
diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json
index 678217b..b349a65 100644
--- a/target/verif/config/hardware.json
+++ b/target/verif/config/hardware.json
@@ -2,15 +2,16 @@
   "description": "Hardware configuration parameters for HCI interconnect",
   "parameters": {
     "N_HWPE": 2,
-    "HWPE_WIDTH": 8,
+    "HWPE_WIDTH_FACT": 8,
     "N_CORE": 8,
-    "N_DMA": 1,
+    "N_DMA": 0,
     "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "HCI",
     "TS_BIT": 21,
     "EXPFIFO": 0,
-    "SEL_LIC": 0,
-    "DATA_WIDTH": 32,
-    "TOT_MEM_SIZE": 32,
-    "N_BANKS": 64
+    "SEL_LIC": 0
   }
 }
diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json
index a4daaf4..924c947 100644
--- a/target/verif/config/testbench.json
+++ b/target/verif/config/testbench.json
@@ -1,13 +1,11 @@
 {
   "description": "Testbench configuration parameters",
   "parameters": {
-    "N_TRANSACTION_LOG": 1000,
-    "TRANSACTION_RATIO": 1,
     "CLK_PERIOD": 50,
     "RST_CLK_CYCLES": 10,
-    "MAX_CYCLES_BETWEEN_GNT_RVALID": 1,
     "RANDOM_GNT": 0,
     "INVERT_PRIO": 0,
-    "LOW_PRIO_MAX_STALL": 10
+    "PRIORITY_CNT_NUMERATOR": 10,
+    "PRIORITY_CNT_DENOMINATOR": 11
   }
 }
diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json
index 7da0838..aa2da43 100644
--- a/target/verif/config/workload.json
+++ b/target/verif/config/workload.json
@@ -1,144 +1,540 @@
 {
-  "description": "Workload configuration for stimuli generation",
-  "simulation_parameters": {
-    "EXACT_OR_MAX_OFFSET": 0,
-    "CYCLE_OFFSET_LOG": 1,
-    "CYCLE_OFFSET_HWPE": 1
-  },
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
   "log_masters": [
     {
       "id": 0,
-      "description": "Core 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 0 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 0 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 1,
-      "description": "Core 1",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 1 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 1 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 2,
-      "description": "Core 2",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 2 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 2 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 3,
-      "description": "Core 3",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 3 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 3 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 4,
-      "description": "Core 4",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 4 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 4 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 5,
-      "description": "Core 5",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 5 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 5 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 6,
-      "description": "Core 6",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 6 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 6 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
     {
       "id": 7,
-      "description": "Core 7",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
     },
-    {
-      "id": 8,
-      "description": "DMA 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    },
-    {
-      "id": 9,
-      "description": "External 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
-    }
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
   ],
+
   "hwpe_masters": [
     {
       "id": 0,
-      "description": "HWPE 0",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A1_buf1",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A2_buf0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A0",
+          "mem_access_type": "linear",
+          "job": "dma_store_C0_buf2",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A3_buf1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A1",
+          "mem_access_type": "linear",
+          "job": "dma_store_C1_buf3",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A2",
+          "mem_access_type": "linear",
+          "job": "dma_store_C2_buf2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        }
+      ]
     },
     {
       "id": 1,
-      "description": "HWPE 1",
-      "mem_access_type": 0,
-      "start_address": 0,
-      "stride0": 0,
-      "len_d0": 0,
-      "stride1": 0,
-      "len_d1": 0,
-      "stride2": 0
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json
new file mode 100644
index 0000000..fde1870
--- /dev/null
+++ b/target/verif/config/workload.schema.json
@@ -0,0 +1,471 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "workload.schema.json",
+  "title": "HCI Verification Workload Configuration",
+  "description": "Schema for workload.json consumed by target/verif/simvectors/main.py",
+  "type": "object",
+  "required": ["log_masters", "hwpe_masters"],
+  "additionalProperties": false,
+  "properties": {
+    "description": {
+      "type": "string",
+      "description": "Human-readable description of this workload."
+    },
+    "log_masters": {
+      "type": "array",
+      "description": "Ordered list of LOG masters (CORE, then DMA, then EXT). Length must equal N_CORE + N_DMA + N_EXT in hardware.json.",
+      "items": { "$ref": "#/$defs/master" }
+    },
+    "hwpe_masters": {
+      "type": "array",
+      "description": "Ordered list of HWPE masters. Length must equal N_HWPE in hardware.json.",
+      "items": { "$ref": "#/$defs/master" }
+    }
+  },
+
+  "$defs": {
+
+    "pattern": {
+      "type": "object",
+      "description": "A single traffic pattern segment. Multiple patterns on the same master are executed sequentially, separated by PAUSE fence tokens in the stimulus file.",
+      "required": ["mem_access_type"],
+      "unevaluatedProperties": false,
+      "properties": {
+
+        "description": {
+          "type": "string",
+          "description": "Human-readable label for this pattern segment, shown in the memory-map report."
+        },
+        "mem_access_type": {
+          "type": "string",
+          "enum": [
+            "idle",
+            "random",
+            "linear",
+            "2d",
+            "3d",
+            "matmul_phased",
+            "matmul",
+            "multi_linear",
+            "bank_group_linear",
+            "rw_rowwise",
+            "gather_scatter",
+            "matmul_tiled_interleave",
+            "matmul_tiled",
+            "hotspot_random"
+          ],
+          "description": "Access pattern selector. Aliases: 'matmul' -> 'matmul_phased', 'matmul_tiled' -> 'matmul_tiled_interleave'."
+        },
+        "job": {
+          "type": "string",
+          "default": "default",
+          "description": "Job name for this pattern segment. Used by other patterns' wait_for_jobs to reference this segment."
+        },
+        "wait_for_jobs": {
+          "type": "array",
+          "items": { "type": "string" },
+          "default": [],
+          "description": "Job names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced jobs have advanced past the same fence level."
+        },
+
+        "n_transactions": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry or from region_size_bytes (transaction-width based)."
+        },
+
+        "region_base_address": {
+          "oneOf": [
+            { "type": "integer", "minimum": 0 },
+            { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
+          ],
+          "description": "[random, linear, matmul_phased] Base byte address of the memory region."
+        },
+        "region_size_bytes": {
+          "oneOf": [
+            { "type": "integer", "minimum": 0 },
+            { "type": "string", "description": "Decimal, hex (0x...) or binary string." }
+          ],
+          "description": "[random, linear, matmul_phased] Size in bytes of the memory region. For linear/matmul_phased, if n_transactions is omitted this can be used to derive it from transaction width."
+        },
+
+        "start_address": {
+          "type": "string",
+          "default": "0",
+          "description": "[linear, 2d, 3d] Start byte address."
+        },
+        "stride0": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[linear, 2d, 3d] Innermost stride in words."
+        },
+        "stride1": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[2d, 3d] Middle stride in words."
+        },
+        "stride2": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[3d] Outermost stride in words."
+        },
+        "length": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[linear] Alias for n_transactions."
+        },
+        "len_d0": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[2d, 3d] Innermost dimension length."
+        },
+        "len_d1": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[2d, 3d] Middle dimension length."
+        },
+        "len_d2": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[3d] Outermost dimension length."
+        },
+
+        "matrix_m": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Rows of A and C."
+        },
+        "matrix_n": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Columns of B and C."
+        },
+        "matrix_k": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[matmul_phased] Columns of A / rows of B."
+        },
+        "region_base_address_a": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the read-A region."
+        },
+        "region_size_bytes_a": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the read-A region."
+        },
+        "region_base_address_b": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the read-B region."
+        },
+        "region_size_bytes_b": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the read-B region."
+        },
+        "region_base_address_c": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Base address of the write-C region."
+        },
+        "region_size_bytes_c": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[matmul_phased] Size in bytes of the write-C region."
+        },
+        "regions": {
+          "type": "array",
+          "description": "[multi_linear] Subregions to stream in schedule order.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "stride_words": { "type": "integer", "minimum": 1, "default": 1 },
+              "read_pct": { "type": "integer", "minimum": 0, "maximum": 100 }
+            }
+          }
+        },
+        "schedule": {
+          "type": "string",
+          "description": "[multi_linear, gather_scatter] Access schedule selector (e.g. round_robin, 4read_1write)."
+        },
+        "burst_len": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[multi_linear] Number of consecutive accesses per selected region before switching."
+        },
+        "start_bank": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[bank_group_linear] Starting bank index."
+        },
+        "bank_group_span": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[bank_group_linear] Number of banks in the active group."
+        },
+        "stride_beats": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[bank_group_linear] Stride in beats through the bank-group phase."
+        },
+        "bank_group_hop": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[bank_group_linear] Optional phase hop applied when advancing to the next group window."
+        },
+        "wen": {
+          "type": "integer",
+          "enum": [0, 1],
+          "description": "[bank_group_linear] Fixed direction: 1=read, 0=write. If omitted, reads/writes are mixed."
+        },
+        "row_base_address": {
+          "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }],
+          "description": "[rw_rowwise] Base address of row 0."
+        },
+        "row_size_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[rw_rowwise] Bytes touched inside each row."
+        },
+        "n_rows": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "[rw_rowwise] Number of rows."
+        },
+        "row_stride_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[rw_rowwise] Byte stride between consecutive row bases."
+        },
+        "reads_per_row": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[rw_rowwise] Number of reads emitted per row."
+        },
+        "writes_per_row": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "[rw_rowwise] Number of writes emitted per row."
+        },
+        "idle_cycles_between_rows": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[rw_rowwise] Idle cycles inserted between rows."
+        },
+        "read_regions": {
+          "type": "array",
+          "description": "[gather_scatter] Source regions for gather reads.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }
+            }
+          }
+        },
+        "write_region": {
+          "type": "object",
+          "description": "[gather_scatter] Destination region for scatter writes.",
+          "required": ["base", "size_bytes"],
+          "additionalProperties": false,
+          "properties": {
+            "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+            "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }
+          }
+        },
+        "chunk_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[gather_scatter] Address increment size in bytes for gather/scatter stepping."
+        },
+        "tile_a_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes read from A per tile schedule step."
+        },
+        "tile_b_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes read from B per tile schedule step."
+        },
+        "tile_c_bytes": {
+          "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }],
+          "description": "[matmul_tiled_interleave] Bytes written to C per tile schedule step."
+        },
+        "tiles": {
+          "type": "integer",
+          "minimum": 1,
+          "default": 1,
+          "description": "[matmul_tiled_interleave] Number of tile iterations before the pattern repeats."
+        },
+        "ab_c_schedule": {
+          "type": "string",
+          "description": "[matmul_tiled_interleave] Tile phase order string, e.g. A_B_B_C."
+        },
+        "idle_cycles_between_tiles": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[matmul_tiled_interleave] Idle cycles inserted between tile iterations."
+        },
+        "hot_regions": {
+          "type": "array",
+          "description": "[hotspot_random] Weighted hot regions used for random accesses.",
+          "items": {
+            "type": "object",
+            "required": ["base", "size_bytes"],
+            "additionalProperties": false,
+            "properties": {
+              "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] },
+              "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] },
+              "weight": { "type": "integer", "minimum": 1, "default": 1 }
+            }
+          }
+        },
+        "matmul_ratio_a": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the read-A phase."
+        },
+        "matmul_ratio_b": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the read-B phase."
+        },
+        "matmul_ratio_c": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 1,
+          "description": "[matmul_phased] Relative weight of the write-C phase."
+        },
+
+        "traffic_pct": {
+          "type": "integer",
+          "minimum": 1,
+          "maximum": 100,
+          "default": 100,
+          "description": "[random, linear, matmul_phased, multi_linear, bank_group_linear, rw_rowwise, gather_scatter, matmul_tiled_interleave, hotspot_random] Modeled request utilization percentage (adds req=0 idles after each request when <100)."
+        },
+        "traffic_read_pct": {
+          "type": "integer",
+          "minimum": 0,
+          "maximum": 100,
+          "description": "[random, linear, hotspot_random] Percentage of accesses that are reads (wen=1)."
+        },
+        "idle_cycles_between_phases": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "[2d, 3d, matmul_phased] Idle cycles inserted at each inner phase boundary."
+        }
+      },
+
+      "allOf": [
+        {
+          "if": { "properties": { "mem_access_type": { "const": "random" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["n_transactions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] },
+          "then": {
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["length"] },
+              { "required": ["region_size_bytes"] }
+            ]
+          }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] },
+          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1"] }] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "3d" } }, "required": ["mem_access_type"] },
+          "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1", "len_d2"] }] }
+        },
+        {
+          "if": {
+            "properties": { "mem_access_type": { "enum": ["matmul_phased", "matmul"] } },
+            "required": ["mem_access_type"]
+          },
+          "then": {
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["region_size_bytes"] },
+              { "required": ["matrix_m", "matrix_n", "matrix_k"] }
+            ]
+          }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "multi_linear" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["regions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "bank_group_linear" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["start_bank", "bank_group_span", "n_transactions"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "rw_rowwise" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["row_base_address", "row_size_bytes", "n_rows", "row_stride_bytes", "reads_per_row", "writes_per_row"] }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "gather_scatter" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["read_regions", "write_region"] }
+        },
+        {
+          "if": {
+            "properties": { "mem_access_type": { "enum": ["matmul_tiled_interleave", "matmul_tiled"] } },
+            "required": ["mem_access_type"]
+          },
+          "then": {
+            "anyOf": [
+              { "required": ["n_transactions"] },
+              { "required": ["region_base_address_a", "region_size_bytes_a", "region_base_address_b", "region_size_bytes_b", "region_base_address_c", "region_size_bytes_c"] }
+            ]
+          }
+        },
+        {
+          "if": { "properties": { "mem_access_type": { "const": "hotspot_random" } }, "required": ["mem_access_type"] },
+          "then": { "required": ["hot_regions"] }
+        }
+      ]
+    },
+
+    "master": {
+      "type": "object",
+      "description": "Per-master configuration. A master either has a flat single-pattern config or a 'patterns' list for multi-job execution.",
+      "unevaluatedProperties": false,
+      "properties": {
+        "id": {
+          "type": "integer",
+          "description": "Optional consistency label. Mapping is always positional."
+        },
+        "description": {
+          "type": "string",
+          "description": "Human-readable master label."
+        },
+        "start_delay_cycles": {
+          "type": "integer",
+          "minimum": 0,
+          "default": 0,
+          "description": "Number of idle cycles prepended to the stimulus file before the first pattern."
+        },
+        "patterns": {
+          "type": "array",
+          "minItems": 1,
+          "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for_jobs of pattern f defines which jobs must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern.",
+          "items": { "$ref": "#/$defs/pattern" }
+        }
+      },
+      "allOf": [
+        { "$ref": "#/$defs/pattern" }
+      ]
+    }
+  }
+}
diff --git a/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json
new file mode 100644
index 0000000..b349a65
--- /dev/null
+++ b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "HCI",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json
new file mode 100644
index 0000000..a505fa1
--- /dev/null
+++ b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "LOG",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json
new file mode 100644
index 0000000..84c47a1
--- /dev/null
+++ b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json
@@ -0,0 +1,17 @@
+{
+  "description": "Hardware configuration parameters for HCI interconnect",
+  "parameters": {
+    "N_HWPE": 2,
+    "HWPE_WIDTH_FACT": 8,
+    "N_CORE": 8,
+    "N_DMA": 0,
+    "N_EXT": 1,
+    "DATA_WIDTH": 32,
+    "TOT_MEM_SIZE": 256,
+    "N_BANKS": 64,
+    "INTERCO_TYPE": "MUX",
+    "TS_BIT": 21,
+    "EXPFIFO": 0,
+    "SEL_LIC": 0
+  }
+}
diff --git a/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json
new file mode 100644
index 0000000..aa2da43
--- /dev/null
+++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json
@@ -0,0 +1,540 @@
+{
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
+  "log_masters": [
+    {
+      "id": 0,
+      "description": "Core 0 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 0 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 0 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core0_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x10000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Core 1 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 1 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 1 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core1_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x11000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "description": "Core 2 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 2 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 2 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core2_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x12000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "description": "Core 3 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 3 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 3 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core3_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x13000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "description": "Core 4 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 4 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 4 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core4_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x14000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "description": "Core 5 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 5 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 5 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core5_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x15000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "description": "Core 6 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 6 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 6 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core6_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x16000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A0",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A1",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A2",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        },
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A1 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A1_buf1",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A2 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A2_buf0",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A0",
+          "mem_access_type": "linear",
+          "job": "dma_store_C0_buf2",
+          "wait_for_jobs": ["gemm_A0"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Prefetch tile A3 into buffer #1 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A3_buf1",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x04000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A1",
+          "mem_access_type": "linear",
+          "job": "dma_store_C1_buf3",
+          "wait_for_jobs": ["gemm_A1"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C0 after GEMM tile A2",
+          "mem_access_type": "linear",
+          "job": "dma_store_C2_buf2",
+          "wait_for_jobs": ["gemm_A2"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x06000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Read back C1 after GEMM tile A3",
+          "mem_access_type": "linear",
+          "job": "dma_store_C3_buf3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 100,
+          "traffic_read_pct": 100,
+          "region_base_address": "0x08000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
+    }
+  ]
+}
diff --git a/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json
new file mode 100644
index 0000000..c8ab84c
--- /dev/null
+++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json
@@ -0,0 +1,126 @@
+{
+  "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.",
+  "log_masters": [
+    {"id": 0, "description": "Core 0 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 1, "description": "Core 1 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 2, "description": "Core 2 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 3, "description": "Core 3 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 4, "description": "Core 4 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 5, "description": "Core 5 post-GEMM background traffic", "mem_access_type": "idle" },
+    {"id": 6, "description": "Core 6 post-GEMM background traffic", "mem_access_type": "idle" },
+    {
+      "id": 7,
+      "description": "Core 7 post-GEMM background traffic",
+      "patterns": [
+        {
+          "description": "Core 7 traffic after gemm_A3",
+          "mem_access_type": "linear",
+          "job": "core7_after_gemm_A3",
+          "wait_for_jobs": ["gemm_A3"],
+          "traffic_pct": 50,
+          "traffic_read_pct": 50,
+          "region_base_address": "0x17000",
+          "region_size_bytes": 1024
+        }
+      ]
+    },
+    { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" }
+  ],
+
+  "hwpe_masters": [
+    {
+      "id": 0,
+      "description": "DMA engine (linear setup + ping/pong prefetch)",
+      "patterns": [
+        {
+          "description": "Setup preload: B matrix (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_B",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x00000",
+          "region_size_bytes": 8192
+        },
+        {
+          "description": "Setup preload: tile A0 into buffer #0 (8 KiB)",
+          "mem_access_type": "linear",
+          "job": "dma_load_A0_buf0",
+          "traffic_pct": 100,
+          "traffic_read_pct": 0,
+          "region_base_address": "0x02000",
+          "region_size_bytes": 8192
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "description": "Single GEMM engine processing 4 tiles with ping/pong buffers",
+      "patterns": [
+        {
+          "description": "GEMM tile A0: A0 x B -> C0",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A0",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A1: A1 x B -> C1",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A1",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A2",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x02000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x06000",
+          "region_size_bytes_c": 8192
+        },
+        {
+          "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)",
+          "mem_access_type": "matmul_phased",
+          "job": "gemm_A3",
+          "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"],
+          "traffic_pct": 90,
+          "matrix_m": 16,
+          "matrix_n": 16,
+          "matrix_k": 16,
+          "region_base_address_a": "0x04000",
+          "region_size_bytes_a": 8192,
+          "region_base_address_b": "0x00000",
+          "region_size_bytes_b": 8192,
+          "region_base_address_c": "0x08000",
+          "region_size_bytes_c": 8192
+        }
+      ]
+    }
+  ]
+}
diff --git a/target/verif/exploration/exploration.mk b/target/verif/exploration/exploration.mk
new file mode 100644
index 0000000..1a93e7b
--- /dev/null
+++ b/target/verif/exploration/exploration.mk
@@ -0,0 +1,17 @@
+# Copyright 2026 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
+# SPDX-License-Identifier: SHL-0.51
+#
+# Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+
+HCI_VERIF_EXPL_DIR = $(HCI_ROOT)/target/verif/exploration
+
+################
+# Benchmarking #
+################
+
+# Modify this script to configure parameters (e.g., workload to run)
+BENCHMARK_SCRIPT := $(HCI_VERIF_EXPL_DIR)/scripts/run_sweep.sh
+
+benchmarking-sweep:
+	. $(BENCHMARK_SCRIPT)
\ No newline at end of file
diff --git a/target/verif/exploration/scripts/parse_vsim.py b/target/verif/exploration/scripts/parse_vsim.py
new file mode 100644
index 0000000..9c0f0ad
--- /dev/null
+++ b/target/verif/exploration/scripts/parse_vsim.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""Parse the final 'Simulation Summary' section from one transcript file."""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List
+
+
+SUMMARY_MARKER = "------ Simulation Summary ------"
+
+
+class ParseError(RuntimeError):
+    """Raised when the transcript summary cannot be parsed."""
+
+
+def _as_float(value: str) -> float:
+    return float(value)
+
+
+def _as_int(value: str) -> int:
+    return int(value)
+
+
+def _clean_line(raw: str) -> str:
+    line = raw.strip()
+    if line.startswith("#"):
+        line = line[1:].strip()
+    return line
+
+
+def _summary_lines(transcript_text: str) -> List[str]:
+    idx = transcript_text.rfind(SUMMARY_MARKER)
+    if idx < 0:
+        raise ParseError(f"Summary marker '{SUMMARY_MARKER}' not found.")
+    return [_clean_line(line) for line in transcript_text[idx:].splitlines()]
+
+
+def _ensure_master(masters: Dict[str, Dict[str, object]], master_name: str) -> Dict[str, object]:
+    entry = masters.get(master_name)
+    if entry is None:
+        entry = {"master_name": master_name}
+        masters[master_name] = entry
+    return entry
+
+
+def parse_summary(transcript_text: str) -> Dict[str, object]:
+    lines = _summary_lines(transcript_text)
+
+    result: Dict[str, object] = {
+        "hw_config": {},
+        "bandwidth": {},
+        "simulation_time": {"per_master": []},
+        "read_response_coverage": {},
+        "transaction_counts": {},
+        "request_to_grant_latency": {
+            "per_master": [],
+            "accumulated": {},
+            "averages": {},
+        },
+        "finish": {},
+    }
+
+    masters: Dict[str, Dict[str, object]] = {}
+
+    patterns = {
+        "masters": re.compile(r"^Masters:\s*CORE=(\d+)\s*DMA=(\d+)\s*EXT=(\d+)\s*HWPE=(\d+)\s*\(total=(\d+)\)$"),
+        "memory": re.compile(
+            r"^Memory:\s*banks=(\d+)\s*total_size=(\d+)\s*kB\s*data_width=(\d+)\s*bits\s*hwpe_width=(\d+)\s*lanes$"
+        ),
+        "interconnect": re.compile(r"^Interconnect:\s*SEL_LIC=(\d+)\s*TS_BIT=(\d+)\s*EXPFIFO=(\d+)$"),
+        "interconnect_side": re.compile(
+            r"^Interconnect-side:\s*TYPE=(LOG|HCI|MUX|UNKNOWN)\s*N_NARROW_HCI=(\d+)\s*N_WIDE_HCI=(\d+)\s*N_DMA=(\d+)\s*N_EXT=(\d+)$"
+        ),
+        "id_addr": re.compile(r"^ID/address:\s*IW=(\d+)\s*ADDR_WIDTH=(\d+)\s*ADDR_WIDTH_BANK=(\d+)$"),
+        "ideal_mem_bw": re.compile(r"^Ideal BW \(memory side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_interco_bw": re.compile(r"^Ideal BW \(interco side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_master_bw_legacy": re.compile(r"^Ideal BW \(master side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "ideal_bottleneck_bw": re.compile(r"^Ideal BW \(bottleneck\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"),
+        "actual_bw": re.compile(
+            r"^Actual BW \(completion\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle\s*\[utilization:\s*([0-9]+(?:\.[0-9]+)?)%\]$"
+        ),
+        "completion_bw_legacy": re.compile(r"^Completion bandwidth .*:\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle$"),
+        "completion_cycles": re.compile(r"^Completion phase duration:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "granted": re.compile(r"^Granted transactions:\s*reads=(\d+)\s*writes=(\d+)\s*total=(\d+)$"),
+        "read_complete": re.compile(r"^Read-complete responses:\s*(\d+)$"),
+        "total_sim_cycles": re.compile(r"^Total simulation time:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "per_master_sim_time": re.compile(r"^([A-Za-z0-9_]+)\s*\((master_[^)]+)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"),
+        "coverage": re.compile(r"^(master_[^:]+):\s*observed\s*(\d+)\s*/\s*expected\s*(\d+)$"),
+        "tx_counts": re.compile(r"^(master_[^:]+):\s*granted reads=(\d+)\s*writes=(\d+),\s*read-complete=(\d+)$"),
+        "req_gnt": re.compile(
+            r"^(master_[^:]+):\s*avg req->gnt stall latency\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$"
+        ),
+        "total_accum": re.compile(
+            r"^Total accumulated req->gnt latency:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$"
+        ),
+        "class_avg": re.compile(
+            r"^(LOG|HWPE|Global) avg req->gnt stall latency "
+            r"\((weighted by grant count|mean of per-master averages)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"
+        ),
+        "finish_note": re.compile(r"^\*\* Note: \$finish\s*:\s*(.+)\((\d+)\)$"),
+        "finish_time": re.compile(r"^Time:\s*([0-9]+)\s*ps\s*Iteration:\s*(\d+)\s*Instance:\s*(.+)$"),
+    }
+
+    for line in lines:
+        if not line or line == SUMMARY_MARKER:
+            continue
+
+        match = patterns["masters"].match(line)
+        if match:
+            result["hw_config"]["masters"] = {
+                "core": _as_int(match.group(1)),
+                "dma": _as_int(match.group(2)),
+                "ext": _as_int(match.group(3)),
+                "hwpe": _as_int(match.group(4)),
+                "total": _as_int(match.group(5)),
+            }
+            continue
+
+        match = patterns["memory"].match(line)
+        if match:
+            result["hw_config"]["memory"] = {
+                "banks": _as_int(match.group(1)),
+                "total_size_kb": _as_int(match.group(2)),
+                "data_width_bits": _as_int(match.group(3)),
+                "hwpe_width_lanes": _as_int(match.group(4)),
+            }
+            continue
+
+        match = patterns["interconnect"].match(line)
+        if match:
+            result["hw_config"]["interconnect"] = {
+                "sel_lic": _as_int(match.group(1)),
+                "ts_bit": _as_int(match.group(2)),
+                "expfifo": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["interconnect_side"].match(line)
+        if match:
+            narrow_hci = _as_int(match.group(2))
+            wide_hci = _as_int(match.group(3))
+            n_dma = _as_int(match.group(4))
+            n_ext = _as_int(match.group(5))
+            result["hw_config"]["interconnect_side"] = {
+                "type": match.group(1),
+                "n_narrow_hci": narrow_hci,
+                "n_wide_hci": wide_hci,
+                "n_dma": n_dma,
+                "n_ext": n_ext,
+                "narrow_total_ports": narrow_hci + n_dma + n_ext,
+                "total_initiator_ports": narrow_hci + wide_hci + n_dma + n_ext,
+            }
+            continue
+
+        match = patterns["id_addr"].match(line)
+        if match:
+            result["hw_config"]["id_address"] = {
+                "iw": _as_int(match.group(1)),
+                "addr_width": _as_int(match.group(2)),
+                "addr_width_bank": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["ideal_mem_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_memory_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_interco_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_master_bw_legacy"].match(line)
+        if match:
+            result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["ideal_bottleneck_bw"].match(line)
+        if match:
+            result["bandwidth"]["ideal_bottleneck_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["actual_bw"].match(line)
+        if match:
+            result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1))
+            result["bandwidth"]["actual_completion_utilization_pct"] = _as_float(match.group(2))
+            continue
+
+        match = patterns["completion_bw_legacy"].match(line)
+        if match:
+            result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["completion_cycles"].match(line)
+        if match:
+            result["bandwidth"]["completion_phase_duration_cycles"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["granted"].match(line)
+        if match:
+            result["bandwidth"]["granted_transactions"] = {
+                "reads": _as_int(match.group(1)),
+                "writes": _as_int(match.group(2)),
+                "total": _as_int(match.group(3)),
+            }
+            continue
+
+        match = patterns["read_complete"].match(line)
+        if match:
+            result["bandwidth"]["read_complete_responses"] = _as_int(match.group(1))
+            continue
+
+        match = patterns["total_sim_cycles"].match(line)
+        if match:
+            result["simulation_time"]["total_cycles"] = _as_float(match.group(1))
+            continue
+
+        match = patterns["per_master_sim_time"].match(line)
+        if match:
+            role_name = match.group(1)
+            master_name = match.group(2)
+            sim_cycles = _as_float(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["role_name"] = role_name
+            entry["sim_time_cycles"] = sim_cycles
+            continue
+
+        match = patterns["coverage"].match(line)
+        if match:
+            master_name = match.group(1)
+            observed = _as_int(match.group(2))
+            expected = _as_int(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["read_observed"] = observed
+            entry["read_expected"] = expected
+            result["read_response_coverage"][master_name] = {
+                "observed": observed,
+                "expected": expected,
+            }
+            continue
+
+        match = patterns["tx_counts"].match(line)
+        if match:
+            master_name = match.group(1)
+            reads = _as_int(match.group(2))
+            writes = _as_int(match.group(3))
+            read_complete = _as_int(match.group(4))
+            entry = _ensure_master(masters, master_name)
+            entry["granted_reads"] = reads
+            entry["granted_writes"] = writes
+            entry["read_complete"] = read_complete
+            result["transaction_counts"][master_name] = {
+                "granted_reads": reads,
+                "granted_writes": writes,
+                "read_complete": read_complete,
+            }
+            continue
+
+        match = patterns["req_gnt"].match(line)
+        if match:
+            master_name = match.group(1)
+            avg_cycles = _as_float(match.group(2))
+            grants = _as_int(match.group(3))
+            entry = _ensure_master(masters, master_name)
+            entry["avg_req_to_gnt_stall_latency_cycles"] = avg_cycles
+            entry["req_to_gnt_grants"] = grants
+            continue
+
+        match = patterns["total_accum"].match(line)
+        if match:
+            result["request_to_grant_latency"]["accumulated"] = {
+                "cycles": _as_float(match.group(1)),
+                "grants": _as_int(match.group(2)),
+            }
+            continue
+
+        match = patterns["class_avg"].match(line)
+        if match:
+            group = match.group(1).lower()
+            avg_type = match.group(2)
+            value = _as_float(match.group(3))
+            key = "weighted_cycles" if "weighted by grant count" in avg_type else "unweighted_cycles"
+            averages = result["request_to_grant_latency"]["averages"]
+            group_entry = averages.get(group, {})
+            group_entry[key] = value
+            averages[group] = group_entry
+            continue
+
+        match = patterns["finish_note"].match(line)
+        if match:
+            result["finish"]["source"] = match.group(1).strip()
+            result["finish"]["line"] = _as_int(match.group(2))
+            continue
+
+        match = patterns["finish_time"].match(line)
+        if match:
+            result["finish"]["time_ps"] = _as_int(match.group(1))
+            result["finish"]["iteration"] = _as_int(match.group(2))
+            result["finish"]["instance"] = match.group(3).strip()
+            continue
+
+    sorted_masters = [masters[name] for name in sorted(masters.keys())]
+    result["simulation_time"]["per_master"] = [
+        {
+            "master_name": row["master_name"],
+            "role_name": row.get("role_name"),
+            "sim_time_cycles": row.get("sim_time_cycles"),
+        }
+        for row in sorted_masters
+    ]
+    result["request_to_grant_latency"]["per_master"] = [
+        {
+            "master_name": row["master_name"],
+            "avg_req_to_gnt_stall_latency_cycles": row.get("avg_req_to_gnt_stall_latency_cycles"),
+            "req_to_gnt_grants": row.get("req_to_gnt_grants"),
+        }
+        for row in sorted_masters
+    ]
+    result["masters"] = sorted_masters
+
+    return result
+
+
+def _cli_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Parse final Simulation Summary lines from one transcript.")
+    parser.add_argument("--transcript", required=True, help="Path to transcript file")
+    parser.add_argument("--out", default="", help="Optional output JSON file path")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = _cli_args()
+    transcript_path = Path(args.transcript)
+    if not transcript_path.exists():
+        raise ParseError(f"Transcript not found: {transcript_path}")
+
+    text = transcript_path.read_text(encoding="utf-8", errors="replace")
+    parsed = parse_summary(text)
+
+    output = json.dumps(parsed, indent=2, sort_keys=False)
+    if args.out:
+        out_path = Path(args.out)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(output + "\n", encoding="ascii")
+    else:
+        print(output)
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except ParseError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(2)
diff --git a/target/verif/exploration/scripts/plot_sweep_results.py b/target/verif/exploration/scripts/plot_sweep_results.py
new file mode 100644
index 0000000..786561e
--- /dev/null
+++ b/target/verif/exploration/scripts/plot_sweep_results.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""Plot sweep metrics from parsed transcript JSON files."""
+
+import argparse
+import json
+import math
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import ListedColormap
+from matplotlib.lines import Line2D
+from matplotlib.patches import Patch
+
+# Cycles of ideal workload runtime
+IDEAL_WORKLOAD_RUNTIME = 4121.0
+
+INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2}
+INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"}
+IDEAL_COLOR = "#7f7f7f"
+
+
+def _to_int(value: object, default: int = 0) -> int:
+    try:
+        return int(value)
+    except Exception:
+        return default
+
+
+def _to_float(value: object, default: float = float("nan")) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return default
+
+
+def _master_sort_key(master: str) -> Tuple[int, int]:
+    if master.startswith("master_log_"):
+        return (0, int(master.rsplit("_", 1)[1]))
+    if master.startswith("master_hwpe_"):
+        return (1, int(master.rsplit("_", 1)[1]))
+    return (9, 0)
+
+
+def _parse_cfg_from_filename(path: Path) -> Tuple[str, int, int]:
+    match = re.match(r"^hardware_([a-zA-Z]+)_([0-9]+)hwpe_([0-9]+)fact\.json$", path.name)
+    if not match:
+        return ("UNK", 0, 0)
+    return (match.group(1).upper(), int(match.group(2)), int(match.group(3)))
+
+
+def _derive_interco_side(hw_cfg: Dict[str, object]) -> Dict[str, int]:
+    masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {}
+    memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {}
+    interco_side = hw_cfg.get("interconnect_side", {}) if isinstance(hw_cfg, dict) else {}
+
+    if isinstance(interco_side, dict) and "narrow_total_ports" in interco_side:
+        return {
+            "n_narrow_hci": _to_int(interco_side.get("n_narrow_hci")),
+            "n_wide_hci": _to_int(interco_side.get("n_wide_hci")),
+            "n_dma": _to_int(interco_side.get("n_dma")),
+            "n_ext": _to_int(interco_side.get("n_ext")),
+            "narrow_total_ports": _to_int(interco_side.get("narrow_total_ports")),
+            "total_initiator_ports": _to_int(interco_side.get("total_initiator_ports")),
+        }
+
+    interco_type = str(interco_side.get("type", "UNK")).upper()
+    if interco_type == "UNK":
+        interco_type = str(hw_cfg.get("interco_type", "UNK")).upper()
+
+    n_core = _to_int(masters.get("core"))
+    n_dma = _to_int(masters.get("dma"))
+    n_ext = _to_int(masters.get("ext"))
+    n_hwpe = _to_int(masters.get("hwpe"))
+    hwpe_width = _to_int(memory.get("hwpe_width_lanes"), 1)
+
+    if interco_type == "LOG":
+        n_narrow_hci = n_core + n_hwpe * hwpe_width
+        n_wide_hci = 0
+    elif interco_type == "MUX":
+        n_narrow_hci = n_core
+        n_wide_hci = 1 if n_hwpe > 0 else 0
+    else:
+        n_narrow_hci = n_core
+        n_wide_hci = n_hwpe
+
+    narrow_total = n_narrow_hci + n_dma + n_ext
+    return {
+        "n_narrow_hci": n_narrow_hci,
+        "n_wide_hci": n_wide_hci,
+        "n_dma": n_dma,
+        "n_ext": n_ext,
+        "narrow_total_ports": narrow_total,
+        "total_initiator_ports": narrow_total + n_wide_hci,
+    }
+
+
+def _load_results(results_dir: Path) -> List[Dict[str, object]]:
+    entries: List[Dict[str, object]] = []
+    for path in sorted(results_dir.glob("hardware_*.json")):
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+
+        hw_cfg = data.get("hw_config", {})
+        masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {}
+        memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {}
+        bw = data.get("bandwidth", {})
+
+        interco_from_name, n_hwpe_name, wf_name = _parse_cfg_from_filename(path)
+        interco_type = str(hw_cfg.get("interconnect_side", {}).get("type", interco_from_name)).upper()
+        if interco_type not in INTERCO_ORDER:
+            interco_type = interco_from_name
+
+        n_hwpe = _to_int(masters.get("hwpe"), n_hwpe_name)
+        hwpe_wf = _to_int(memory.get("hwpe_width_lanes"), wf_name)
+        cfg_label = f"{interco_type}_{n_hwpe}x{hwpe_wf}"
+
+        interco_side = _derive_interco_side(hw_cfg if isinstance(hw_cfg, dict) else {})
+        banks = _to_int(memory.get("banks"))
+        data_width = _to_int(memory.get("data_width_bits"))
+        ideal_mem = float(banks * data_width)
+        ideal_interco = float(
+            interco_side["narrow_total_ports"] * data_width
+            + interco_side["n_wide_hci"] * hwpe_wf * data_width
+        )
+        ideal_bottleneck = min(ideal_mem, ideal_interco)
+
+        actual_bw = _to_float(bw.get("actual_completion_bit_per_cycle"))
+        util_pct = (actual_bw / ideal_bottleneck * 100.0) if ideal_bottleneck > 0 and not math.isnan(actual_bw) else float("nan")
+
+        entries.append(
+            {
+                "path": path,
+                "label": cfg_label,
+                "interco_type": interco_type,
+                "n_hwpe": n_hwpe,
+                "hwpe_width_fact": hwpe_wf,
+                "json": data,
+                "total_sim_cycles": _to_float(data.get("simulation_time", {}).get("total_cycles")),
+                "avg_req_to_gnt_per_master": data.get("request_to_grant_latency", {}).get("per_master", []),
+                "ideal_mem_bw": ideal_mem,
+                "ideal_interco_bw": ideal_interco,
+                "ideal_bottleneck_bw": ideal_bottleneck,
+                "actual_bw": actual_bw,
+                "utilization_pct": util_pct,
+            }
+        )
+
+    entries.sort(key=lambda e: (e["n_hwpe"], e["hwpe_width_fact"], INTERCO_ORDER.get(e["interco_type"], 9)))
+    return entries
+
+
+def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    values = [e["total_sim_cycles"] for e in entries]
+    colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries]
+    x = np.arange(len(entries), dtype=float)
+
+    fig, ax = plt.subplots(figsize=(max(8, 1.2 * len(entries)), 5.6))
+    bars = ax.bar(x, values, color=colors, width=0.68)
+    ax.set_title("Total simulation time vs ideal workload runtime")
+    ax.set_ylabel("cycles")
+    ax.set_xlabel("Configuration")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_axisbelow(True)
+    ax.grid(axis="y", alpha=0.25)
+
+    for bar, val in zip(bars, values):
+        if math.isnan(val):
+            continue
+        mult_of_ideal = (val / IDEAL_WORKLOAD_RUNTIME) if val > 0 and IDEAL_WORKLOAD_RUNTIME > 0 else float("nan")
+        pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime"
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.0f}\n({pct_txt})",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+        )
+
+    ax.axhline(
+        y=IDEAL_WORKLOAD_RUNTIME,
+        color="red",
+        linestyle="--",
+        linewidth=1.6,
+        label=f"Ideal workload runtime ({IDEAL_WORKLOAD_RUNTIME:.0f} cycles)",
+    )
+
+    legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")]
+    legend.append(Line2D([0], [0], color="red", linestyle="--", linewidth=1.6, label="Ideal workload runtime"))
+    ax.legend(handles=legend, loc="lower left")
+    ax.margins(y=0.24)
+    fig.tight_layout(rect=(0.0, 0.02, 1.0, 0.98))
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def _plot_per_master_avg_req_to_gnt(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    masters = sorted(
+        {
+            row.get("master_name", "")
+            for e in entries
+            for row in e.get("avg_req_to_gnt_per_master", [])
+            if isinstance(row, dict) and row.get("master_name", "")
+        },
+        key=_master_sort_key,
+    )
+
+    if not masters:
+        fig, ax = plt.subplots(figsize=(8, 3))
+        ax.text(0.5, 0.5, "No per-master req->gnt data", ha="center", va="center")
+        ax.axis("off")
+        fig.tight_layout()
+        fig.savefig(out_path, dpi=150)
+        plt.close(fig)
+        return
+
+    matrix = np.full((len(masters), len(entries)), np.nan, dtype=float)
+    master_idx = {m: i for i, m in enumerate(masters)}
+    for j, entry in enumerate(entries):
+        for row in entry.get("avg_req_to_gnt_per_master", []):
+            if not isinstance(row, dict):
+                continue
+            m = row.get("master_name", "")
+            if m not in master_idx:
+                continue
+            matrix[master_idx[m], j] = _to_float(row.get("avg_req_to_gnt_stall_latency_cycles"))
+
+    cmap = ListedColormap(plt.cm.get_cmap("viridis")(np.linspace(0.0, 1.0, 256)))
+    cmap.set_bad(color="white")
+
+    fig, ax = plt.subplots(figsize=(max(10, 1.2 * len(entries)), max(4, 0.35 * len(masters))))
+    im = ax.imshow(np.ma.masked_invalid(matrix), aspect="auto", cmap=cmap, interpolation="nearest")
+    ax.set_title("Avg req->gnt stall latency per master")
+    ax.set_xlabel("Configuration")
+    ax.set_ylabel("Master")
+    ax.set_xticks(np.arange(len(entries)))
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_yticks(np.arange(len(masters)))
+    ax.set_yticklabels(masters)
+
+    vmax = np.nanmax(matrix) if np.any(~np.isnan(matrix)) else 0.0
+    thresh = 0.6 * vmax if vmax > 0 else 0.0
+    for r in range(matrix.shape[0]):
+        for c in range(matrix.shape[1]):
+            val = matrix[r, c]
+            if math.isnan(val):
+                continue
+            color = "white" if val < thresh else "black"
+            ax.text(c, r, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color)
+
+    fig.colorbar(im, ax=ax, label="cycles")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None:
+    labels = [e["label"] for e in entries]
+    ideal_vals = [e["ideal_bottleneck_bw"] for e in entries]
+    actual_vals = [e["actual_bw"] for e in entries]
+    util_vals = [e["utilization_pct"] for e in entries]
+    sim_cycles_vals = [e["total_sim_cycles"] for e in entries]
+    ideal_app_vals = []
+    for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals):
+        if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_RUNTIME <= 0.0:
+            ideal_app_vals.append(float("nan"))
+        else:
+            ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_RUNTIME)
+    actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries]
+    x = np.arange(len(entries), dtype=float)
+    width = 0.34
+
+    fig, ax = plt.subplots(figsize=(max(9, 1.25 * len(entries)), 5.0))
+    ideal_bars = ax.bar(
+        x - width / 2.0,
+        ideal_vals,
+        width=width,
+        color=IDEAL_COLOR,
+        label="Max interco bandwidth",
+    )
+    actual_bars = ax.bar(x + width / 2.0, actual_vals, width=width, color=actual_colors, label="Actual BW (completion)")
+    ax.set_title("Bandwidth: interconnect-side ideal vs actual")
+    ax.set_ylabel("bit/cycle")
+    ax.set_xlabel("Configuration")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation=20, ha="right")
+    ax.set_axisbelow(True)
+    ax.grid(axis="y", alpha=0.25)
+
+    for bar, val in zip(ideal_bars, ideal_vals):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.0f}",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            zorder=8,
+        )
+    for bar, val, util in zip(actual_bars, actual_vals, util_vals):
+        if math.isnan(val):
+            continue
+        util_txt = "n/a" if math.isnan(util) else f"{util:.1f}% interco util"
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            val,
+            f"{val:.1f}\n({util_txt})",
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            zorder=8,
+        )
+
+    # Ideal app BW computed from moved data and ideal application duration:
+    # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_RUNTIME
+    valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)]
+    if valid_ideal_app_vals:
+        ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals)
+        ax.axhline(
+            y=ideal_workload_bw,
+            color="red",
+            linestyle="--",
+            linewidth=1.8,
+            label="Ideal workload bandwidth",
+            zorder=4,
+        )
+        ax.text(
+            x[-1] + 0.35,
+            ideal_workload_bw,
+            f"{ideal_workload_bw:.1f}",
+            color="red",
+            fontsize=9,
+            ha="right",
+            va="bottom",
+            zorder=8,
+        )
+
+    interco_legend = [Patch(facecolor=INTERCO_COLORS[k], label=f"Actual {k}") for k in ("LOG", "MUX", "HCI")]
+    base_legend = [Patch(facecolor=IDEAL_COLOR, label="Max interco bandwidth")]
+    extra_legend = [Line2D([0], [0], color="red", linestyle="--", linewidth=1.8, label="Ideal workload bandwidth")]
+    ax.legend(handles=base_legend + interco_legend + extra_legend, loc="best")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Plot sweep results from parsed transcript JSON files.")
+    parser.add_argument(
+        "--results-dir",
+        default="target/verif/results",
+        help="Directory containing parsed sweep JSON files (hardware_*.json).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        default="target/verif/results/plots",
+        help="Output directory for generated plots.",
+    )
+    args = parser.parse_args()
+
+    results_dir = Path(args.results_dir)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    entries = _load_results(results_dir)
+    if not entries:
+        raise SystemExit(f"No sweep JSON files found in: {results_dir}")
+
+    _plot_total_sim_time(entries, out_dir / "total_simulation_time.png")
+    _plot_per_master_avg_req_to_gnt(entries, out_dir / "avg_req_to_gnt_per_master.png")
+    _plot_bandwidth(entries, out_dir / "bandwidth_ideal_vs_actual.png")
+
+    print(f"Plots written to: {out_dir}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/target/verif/exploration/scripts/run_sweep.sh b/target/verif/exploration/scripts/run_sweep.sh
new file mode 100644
index 0000000..8759a9f
--- /dev/null
+++ b/target/verif/exploration/scripts/run_sweep.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+set -e
+
+# Make sure we are in hci root
+if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then
+  echo "Error: This script must be run from within the hci project git repository." >&2
+  exit 1
+fi
+cd "$git_root"
+
+# Directories
+VERIF_DIR=./target/verif
+VERIF_EXPL_DIR=$VERIF_DIR/exploration
+RESULTS_DIR=$VERIF_EXPL_DIR/results
+PLOTS_DIR=$RESULTS_DIR/plots
+
+RUN_NAME=gemm_cores_double_buffer
+
+# Makefile settings (verif.mk)
+export GUI=0
+export WORKLOAD_JSON=$VERIF_EXPL_DIR/config/workloads/workload_dma_gemm_cores.json
+export TESTBENCH_JSON=$VERIF_DIR/config/testbench.json
+# HARDWARE_JSON is swept in the loop below
+
+mkdir -p "$RESULTS_DIR"
+
+# For each hardware*.json, run simulation and parse transcript
+for hardware_config in $VERIF_EXPL_DIR/config/hardware/hardware_*.json; do
+  make clean-verif
+  echo -e "\033[32;1mRunning simulation with hardware config: $hardware_config\033[0m"
+  export HARDWARE_JSON="$hardware_config"
+  make run-verif
+  python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).json
+  # Copy html report
+  cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).html
+done
+
+python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME
+
+# Ideal run (manual)
+# do not forget to change IDEAL_WORKLOAD_RUNTIME in plot_sweep_results.py
+python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/ideal.json
+cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/ideal.html
+# Regenerate plots with correct IDEAL_WORKLOAD_RUNTIME
+python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME
diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md
index 97be798..472323b 100644
--- a/target/verif/simvectors/README.md
+++ b/target/verif/simvectors/README.md
@@ -1,23 +1,293 @@
-# HCI Stimuli Generator
+# Simvectors Stimuli Generator
 
-Python package for generating test stimuli for the HCI verification environment.
+## Scope
+`main.py` generates per-master stimuli vectors from:
+- `workload.json`
+- `hardware.json`
+- `testbench.json`
 
-## Usage
+This README summarizes:
+- available `mem_access_type` patterns and JSON parameters
+- every source of `req=0` cycles
+- read/write blocked-set behavior
+- generated outputs and formats
 
-### Generate Stimuli
+## JSON Structure (workload)
+Top-level:
+- `log_masters`: list of narrow masters
+- `hwpe_masters`: list of wide masters
 
-```bash
-python target/verif/simvectors/main.py --workload_config <path> --testbench_config <path> --hardware_config <path>
-```
+### Field Format Conventions
+| Kind | Accepted JSON format | Unit / meaning |
+|---|---|---|
+| Address fields (`*_address`, `base`) | integer or string (`"1234"`, `"0x4000"`, `"101010"`) | byte address |
+| Size fields (`*_size_bytes`, `chunk_bytes`, `tile_*_bytes`) | integer or numeric string | bytes |
+| Strides (`stride0/1/2`) | integer | words (word = `DATA_WIDTH/8` for that master) |
+| `row_stride_bytes` | integer or numeric string | bytes |
+| `stride_beats` | integer | beats (beat = transaction width in bytes for that master) |
+| Counters (`n_transactions`, `len_*`, `tiles`, `reads_per_row`, `writes_per_row`) | integer | count |
+| Percentages (`traffic_pct`, `traffic_read_pct`, `read_pct`) | integer | percent |
+| `wen` | `0` or `1` | `0`=write, `1`=read |
 
-Or use the Makefile:
-```bash
-make stimuli
-```
+### Master-Level Fields
+| Field | Required | Default | Notes |
+|---|---|---|---|
+| `id` | no | positional index | Informational/consistency warning only. |
+| `description` | no | empty | Human-readable label. |
+| `start_delay_cycles` | no | `0` | Prepended `req=0` cycles before first pattern. |
+| `patterns` | no | absent | If present, this list drives generation. |
 
-## Configuration
+Precedence/exclusivity:
+- Master format is effectively either:
+  - flat single-pattern master (no `patterns`)
+  - or `patterns` list.
+- If `patterns` exists, flat pattern fields at master level are ignored for traffic generation (except master-level fields like `start_delay_cycles`).
 
-Configuration files are located in `target/verif/config/`:
-- `hardware.json` - HCI hardware parameters (auto-generates `generated/hardware.mk`)
-- `testbench.json` - Testbench parameters (auto-generates `generated/testbench.mk`)
-- `workload.json` - Workload configuration with simulation parameters and master-specific settings
+### Common Pattern Fields (apply to every pattern type)
+| Field | Required | Default | Notes |
+|---|---|---|---|
+| `mem_access_type` | yes | none | Pattern selector. |
+| `description` | no | empty | Label used in reports. |
+| `job` | no | `"default"` | Dependency graph node name. |
+| `wait_for_jobs` | no | `[]` | Inserts dependency gate before pattern. |
+| `n_transactions` | conditional | derivable for many patterns | If omitted, derived when supported. |
+| `traffic_pct` | no | `100` | Adds per-request idle shaping (`req=0`) on patterns that implement traffic shaping. |
+
+## Pattern Catalog
+The tables below list pattern-specific fields.  
+Complete field set for a pattern = **common fields above + pattern-specific fields below**.
+`Required = conditional` means: required unless the documented derivation path is present.
+
+### `idle`
+No memory transaction. Emits idle and trailing `PAUSE`.
+
+Pattern-specific fields: none.
+
+### `random`
+Uniform random over a region.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | yes | none | int | Not derivable for this pattern. |
+| `region_base_address` | no | evenly partitioned per master | address (bytes) | |
+| `region_size_bytes` | no | evenly partitioned per master | bytes | |
+| `traffic_read_pct` | no | random R/W mix | % | If set, deterministic read/write split. |
+
+### `linear`
+1D strided stream.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `length` or `region_size_bytes`. |
+| `length` | no | none | int | Alias source for derived `n_transactions`. |
+| `start_address` | no | `"0"` | address (bytes) | If absent and `region_base_address` exists, uses `region_base_address`. |
+| `stride0` | no | `0` (or `1` if `region_size_bytes` set) | words | |
+| `region_base_address` | no | evenly partitioned per master | address (bytes) | Used for region context and start fallback. |
+| `region_size_bytes` | no | evenly partitioned per master | bytes | Can derive `n_transactions`. |
+| `traffic_read_pct` | no | random R/W mix | % | |
+
+### `2d`
+2D walk.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1`. |
+| `start_address` | no | `"0"` | address (bytes) | |
+| `stride0` | no | `0` | words | |
+| `len_d0` | conditional | none | int | |
+| `stride1` | no | `0` | words | |
+| `len_d1` | conditional | none | int | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. |
+
+### `3d`
+3D walk.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1 * len_d2`. |
+| `start_address` | no | `"0"` | address (bytes) | |
+| `stride0` | no | `0` | words | |
+| `len_d0` | conditional | none | int | |
+| `stride1` | no | `0` | words | |
+| `len_d1` | conditional | none | int | |
+| `stride2` | no | `0` | words | |
+| `len_d2` | conditional | none | int | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. |
+
+### `matmul_phased` (alias: `matmul`)
+Phased A-read / B-read / C-write traffic.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `n_transactions` | conditional | derived | int | Derivable from region size or matrix dims. |
+| `region_base_address`, `region_size_bytes` | conditional | evenly partitioned | bytes | Combined region (auto A/B/C split). |
+| `matrix_m`, `matrix_n`, `matrix_k` | no | none | int | Alternative source for derived `n_transactions`. |
+| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | no | none | bytes | Explicit per-phase regions. |
+| `matmul_ratio_a/b/c` | no | `1/1/1` | relative weights | |
+| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit phase-boundary idles. |
+
+Mutual exclusivity / precedence:
+- If explicit `*_a/b/c` regions are provided, they take precedence over combined-region auto-split.
+
+### `multi_linear`
+Multiple subregions, schedule-driven interleave.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `regions` | yes | none | array | Each entry has `base`, `size_bytes`, optional `stride_words`, `read_pct`. |
+| `schedule` | no | `round_robin` | string | |
+| `burst_len` | no | `1` | int | |
+| `n_transactions` | conditional | derived | int | Derivable from sum of region sizes. |
+
+### `bank_group_linear`
+Linear stream constrained by bank group phase controls.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `start_bank` | yes | none | int | 0-based bank index. |
+| `bank_group_span` | yes | none | int | Number of banks in active group. |
+| `stride_beats` | no | `1` | beats | |
+| `bank_group_hop` | no | `0` | int | Group-phase hop per wrap. |
+| `wen` | no | mixed R/W | `0` or `1` | Fixed direction if set. |
+| `n_transactions` | yes | none | int | Required for this pattern. |
+
+### `rw_rowwise`
+Per-row read phase then write phase.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `row_base_address` | yes | none | address (bytes) | |
+| `row_size_bytes` | yes | none | bytes | |
+| `n_rows` | yes | none | int | |
+| `row_stride_bytes` | yes | none | bytes | |
+| `reads_per_row` | yes | none | int | |
+| `writes_per_row` | yes | none | int | |
+| `idle_cycles_between_rows` | no | `0` | cycles | Inserts explicit row-boundary idles. |
+| `n_transactions` | conditional | derived | int | Derivable as `n_rows * (reads_per_row + writes_per_row)`. |
+
+### `gather_scatter`
+Gather from multiple read regions, scatter to write region.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `read_regions` | yes | none | array | Each entry: `base`, `size_bytes`. |
+| `write_region` | yes | none | object | `base`, `size_bytes`. |
+| `chunk_bytes` | no | transaction width | bytes | Address increment granularity. |
+| `schedule` | no | `4read_1write` | string | |
+| `n_transactions` | conditional | derived | int | Derivable from region sizes and chunk. |
+
+### `matmul_tiled_interleave` (alias: `matmul_tiled`)
+Tile-like interleaving among A/B/C streams.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `region_base_address`, `region_size_bytes` | no | evenly partitioned | bytes | Used as fallback context for auto split when explicit A/B/C are absent. |
+| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | conditional | fallback split | bytes | Preferred explicit mode. |
+| `tile_a_bytes`, `tile_b_bytes`, `tile_c_bytes` | no | transaction width | bytes | Tile step payloads per stream. |
+| `tiles` | no | `1` | int | |
+| `ab_c_schedule` | no | `A_B_C` | string | |
+| `idle_cycles_between_tiles` | no | `0` | cycles | Inserts explicit tile-boundary idles. |
+| `n_transactions` | conditional | derived | int | Can be derived from tile parameters/schedule. |
+
+Mutual exclusivity / precedence:
+- Preferred: explicit A/B/C regions.
+- If missing, generator falls back to splitting combined region context.
+
+### `hotspot_random`
+Weighted random traffic across hot regions.
+
+| Field | Required | Default | Format / unit | Notes |
+|---|---|---|---|---|
+| `hot_regions` | yes | none | array | Each entry: `base`, `size_bytes`, optional `weight` (default `1`). |
+| `n_transactions` | no | weak fallback | int | Prefer setting explicitly. |
+| `traffic_read_pct` | no | random R/W mix | % | |
+
+## All Sources of `req=0` Cycles
+`req=0` can be generated by:
+
+1. `traffic_pct` shaping
+- For every emitted request, inserts `idles_per_req = round((100-pct)/pct)` idle lines.
+- Applies in random/linear and all new patterns that support `traffic_pct`.
+
+2. Boundary idle knobs (JSON)
+- `idle_cycles_between_phases` in `2d`, `3d`, `matmul_phased`
+- `idle_cycles_between_rows` in `rw_rowwise`
+- `idle_cycles_between_tiles` in `matmul_tiled_interleave`
+
+3. `start_delay_cycles` (per master)
+- Prepends idle lines before first pattern.
+
+4. Dependency gate for `wait_for_jobs`
+- For each dependent pattern, generator inserts a synthetic idle+`PAUSE` gate before real traffic.
+
+5. `idle` pattern
+- Explicitly emits idle and `PAUSE`.
+
+## Read/Write Blocked-Set Functionality
+Address filtering is implemented inside pattern generators via:
+- `_is_allowed(add, wen, read_blocked_set, write_blocked_set)`
+- `_record_access(add, wen, read_blocked_set, write_blocked_set)`
+
+Behavior (within one pattern invocation):
+- Read checks (`wen=1`) consult `read_blocked_set`.
+- Write checks (`wen=0`) consult `write_blocked_set`.
+- On every emitted access, the address is added to `write_blocked_set`.
+- On emitted writes only, the address is also added to `read_blocked_set`.
+
+Effective policy:
+- read after read: allowed
+- write after read: blocked
+- read after write: blocked
+- write after write: blocked
+
+Notes:
+- Blocking state is pattern-local (it does not persist across patterns).
+- Generators are strict about transaction count: each non-idle pattern must emit exactly `n_transactions` (`N_TEST`).
+- If blocking rules make the requested count unreachable for a pattern, generation fails with an explicit error instead of silently under-emitting.
+
+## Outputs
+
+### 1. Stimuli vectors
+Path:
+- `target/verif/simvectors/generated/stimuli/master_log_<i>.txt`
+- `target/verif/simvectors/generated/stimuli/master_hwpe_<i>.txt`
+
+Per-cycle vector line format:
+- `req id wen data add`
+- `req`: `1` active request, `0` idle
+- `id`: request ID (`IW` bits)
+- `wen`: `1` read, `0` write
+- `data`: payload (`DATA_WIDTH` bits for narrow, `HWPE_WIDTH_FACT*DATA_WIDTH` for wide)
+- `add`: byte address (`ADD_WIDTH` bits)
+
+Fence token:
+- A standalone line `PAUSE` is emitted at end of each pattern segment.
+
+### 2. Memory map report
+Path:
+- `target/verif/simvectors/generated/memory_map.txt`
+
+Contains:
+- per-pattern region and traffic summary
+- dependency/fence map
+- temporal schedule summary
+- region lifetimes and overlaps context
+
+### 3. Dataflow visualization
+Path:
+- `target/verif/simvectors/generated/dataflow.html`
+
+Contains:
+- execution timeline (SVG)
+- region-map blocks
+- per-region usage cards
+- overlap table
+
+### 4. Optional outputs
+- `--golden`: emits expected read-data vectors under `generated/golden/`
+- `--emit_phases_mk <path>`: emits fence/dependency Makefile fragment
+
+## Recommended Extra Documentation
+- one minimal JSON example per pattern
+- exact dependency semantics for `job` / `wait_for_jobs` with 2-3 pattern chain examples
+- known caveat: timeline model in report is simplified and may differ from full RTL contention timing
diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py
index 3f4cd89..64b9b07 100644
--- a/target/verif/simvectors/hci_stimuli/__init__.py
+++ b/target/verif/simvectors/hci_stimuli/__init__.py
@@ -6,7 +6,5 @@
 """
 
 from .generator import StimuliGenerator
-from .processor import unfold_raw_txt, pad_txt_files
-
-__all__ = ['StimuliGenerator', 'unfold_raw_txt', 'pad_txt_files']
 
+__all__ = ['StimuliGenerator']
diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py
index ca5e2d0..52993f5 100644
--- a/target/verif/simvectors/hci_stimuli/generator.py
+++ b/target/verif/simvectors/hci_stimuli/generator.py
@@ -1,186 +1,84 @@
-"""Stimuli Generator class and access-pattern generators."""
-import random
-import os
-
-class StimuliGenerator:
-    def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION):
-        self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY
-        self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8)
-        self.N_BANKS = N_BANKS
-        self.TOT_MEM_SIZE = TOT_MEM_SIZE
-        self.DATA_WIDTH = DATA_WIDTH
-        self.ADD_WIDTH = int(ADD_WIDTH)
-        self.filepath = filepath
-        os.makedirs(os.path.dirname(filepath),exist_ok=True)
-        self.N_TEST = N_TEST
-        self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET
-        self.CYCLE_OFFSET = CYCLE_OFFSET
-        self.IW = IW
-        self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION
-    
-    def random_data(self):
-        data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data
-        data = bin(data_decimal)[2:].zfill(self.DATA_WIDTH)
-        return data
-    
-    def data_wen_offset(self):
-        wen = random.randint(0,1) # write enable signal (1 = read, 0 = write)
-        if (self.EXACT_OR_MAX_OFFSET):
-            cycle_offset = random.randint(1,self.CYCLE_OFFSET)
-        else:
-            cycle_offset = self.CYCLE_OFFSET 
-        if wen:
-            data = "0" * self.DATA_WIDTH
-        else:
-            data = self.random_data()
-        return data, wen, cycle_offset
-    
-
-    def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            for test in range(self.N_TEST):
-                data, wen, cycle_offset = self.data_wen_offset()
-                while True:
-                    add_decimal = int((random.randint(0, int((self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE)/self.WIDTH_OF_MEMORY_BYTE)))*(self.WIDTH_OF_MEMORY_BYTE)) # generate a random word-aligned memory address.
-                    if add_decimal > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        add_decimal = add_decimal - self.TOT_MEM_SIZE*1024 #rolls over
-                    add = bin(add_decimal)[2:].zfill(self.ADD_WIDTH)
-
-                    if wen:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            break
-                    else:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                            break
-                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id  + 1
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-    
-    def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            next_address = int(start_address,2)
-            if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-            for test in range(self.N_TEST):
-                data, wen, cycle_offset = self.data_wen_offset()
-                
-                add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                next_address += (self.WIDTH_OF_MEMORY_BYTE)*stride0 #word-aligned memory address
-                if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                    next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-                if wen:
-                    if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                        break
-                else:
-                    if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                        break
-                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                id = id + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-                
-        return id
-    
-    def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            start_address = int(start_address,2)
-            next_address = start_address
-            j = 0
-            STOP = 0
-            while True:
-                for i in range(len_d0):
-                    data, wen, cycle_offset = self.data_wen_offset()
-                    next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 #word-aligned memory address
-                    add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-                    if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                        next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-
-                    if wen:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
-                            if id - id_start >= self.N_TEST :
-                                STOP = 1
-                                break
-                    else:
-                        if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                            LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                            LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                            file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                            id = id + 1
-                            if id - id_start >= self.N_TEST :
-                                STOP = 1
-                                break
-                if STOP:
-                    break
-                j = j + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-    
-    def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE):
-        id = id_start
-        LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = []
-        LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = []
-        with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator
-            start_address = int(start_address,2)
-            next_address = start_address
-            k = 0
-            STOP = 0
-            while True:
-                for j in range(len_d1):
-                    for i in range(len_d0):
-                        data, wen, cycle_offset = self.data_wen_offset()
-                        next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 + k*(self.WIDTH_OF_MEMORY_BYTE)*stride2 #word-aligned memory address
-                        if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE :
-                            next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over
-                        add = bin(next_address)[2:].zfill(self.ADD_WIDTH)
-
-                        if wen:
-                            if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ:
-                                LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
-                                if id - id_start >= self.N_TEST :
-                                    STOP = 1
-                                    break
-                        else:
-                            if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE:
-                                LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add)
-                                LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add)
-                                file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n")
-                                id = id + 1
-                                if id - id_start >= self.N_TEST :
-                                    STOP = 1
-                                    break
-                    if STOP:
-                        break
-                if STOP:
-                        break
-                k = k + 1
-
-            LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW)
-            LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW)
-        return id
-        
-
+"""StimuliGenerator: infrastructure for writing cycle-accurate stimuli files.
+
+Each output file has one line per simulation cycle in the format:
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
+
+req=0 means no transaction that cycle (id/wen/data/add are don't-cares).
+req=1 means an active transaction.
+"""
+
+import os
+import random
+
+from .patterns import PatternsMixin
+
+
+class StimuliGenerator(PatternsMixin):
+    def __init__(
+        self,
+        IW,
+        WIDTH_OF_MEMORY,
+        N_BANKS,
+        TOT_MEM_SIZE,
+        DATA_WIDTH,
+        ADD_WIDTH,
+        filepath,
+        N_TEST,
+        MASTER_NUMBER_IDENTIFICATION,
+    ):
+        self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY
+        self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY / 8)
+        self.N_BANKS = N_BANKS
+        self.TOT_MEM_SIZE = TOT_MEM_SIZE
+        self.DATA_WIDTH = DATA_WIDTH
+        self.ADD_WIDTH = int(ADD_WIDTH)
+        self.filepath = filepath
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        self.N_TEST = N_TEST
+        self.IW = IW
+        self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION
+
+    def _format_id(self, id_value):
+        return bin(id_value % (1 << self.IW))[2:].zfill(self.IW)
+
+    def random_data(self):
+        data_decimal = random.randint(0, (2 ** self.DATA_WIDTH) - 1)
+        return bin(data_decimal)[2:].zfill(self.DATA_WIDTH)
+
+    def _write_req(self, file_obj, id_value, wen, data, add):
+        """Write one active-request line (req=1)."""
+        file_obj.write(
+            "1 "
+            + self._format_id(id_value)
+            + " "
+            + str(wen)
+            + " "
+            + data
+            + " "
+            + add
+            + "\n"
+        )
+
+    def _write_idle(self, file_obj):
+        """Write one idle line (req=0)."""
+        file_obj.write(
+            "0 "
+            + "0" * self.IW
+            + " 0 "
+            + "0" * self.DATA_WIDTH
+            + " "
+            + "0" * self.ADD_WIDTH
+            + "\n"
+        )
+
+    def _write_pause(self, file_obj):
+        """Write a PAUSE fence token line."""
+        file_obj.write("PAUSE\n")
+
+    def data_wen(self):
+        wen = random.randint(0, 1)  # 1=read, 0=write
+        if wen:
+            data = "0" * self.DATA_WIDTH
+        else:
+            data = self.random_data()
+        return data, wen
diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py
new file mode 100644
index 0000000..cf080fe
--- /dev/null
+++ b/target/verif/simvectors/hci_stimuli/patterns.py
@@ -0,0 +1,850 @@
+"""Access-pattern generators for StimuliGenerator.
+
+Each method writes a cycle-accurate stimuli file directly (one line per cycle):
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
+
+req=0 lines are idle cycles. req=1 lines are active transactions.
+
+Fence semantics (one trailing PAUSE per pattern):
+  Each pattern ends with a PAUSE. fence_idx[i] increments when resume_i fires while
+  fence_reached_o is high (i.e. while the driver is sitting at the PAUSE).
+  fence_idx[i] >= k means driver i has been granted to leave fence k-1,
+  i.e. pattern k-1 is complete and driver i is about to start pattern k.
+
+  resume_i fires when the dependencies of the NEXT pattern are satisfied.
+  So resume_i = "your next job's inputs are ready, proceed".
+
+  Trailing PAUSE of the last pattern has mask=0 → resume_i fires in one cycle
+  → fence_idx advances to N_patterns, signalling final completion to dependents.
+
+All generators accept append=True to open the file in append mode.
+"""
+
+import random
+
+class PatternsMixin:
+
+    @staticmethod
+    def _parse_address(addr_str):
+        s = str(addr_str)
+        if s.startswith('0x') or s.startswith('0X'): return int(s, 16)
+        if set(s) <= {'0', '1'}: return int(s, 2)
+        return int(s, 0)
+
+    @staticmethod
+    def _align_down(value, alignment):
+        if alignment <= 0: return value
+        return (value // alignment) * alignment
+
+    @staticmethod
+    def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c):
+        ra = max(0, int(ratio_a)); rb = max(0, int(ratio_b)); rc = max(0, int(ratio_c))
+        if ra == 0 and rb == 0 and rc == 0: ra, rb, rc = 1, 1, 1
+        s = ra + rb + rc
+        ca = (total_ops * ra) // s; cb = (total_ops * rb) // s; cc = total_ops - ca - cb
+        if total_ops >= 3:
+            if ra > 0 and ca == 0: ca = 1; cc = max(0, cc-1)
+            if rb > 0 and cb == 0: cb = 1; cc = max(0, cc-1)
+            if rc > 0 and cc == 0:
+                if ca > 1: ca -= 1; cc = 1
+                elif cb > 1: cb -= 1; cc = 1
+        return ca, cb, cc
+
+    @staticmethod
+    def _idles_per_req(traffic_pct):
+        traffic_pct = max(1, min(100, int(traffic_pct)))
+        return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct))
+
+    def _is_allowed(self, add, wen, read_blocked_set, write_blocked_set):
+        return add not in (read_blocked_set if wen else write_blocked_set)
+
+    def _record_access(self, add, wen, read_blocked_set, write_blocked_set):
+        write_blocked_set.add(add)
+        if not wen:
+            read_blocked_set.add(add)
+
+    @staticmethod
+    def _init_blocked_sets(read_blocked, write_blocked):
+        return set(read_blocked or []), set(write_blocked or [])
+
+    @staticmethod
+    def _extend_unique_sorted(target, values):
+        if not isinstance(target, list):
+            return
+        known = set(target)
+        for v in sorted(values):
+            if v in known:
+                continue
+            target.append(v)
+            known.add(v)
+
+    def _commit_blocked_sets(self, read_blocked, write_blocked, read_blocked_set, write_blocked_set):
+        self._extend_unique_sorted(read_blocked, read_blocked_set)
+        self._extend_unique_sorted(write_blocked, write_blocked_set)
+
+    def _require_exact_emits(self, pattern_name, id_start, id_value):
+        emitted = int(id_value - id_start)
+        expected = int(self.N_TEST)
+        if emitted == expected:
+            return
+        raise RuntimeError(
+            f"{pattern_name}: emitted {emitted} transaction(s), expected {expected}. "
+            "Adjust region/shape/traffic to satisfy the read/write blocked policy."
+        )
+
+    def _open(self, append):
+        return open(self.filepath, "a" if append else "w", encoding="ascii")
+
+    def _total_mem_bytes(self):
+        return int(self.TOT_MEM_SIZE * 1024)
+
+    def _normalize_addr(self, addr):
+        total = self._total_mem_bytes()
+        if total <= self.WIDTH_OF_MEMORY_BYTE:
+            return 0
+        max_addr = total - self.WIDTH_OF_MEMORY_BYTE
+        a = int(addr) % total
+        if a > max_addr:
+            a = max_addr
+        return a
+
+    @staticmethod
+    def _parse_read_write_schedule(schedule, default="4read_1write"):
+        raw = str(schedule if schedule is not None else default).strip().lower()
+        if not raw:
+            raw = default
+        tokens = []
+        for chunk in raw.replace("-", "_").split("_"):
+            c = chunk.strip()
+            if not c:
+                continue
+            count = 1
+            word = c
+            if c[0].isdigit():
+                i = 0
+                while i < len(c) and c[i].isdigit():
+                    i += 1
+                count = max(1, int(c[:i]))
+                word = c[i:]
+            elif c[-1].isdigit():
+                i = len(c) - 1
+                while i >= 0 and c[i].isdigit():
+                    i -= 1
+                count = max(1, int(c[i + 1:]))
+                word = c[:i + 1]
+            word = word.strip()
+            if word in {"read", "r"}:
+                tokens.extend(["R"] * count)
+            elif word in {"write", "w"}:
+                tokens.extend(["W"] * count)
+        if not tokens:
+            return ["R", "R", "R", "R", "W"]
+        return tokens
+
+    @staticmethod
+    def _parse_abc_schedule(schedule, default="A_B_C"):
+        raw = str(schedule if schedule is not None else default).strip().upper()
+        if not raw:
+            raw = default
+        tokens = []
+        for chunk in raw.replace("-", "_").split("_"):
+            c = chunk.strip()
+            if not c:
+                continue
+            count = 1
+            letter = c
+            if c[0].isdigit():
+                i = 0
+                while i < len(c) and c[i].isdigit():
+                    i += 1
+                count = max(1, int(c[:i]))
+                letter = c[i:]
+            elif c[-1].isdigit():
+                i = len(c) - 1
+                while i >= 0 and c[i].isdigit():
+                    i -= 1
+                count = max(1, int(c[i + 1:]))
+                letter = c[:i + 1]
+            letter = letter.strip().upper()
+            if letter in {"A", "B", "C"}:
+                tokens.extend([letter] * count)
+        if not tokens:
+            return ["A", "B", "C"]
+        return tokens
+
+    # ------------------------------------------------------------------ #
+    # Access patterns — each writes: transactions | PAUSE                 #
+    # ------------------------------------------------------------------ #
+
+    def random_gen(self, id_start, read_blocked, write_blocked,
+                   region_base=0, region_size=None, traffic_pct=100,
+                   traffic_read_pct=None, append=False):
+        total = int(self.TOT_MEM_SIZE * 1024)
+        if region_size is None: region_size = total
+        region_size = min(region_size, total - region_base)
+        n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE)
+        n_idles = self._idles_per_req(traffic_pct)
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
+        else:
+            wen_seq = None
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        max_attempts = max(1, n_words * 4)
+        with self._open(append) as f:
+            for i in range(self.N_TEST):
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None: data, wen = self.data_wen()
+                else: data = "0"*self.DATA_WIDTH if wen else self.random_data()
+                placed = False
+                for _ in range(max_attempts):
+                    ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE
+                    add = bin(int(ad))[2:].zfill(self.ADD_WIDTH)
+                    if self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        placed = True
+                        break
+                if not placed:
+                    continue
+                self._write_req(f, id_value, wen, data, add); id_value += 1
+                for _ in range(n_idles): self._write_idle(f)
+            self._write_pause(f)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("random", id_start, id_value)
+        return id_value
+
+    def linear_gen(self, stride0, start_address, id_start, read_blocked, write_blocked,
+                   traffic_pct=100, traffic_read_pct=None, append=False):
+        n_idles = self._idles_per_req(traffic_pct)
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads)
+        else:
+            wen_seq = None
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        with self._open(append) as f:
+            addr = self._parse_address(start_address)
+            if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                addr -= self.TOT_MEM_SIZE*1024
+            for i in range(self.N_TEST):
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None: data, wen = self.data_wen()
+                else: data = "0"*self.DATA_WIDTH if wen else self.random_data()
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                addr += self.WIDTH_OF_MEMORY_BYTE * stride0
+                if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                    addr -= self.TOT_MEM_SIZE*1024
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                self._write_req(f, id_value, wen, data, add); id_value += 1
+                for _ in range(n_idles): self._write_idle(f)
+            self._write_pause(f)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("linear", id_start, id_value)
+        return id_value
+
+    def gen_2d(self, stride0, len_d0, stride1, start_address, id_start,
+               read_blocked, write_blocked, idle_cycles_between_phases=0, append=False):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        with self._open(append) as f:
+            base = self._parse_address(start_address); j = 0
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                for i in range(len_d0):
+                    data, wen = self.data_wen()
+                    addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1
+                    if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                        addr -= self.TOT_MEM_SIZE*1024
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                    self._write_req(f, id_value, wen, data, add); id_value += 1
+                    if id_value - id_start >= self.N_TEST: break
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
+                if id_value == emitted_before:
+                    break
+                j += 1
+            self._write_pause(f)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("2d", id_start, id_value)
+        return id_value
+
+    def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start,
+               read_blocked, write_blocked, idle_cycles_between_phases=0, append=False):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        with self._open(append) as f:
+            base = self._parse_address(start_address); k = 0
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                for j in range(len_d1):
+                    for i in range(len_d0):
+                        data, wen = self.data_wen()
+                        addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + k*self.WIDTH_OF_MEMORY_BYTE*stride2
+                        if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE:
+                            addr -= self.TOT_MEM_SIZE*1024
+                        add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                        if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        self._write_req(f, id_value, wen, data, add); id_value += 1
+                        if id_value - id_start >= self.N_TEST: break
+                    if id_value - id_start >= self.N_TEST: break
+                    for _ in range(idle_cycles_between_phases): self._write_idle(f)
+                if id_value == emitted_before:
+                    break
+                k += 1
+            self._write_pause(f)
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("3d", id_start, id_value)
+        return id_value
+
+    def idle_gen(self, id_start, append=False):
+        with self._open(append) as f:
+            self._write_idle(f)
+            self._write_pause(f)
+        return id_start
+
+    def matmul_phased_gen(self, id_start, read_blocked, write_blocked,
+                          region_base_address, region_size_bytes,
+                          matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1,
+                          traffic_pct=100,
+                          idle_cycles_between_phases=0,
+                          region_base_address_a=None, region_size_bytes_a=None,
+                          region_base_address_b=None, region_size_bytes_b=None,
+                          region_base_address_c=None, region_size_bytes_c=None,
+                          append=False):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024)
+        n_idles = self._idles_per_req(traffic_pct)
+
+        def _res(bo, so, fb, fs):
+            b = self._align_down(int(bo if bo is not None else fb), ab)
+            s = self._align_down(int(so if so is not None else fs), ab)
+            if b+s > tm: s = self._align_down(tm-b, ab)
+            return b, s
+
+        if region_base_address_a is not None and region_size_bytes_a is not None:
+            a_base, a_size = _res(region_base_address_a, region_size_bytes_a, 0, 0)
+            b_base, b_size = _res(region_base_address_b, region_size_bytes_b, a_base, a_size)
+            c_base, c_size = _res(region_base_address_c, region_size_bytes_c, a_base, a_size)
+        else:
+            base = self._align_down(int(region_base_address), ab)
+            size = self._align_down(int(region_size_bytes), ab)
+            if base+size > tm: size = self._align_down(tm-base, ab)
+            rw = size // ab
+            if rw < 3:
+                with self._open(append) as f: self._write_idle(f); self._write_pause(f)
+                self._require_exact_emits("matmul_phased", id_start, id_value)
+                return id_value
+            aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw
+            a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab
+            c_base=b_base+b_size; c_size=cw*ab
+
+        ca, cb, cc = self._phase_counts(self.N_TEST, matmul_ratio_a, matmul_ratio_b, matmul_ratio_c)
+
+        def _emit(fobj, count, wen, pb, pe):
+            nonlocal id_value
+            addr = pb
+            for _ in range(count):
+                data = "0"*self.DATA_WIDTH if wen else self.random_data()
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    addr += ab
+                    if addr >= pe:
+                        addr = pb
+                    continue
+                self._write_req(fobj, id_value, wen, data, add)
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                id_value += 1; addr += ab
+                if addr >= pe: addr = pb
+                for _ in range(n_idles): self._write_idle(fobj)
+
+        with self._open(append) as f:
+            _emit(f, ca, 1, a_base, a_base+a_size)
+            if ca > 0 and (cb > 0 or cc > 0):
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
+            _emit(f, cb, 1, b_base, b_base+b_size)
+            if cb > 0 and cc > 0:
+                for _ in range(idle_cycles_between_phases): self._write_idle(f)
+            _emit(f, cc, 0, c_base, c_base+c_size)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("matmul_phased", id_start, id_value)
+        return id_value
+
+    def multi_linear_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        regions,
+        schedule="round_robin",
+        burst_len=1,
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        burst = max(1, int(burst_len))
+
+        norm_regions = []
+        for reg in regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            stride_words = max(1, int(reg.get("stride_words", 1)))
+            read_pct = reg.get("read_pct")
+            if read_pct is not None:
+                read_pct = max(0, min(100, int(read_pct)))
+            norm_regions.append({
+                "base": base,
+                "size": size,
+                "stride_words": stride_words,
+                "read_pct": read_pct,
+                "offset": 0,
+            })
+
+        if not norm_regions:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("multi_linear", id_start, id_value)
+            return id_value
+
+        rr = 0
+        stalled_rounds = 0
+        with self._open(append) as f:
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                reg = norm_regions[rr % len(norm_regions)]
+                rr += 1
+                chunk = burst if str(schedule).strip().lower() == "round_robin" else max(1, self.N_TEST)
+                for _ in range(chunk):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = reg["base"] + reg["offset"]
+                    add = bin(self._normalize_addr(addr))[2:].zfill(self.ADD_WIDTH)
+                    if reg["read_pct"] is None:
+                        data, wen = self.data_wen()
+                    else:
+                        wen = 1 if random.randint(1, 100) <= reg["read_pct"] else 0
+                        data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                    if self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        self._write_req(f, id_value, wen, data, add)
+                        id_value += 1
+                        for _ in range(n_idles):
+                            self._write_idle(f)
+                    step = reg["stride_words"] * ab
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                if id_value == emitted_before:
+                    stalled_rounds += 1
+                    if stalled_rounds >= len(norm_regions):
+                        break
+                else:
+                    stalled_rounds = 0
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("multi_linear", id_start, id_value)
+        return id_value
+
+    def bank_group_linear_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        start_bank,
+        bank_group_span,
+        stride_beats=1,
+        bank_group_hop=0,
+        wen=None,
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        span = max(1, min(int(bank_group_span), int(self.N_BANKS)))
+        start_bank = int(start_bank) % max(1, int(self.N_BANKS))
+        stride = max(1, int(stride_beats))
+        hop = max(0, int(bank_group_hop))
+
+        with self._open(append) as f:
+            for tx in range(self.N_TEST):
+                phase = tx * stride
+                group_idx = phase // span
+                bank_base = (start_bank + group_idx * hop * span) % self.N_BANKS
+                bank = (bank_base + (phase % span)) % self.N_BANKS
+                row = group_idx
+                word_idx = row * self.N_BANKS + bank
+                addr = self._normalize_addr(word_idx * ab)
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                if wen is None:
+                    data, wen_cur = self.data_wen()
+                else:
+                    wen_cur = 1 if int(wen) else 0
+                    data = "0" * self.DATA_WIDTH if wen_cur else self.random_data()
+                if not self._is_allowed(add, wen_cur, read_blocked_set, write_blocked_set):
+                    continue
+                self._record_access(add, wen_cur, read_blocked_set, write_blocked_set)
+                self._write_req(f, id_value, wen_cur, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("bank_group_linear", id_start, id_value)
+        return id_value
+
+    def rw_rowwise_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        row_base_address,
+        row_size_bytes,
+        n_rows,
+        row_stride_bytes,
+        reads_per_row,
+        writes_per_row,
+        traffic_pct=100,
+        idle_cycles_between_rows=0,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        n_idles = self._idles_per_req(traffic_pct)
+        base = self._align_down(int(row_base_address), ab)
+        row_size = max(ab, self._align_down(int(row_size_bytes), ab))
+        row_stride = max(ab, self._align_down(int(row_stride_bytes), ab))
+        n_rows = max(0, int(n_rows))
+        reads_per_row = max(0, int(reads_per_row))
+        writes_per_row = max(0, int(writes_per_row))
+
+        with self._open(append) as f:
+            for r in range(n_rows):
+                if id_value - id_start >= self.N_TEST:
+                    break
+                row_base = self._normalize_addr(base + r * row_stride)
+                for i in range(reads_per_row):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = self._normalize_addr(row_base + (i * ab) % row_size)
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    wen = 1
+                    data = "0" * self.DATA_WIDTH
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                    self._write_req(f, id_value, wen, data, add)
+                    id_value += 1
+                    for _ in range(n_idles):
+                        self._write_idle(f)
+                for i in range(writes_per_row):
+                    if id_value - id_start >= self.N_TEST:
+                        break
+                    addr = self._normalize_addr(row_base + (i * ab) % row_size)
+                    add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                    wen = 0
+                    data = self.random_data()
+                    if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                        continue
+                    self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                    self._write_req(f, id_value, wen, data, add)
+                    id_value += 1
+                    for _ in range(n_idles):
+                        self._write_idle(f)
+                if r < n_rows - 1:
+                    for _ in range(max(0, int(idle_cycles_between_rows))):
+                        self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("rw_rowwise", id_start, id_value)
+        return id_value
+
+    def gather_scatter_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        read_regions,
+        write_region,
+        chunk_bytes=0,
+        schedule="4read_1write",
+        traffic_pct=100,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        chunk_val = ab if chunk_bytes is None else int(chunk_bytes)
+        step = max(ab, self._align_down(chunk_val if chunk_val > 0 else ab, ab))
+        tokens = self._parse_read_write_schedule(schedule)
+
+        reads = []
+        for reg in read_regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            reads.append({"base": base, "size": size, "offset": 0})
+
+        wb = self._align_down(int((write_region or {}).get("base", 0)), ab)
+        ws = self._align_down(int((write_region or {}).get("size_bytes", 0)), ab)
+        if wb >= tm:
+            wb %= tm
+        if wb + ws > tm:
+            ws = self._align_down(tm - wb, ab)
+
+        if not reads and ws <= 0:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("gather_scatter", id_start, id_value)
+            return id_value
+
+        read_rr = 0
+        token_idx = 0
+        write_offset = 0
+        max_no_progress = max(32, len(tokens) * max(1, len(reads) + (1 if ws > 0 else 0)))
+        no_progress_iters = 0
+        with self._open(append) as f:
+            while id_value - id_start < self.N_TEST:
+                token = tokens[token_idx % len(tokens)]
+                token_idx += 1
+                wen = 1 if token == "R" else 0
+                if token == "R" and reads:
+                    reg = reads[read_rr % len(reads)]
+                    read_rr += 1
+                    addr = self._normalize_addr(reg["base"] + reg["offset"])
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                elif ws > 0:
+                    addr = self._normalize_addr(wb + write_offset)
+                    write_offset = (write_offset + step) % ws
+                elif reads:
+                    reg = reads[read_rr % len(reads)]
+                    read_rr += 1
+                    wen = 1
+                    addr = self._normalize_addr(reg["base"] + reg["offset"])
+                    reg["offset"] = (reg["offset"] + step) % reg["size"]
+                else:
+                    break
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    no_progress_iters += 1
+                    if no_progress_iters >= max_no_progress:
+                        break
+                    continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                no_progress_iters = 0
+                self._write_req(f, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("gather_scatter", id_start, id_value)
+        return id_value
+
+    def matmul_tiled_interleave_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        region_base_address_a,
+        region_size_bytes_a,
+        region_base_address_b,
+        region_size_bytes_b,
+        region_base_address_c,
+        region_size_bytes_c,
+        tile_a_bytes=0,
+        tile_b_bytes=0,
+        tile_c_bytes=0,
+        tiles=1,
+        ab_c_schedule="A_B_C",
+        traffic_pct=100,
+        idle_cycles_between_tiles=0,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+        tile_idle = max(0, int(idle_cycles_between_tiles))
+        tokens = self._parse_abc_schedule(ab_c_schedule)
+
+        def _res(base_raw, size_raw):
+            base = self._align_down(int(base_raw), ab)
+            size = self._align_down(int(size_raw), ab)
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            return base, size
+
+        a_base, a_size = _res(region_base_address_a, region_size_bytes_a)
+        b_base, b_size = _res(region_base_address_b, region_size_bytes_b)
+        c_base, c_size = _res(region_base_address_c, region_size_bytes_c)
+        if a_size <= 0 or b_size <= 0 or c_size <= 0:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("matmul_tiled_interleave", id_start, id_value)
+            return id_value
+
+        ta = ab if tile_a_bytes is None else int(tile_a_bytes)
+        tb = ab if tile_b_bytes is None else int(tile_b_bytes)
+        tc = ab if tile_c_bytes is None else int(tile_c_bytes)
+        cnt_a = max(1, self._align_down(ta if ta > 0 else ab, ab) // ab)
+        cnt_b = max(1, self._align_down(tb if tb > 0 else ab, ab) // ab)
+        cnt_c = max(1, self._align_down(tc if tc > 0 else ab, ab) // ab)
+        counts = {"A": cnt_a, "B": cnt_b, "C": cnt_c}
+        ptr = {"A": 0, "B": 0, "C": 0}
+        base = {"A": a_base, "B": b_base, "C": c_base}
+        size = {"A": a_size, "B": b_size, "C": c_size}
+        max_tiles = max(1, int(tiles))
+
+        with self._open(append) as f:
+            tile_idx = 0
+            stalled_tiles = 0
+            while id_value - id_start < self.N_TEST:
+                emitted_before = id_value
+                for tok in tokens:
+                    for _ in range(counts[tok]):
+                        if id_value - id_start >= self.N_TEST:
+                            break
+                        addr = self._normalize_addr(base[tok] + ptr[tok])
+                        ptr[tok] = (ptr[tok] + ab) % size[tok]
+                        wen = 0 if tok == "C" else 1
+                        add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                        data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                        if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                            continue
+                        self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                        self._write_req(f, id_value, wen, data, add)
+                        id_value += 1
+                        for _ in range(n_idles):
+                            self._write_idle(f)
+                tile_idx += 1
+                if tile_idle > 0 and id_value - id_start < self.N_TEST:
+                    for _ in range(tile_idle):
+                        self._write_idle(f)
+                if tile_idx >= max_tiles:
+                    tile_idx = 0
+                if id_value == emitted_before:
+                    stalled_tiles += 1
+                    if stalled_tiles >= max_tiles:
+                        break
+                else:
+                    stalled_tiles = 0
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("matmul_tiled_interleave", id_start, id_value)
+        return id_value
+
+    def hotspot_random_gen(
+        self,
+        id_start,
+        read_blocked,
+        write_blocked,
+        hot_regions,
+        traffic_pct=100,
+        traffic_read_pct=None,
+        append=False,
+    ):
+        id_value = id_start
+        read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked)
+        ab = self.WIDTH_OF_MEMORY_BYTE
+        tm = self._total_mem_bytes()
+        n_idles = self._idles_per_req(traffic_pct)
+
+        regions = []
+        weights = []
+        for reg in hot_regions or []:
+            base = self._align_down(int(reg.get("base", 0)), ab)
+            size = self._align_down(int(reg.get("size_bytes", 0)), ab)
+            weight = max(1, int(reg.get("weight", 1)))
+            if size <= 0:
+                continue
+            if base >= tm:
+                base %= tm
+            if base + size > tm:
+                size = self._align_down(tm - base, ab)
+            if size <= 0:
+                continue
+            regions.append({"base": base, "size": size})
+            weights.append(weight)
+
+        if not regions:
+            with self._open(append) as f:
+                self._write_idle(f)
+                self._write_pause(f)
+            self._require_exact_emits("hotspot_random", id_start, id_value)
+            return id_value
+
+        if traffic_read_pct is not None:
+            rpct = max(0, min(100, int(traffic_read_pct)))
+            n_reads = (self.N_TEST * rpct) // 100
+            wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads)
+        else:
+            wen_seq = None
+
+        with self._open(append) as f:
+            for i in range(self.N_TEST):
+                reg = random.choices(regions, weights=weights, k=1)[0]
+                n_words = max(1, reg["size"] // ab)
+                ad = reg["base"] + random.randint(0, n_words - 1) * ab
+                addr = self._normalize_addr(ad)
+                add = bin(addr)[2:].zfill(self.ADD_WIDTH)
+                wen = wen_seq[i] if wen_seq is not None else None
+                if wen is None:
+                    data, wen = self.data_wen()
+                else:
+                    data = "0" * self.DATA_WIDTH if wen else self.random_data()
+                if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set):
+                    continue
+                self._record_access(add, wen, read_blocked_set, write_blocked_set)
+                self._write_req(f, id_value, wen, data, add)
+                id_value += 1
+                for _ in range(n_idles):
+                    self._write_idle(f)
+            self._write_pause(f)
+
+        self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set)
+        self._require_exact_emits("hotspot_random", id_start, id_value)
+        return id_value
diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py
deleted file mode 100644
index 91b4892..0000000
--- a/target/verif/simvectors/hci_stimuli/processor.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""Processor helpers: unfold and pad stimuli text files."""
-
-import os
-
-
-# 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list.
-# -folder_path_raw          --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated)
-# -folder_path_processed    --> String that specifies the path of the folder containing the new txt files created by this function
-def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH):
-    file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")]
-    for file in file_names:
-        filepath_read = os.path.join(folder_path_raw,file)
-        filepath_write = os.path.join(folder_path_processed, file)
-        os.makedirs(os.path.dirname(filepath_write),exist_ok=True)
-        with open(filepath_read, 'r', encoding = "ascii") as file_read:
-            with open(filepath_write, 'w', encoding="ascii") as file_write:
-                for line in file_read:
-                    if line != 'zero':
-                        values = line.split()
-                        id = values[0]
-                        cycle_offset = values[1]               
-                        wen = values[2]
-                        data = values[3]
-                        add = values[4]
-                        if "log" in file:
-                            for _ in range(int(cycle_offset)-1):
-                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        else:
-                            for _ in range(int(cycle_offset)-1):
-                                file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n")
-                    else:
-                        if "log" in file:
-                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-                        else:
-                            file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n")
-
-
-# 2) ++PAD++ txt files to have the same number of lines
-# -Folder_path  --> path of the folder containing the txt files to be padded
-def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH):
-    file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder
-    max_lines = 0
-    line_count = {} # Dictionary to store the number of lines in each txt file
-    # Determining the maximum number of lines among the txt files
-    for file in file_names:
-        file_path = os.path.join(folder_path,file)
-        with open(file_path,'r', encoding = 'ascii') as f:
-            line_count[file] = sum(1 for _ in f)
-            max_lines = max(max_lines, line_count[file])
-    # Pad files
-    for file in file_names:
-        padding_needed = max_lines - line_count[file]
-        if padding_needed > 0:
-            file_path = os.path.join(folder_path,file)
-            with open(file_path, 'a', encoding = 'ascii') as f:
-                if "log" in file:
-                    for _ in range(padding_needed):
-                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
-                else:
-                    for _ in range(padding_needed):
-                        f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") 
diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py
new file mode 100644
index 0000000..5784714
--- /dev/null
+++ b/target/verif/simvectors/html_report.py
@@ -0,0 +1,394 @@
+"""HTML report generation for memory lifetime visualization."""
+
+from pathlib import Path
+import html
+import math
+
+
+def build_memory_lifetime_html(
+    *,
+    pattern_nodes,
+    driver_windows,
+    regions_timeline,
+    total_cycles,
+    mux_serialization_applied,
+    mux_phase_order,
+    schedule_has_cycle,
+    driver_name_fn,
+    interco_type,
+    n_core_cfg,
+    n_dma_cfg,
+    n_ext_cfg,
+    n_hwpe_cfg,
+    dw_narrow,
+    dw_wide,
+    n_narrow_hci_cfg,
+    n_wide_hci_cfg,
+    n_banks,
+):
+    palette = [
+        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf",
+        "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd",
+        "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e",
+        "#e6ab02", "#a6761d", "#666666",
+    ]
+
+    def _color_for_driver(drv_idx):
+        return palette[drv_idx % len(palette)]
+
+    def _tick_step(total):
+        if total <= 10:
+            return 1
+        raw = max(1, total // 10)
+        mag = 10 ** int(math.log10(raw))
+        return int(math.ceil(raw / mag) * mag)
+
+    total_for_plot = max(1, total_cycles)
+    chart_width = 1280
+    x_left = 220
+    x_right = 24
+    plot_w = chart_width - x_left - x_right
+    row_h = 80
+    y_top = 36
+    exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())]
+    exec_h = y_top + row_h * len(exec_rows) + 72
+    tick = _tick_step(total_for_plot)
+    ticks = list(range(0, total_for_plot + 1, tick))
+    if ticks[-1] != total_for_plot:
+        ticks.append(total_for_plot)
+
+    exec_svg = []
+    exec_svg.append(f'<svg width="{chart_width}" height="{exec_h}" viewBox="0 0 {chart_width} {exec_h}" xmlns="http://www.w3.org/2000/svg">')
+    exec_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    exec_svg.append(f'<text x="{x_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">Execution Timeline (transaction-count model)</text>')
+    for t in ticks:
+        x = x_left + (t / total_for_plot) * plot_w
+        exec_svg.append(f'<line x1="{x:.2f}" y1="{y_top - 6}" x2="{x:.2f}" y2="{exec_h - 52}" stroke="#e0e0e0" stroke-width="1"/>')
+        exec_svg.append(f'<text x="{x:.2f}" y="{exec_h - 36}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" fill="#555">{t}</text>')
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 18}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="12" fill="#333">Transaction number</text>'
+    )
+    exec_svg.append(
+        f'<text x="{x_left + (plot_w / 2):.2f}" y="{exec_h - 2}" text-anchor="middle" '
+        'font-family="Arial, sans-serif" font-size="11" fill="#555">'
+        'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.'
+        '</text>'
+    )
+
+    def _rw_mix_text(node):
+        rpct = node.get('traffic_read_pct')
+        if rpct is None:
+            read_bytes = 0
+            write_bytes = 0
+            for reg in node.get('regions', []):
+                label_l = str(reg.get('label', '')).lower()
+                size_b = max(0, int(reg.get('size', 0)))
+                if 'read' in label_l and 'write' not in label_l:
+                    read_bytes += size_b
+                elif 'write' in label_l and 'read' not in label_l:
+                    write_bytes += size_b
+            if (read_bytes + write_bytes) > 0:
+                rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes)))
+            else:
+                rpct = 50
+        rpct = max(0, min(100, int(rpct)))
+        if rpct == 0:
+            return "100% writes"
+        if rpct == 100:
+            return "100% reads"
+        return f"{rpct}% reads / {100 - rpct}% writes"
+
+    def _fmt_kib(bytes_v):
+        return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB"
+
+    def _outside_detail_lines(node, base_addr, size_kib):
+        regs = list(node.get('regions', []))
+        label_map = {str(r.get('label', '')): r for r in regs}
+        has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)'))
+        is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased', 'matmul_tiled_interleave'} or has_matmul_regs
+
+        if is_matmul:
+            mat_lines = []
+            for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]:
+                if lbl in label_map:
+                    r = label_map[lbl]
+                    mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})")
+            if not mat_lines:
+                return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+            return mat_lines
+
+        if regs:
+            r0 = regs[0]
+            return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"]
+        return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"]
+
+    for row_idx, row in enumerate(exec_rows):
+        y = y_top + row_idx * row_h
+        name = html.escape(row['name'])
+        label_color = "#111111" if row['is_hwpe'] else "#555555"
+        exec_svg.append(f'<text x="{x_left - 10}" y="{y + 34}" text-anchor="end" font-family="Arial, sans-serif" font-size="12" fill="{label_color}">{name}</text>')
+        exec_svg.append(f'<line x1="{x_left}" y1="{y + row_h - 1}" x2="{x_left + plot_w}" y2="{y + row_h - 1}" stroke="#f0f0f0" stroke-width="1"/>')
+        nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']]
+        for node in nodes:
+            if node['end_cycle'] <= node['start_cycle']:
+                continue
+            x = x_left + (node['start_cycle'] / total_for_plot) * plot_w
+            w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w)
+            color = _color_for_driver(node['driver_idx'])
+            base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0)
+            size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0
+            rw_mix = _rw_mix_text(node)
+            line1_full = f"{node['job']}"
+            line2_full = f"{node['mem_access_type']}"
+            line3_full = rw_mix
+            outside_lines = _outside_detail_lines(node, base_addr, size_kib)
+            title = (
+                f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']}, {node['end_cycle']}) "
+                f"{node['mem_access_type']} n={node['n_transactions']} "
+                f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} "
+                f"{' | '.join(outside_lines)}"
+            )
+            exec_svg.append('<g style="cursor:help;">')
+            exec_svg.append(f'<title>{html.escape(title)}</title>')
+            exec_svg.append(
+                f'<rect x="{x:.2f}" y="{y + 4}" width="{w:.2f}" height="34" '
+                f'rx="3" ry="3" fill="{color}" fill-opacity="0.82" stroke="#222" stroke-width="0.2"/>'
+            )
+            line1 = html.escape(line1_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 15}" font-family="Arial, sans-serif" font-size="9" '
+                f'fill="#ffffff" style="pointer-events:none;">{line1}</text>'
+            )
+            line2 = html.escape(line2_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 25}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line2}</text>'
+            )
+            line3 = html.escape(line3_full)
+            exec_svg.append(
+                f'<text x="{x + 4:.2f}" y="{y + 34}" font-family="Arial, sans-serif" font-size="8" '
+                f'fill="#ffffff" style="pointer-events:none;">{line3}</text>'
+            )
+            for ext_idx, ext in enumerate(outside_lines):
+                y_ext = y + 49 + (ext_idx * 9)
+                ext_txt = html.escape(ext)
+                exec_svg.append(
+                    f'<text x="{x + 2:.2f}" y="{y_ext:.2f}" font-family="Arial, sans-serif" font-size="8" '
+                    f'fill="#333333" style="pointer-events:none;">{ext_txt}</text>'
+                )
+            exec_svg.append('</g>')
+    exec_svg.append('</svg>')
+
+    region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))]
+    overlap_rows = []
+    overlaps_by_region = {i: [] for i in range(len(region_rows))}
+    for i in range(len(region_rows)):
+        a = region_rows[i]
+        for j in range(i + 1, len(region_rows)):
+            b = region_rows[j]
+            ov_base = max(a['base'], b['base'])
+            ov_end = min(a['end'], b['end'])
+            if ov_base <= ov_end:
+                ov_size = ov_end - ov_base + 1
+                overlap_rows.append({
+                    'a_idx': i,
+                    'b_idx': j,
+                    'ov_base': ov_base,
+                    'ov_end': ov_end,
+                    'ov_size': ov_size,
+                })
+                overlaps_by_region[i].append((j, ov_base, ov_end, ov_size))
+                overlaps_by_region[j].append((i, ov_base, ov_end, ov_size))
+
+    used_min = min((reg['base'] for reg in region_rows), default=0)
+    used_max = max((reg['end'] for reg in region_rows), default=0)
+    if used_max < used_min:
+        used_max = used_min
+    used_span = max(1, used_max - used_min + 1)
+
+    map_left = 170
+    map_right = 24
+    map_plot_w = chart_width - map_left - map_right
+    map_h = 124
+    bar_y = 56
+    bar_h = 24
+    region_map_svg = []
+    region_map_svg.append(f'<svg width="{chart_width}" height="{map_h}" viewBox="0 0 {chart_width} {map_h}" xmlns="http://www.w3.org/2000/svg">')
+    region_map_svg.append('<rect x="0" y="0" width="100%" height="100%" fill="#ffffff"/>')
+    region_map_svg.append(
+        f'<text x="{map_left}" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="700">'
+        f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})"
+        f"</text>"
+    )
+    region_map_svg.append(
+        f'<rect x="{map_left}" y="{bar_y}" width="{map_plot_w}" height="{bar_h}" '
+        f'fill="#f6f6f6" stroke="#cfcfcf" stroke-width="1"/>'
+    )
+    for pct in [0, 25, 50, 75, 100]:
+        x = map_left + (pct / 100.0) * map_plot_w
+        addr = used_min + int(((used_span - 1) * pct) / 100.0)
+        region_map_svg.append(f'<line x1="{x:.2f}" y1="{bar_y - 8}" x2="{x:.2f}" y2="{bar_y + bar_h + 8}" stroke="#d7d7d7" stroke-width="1"/>')
+        region_map_svg.append(
+            f'<text x="{x:.2f}" y="{bar_y + bar_h + 22}" text-anchor="middle" '
+            f'font-family="Arial, sans-serif" font-size="11" fill="#555">0x{addr:08x}</text>'
+        )
+    for reg in region_rows:
+        x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w
+        w = max(1.0, (reg['size'] / used_span) * map_plot_w)
+        color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888"
+        title = (
+            f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"size={reg['size']}B accesses={len(reg['accesses'])}"
+        )
+        region_map_svg.append(
+            f'<rect x="{x:.2f}" y="{bar_y + 2}" width="{w:.2f}" height="{bar_h - 4}" '
+            f'rx="2" ry="2" fill="{color}" fill-opacity="0.62" stroke="#222" stroke-width="0.35">'
+            f'<title>{html.escape(title)}</title></rect>'
+        )
+        if w >= 92:
+            region_map_svg.append(
+                f'<text x="{x + 4:.2f}" y="{bar_y + 16}" font-family="Arial, sans-serif" '
+                f'font-size="10" fill="#111">{html.escape(reg["label"])}</text>'
+            )
+    region_map_svg.append('</svg>')
+
+    legend_items = []
+    for d in sorted(driver_windows.keys()):
+        n = driver_name_fn(d)
+        c = _color_for_driver(d)
+        legend_items.append(
+            f'<span style="display:inline-flex;align-items:center;margin-right:12px;margin-bottom:6px;">'
+            f'<span style="display:inline-block;width:11px;height:11px;background:{c};margin-right:5px;border:1px solid #222;"></span>'
+            f'<span>{html.escape(n)}</span></span>'
+        )
+
+    region_cards = []
+    for idx, reg in enumerate(region_rows):
+        access_rows = []
+        accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start']))
+        for acc in accesses:
+            desc = acc['description'] if acc['description'] else "-"
+            access_rows.append(
+                "<tr>"
+                f"<td>{html.escape(acc['driver_name'])}</td>"
+                f"<td>p{acc['pattern_idx']}</td>"
+                f"<td>{html.escape(acc['job'])}</td>"
+                f"<td>{html.escape(desc)}</td>"
+                f"<td>[{acc['start']}, {acc['end']})</td>"
+                "</tr>"
+            )
+        overlap_refs = overlaps_by_region.get(idx, [])
+        if overlap_refs:
+            ov_txt = ", ".join(
+                [
+                    f"{html.escape(region_rows[j]['label'])} "
+                    f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)"
+                    for (j, ovb, ove, ovs) in overlap_refs
+                ]
+            )
+        else:
+            ov_txt = "none"
+        region_cards.append(
+            "<div class='region-card'>"
+            f"<div><b>{html.escape(reg['label'])}</b> | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B</div>"
+            f"<div class='meta' style='margin:4px 0 8px 0;'><b>Overlaps:</b> {ov_txt}</div>"
+            "<table class='smalltbl'><thead><tr>"
+            "<th>Driver/HWPE</th><th>Pattern</th><th>Job</th><th>Description</th><th>Modeled interval</th>"
+            "</tr></thead><tbody>"
+            f"{''.join(access_rows)}"
+            "</tbody></table>"
+            "</div>"
+        )
+
+    overlap_table_rows = []
+    for ov in overlap_rows:
+        a = region_rows[ov['a_idx']]
+        b = region_rows[ov['b_idx']]
+        overlap_table_rows.append(
+            "<tr>"
+            f"<td>{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})</td>"
+            f"<td>{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})</td>"
+            f"<td>0x{ov['ov_base']:08x}</td>"
+            f"<td>0x{ov['ov_end']:08x}</td>"
+            f"<td>{ov['ov_size']}</td>"
+            "</tr>"
+        )
+
+    note = "Timeline follows declared wait_for_jobs dependencies from workload.json."
+    note_2 = (
+        "Time axis is transaction-count based only "
+        "(no interconnect conflict/stall/arbitration modeling)."
+    )
+    note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing."
+    note_3 = ""
+    if mux_serialization_applied:
+        note_3 = (
+            "MUX mode: HWPE execution is serialized by job order, "
+            "with lower HWPE ID first inside each job (tb_hci-like)."
+        )
+        if mux_phase_order:
+            note_3 += f" Job order: {', '.join(mux_phase_order)}."
+    note_3_html = f"<div class='meta'>{html.escape(note_3)}</div>" if note_3 else ""
+    cycle_warning_html = (
+        "<p style='margin:8px 0;color:#b00020;font-weight:600;'>Warning: dependency cycle detected; fallback scheduling order used.</p>"
+        if schedule_has_cycle else ""
+    )
+    overlap_html = (
+        "<table><thead><tr><th>Region A</th><th>Region B</th><th>Overlap Base</th><th>Overlap End</th><th>Overlap Size (B)</th></tr></thead><tbody>"
+        f"{''.join(overlap_table_rows)}"
+        "</tbody></table>"
+        if overlap_table_rows else
+        "<div class='meta'>No overlaps detected among used regions.</div>"
+    )
+
+    return (
+        "<!doctype html><html><head><meta charset='utf-8'>"
+        "<title>Memory Access Region View (Transaction-Count Model)</title>"
+        "<style>"
+        "body{font-family:Arial,sans-serif;margin:18px;background:#fafafa;color:#111;}"
+        "h1{font-size:20px;margin:0 0 6px 0;}h2{font-size:16px;margin:18px 0 8px 0;}"
+        ".meta{font-size:13px;color:#333;margin-bottom:10px;}"
+        ".panel{background:#fff;border:1px solid #ddd;border-radius:8px;padding:12px;margin-bottom:14px;overflow-x:auto;}"
+        "table{border-collapse:collapse;width:100%;font-size:12px;background:#fff;}"
+        "th,td{border:1px solid #ddd;padding:5px 7px;text-align:left;vertical-align:top;}"
+        "th{background:#f3f3f3;}"
+        ".region-card{border:1px solid #dcdcdc;border-radius:6px;padding:10px;margin-bottom:10px;background:#fff;}"
+        ".smalltbl{border-collapse:collapse;width:100%;font-size:11px;background:#fff;}"
+        ".smalltbl th,.smalltbl td{border:1px solid #ddd;padding:4px 6px;text-align:left;vertical-align:top;}"
+        ".smalltbl th{background:#f7f7f7;}"
+        "</style></head><body>"
+        "<h1>Memory Access Region View</h1>"
+        f"<div class='meta'><b>Drivers ({dw_narrow} bit):</b> "
+        f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}</div>"
+        f"<div class='meta'><b>Drivers ({dw_wide} bit):</b> HWPE={n_hwpe_cfg}</div>"
+        f"<div class='meta'><b>Interconnect type:</b> {html.escape(interco_type)} | "
+        f"<b>Narrow master ports ({dw_narrow} bit):</b> {n_narrow_hci_cfg} | "
+        f"<b>Wide master ports ({dw_wide} bit):</b> {n_wide_hci_cfg} | "
+        f"<b>Slave ports (banks):</b> {n_banks} | "
+        f"<b>Total modeled time:</b> {total_cycles} units</div>"
+        f"<div class='meta'>{html.escape(note)}</div>"
+        f"<div class='meta'>{html.escape(note_2)}</div>"
+        f"<div class='meta'>{html.escape(note_2b)}</div>"
+        f"{note_3_html}"
+        f"{cycle_warning_html}"
+        "<div class='panel'><h2 style='margin-top:0;'>Legend</h2>"
+        f"{''.join(legend_items)}</div>"
+        "<div class='panel'>"
+        f"{''.join(exec_svg)}"
+        "</div>"
+        "<div class='panel'>"
+        f"{''.join(region_map_svg)}"
+        "</div>"
+        "<h2>Region Usage Blocks</h2>"
+        f"{''.join(region_cards)}"
+        "<h2>Overlapping Regions</h2>"
+        f"{overlap_html}"
+        "</body></html>"
+    )
+
+
+def write_memory_lifetime_html(memory_lifetime_path: Path, **kwargs):
+    html_doc = build_memory_lifetime_html(**kwargs)
+    memory_lifetime_path.write_text(html_doc, encoding='utf-8')
diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py
index 21c2e28..da740bb 100644
--- a/target/verif/simvectors/main.py
+++ b/target/verif/simvectors/main.py
@@ -1,29 +1,37 @@
 """Stimuli generator (reads JSON configs in `verif/config`).
 
 This script is invoked by the top-level Makefile and expects three
-JSON config files: workload, testbench and hardware. It produces raw
-and processed stimuli in `verif/simvectors/generated`.
+JSON config files: workload, testbench and hardware. It produces
+cycle-accurate stimuli in `verif/simvectors/generated/stimuli`.
+
+Each stimuli file encodes an offered per-cycle request stream plus PAUSE fence tokens.
+
+Idle lines (req=0) represent intended issue gaps in the absence of backpressure.
+The application driver may consume some idle entries while stalled on an earlier
+request grant, and hide some memory/interconnect latency. So the file is not a
+strict wall-clock replay under contention.
+
+Stimuli line format:
+  req(1b) id(IWb) wen(1b) data(Nb) add(Ab)
 """
 
-### LIBRARIES AND DEPENDENCIES ###
 import json
+import math
 import sys
 from pathlib import Path
 import argparse
-import numpy as np
 
 code_directory = Path(__file__).resolve().parent
 
-
-# Try to import the local package `hci_stimuli`. If the running
-# environment doesn't include the `simvectors` directory on `sys.path`
-# (for example when invoked from a different working directory), add
-# `code_directory` to `sys.path` as a minimal fallback.
 try:
-    from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files
+    from hci_stimuli import StimuliGenerator
+    from memory_report import write_memory_map_txt
+    from html_report import write_memory_lifetime_html
 except Exception:
     sys.path.insert(0, str(code_directory))
-    from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files
+    from hci_stimuli import StimuliGenerator
+    from memory_report import write_memory_map_txt
+    from html_report import write_memory_lifetime_html
 
 
 def parse_args(argv=None):
@@ -31,6 +39,8 @@ def parse_args(argv=None):
     parser.add_argument('--workload_config', required=True, help="Path to JSON workload configuration file")
     parser.add_argument('--testbench_config', required=True, help="Path to JSON testbench configuration file")
     parser.add_argument('--hardware_config', required=True, help="Path to JSON hardware configuration file")
+    parser.add_argument('--emit_phases_mk', default=None, metavar='PATH',
+                        help="Also write the phases.mk Makefile fragment to PATH")
     parser.add_argument(
         '--golden',
         action='store_true',
@@ -52,19 +62,17 @@ def load_config(filename, description):
     except json.JSONDecodeError as e:
         print(f"ERROR: Invalid JSON in {description} file: {e}")
         sys.exit(1)
+
+
 ### MAIN ENTRYPOINT ###
 def main(argv=None):
-    # parse CLI args
     args = parse_args(argv)
 
-    # load configs
     hardware_config = load_config(args.hardware_config, "Hardware configuration")
     testbench_config = load_config(args.testbench_config, "Testbench configuration")
     workload_config = load_config(args.workload_config, "Workload configuration")
 
-    # helpers imported at module-level (with a small sys.path fallback)
-
-    # Extract hardware parameters
+    # Hardware parameters
     hw_params = hardware_config['parameters']
     N_BANKS = hw_params['N_BANKS']
     TOT_MEM_SIZE = hw_params['TOT_MEM_SIZE']
@@ -73,152 +81,863 @@ def main(argv=None):
     N_DMA = hw_params['N_DMA']
     N_EXT = hw_params['N_EXT']
     N_HWPE = hw_params['N_HWPE']
-    HWPE_WIDTH = hw_params['HWPE_WIDTH']
-
-    # Extract testbench parameters
-    tb_params = testbench_config['parameters']
-    TEST_RATIO = tb_params['TRANSACTION_RATIO']
-    N_TEST_LOG = tb_params['N_TRANSACTION_LOG']
-
-    # Extract workload simulation parameters
-    workload_sim_params = workload_config['simulation_parameters']
-    CYCLE_OFFSET_LOG = workload_sim_params['CYCLE_OFFSET_LOG']
-    CYCLE_OFFSET_HWPE = workload_sim_params['CYCLE_OFFSET_HWPE']
-    EXACT_OR_MAX_OFFSET = workload_sim_params['EXACT_OR_MAX_OFFSET']
+    HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT']
+    N_CORE_CFG = N_CORE
+    N_DMA_CFG = N_DMA
+    N_EXT_CFG = N_EXT
+    N_HWPE_CFG = N_HWPE
 
-    # Extract workload master parameters
     log_masters = workload_config['log_masters']
     hwpe_masters = workload_config['hwpe_masters']
 
     # Derived parameters
-    WIDTH_OF_MEMORY = DATA_WIDTH
-    WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8
-    N_WORDS = (TOT_MEM_SIZE * 1000 / N_BANKS) / WIDTH_OF_MEMORY_BYTE
-    ADD_WIDTH = int(np.ceil(np.log2(TOT_MEM_SIZE * 1000)))
-    N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO)
+    ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1024))
     N_LOG = N_CORE + N_DMA + N_EXT
-    N_MASTER = N_LOG + N_HWPE
-    IW = int(np.ceil(np.log2(N_TEST_LOG * N_LOG + N_TEST_HWPE * N_HWPE)))
-    CORE_ZERO_FLAG = False
-    EXT_ZERO_FLAG = False
-    DMA_ZERO_FLAG = False
-    HWPE_ZERO_FLAG = False
+    N_LOG_CFG = N_LOG
+    IW = 8
+
+    def _narrow_driver_name(local_idx: int) -> str:
+        idx = int(local_idx)
+        if idx < N_CORE_CFG:
+            return f"core_{idx}"
+        idx -= N_CORE_CFG
+        if idx < N_DMA_CFG:
+            return f"dma_{idx}"
+        idx -= N_DMA_CFG
+        if idx < N_EXT_CFG:
+            return f"ext_{idx}"
+        return f"narrow_{local_idx}"
 
     # Validations
     if len(log_masters) != N_LOG:
         print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})")
         sys.exit(1)
-
     if len(hwpe_masters) != N_HWPE:
         print(f"ERROR: Number of HWPE masters in workload config ({len(hwpe_masters)}) doesn't match hardware config N_HWPE ({N_HWPE})")
         sys.exit(1)
-
-    if (not N_WORDS.is_integer()):
-        print("ERROR: the number of words is not an integer value")
-        sys.exit(1)
-    if (N_MASTER < 1):
+    if N_LOG + N_HWPE < 1:
         print("ERROR: the number of masters must be > 0")
         sys.exit(1)
+    n_words = (TOT_MEM_SIZE * 1024 / N_BANKS) / (DATA_WIDTH / 8)
+    if not n_words.is_integer():
+        print("ERROR: the number of words is not an integer value")
+        sys.exit(1)
 
     # Prepare output dirs
     simvectors_dir = code_directory.resolve()
     generated_dir = (simvectors_dir / 'generated').resolve()
-    raw_dir = (generated_dir / 'stimuli_raw').resolve()
-    processed_dir = (generated_dir / 'stimuli_processed').resolve()
+    stimuli_dir = (generated_dir / 'stimuli').resolve()
     generated_dir.mkdir(parents=True, exist_ok=True)
-    raw_dir.mkdir(parents=True, exist_ok=True)
-    processed_dir.mkdir(parents=True, exist_ok=True)
+    stimuli_dir.mkdir(parents=True, exist_ok=True)
 
-    # Create zero files when a class of masters is absent. We keep the
-    # original behaviour of creating a single 'zero' file per missing
-    # class to preserve downstream expectations.
-    def _create_zero_file(path: Path):
+    def _create_idle_file(path: Path, data_width: int):
+        """Write a single idle line for a master that is not present in hardware."""
         path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text('zero', encoding='ascii')
+        path.write_text(
+            "0 " + "0" * IW + " 0 " + "0" * data_width + " " + "0" * ADD_WIDTH + "\n",
+            encoding='ascii',
+        )
+
+    CORE_ZERO_FLAG = False
+    DMA_ZERO_FLAG = False
+    EXT_ZERO_FLAG = False
+    HWPE_ZERO_FLAG = False
 
     if N_CORE <= 0:
         CORE_ZERO_FLAG = True
         N_CORE = 1
-        _create_zero_file(raw_dir / 'master_log_0.txt')
+        _create_idle_file(stimuli_dir / 'master_log_0.txt', DATA_WIDTH)
     if N_DMA <= 0:
         DMA_ZERO_FLAG = True
         N_DMA = 1
-        _create_zero_file(raw_dir / f'master_log_{N_CORE}.txt')
+        _create_idle_file(stimuli_dir / f'master_log_{N_CORE}.txt', DATA_WIDTH)
     if N_EXT <= 0:
         EXT_ZERO_FLAG = True
         N_EXT = 1
-        _create_zero_file(raw_dir / f'master_log_{N_CORE + N_DMA}.txt')
+        _create_idle_file(stimuli_dir / f'master_log_{N_CORE + N_DMA}.txt', DATA_WIDTH)
     if N_HWPE <= 0:
         HWPE_ZERO_FLAG = True
         N_HWPE = 1
-        _create_zero_file(raw_dir / 'master_hwpe_0.txt')
+        _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH)
 
     next_start_id = 0
-    LIST_OF_FORBIDDEN_ADDRESSES_WRITE = []
-    LIST_OF_FORBIDDEN_ADDRESSES_READ = []
-
-    def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, master_global_idx: int):
-        """Create StimuliGenerator and run the configured generation method.
-
-        Parameters:
-        - filepath: output raw txt file path
-        - master_config: dict from workload.json for this master
-        - is_hwpe: whether this is an HWPE master (affects data width and counts)
-        - master_global_idx: global master id used by the generator
-        """
+
+    # Memory map entries collected during generation, printed at the end
+    memory_map_entries = []
+
+    def _bank_of(byte_addr):
+        return (byte_addr // (DATA_WIDTH // 8)) % N_BANKS
+
+    def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes,
+                           region_base, region_size, start_address, stride0, len_d0,
+                           stride1, len_d1, stride2, master_config, total_mem_bytes):
+        if kind == 'master_log':
+            label_prefix = _narrow_driver_name(local_idx)
+        elif kind == 'master_hwpe':
+            label_prefix = f"hwpe_{local_idx}"
+        else:
+            label_prefix = f"{kind}_{local_idx}"
+        label = label_prefix + (f" ({description})" if description else "")
+        if config == 'idle' or n_test == 0:
+            memory_map_entries.append({'label': label, 'pattern': config, 'n': 0,
+                                       'info': 'idle - no memory accesses'})
+            return
+
+        first_addr = last_addr = None
+        detail = {}
+
+        if config == 'random':
+            first_addr = region_base
+            last_addr = region_base + region_size - access_bytes
+            detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x}  ({region_size} B)"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct']    = f"{rpct}%"
+        elif config == 'matmul_phased':
+            ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'),   None)
+            rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'),   None)
+            rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'),   None)
+            if ra is not None and sa is not None:
+                # Explicit per-phase regions
+                a_base, a_size = ra, sa
+                b_base, b_size = (rb, sb) if rb is not None and sb is not None else (ra, sa)
+                c_base, c_size = (rc, sc) if rc is not None and sc is not None else (ra, sa)
+                detail['matrix_A (read)']  = f"0x{a_base:08x} - 0x{a_base + a_size - access_bytes:08x}  ({a_size} B)"
+                if int(master_config.get('matmul_ratio_b', 1)) > 0:
+                    detail['matrix_B (read)']  = f"0x{b_base:08x} - 0x{b_base + b_size - access_bytes:08x}  ({b_size} B)"
+                detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_size - access_bytes:08x}  ({c_size} B)"
+                first_addr = a_base
+                last_addr  = c_base + c_size - access_bytes
+            else:
+                # Auto-split combined region into thirds
+                a_words = max(1, (region_size // access_bytes) // 3)
+                b_words = max(1, (region_size // access_bytes) // 3)
+                c_words = (region_size // access_bytes) - a_words - b_words
+                a_base = region_base
+                b_base = a_base + a_words * access_bytes
+                c_base = b_base + b_words * access_bytes
+                detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x}  ({region_size} B)  [auto-split]"
+                detail['matrix_A (read)']  = f"0x{a_base:08x} - 0x{b_base - access_bytes:08x}  ({a_words * access_bytes} B)"
+                detail['matrix_B (read)']  = f"0x{b_base:08x} - 0x{c_base - access_bytes:08x}  ({b_words * access_bytes} B)"
+                detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_words * access_bytes - access_bytes:08x}  ({c_words * access_bytes} B)"
+                first_addr = a_base
+                last_addr  = c_base + c_words * access_bytes - access_bytes
+            if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')):
+                m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k'])
+                detail['matrix_dims'] = f"M={m} N={n} K={k}  (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+        elif config == 'multi_linear':
+            regs = master_config.get('regions', []) or []
+            detail['schedule'] = str(master_config.get('schedule', 'round_robin'))
+            detail['burst_len'] = int(master_config.get('burst_len', 1))
+            for idx, reg in enumerate(regs):
+                base = _parse_maybe_bin_int(reg.get('base'), 0)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                stride_w = int(reg.get('stride_words', 1))
+                rpct = reg.get('read_pct')
+                rpct_txt = f", read={int(rpct)}%" if rpct is not None else ""
+                detail[f"region_{idx}"] = (
+                    f"0x{base:08x} - 0x{base + max(0, size) - 1:08x}  "
+                    f"({size} B, stride={stride_w} words{rpct_txt})"
+                )
+            if regs:
+                first_addr = _parse_maybe_bin_int(regs[0].get('base'), 0)
+                last_reg = regs[-1]
+                lb = _parse_maybe_bin_int(last_reg.get('base'), 0)
+                ls = _parse_maybe_bin_int(last_reg.get('size_bytes'), 0)
+                last_addr = lb + max(0, ls) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'bank_group_linear':
+            span = max(1, int(master_config.get('bank_group_span', 1)))
+            start_bank = int(master_config.get('start_bank', 0)) % max(1, int(N_BANKS))
+            stride_beats = max(1, int(master_config.get('stride_beats', 1)))
+            first_addr = start_bank * access_bytes
+            phase = max(0, n_test - 1) * stride_beats
+            group_idx = phase // span
+            bank = (start_bank + (phase % span)) % max(1, int(N_BANKS))
+            last_addr = (group_idx * N_BANKS + bank) * access_bytes
+            last_addr = last_addr % total_mem_bytes
+            detail['start_bank'] = start_bank
+            detail['bank_group_span'] = span
+            detail['stride_beats'] = stride_beats
+            if 'bank_group_hop' in master_config:
+                detail['bank_group_hop'] = int(master_config.get('bank_group_hop', 0))
+            if 'wen' in master_config:
+                detail['wen'] = int(master_config.get('wen', 1))
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'rw_rowwise':
+            row_base = _parse_maybe_bin_int(master_config.get('row_base_address'), region_base)
+            row_size = _parse_maybe_bin_int(master_config.get('row_size_bytes'), access_bytes)
+            n_rows = max(0, int(master_config.get('n_rows', 0)))
+            row_stride = _parse_maybe_bin_int(master_config.get('row_stride_bytes'), row_size)
+            rpr = max(0, int(master_config.get('reads_per_row', 0)))
+            wpr = max(0, int(master_config.get('writes_per_row', 0)))
+            first_addr = row_base
+            last_addr = row_base + max(0, n_rows - 1) * row_stride + max(0, row_size - access_bytes)
+            last_addr = last_addr % total_mem_bytes
+            detail['rows'] = f"n_rows={n_rows}, row_size={row_size} B, row_stride={row_stride} B"
+            detail['per_row'] = f"reads={rpr}, writes={wpr}"
+            idle_between = int(master_config.get('idle_cycles_between_rows', 0))
+            if idle_between:
+                detail['idle_between_rows'] = f"{idle_between} cycles"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'gather_scatter':
+            rr = master_config.get('read_regions', []) or []
+            wr = master_config.get('write_region', {}) or {}
+            for idx, reg in enumerate(rr):
+                b = _parse_maybe_bin_int(reg.get('base'), 0)
+                s = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                detail[f"read_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x}  ({s} B)"
+            wb = _parse_maybe_bin_int(wr.get('base'), 0)
+            ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0)
+            detail['write_region'] = f"0x{wb:08x} - 0x{wb + max(0, ws) - 1:08x}  ({ws} B)"
+            detail['schedule'] = str(master_config.get('schedule', '4read_1write'))
+            detail['chunk_bytes'] = int(_parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes))
+            if rr:
+                first_addr = _parse_maybe_bin_int(rr[0].get('base'), 0)
+            else:
+                first_addr = wb
+            last_addr = wb + max(0, ws) - access_bytes if ws > 0 else first_addr
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), region_base)
+            sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), region_size // 3)
+            rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), ra + sa)
+            sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), region_size // 3)
+            rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), rb + sb)
+            sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), region_size - max(0, sa) - max(0, sb))
+            detail['matrix_A (read)'] = f"0x{ra:08x} - 0x{ra + max(0, sa) - access_bytes:08x}  ({sa} B)"
+            detail['matrix_B (read)'] = f"0x{rb:08x} - 0x{rb + max(0, sb) - access_bytes:08x}  ({sb} B)"
+            detail['matrix_C (write)'] = f"0x{rc:08x} - 0x{rc + max(0, sc) - access_bytes:08x}  ({sc} B)"
+            detail['tile_bytes'] = (
+                f"A={int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes))}, "
+                f"B={int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes))}, "
+                f"C={int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes))}"
+            )
+            detail['tiles'] = int(master_config.get('tiles', 1))
+            detail['ab_c_schedule'] = str(master_config.get('ab_c_schedule', 'A_B_C'))
+            idle_tiles = int(master_config.get('idle_cycles_between_tiles', 0))
+            if idle_tiles:
+                detail['idle_between_tiles'] = f"{idle_tiles} cycles"
+            first_addr = ra
+            last_addr = rc + max(0, sc) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+        elif config == 'hotspot_random':
+            hrs = master_config.get('hot_regions', []) or []
+            for idx, reg in enumerate(hrs):
+                b = _parse_maybe_bin_int(reg.get('base'), 0)
+                s = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                w = int(reg.get('weight', 1))
+                detail[f"hot_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x}  ({s} B, weight={w})"
+            if hrs:
+                first_addr = _parse_maybe_bin_int(hrs[0].get('base'), 0)
+                lb = _parse_maybe_bin_int(hrs[-1].get('base'), 0)
+                ls = _parse_maybe_bin_int(hrs[-1].get('size_bytes'), 0)
+                last_addr = lb + max(0, ls) - access_bytes
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct'] = f"{rpct}%"
+        elif config == 'linear':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            last_addr = base + (n_test - 1) * stride0 * access_bytes
+            last_addr = last_addr % total_mem_bytes
+            detail['start'] = f"0x{base:08x}"
+            detail['stride'] = f"{stride0} words ({stride0 * access_bytes} B)"
+            tpct = master_config.get('traffic_pct')
+            if tpct is not None:
+                rpct = master_config.get('traffic_read_pct', 50)
+                n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0
+                detail['traffic_pct'] = f"{tpct}%  ({n_idles_per_req} idle(s) after each transaction)"
+                detail['read_pct']    = f"{rpct}%"
+        elif config == '2d':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            last_addr = (base + (len_d0 - 1) * stride0 * access_bytes
+                         + (n_test // max(len_d0, 1) - 1) * stride1 * access_bytes) % total_mem_bytes
+            detail['dims'] = f"{len_d0} x (n_rows)  stride0={stride0} stride1={stride1}"
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+        elif config == '3d':
+            base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0)
+            first_addr = base
+            detail['dims'] = f"{len_d0} x {len_d1} x (n_outer)  stride0={stride0} stride1={stride1} stride2={stride2}"
+            last_addr = base  # approximate for 3d
+            idle_between = master_config.get('idle_cycles_between_phases', 0)
+            if idle_between:
+                detail['idle_between_phases'] = f"{idle_between} cycles"
+
+        if first_addr is not None:
+            detail['first_addr'] = f"0x{first_addr:08x}  (bank {_bank_of(first_addr)})"
+            detail['last_addr']  = f"0x{last_addr:08x}  (bank {_bank_of(last_addr)})"
+            detail['transfer']   = f"{n_test} transactions x {data_width // 8} B = {n_test * data_width // 8} B"
+
+        memory_map_entries.append({'label': label, 'pattern': config, 'n': n_test,
+                                   'detail': detail})
+
+    # (file_path, n_idle_cycles) -- populated during generation,
+    # prepended as idle lines before padding for static start delays.
+    pending_start_delays = []
+
+    def _parse_maybe_bin_int(raw_value, default_value):
+        """Parse an int or binary/hex/decimal string; return default on failure."""
+        if raw_value is None:
+            return default_value
+        if isinstance(raw_value, int):
+            return raw_value
+        if isinstance(raw_value, str):
+            v = raw_value.strip()
+            if not v:
+                return default_value
+            if set(v) <= {"0", "1"}:
+                return int(v, 2)
+            try:
+                return int(v, 0)
+            except ValueError:
+                return default_value
+        return default_value
+
+    def _normalize_mem_access_type(raw_value, master_name):
+        allowed = {
+            "random",
+            "linear",
+            "2d",
+            "3d",
+            "idle",
+            "matmul_phased",
+            "multi_linear",
+            "bank_group_linear",
+            "rw_rowwise",
+            "gather_scatter",
+            "matmul_tiled_interleave",
+            "hotspot_random",
+        }
+        aliases = {
+            "matmul": "matmul_phased",
+            "matmul_tiled": "matmul_tiled_interleave",
+        }
+
+        if not isinstance(raw_value, str):
+            print(
+                f"ERROR: {master_name} has invalid mem_access_type={raw_value} "
+                f"(type={type(raw_value).__name__}). Allowed: {', '.join(sorted(allowed))}"
+            )
+            sys.exit(1)
+
+        key = aliases.get(raw_value.strip().lower(), raw_value.strip().lower())
+        if key not in allowed:
+            print(
+                f"ERROR: {master_name} has invalid mem_access_type='{raw_value}'. "
+                f"Allowed: {', '.join(sorted(allowed))}"
+            )
+            sys.exit(1)
+        return key
+
+    def _pattern_job_name(pattern_config: dict) -> str:
+        """Resolve job name from the mandatory 'job' key."""
+        return str(pattern_config.get('job', 'default'))
+
+    def _pattern_wait_for_jobs(pattern_config):
+        """Resolve dependency list from the mandatory 'wait_for_jobs' key."""
+        raw = pattern_config.get('wait_for_jobs', [])
+        if raw is None:
+            return []
+        if isinstance(raw, list):
+            return [str(x) for x in raw]
+        return [str(raw)]
+
+    def _warn_if_id_mismatch(master_cfg, expected_idx, master_name):
+        raw_id = master_cfg.get("id", expected_idx)
+        try:
+            cfg_id = int(raw_id)
+        except (TypeError, ValueError):
+            print(f"WARNING: {master_name} has non-integer id={raw_id}; positional index {expected_idx} is used.")
+            return
+        if cfg_id != expected_idx:
+            print(
+                f"WARNING: {master_name} has id={cfg_id} but positional index is {expected_idx}; "
+                "stimuli-to-driver mapping is positional."
+            )
+
+    def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_width: int, kind: str, local_idx: int) -> int:
+        """Resolve n_transactions from explicit field or geometry, depending on pattern."""
+        if 'n_transactions' in master_config:
+            return int(master_config['n_transactions'])
+        access_bytes = max(1, int(data_width // 8))
+        if mem_access_type == 'linear':
+            length = master_config.get('length')
+            if length is not None:
+                return int(length)
+            raw_region_size = master_config.get('region_size_bytes')
+            if raw_region_size is not None:
+                region_size = _parse_maybe_bin_int(raw_region_size, None)
+                if region_size is not None:
+                    return max(0, int(region_size) // access_bytes)
+        elif mem_access_type == '2d':
+            len_d0 = master_config.get('len_d0')
+            len_d1 = master_config.get('len_d1')
+            if len_d0 is not None and len_d1 is not None:
+                return int(len_d0) * int(len_d1)
+        elif mem_access_type == '3d':
+            len_d0 = master_config.get('len_d0')
+            len_d1 = master_config.get('len_d1')
+            len_d2 = master_config.get('len_d2')
+            if len_d0 is not None and len_d1 is not None and len_d2 is not None:
+                return int(len_d0) * int(len_d1) * int(len_d2)
+        elif mem_access_type == 'matmul_phased':
+            # For non-random region-based traffic, allow deriving transactions
+            # from region size and transaction width.
+            raw_region_size = master_config.get('region_size_bytes')
+            if raw_region_size is not None:
+                region_size = _parse_maybe_bin_int(raw_region_size, None)
+                if region_size is not None:
+                    return max(0, int(region_size) // access_bytes)
+            m = master_config.get('matrix_m')
+            n = master_config.get('matrix_n')
+            k = master_config.get('matrix_k')
+            if m is not None and n is not None and k is not None:
+                return int(m) * int(k) + int(k) * int(n) + int(m) * int(n)
+        elif mem_access_type == 'multi_linear':
+            total = 0
+            for reg in master_config.get('regions', []) or []:
+                size_v = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                total += max(0, int(size_v)) // access_bytes
+            if total > 0:
+                return total
+        elif mem_access_type == 'bank_group_linear':
+            print(
+                f"ERROR: {kind}_{local_idx} mem_access_type='bank_group_linear' "
+                "requires explicit 'n_transactions'."
+            )
+            sys.exit(1)
+        elif mem_access_type == 'rw_rowwise':
+            n_rows = master_config.get('n_rows')
+            rpr = master_config.get('reads_per_row')
+            wpr = master_config.get('writes_per_row')
+            if n_rows is not None and rpr is not None and wpr is not None:
+                return max(0, int(n_rows)) * (max(0, int(rpr)) + max(0, int(wpr)))
+        elif mem_access_type == 'gather_scatter':
+            chunk = _parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes)
+            step = max(access_bytes, int(chunk) if chunk is not None else access_bytes)
+            total = 0
+            for reg in master_config.get('read_regions', []) or []:
+                total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // step
+            wr = master_config.get('write_region', {}) or {}
+            total += max(0, int(_parse_maybe_bin_int(wr.get('size_bytes'), 0))) // step
+            if total > 0:
+                return total
+        elif mem_access_type == 'matmul_tiled_interleave':
+            tiles = max(1, int(master_config.get('tiles', 1)))
+            sched = str(master_config.get('ab_c_schedule', 'A_B_C')).upper().replace('-', '_')
+            toks = [t for t in sched.split('_') if t]
+            if not toks:
+                toks = ['A', 'B', 'C']
+            cnt_a = max(1, int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes)) // access_bytes)
+            cnt_b = max(1, int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes)) // access_bytes)
+            cnt_c = max(1, int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes)) // access_bytes)
+            per_tile = 0
+            for t in toks:
+                if t == 'A':
+                    per_tile += cnt_a
+                elif t == 'B':
+                    per_tile += cnt_b
+                elif t == 'C':
+                    per_tile += cnt_c
+            if per_tile > 0:
+                return tiles * per_tile
+        elif mem_access_type == 'hotspot_random':
+            total = 0
+            for reg in master_config.get('hot_regions', []) or []:
+                total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // access_bytes
+            if total > 0:
+                return total
+        elif mem_access_type == 'idle':
+            return 0
+        print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no "
+              f"'n_transactions' and no geometry fields to derive it from.")
+        sys.exit(1)
+
+    def _generate_pattern(
+        filepath: Path,
+        pattern_config: dict,
+        *,
+        is_hwpe: bool,
+        master_global_idx: int,
+        master_local_idx: int,
+        n_peers_of_kind: int,
+        append: bool,
+    ):
+        """Generate one pattern segment. append=True opens file in append mode.
+        Every pattern always writes a trailing PAUSE (handled by the generator)."""
         nonlocal next_start_id
-        data_width = HWPE_WIDTH * DATA_WIDTH if is_hwpe else DATA_WIDTH
-        n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG
-        cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        kind = 'master_hwpe' if is_hwpe else 'master_log'
 
         master = StimuliGenerator(
-            IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH,
-            str(filepath), n_test, EXACT_OR_MAX_OFFSET, cycle_offset, master_global_idx
+            IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH,
+            str(filepath), 0, master_global_idx
         )
 
-        config = str(master_config.get('mem_access_type', '0'))
-        start_address = str(master_config.get('start_address', '0'))
-        stride0 = int(master_config.get('stride0', 0))
-        len_d0 = int(master_config.get('len_d0', 0))
-        stride1 = int(master_config.get('stride1', 0))
-        len_d1 = int(master_config.get('len_d1', 0))
-        stride2 = int(master_config.get('stride2', 0))
-
-        if config == '0':
-            next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '1':
-            next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '2':
-            next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '3':
-            next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-
-    def _gen_hwpe_master(master_idx, master_config, global_idx):
-        nonlocal next_start_id
-        filepath = raw_dir / f"master_hwpe_{master_idx}.txt"
-        master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH * DATA_WIDTH, ADD_WIDTH,
-                                   str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx)
-        config = str(master_config.get('mem_access_type', '0'))
-        start_address = str(master_config.get('start_address', '0'))
-        stride0 = int(master_config.get('stride0', 0))
-        len_d0 = int(master_config.get('len_d0', 0))
-        stride1 = int(master_config.get('stride1', 0))
-        len_d1 = int(master_config.get('len_d1', 0))
-        stride2 = int(master_config.get('stride2', 0))
-
-        if config == '0':
-            next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '1':
-            next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '2':
-            next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
-        elif config == '3':
-            next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE)
+        if 'mem_access_type' not in pattern_config:
+            print(f"ERROR: {kind}_{master_local_idx} pattern is missing mem_access_type.")
+            sys.exit(1)
+
+        config = _normalize_mem_access_type(
+            pattern_config['mem_access_type'],
+            f"{kind}_{master_local_idx}",
+        )
+        if 'start_address' in pattern_config:
+            start_address = str(pattern_config['start_address'])
+        elif config == 'linear' and 'region_base_address' in pattern_config:
+            start_address = str(pattern_config['region_base_address'])
+        else:
+            start_address = '0'
+
+        if 'stride0' in pattern_config:
+            stride0 = int(pattern_config['stride0'])
+        elif config == 'linear' and 'region_size_bytes' in pattern_config:
+            stride0 = 1
+        else:
+            stride0 = 0
+        len_d0 = int(pattern_config.get('len_d0', 0))
+        stride1 = int(pattern_config.get('stride1', 0))
+        len_d1 = int(pattern_config.get('len_d1', 0))
+        stride2 = int(pattern_config.get('stride2', 0))
+
+        total_mem_bytes = int(TOT_MEM_SIZE * 1024)
+        access_bytes = max(1, int(data_width // 8))
+        default_region_size = total_mem_bytes // max(1, n_peers_of_kind)
+        default_region_base = master_local_idx * default_region_size
+
+        region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base)
+
+        # For non-random region-based patterns, if n_transactions is provided but
+        # region_size_bytes is omitted, span a full non-wrapping region that can
+        # hold all transactions once at the current transaction width.
+        region_size_input = pattern_config.get('region_size_bytes')
+        if (
+            config in {'linear', 'matmul_phased', 'matmul_tiled_interleave'}
+            and region_size_input is None
+            and 'n_transactions' in pattern_config
+        ):
+            region_size_input = int(pattern_config['n_transactions']) * access_bytes
+
+        region_size = _parse_maybe_bin_int(region_size_input, default_region_size)
+
+        region_base = (region_base // access_bytes) * access_bytes
+        if region_base >= total_mem_bytes:
+            region_base = region_base % total_mem_bytes
+        region_size = (max(0, region_size) // access_bytes) * access_bytes
+        if region_size <= 0:
+            region_size = (default_region_size // access_bytes) * access_bytes
+        if region_base + region_size > total_mem_bytes:
+            region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes
+
+        n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx)
+        master.N_TEST = n_test
+        # Read/write blocked filtering is pattern-local only.
+        read_blocked_local = []
+        write_blocked_local = []
+        tpct_raw = pattern_config.get('traffic_pct', 100)
+        tpct = 100 if tpct_raw is None else int(tpct_raw)
+
+        multi_regions_cfg = []
+        for reg in pattern_config.get('regions', []) or []:
+            multi_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+                'stride_words': int(reg.get('stride_words', 1)),
+                'read_pct': reg.get('read_pct'),
+            })
+
+        read_regions_cfg = []
+        for reg in pattern_config.get('read_regions', []) or []:
+            read_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+            })
+        wr_cfg_raw = pattern_config.get('write_region', {}) or {}
+        write_region_cfg = {
+            'base': _parse_maybe_bin_int(wr_cfg_raw.get('base'), 0),
+            'size_bytes': _parse_maybe_bin_int(wr_cfg_raw.get('size_bytes'), 0),
+        }
+
+        hot_regions_cfg = []
+        for reg in pattern_config.get('hot_regions', []) or []:
+            hot_regions_cfg.append({
+                'base': _parse_maybe_bin_int(reg.get('base'), 0),
+                'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0),
+                'weight': int(reg.get('weight', 1)),
+            })
+
+        if config == 'random':
+            next_start_id = master.random_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                region_base=region_base,
+                region_size=region_size,
+                traffic_pct=tpct,
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
+            )
+        elif config == 'linear':
+            next_start_id = master.linear_gen(
+                stride0, start_address, next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                traffic_pct=tpct,
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
+            )
+        elif config == '2d':
+            next_start_id = master.gen_2d(
+                stride0, len_d0, stride1, start_address, next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                append=append,
+            )
+        elif config == '3d':
+            next_start_id = master.gen_3d(
+                stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                append=append,
+            )
+        elif config == 'idle':
+            next_start_id = master.idle_gen(next_start_id, append=append)
+        elif config == 'matmul_phased':
+            if not is_hwpe:
+                print(
+                    f"WARNING: mem_access_type='matmul_phased' is typically used for HWPE masters; "
+                    f"{kind}_{master_local_idx} will still use requested phased behavior."
+                )
+            min_region_size = 3 * access_bytes
+            if region_size < min_region_size:
+                print(
+                    f"ERROR: {kind}_{master_local_idx} region_size_bytes="
+                    f"{region_size} is too small for matmul_phased (minimum {min_region_size})."
+                )
+                sys.exit(1)
+            next_start_id = master.matmul_phased_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                region_base,
+                region_size,
+                int(pattern_config.get('matmul_ratio_a', 1)),
+                int(pattern_config.get('matmul_ratio_b', 1)),
+                int(pattern_config.get('matmul_ratio_c', 1)),
+                traffic_pct=int(pattern_config.get('traffic_pct', 100)),
+                idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)),
+                region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None),
+                region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None),
+                region_base_address_b=_parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None),
+                region_size_bytes_b=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None),
+                region_base_address_c=_parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None),
+                region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None),
+                append=append,
+            )
+        elif config == 'multi_linear':
+            next_start_id = master.multi_linear_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                regions=multi_regions_cfg,
+                schedule=pattern_config.get('schedule', 'round_robin'),
+                burst_len=int(pattern_config.get('burst_len', 1)),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'bank_group_linear':
+            next_start_id = master.bank_group_linear_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                start_bank=int(pattern_config.get('start_bank', 0)),
+                bank_group_span=int(pattern_config.get('bank_group_span', 1)),
+                stride_beats=int(pattern_config.get('stride_beats', 1)),
+                bank_group_hop=int(pattern_config.get('bank_group_hop', 0)),
+                wen=pattern_config.get('wen'),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'rw_rowwise':
+            next_start_id = master.rw_rowwise_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                row_base_address=_parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base),
+                row_size_bytes=_parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes),
+                n_rows=int(pattern_config.get('n_rows', 1)),
+                row_stride_bytes=_parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), access_bytes),
+                reads_per_row=int(pattern_config.get('reads_per_row', 0)),
+                writes_per_row=int(pattern_config.get('writes_per_row', 0)),
+                traffic_pct=tpct,
+                idle_cycles_between_rows=int(pattern_config.get('idle_cycles_between_rows', 0)),
+                append=append,
+            )
+        elif config == 'gather_scatter':
+            next_start_id = master.gather_scatter_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                read_regions=read_regions_cfg,
+                write_region=write_region_cfg,
+                chunk_bytes=_parse_maybe_bin_int(pattern_config.get('chunk_bytes'), access_bytes),
+                schedule=pattern_config.get('schedule', '4read_1write'),
+                traffic_pct=tpct,
+                append=append,
+            )
+        elif config == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+            rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+            rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+            if ra is None or sa is None or rb is None or sb is None or rc is None or sc is None:
+                # Fallback to split the combined region into A/B/C thirds.
+                n_words = max(3, region_size // access_bytes)
+                a_words = max(1, n_words // 3)
+                b_words = max(1, n_words // 3)
+                c_words = max(1, n_words - a_words - b_words)
+                ra = region_base
+                sa = a_words * access_bytes
+                rb = ra + sa
+                sb = b_words * access_bytes
+                rc = rb + sb
+                sc = c_words * access_bytes
+            next_start_id = master.matmul_tiled_interleave_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                region_base_address_a=ra,
+                region_size_bytes_a=sa,
+                region_base_address_b=rb,
+                region_size_bytes_b=sb,
+                region_base_address_c=rc,
+                region_size_bytes_c=sc,
+                tile_a_bytes=_parse_maybe_bin_int(pattern_config.get('tile_a_bytes'), access_bytes),
+                tile_b_bytes=_parse_maybe_bin_int(pattern_config.get('tile_b_bytes'), access_bytes),
+                tile_c_bytes=_parse_maybe_bin_int(pattern_config.get('tile_c_bytes'), access_bytes),
+                tiles=int(pattern_config.get('tiles', 1)),
+                ab_c_schedule=pattern_config.get('ab_c_schedule', 'A_B_C'),
+                traffic_pct=tpct,
+                idle_cycles_between_tiles=int(pattern_config.get('idle_cycles_between_tiles', 0)),
+                append=append,
+            )
+        elif config == 'hotspot_random':
+            next_start_id = master.hotspot_random_gen(
+                next_start_id,
+                read_blocked_local,
+                write_blocked_local,
+                hot_regions=hot_regions_cfg,
+                traffic_pct=tpct,
+                traffic_read_pct=pattern_config.get('traffic_read_pct'),
+                append=append,
+            )
+
+        _record_memory_map(
+            kind, master_local_idx,
+            pattern_config.get('description', ''),
+            config, n_test, data_width, access_bytes,
+            region_base, region_size,
+            start_address, stride0, len_d0, stride1, len_d1, stride2,
+            pattern_config, total_mem_bytes,
+        )
+
+    def _generate_master(
+        filepath: Path,
+        master_config: dict,
+        *,
+        is_hwpe: bool,
+        master_global_idx: int,
+        master_local_idx: int,
+        n_peers_of_kind: int,
+    ):
+        """Generate stimulus for a master, supporting single flat pattern or patterns list."""
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+
+        # Resolve pattern list: either explicit 'patterns' list or a single flat pattern
+        if 'patterns' in master_config:
+            patterns = master_config['patterns']
+            if not patterns:
+                kind = 'master_hwpe' if is_hwpe else 'master_log'
+                print(f"ERROR: {kind}_{master_local_idx} has empty patterns list.")
+                sys.exit(1)
+        else:
+            # Legacy flat format: treat the master config itself as a single pattern
+            patterns = [master_config]
+
+        # Start delay applies to the whole master (prepended before first pattern)
+        start_delay = int(master_config.get('start_delay_cycles', 0))
+        if start_delay > 0:
+            pending_start_delays.append((filepath, start_delay, data_width))
+
+        # For each pattern with wait_for_jobs, prepend a synthetic idle+PAUSE that acts as
+        # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free
+        # pass), so fence_idx advances immediately after the real work is done.
+        # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle
+        # gate, blocking), giving resume_i a single clean meaning: start your next job.
+        dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        first_written = False
+        for p_idx, pattern_config in enumerate(patterns):
+            if _pattern_wait_for_jobs(pattern_config):
+                # Synthetic idle+PAUSE gates this pattern
+                _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE,
+                                         dw, ADD_WIDTH, str(filepath), 0, master_global_idx)
+                _idle.N_TEST = 0
+                _idle.idle_gen(next_start_id, append=first_written)
+                first_written = True
+            _generate_pattern(
+                filepath,
+                pattern_config,
+                is_hwpe=is_hwpe,
+                master_global_idx=master_global_idx,
+                master_local_idx=master_local_idx,
+                n_peers_of_kind=n_peers_of_kind,
+                append=first_written,
+            )
+            first_written = True
 
     global_idx = 0
-    # Generate logarithmic masters (CORE, DMA, EXT) in order
+
+    # Generate LOG masters (CORE, DMA, EXT) in order
     for i in range(N_LOG):
-        # determine class of this master (core/dma/ext)
         if i < N_CORE:
             if CORE_ZERO_FLAG:
                 global_idx += 1
@@ -233,66 +952,768 @@ def _gen_hwpe_master(master_idx, master_config, global_idx):
                 continue
 
         master_cfg = log_masters[i]
-        _generate_master(raw_dir / f"master_log_{i}.txt", master_cfg, is_hwpe=False, master_global_idx=global_idx)
+        _warn_if_id_mismatch(master_cfg, i, f"master_log_{i}")
+        _generate_master(
+            stimuli_dir / f"master_log_{i}.txt",
+            master_cfg,
+            is_hwpe=False,
+            master_global_idx=global_idx,
+            master_local_idx=i,
+            n_peers_of_kind=max(1, N_LOG),
+        )
         global_idx += 1
 
-    # Generate HWPE masters; their global index follows the previous masters
+    # Generate HWPE masters
     for hw_idx in range(N_HWPE):
         if HWPE_ZERO_FLAG:
             global_idx += 1
             continue
         master_cfg = hwpe_masters[hw_idx]
-        _generate_master(raw_dir / f"master_hwpe_{hw_idx}.txt", master_cfg, is_hwpe=True, master_global_idx=global_idx)
+        _warn_if_id_mismatch(master_cfg, hw_idx, f"master_hwpe_{hw_idx}")
+        _generate_master(
+            stimuli_dir / f"master_hwpe_{hw_idx}.txt",
+            master_cfg,
+            is_hwpe=True,
+            master_global_idx=global_idx,
+            master_local_idx=hw_idx,
+            n_peers_of_kind=max(1, N_HWPE),
+        )
         global_idx += 1
 
-    print("STEP 0 COMPLETED: create raw txt files")
+    print("STEP 0 COMPLETED: generate stimuli files")
+
+    # -----------------------------------------------------------------------
+    # Compute FENCE_MASKS and emit fence_masks.mk
+    #
+    # Fence slot f corresponds to the PAUSE before pattern f in the stimulus
+    # file (i.e. between pattern f-1 and pattern f). The mask at slot f holds
+    # the set of drivers that must have passed fence f before this driver can
+    # resume from that PAUSE.
+    #
+    # For a master with N patterns, there are N fence slots (slot 0 = before
+    # pattern 0, slot f = before pattern f). The wait_for_jobs of pattern f defines
+    # the mask at fence slot f.
+    #
+    # Legacy flat masters (no 'patterns' key) are treated as single-pattern
+    # masters: one fence slot (slot 0) from the top-level wait_for_jobs field.
+    # -----------------------------------------------------------------------
+    N_DRIVERS = N_LOG + N_HWPE
+
+    def _patterns_of(master_config):
+        """Return the list of pattern configs for a master."""
+        if 'patterns' in master_config:
+            return master_config['patterns']
+        return [master_config]
+
+    # Build job->driver map: every pattern of every driver registers its job.
+    # This allows wait_for_jobs to reference any job, not just first patterns.
+    # A job may be associated with multiple drivers (e.g. 8 cores all in softmax_t0).
+    job_to_drivers = {}
+    all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters]
+    for i, (m, _) in enumerate(all_masters):
+        for pat in _patterns_of(m):
+            job = _pattern_job_name(pat)
+            if i not in job_to_drivers.get(job, []):
+                job_to_drivers.setdefault(job, []).append(i)
+
+    # Build job->pattern_index map: for each job, which pattern index within
+    # each driver corresponds to that job. Used to compute FENCE_REQ_LEVELS.
+    # job_pattern_idx[job][driver] = pattern index of that job in that driver
+    job_pattern_idx = {}
+    for i, (m, _) in enumerate(all_masters):
+        for p_idx, pat in enumerate(_patterns_of(m)):
+            job = _pattern_job_name(pat)
+            job_pattern_idx.setdefault(job, {})[i] = p_idx
+
+    def _resolve_wait_mask(wait_for_jobs_list):
+        mask = 0
+        for dep_job in wait_for_jobs_list:
+            for dep_drv in job_to_drivers.get(str(dep_job), []):
+                mask |= (1 << dep_drv)
+        return mask
+
+    # Precompute per-driver fence_idx value after finishing pattern p:
+    # = number of fences (synthetic idle gates + trailing PAUSEs) passed up to and
+    #   including the trailing PAUSE of pattern p.
+    def _fence_idx_after_pattern(drv_idx, pat_idx):
+        pats = _patterns_of(all_masters[drv_idx][0])
+        # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for_jobs)
+        n_gates = sum(1 for k in range(pat_idx + 1) if _pattern_wait_for_jobs(pats[k]))
+        # Plus trailing PAUSEs for patterns 0..pat_idx
+        n_trailing = pat_idx + 1
+        return n_gates + n_trailing
+
+    def _resolve_req_levels(wait_for_jobs_list):
+        """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j."""
+        levels = [0] * N_DRIVERS
+        for dep_job in wait_for_jobs_list:
+            for dep_drv in job_to_drivers.get(str(dep_job), []):
+                p_idx = job_pattern_idx.get(str(dep_job), {}).get(dep_drv, 0)
+                levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx)
+        return levels
+
+    # Build per-driver fence mask and req_level lists.
+    # Each pattern with wait_for_jobs gets a synthetic idle gate (mask = wait_for_jobs) before it.
+    # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx).
+    # Fences are enumerated in file order: for each pattern p:
+    #   if p has wait_for_jobs: synthetic idle fence (mask = wait_for_jobs of p)
+    #   trailing PAUSE fence (mask = 0)
+    fence_masks = []
+    req_levels = []
+    for i, (m, _) in enumerate(all_masters):
+        patterns = _patterns_of(m)
+        per_masks  = []
+        per_levels = []
+        for pat in patterns:
+            wait_for_jobs = _pattern_wait_for_jobs(pat)
+            if wait_for_jobs:
+                # Synthetic idle gate: blocking fence
+                per_masks.append(_resolve_wait_mask(wait_for_jobs))
+                per_levels.append(_resolve_req_levels(wait_for_jobs))
+            # Trailing PAUSE: free pass, just signals completion
+            per_masks.append(0)
+            per_levels.append([0] * N_DRIVERS)
+        fence_masks.append(per_masks)
+        req_levels.append(per_levels)
+
+    max_fences = max((len(fm) for fm in fence_masks), default=1)
+
+    # Pad to max_fences
+    for i in range(N_DRIVERS):
+        while len(fence_masks[i]) < max_fences:
+            fence_masks[i].append(0)
+        while len(req_levels[i]) < max_fences:
+            req_levels[i].append([0] * N_DRIVERS)
+
+    # Emit SV literals
+    hex_width = max(1, (N_DRIVERS + 3) // 4)
+    per_driver_literals = []
+    for i in range(N_DRIVERS):
+        slot_literals = [f"{N_DRIVERS}'h{fence_masks[i][f]:0{hex_width}x}" for f in range(max_fences)]
+        per_driver_literals.append("'{" + ", ".join(slot_literals) + "}")
+    fence_masks_param = "'{" + ", ".join(per_driver_literals) + "}"
+
+    # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned
+    # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector.
+    # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15).
+    max_req_level = 0
+    for i in range(N_DRIVERS):
+        for f in range(max_fences):
+            for j in range(N_DRIVERS):
+                max_req_level = max(max_req_level, int(req_levels[i][f][j]))
+    if max_req_level > 15:
+        print(
+            "ERROR: Fence dependency level overflow: "
+            f"required fence_idx={max_req_level}, but packed format supports only 0..15. "
+            "Reduce the number of fence crossings per dependent job or widen LEVEL_BITS."
+        )
+        sys.exit(1)
+
+    LEVEL_BITS = 4
+    packed_width = N_DRIVERS * LEVEL_BITS
+    packed_hex_digits = (packed_width + 3) // 4
+    req_driver_literals = []
+    for i in range(N_DRIVERS):
+        fence_literals = []
+        for f in range(max_fences):
+            val = 0
+            for j in range(N_DRIVERS):
+                val |= (req_levels[i][f][j] & 0xF) << (j * LEVEL_BITS)
+            fence_literals.append(f"{packed_width}'h{val:0{packed_hex_digits}x}")
+        req_driver_literals.append("'{" + ", ".join(fence_literals) + "}")
+    fence_req_levels_packed_param = "'{" + ", ".join(req_driver_literals) + "}"
+
+    if args.emit_phases_mk:
+        phases_mk_path = Path(args.emit_phases_mk)
+        phases_mk_path.parent.mkdir(parents=True, exist_ok=True)
+        phases_mk_path.write_text(
+            "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n"
+            "# Per-driver per-fence dependency data for tb_hci.sv.\n"
+            f"# Drivers 0..{N_LOG-1} = narrow masters (core/dma/ext), {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n"
+            f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n"
+            f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n"
+            f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n"
+            f"MAX_FENCES_PARAM := {max_fences}\n"
+            f"FENCE_MASKS_PARAM := {fence_masks_param}\n"
+            f"FENCE_REQ_LEVELS_PACKED_PARAM := {fence_req_levels_packed_param}\n",
+            encoding='utf-8',
+        )
+        print(f"FENCE_MASKS.MK written: {phases_mk_path}")
+
+    # -----------------------------------------------------------------------
+    # Build and emit memory map report
+    # -----------------------------------------------------------------------
+    INTERCO_TYPE = str(hw_params.get('INTERCO_TYPE', 'HCI')).strip().upper()
+    if INTERCO_TYPE not in {"LOG", "MUX", "HCI"}:
+        INTERCO_TYPE = "HCI"
+    DW_NARROW = int(DATA_WIDTH)
+    DW_WIDE = int(HWPE_WIDTH_FACT * DATA_WIDTH)
+    N_NARROW_HCI_CFG = int(
+        N_CORE_CFG + N_DMA_CFG + N_EXT_CFG
+        + (N_HWPE_CFG * HWPE_WIDTH_FACT if INTERCO_TYPE == "LOG" else 0)
+    )
+    N_WIDE_HCI_CFG = int(N_HWPE_CFG if INTERCO_TYPE == "HCI" else (1 if INTERCO_TYPE == "MUX" else 0))
+    N_MASTER_PORTS_CFG = int(N_NARROW_HCI_CFG + N_WIDE_HCI_CFG)
+
+    def _driver_name(driver_idx):
+        if driver_idx < N_LOG:
+            return _narrow_driver_name(driver_idx)
+        return f"hwpe_{driver_idx - N_LOG}"
+
+    def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peers):
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        access_bytes = max(1, int(data_width // 8))
+        total_mem_bytes = int(TOT_MEM_SIZE * 1024)
+        default_region_size = total_mem_bytes // max(1, n_peers)
+        default_region_base = local_idx * default_region_size
+
+        region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base)
+        region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size)
+
+        region_base = (region_base // access_bytes) * access_bytes
+        if region_base >= total_mem_bytes:
+            region_base = region_base % total_mem_bytes
+        region_size = (max(0, region_size) // access_bytes) * access_bytes
+        if region_size <= 0:
+            region_size = (default_region_size // access_bytes) * access_bytes
+        if region_base + region_size > total_mem_bytes:
+            region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes
+
+        if region_size <= 0:
+            return []
+
+        if mem_access_type == 'idle':
+            return []
+        if mem_access_type == 'multi_linear':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), region_size)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                rpct = reg.get('read_pct')
+                if rpct is None:
+                    lbl = f"R{idx}"
+                else:
+                    lbl = f"R{idx}({'read' if int(rpct) >= 50 else 'write'})"
+                regions.append({
+                    'label': lbl,
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            return regions
+        if mem_access_type == 'bank_group_linear':
+            span = max(1, int(pattern_config.get('bank_group_span', 1)))
+            start_bank = int(pattern_config.get('start_bank', 0)) % max(1, int(N_BANKS))
+            n_tx = _parse_maybe_bin_int(pattern_config.get('n_transactions'), 1)
+            n_tx = max(1, int(n_tx))
+            rows = max(1, math.ceil(n_tx / span))
+            size = min(total_mem_bytes, rows * span * access_bytes)
+            base = (start_bank * access_bytes) % max(1, total_mem_bytes)
+            if base + size > total_mem_bytes:
+                size = max(access_bytes, total_mem_bytes - base)
+            return [{
+                'label': 'bank_group',
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            }]
+        if mem_access_type == 'rw_rowwise':
+            row_base = _parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base)
+            row_size = _parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes)
+            n_rows = max(1, int(pattern_config.get('n_rows', 1)))
+            row_stride = _parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), row_size)
+            base = (row_base // access_bytes) * access_bytes
+            if base >= total_mem_bytes:
+                base = base % total_mem_bytes
+            size = ((max(0, row_stride) * max(0, n_rows - 1)) + max(0, row_size))
+            size = (size // access_bytes) * access_bytes
+            if base + size > total_mem_bytes:
+                size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+            if size <= 0:
+                size = access_bytes
+            return [{
+                'label': 'rowwise',
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            }]
+        if mem_access_type == 'gather_scatter':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('read_regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                regions.append({
+                    'label': f"gather_{idx}(read)",
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            wr = pattern_config.get('write_region', {}) or {}
+            wb = _parse_maybe_bin_int(wr.get('base'), region_base)
+            ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0)
+            wb = (wb // access_bytes) * access_bytes
+            if wb >= total_mem_bytes:
+                wb = wb % total_mem_bytes
+            ws = (max(0, ws) // access_bytes) * access_bytes
+            if wb + ws > total_mem_bytes:
+                ws = ((total_mem_bytes - wb) // access_bytes) * access_bytes
+            if ws > 0:
+                regions.append({
+                    'label': 'scatter(write)',
+                    'base': wb,
+                    'size': ws,
+                    'end': wb + ws - 1,
+                })
+            return regions
+        if mem_access_type == 'hotspot_random':
+            regions = []
+            for idx, reg in enumerate(pattern_config.get('hot_regions', []) or []):
+                base = _parse_maybe_bin_int(reg.get('base'), region_base)
+                size = _parse_maybe_bin_int(reg.get('size_bytes'), 0)
+                base = (base // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, size) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size <= 0:
+                    continue
+                regions.append({
+                    'label': f"hot_{idx}",
+                    'base': base,
+                    'size': size,
+                    'end': base + size - 1,
+                })
+            return regions
+        if mem_access_type == 'matmul_tiled_interleave':
+            ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+            sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+            rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+            sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+            rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+            sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+            regions = []
+            if ra is not None and sa is not None and rb is not None and sb is not None and rc is not None and sc is not None:
+                sub_defs = [
+                    ('A(read)', ra, sa),
+                    ('B(read)', rb, sb),
+                    ('C(write)', rc, sc),
+                ]
+            else:
+                n_words = max(3, region_size // access_bytes)
+                a_words = max(1, n_words // 3)
+                b_words = max(1, n_words // 3)
+                c_words = max(1, n_words - a_words - b_words)
+                sub_defs = [
+                    ('A(read)', region_base, a_words * access_bytes),
+                    ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes),
+                    ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes),
+                ]
+            for label, base_raw, size_raw in sub_defs:
+                base = (int(base_raw) // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, int(size_raw)) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size > 0:
+                    regions.append({
+                        'label': label,
+                        'base': base,
+                        'size': size,
+                        'end': base + size - 1,
+                    })
+            return regions
+
+        if mem_access_type != 'matmul_phased':
+            return [{
+                'label': 'region',
+                'base': region_base,
+                'size': region_size,
+                'end': region_base + region_size - 1,
+            }]
+
+        ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None)
+        sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None)
+        rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None)
+        sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None)
+        rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None)
+        sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None)
+
+        regions = []
+        if ra is not None and sa is not None:
+            sub_defs = [
+                ('A(read)', ra, sa),
+                ('B(read)', rb if rb is not None else ra, sb if sb is not None else sa),
+                ('C(write)', rc if rc is not None else ra, sc if sc is not None else sa),
+            ]
+            for label, base_raw, size_raw in sub_defs:
+                base = (int(base_raw) // access_bytes) * access_bytes
+                if base >= total_mem_bytes:
+                    base = base % total_mem_bytes
+                size = (max(0, int(size_raw)) // access_bytes) * access_bytes
+                if base + size > total_mem_bytes:
+                    size = ((total_mem_bytes - base) // access_bytes) * access_bytes
+                if size > 0:
+                    regions.append({
+                        'label': label,
+                        'base': base,
+                        'size': size,
+                        'end': base + size - 1,
+                    })
+            return regions
+
+        n_words = region_size // access_bytes
+        if n_words < 3:
+            return [{
+                'label': 'region',
+                'base': region_base,
+                'size': region_size,
+                'end': region_base + region_size - 1,
+            }]
+        a_words = max(1, n_words // 3)
+        b_words = max(1, n_words // 3)
+        c_words = n_words - a_words - b_words
+        sub_regions = [
+            ('A(read)', region_base, a_words * access_bytes),
+            ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes),
+            ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes),
+        ]
+        for label, base, size in sub_regions:
+            if size <= 0:
+                continue
+            regions.append({
+                'label': label,
+                'base': base,
+                'size': size,
+                'end': base + size - 1,
+            })
+        return regions
+
+    def _estimate_pattern_cycles(pattern_config, _mem_access_type, n_test, _txn_bytes):
+        # Temporal model intentionally follows emitted traffic only:
+        # one unit per transaction plus req=0 idles from traffic_pct shaping.
+        # No absolute/phase/tile/row cycle estimation is applied here.
+        base = max(0, int(n_test))
+        tpct = pattern_config.get('traffic_pct')
+        n_idles_per_req = 0
+        if tpct is not None:
+            tp = max(1, min(100, int(tpct)))
+            n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp))
+        return int(base * (1 + n_idles_per_req))
+
+    pattern_nodes = []
+    node_idx_by_driver_pattern = {}
+    job_to_nodes = {}
+    driver_last_node = {}
+
+    for drv_idx, (master_cfg, is_hwpe) in enumerate(all_masters):
+        patterns = _patterns_of(master_cfg)
+        local_idx = drv_idx - N_LOG if is_hwpe else drv_idx
+        data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH
+        kind = 'master_hwpe' if is_hwpe else 'master_log'
+        n_peers = max(1, N_HWPE if is_hwpe else N_LOG)
+        start_delay = int(master_cfg.get('start_delay_cycles', 0))
+        for p_idx, pat in enumerate(patterns):
+            raw_type = pat.get('mem_access_type', 'idle')
+            mem_access_type = _normalize_mem_access_type(raw_type, f"{kind}_{local_idx}")
+            n_test = _resolve_n_transactions(pat, mem_access_type, data_width, kind, local_idx)
+            declared_wait_for_jobs = _pattern_wait_for_jobs(pat)
+            # Timeline view follows declared dependencies from workload.json.
+            effective_wait_for_jobs = declared_wait_for_jobs
+            node = {
+                'node_idx': len(pattern_nodes),
+                'driver_idx': drv_idx,
+                'driver_name': _driver_name(drv_idx),
+                'is_hwpe': is_hwpe,
+                'local_idx': local_idx,
+                'pattern_idx': p_idx,
+                'description': str(pat.get('description', '')).strip(),
+                'job': _pattern_job_name(pat),
+                'wait_for_jobs_declared': declared_wait_for_jobs,
+                'wait_for_jobs_effective': effective_wait_for_jobs,
+                'n_transactions': int(n_test),
+                'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test, int(data_width // 8))),
+                'mem_access_type': mem_access_type,
+                'traffic_read_pct': pat.get('traffic_read_pct'),
+                'txn_bytes': int(data_width // 8),
+                'start_delay': start_delay if p_idx == 0 else 0,
+                'regions': _resolve_regions(pat, mem_access_type, is_hwpe, local_idx, n_peers),
+            }
+            pattern_nodes.append(node)
+            node_idx_by_driver_pattern[(drv_idx, p_idx)] = node['node_idx']
+            job_to_nodes.setdefault(node['job'], []).append(node['node_idx'])
+            driver_last_node[drv_idx] = node['node_idx']
+
+    n_nodes = len(pattern_nodes)
+    preds = [set() for _ in range(n_nodes)]
+    succs = [set() for _ in range(n_nodes)]
+    mux_serialization_applied = False
+    mux_phase_order = []
+
+    def _add_edge(src, dst):
+        if src == dst or src < 0 or dst < 0:
+            return
+        if src not in preds[dst]:
+            preds[dst].add(src)
+            succs[src].add(dst)
+
+    for node in pattern_nodes:
+        n_idx = node['node_idx']
+        drv_idx = node['driver_idx']
+        p_idx = node['pattern_idx']
+        if p_idx > 0:
+            _add_edge(node_idx_by_driver_pattern[(drv_idx, p_idx - 1)], n_idx)
+        for dep_job in node['wait_for_jobs_effective']:
+            for dep_idx in job_to_nodes.get(dep_job, []):
+                _add_edge(dep_idx, n_idx)
+
+    if INTERCO_TYPE == "MUX":
+        # Match tb_hci MUX semantics in the temporal model:
+        # serialize HWPE execution by job order, and by HWPE ID within a job.
+        hwpe_nodes = [n for n in pattern_nodes if n['is_hwpe']]
+        if hwpe_nodes:
+            job_first_seen = {}
+            for n in sorted(hwpe_nodes, key=lambda x: (x['pattern_idx'], x['local_idx'], x['node_idx'])):
+                job_first_seen.setdefault(n['job'], len(job_first_seen))
+
+            job_preds = {jb: set() for jb in job_first_seen}
+            job_succs = {jb: set() for jb in job_first_seen}
+            for n in hwpe_nodes:
+                cur = n['job']
+                for dep_job in n['wait_for_jobs_effective']:
+                    dep = str(dep_job)
+                    if dep in job_first_seen and dep != cur:
+                        job_preds[cur].add(dep)
+                        job_succs[dep].add(cur)
+
+            phase_indeg = {jb: len(job_preds[jb]) for jb in job_first_seen}
+            phase_ready = sorted([jb for jb, deg in phase_indeg.items() if deg == 0],
+                                 key=lambda jb: job_first_seen[jb])
+            mux_phase_order = []
+            while phase_ready:
+                cur = phase_ready.pop(0)
+                mux_phase_order.append(cur)
+                for nxt in sorted(job_succs[cur], key=lambda jb: job_first_seen[jb]):
+                    phase_indeg[nxt] -= 1
+                    if phase_indeg[nxt] == 0:
+                        phase_ready.append(nxt)
+                phase_ready.sort(key=lambda jb: job_first_seen[jb])
+            if len(mux_phase_order) != len(job_first_seen):
+                mux_phase_order = sorted(job_first_seen.keys(), key=lambda jb: job_first_seen[jb])
+
+            phase_rank = {ph: i for i, ph in enumerate(mux_phase_order)}
+            hwpe_sorted = sorted(
+                hwpe_nodes,
+                key=lambda n: (
+                    phase_rank.get(n['job'], 10 ** 9),
+                    n['local_idx'],
+                    n['pattern_idx'],
+                    n['node_idx'],
+                ),
+            )
+            for i in range(1, len(hwpe_sorted)):
+                _add_edge(hwpe_sorted[i - 1]['node_idx'], hwpe_sorted[i]['node_idx'])
+            mux_serialization_applied = True
+
+    indeg = [len(preds[i]) for i in range(n_nodes)]
+    ready = [i for i, d in enumerate(indeg) if d == 0]
+    ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx']))
+    topo_order = []
+    while ready:
+        cur = ready.pop(0)
+        topo_order.append(cur)
+        for nxt in sorted(succs[cur]):
+            indeg[nxt] -= 1
+            if indeg[nxt] == 0:
+                ready.append(nxt)
+        ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx']))
 
-    # Process raw files
-    simvector_raw_path = str(raw_dir)
-    simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve())
-    unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH)
-    print("STEP 1 COMPLETED: unfold txt files")
+    schedule_has_cycle = len(topo_order) != n_nodes
+    if schedule_has_cycle:
+        topo_order = list(range(n_nodes))
 
-    pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH)
-    print("STEP 2 COMPLETED: pad txt files")
+    node_start = [0 for _ in range(n_nodes)]
+    node_end = [0 for _ in range(n_nodes)]
+    for _ in range(max(1, n_nodes + 1)):
+        changed = False
+        for n_idx in topo_order:
+            dep_end = max((node_end[p] for p in preds[n_idx]), default=0)
+            start_time = max(int(pattern_nodes[n_idx]['start_delay']), dep_end)
+            end_time = start_time + max(0, int(pattern_nodes[n_idx]['cycles']))
+            if start_time != node_start[n_idx] or end_time != node_end[n_idx]:
+                node_start[n_idx] = start_time
+                node_end[n_idx] = end_time
+                changed = True
+        if not changed:
+            break
 
+    for n_idx, node in enumerate(pattern_nodes):
+        node['start_cycle'] = int(node_start[n_idx])
+        node['end_cycle'] = int(node_end[n_idx])
+
+    total_cycles = max((n['end_cycle'] for n in pattern_nodes), default=0)
+
+    driver_windows = {}
+    for node in pattern_nodes:
+        w = driver_windows.setdefault(node['driver_idx'], {
+            'driver_idx': node['driver_idx'],
+            'name': node['driver_name'],
+            'is_hwpe': node['is_hwpe'],
+            'start': node['start_cycle'],
+            'end': node['end_cycle'],
+        })
+        w['start'] = min(w['start'], node['start_cycle'])
+        w['end'] = max(w['end'], node['end_cycle'])
+
+    regions_timeline = {}
+    for node in pattern_nodes:
+        for reg in node['regions']:
+            reg_key = (reg['base'], reg['size'], reg['label'])
+            entry = regions_timeline.setdefault(reg_key, {
+                'base': reg['base'],
+                'size': reg['size'],
+                'end': reg['end'],
+                'label': reg['label'],
+                'accesses': [],
+            })
+            entry['accesses'].append({
+                'driver_idx': node['driver_idx'],
+                'driver_name': node['driver_name'],
+                'job': node['job'],
+                'start': node['start_cycle'],
+                'end': node['end_cycle'],
+                'pattern_idx': node['pattern_idx'],
+                'description': node['description'],
+            })
+    for reg in regions_timeline.values():
+        reg['lifetime_start'] = min((a['start'] for a in reg['accesses']), default=0)
+        reg['lifetime_end'] = max((a['end'] for a in reg['accesses']), default=0)
+
+    # -----------------------------------------------------------------------
+    # Build memory_map.txt
+    # -----------------------------------------------------------------------
+    memory_map_path = generated_dir / 'memory_map.txt'
+    write_memory_map_txt(
+        memory_map_path=memory_map_path,
+        total_mem_size_kib=TOT_MEM_SIZE,
+        n_banks=N_BANKS,
+        data_width=DATA_WIDTH,
+        hwpe_data_width=HWPE_WIDTH_FACT * DATA_WIDTH,
+        n_core_cfg=N_CORE_CFG,
+        n_dma_cfg=N_DMA_CFG,
+        n_ext_cfg=N_EXT_CFG,
+        n_log_cfg=N_LOG_CFG,
+        n_hwpe_cfg=N_HWPE_CFG,
+        interco_type=INTERCO_TYPE,
+        dw_narrow=DW_NARROW,
+        dw_wide=DW_WIDE,
+        n_narrow_hci_cfg=N_NARROW_HCI_CFG,
+        n_wide_hci_cfg=N_WIDE_HCI_CFG,
+        memory_map_entries=memory_map_entries,
+        job_to_drivers=job_to_drivers,
+        driver_name_fn=_driver_name,
+        n_drivers=N_DRIVERS,
+        fence_masks=fence_masks,
+        total_cycles=total_cycles,
+        mux_serialization_applied=mux_serialization_applied,
+        mux_phase_order=mux_phase_order,
+        schedule_has_cycle=schedule_has_cycle,
+        driver_windows=driver_windows,
+        pattern_nodes=pattern_nodes,
+        regions_timeline=regions_timeline,
+    )
+    print(f"Memory map written: {memory_map_path}")
+
+    # -----------------------------------------------------------------------
+    # Build dataflow.html (simple SVG timeline view)
+    # -----------------------------------------------------------------------
+    dataflow_path = generated_dir / 'dataflow.html'
+    write_memory_lifetime_html(
+        memory_lifetime_path=dataflow_path,
+        pattern_nodes=pattern_nodes,
+        driver_windows=driver_windows,
+        regions_timeline=regions_timeline,
+        total_cycles=total_cycles,
+        mux_serialization_applied=mux_serialization_applied,
+        mux_phase_order=mux_phase_order,
+        schedule_has_cycle=schedule_has_cycle,
+        driver_name_fn=_driver_name,
+        interco_type=INTERCO_TYPE,
+        n_core_cfg=N_CORE_CFG,
+        n_dma_cfg=N_DMA_CFG,
+        n_ext_cfg=N_EXT_CFG,
+        n_hwpe_cfg=N_HWPE_CFG,
+        dw_narrow=DW_NARROW,
+        dw_wide=DW_WIDE,
+        n_narrow_hci_cfg=N_NARROW_HCI_CFG,
+        n_wide_hci_cfg=N_WIDE_HCI_CFG,
+        n_banks=N_BANKS,
+    )
+    print(f"Dataflow plot written: {dataflow_path}")
+
+    # -----------------------------------------------------------------------
+    # Apply per-master start delays
+    # -----------------------------------------------------------------------
+    for fpath, delay, dw in pending_start_delays:
+        if fpath.exists():
+            idle_line = "0 " + "0" * IW + " 0 " + "0" * dw + " " + "0" * ADD_WIDTH + "\n"
+            original = fpath.read_text(encoding='ascii')
+            fpath.write_text(idle_line * delay + original, encoding='ascii')
+
+    print("STEP 1 COMPLETED: generate documents and apply start delays to stimuli")
+
+    # -----------------------------------------------------------------------
+    # Golden vectors
+    # -----------------------------------------------------------------------
     if args.golden:
         golden_dir = (generated_dir / 'golden').resolve()
         golden_dir.mkdir(parents=True, exist_ok=True)
 
-        for stim_path in sorted(processed_dir.glob('master_*.txt')):
+        for stim_path in sorted(stimuli_dir.glob('master_*.txt')):
             try:
                 text = stim_path.read_text(encoding='ascii')
             except OSError:
                 continue
 
-            if text.strip() == 'zero':
-                continue
-
             mem = {}
             out_lines = []
             for raw_line in text.splitlines():
                 line = raw_line.strip()
                 if not line:
                     continue
-
                 parts = line.split()
                 if len(parts) != 5:
                     continue
-
                 req_s, id_s, wen_s, data_s, add_s = parts
                 if req_s != '1':
                     continue
-
                 if wen_s == '0':
                     mem[add_s] = data_s
                     continue
-
                 exp_s = mem.get(add_s, '1' * len(data_s))
                 out_lines.append(f"{id_s} {add_s} {exp_s}")
 
-            (golden_dir / f"golden_{stim_path.name}").write_text("\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii')
-        print("STEP 3 COMPLETED: golden vectors")
+            (golden_dir / f"golden_{stim_path.name}").write_text(
+                "\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii'
+            )
+        print("STEP 2 COMPLETED: golden vectors")
 
 
 if __name__ == '__main__':
diff --git a/target/verif/simvectors/memory_report.py b/target/verif/simvectors/memory_report.py
new file mode 100644
index 0000000..2da9cfb
--- /dev/null
+++ b/target/verif/simvectors/memory_report.py
@@ -0,0 +1,119 @@
+"""Text memory map report generation."""
+
+from pathlib import Path
+
+
+def build_memory_map_text(
+    *,
+    total_mem_size_kib,
+    n_banks,
+    data_width,
+    hwpe_data_width,
+    n_core_cfg,
+    n_dma_cfg,
+    n_ext_cfg,
+    n_log_cfg,
+    n_hwpe_cfg,
+    interco_type,
+    dw_narrow,
+    dw_wide,
+    n_narrow_hci_cfg,
+    n_wide_hci_cfg,
+    memory_map_entries,
+    job_to_drivers,
+    driver_name_fn,
+    n_drivers,
+    fence_masks,
+    total_cycles,
+    mux_serialization_applied,
+    mux_phase_order,
+    schedule_has_cycle,
+    driver_windows,
+    pattern_nodes,
+    regions_timeline,
+):
+    word_bytes = data_width // 8
+    bank_stride_bytes = n_banks * word_bytes
+    lines = []
+    lines.append("=" * 72)
+    lines.append("MEMORY MAP REPORT")
+    lines.append(f"  Total memory : {total_mem_size_kib} KiB  ({total_mem_size_kib * 1024} B)")
+    lines.append(f"  Banks        : {n_banks}  x  {word_bytes} B/word  (interleaved, stride {bank_stride_bytes} B)")
+    lines.append(f"  Data width   : {data_width} b LOG  /  {hwpe_data_width} b HWPE")
+    lines.append(
+        f"  Drivers({dw_narrow} bit) : "
+        f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}  (LOG total={n_log_cfg})"
+    )
+    lines.append(
+        f"  Drivers({dw_wide} bit) : "
+        f"HWPE={n_hwpe_cfg}"
+    )
+    lines.append(
+        f"  Interconnect type : {interco_type}  |  "
+        f"Narrow master ports ({dw_narrow} bit)={n_narrow_hci_cfg}  |  "
+        f"Wide master ports ({dw_wide} bit)={n_wide_hci_cfg}  |  "
+        f"Slave ports (banks)={n_banks}"
+    )
+    lines.append("=" * 72)
+    for entry in memory_map_entries:
+        lines.append(f"\n  [{entry['label']}]  pattern={entry['pattern']}  n_transactions={entry['n']}")
+        if 'info' in entry:
+            lines.append(f"    {entry['info']}")
+        for k, v in entry.get('detail', {}).items():
+            lines.append(f"    {k:<14}: {v}")
+    lines.append("")
+    lines.append("  Job / dependency map:")
+    for job, drivers in sorted(job_to_drivers.items()):
+        driver_names = [driver_name_fn(d) for d in drivers]
+        lines.append(f"    job '{job}': {', '.join(driver_names)}")
+    for i in range(n_drivers):
+        name = driver_name_fn(i)
+        for f, mask in enumerate(fence_masks[i]):
+            if mask:
+                deps = [driver_name_fn(j) for j in range(n_drivers) if mask & (1 << j)]
+                lines.append(f"    {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}")
+    lines.append("")
+    lines.append("  Temporal schedule (transaction-count model):")
+    lines.append(f"    Total modeled time: {total_cycles} units (1 unit = 1 transaction)")
+    lines.append("    Note: Declared wait_for_jobs dependencies are used for scheduling.")
+    lines.append("    Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).")
+    lines.append("    Note: No interconnect contention/stall timing is modeled.")
+    if mux_serialization_applied:
+        lines.append("    Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).")
+        lines.append(f"    MUX job order: {', '.join(mux_phase_order)}")
+    if schedule_has_cycle:
+        lines.append("    WARNING: dependency cycle detected while scheduling; using fallback order.")
+    for d in range(n_drivers):
+        if d not in driver_windows:
+            continue
+        w = driver_windows[d]
+        lines.append(f"    {w['name']:<8}: [{w['start']:>6}, {w['end']:>6})  dur={w['end'] - w['start']:>6}")
+        for node in [n for n in pattern_nodes if n['driver_idx'] == d]:
+            reg_tokens = []
+            for reg in node['regions']:
+                reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B")
+            reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions"
+            lines.append(
+                f"      p{node['pattern_idx']} job={node['job']} "
+                f"[{node['start_cycle']},{node['end_cycle']}) "
+                f"type={node['mem_access_type']} n={node['n_transactions']}  {reg_text}"
+            )
+    lines.append("")
+    lines.append("  Memory region lifetimes:")
+    for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])):
+        reg = regions_timeline[key]
+        users = sorted({a['driver_name'] for a in reg['accesses']})
+        lines.append(
+            f"    {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} "
+            f"({reg['size']:>6} B)  lifetime=[{reg['lifetime_start']},{reg['lifetime_end']})  "
+            f"users={', '.join(users)}"
+        )
+    lines.append("=" * 72)
+
+    return "\n".join(lines) + "\n"
+
+
+def write_memory_map_txt(memory_map_path: Path, **kwargs):
+    report_text = build_memory_map_text(**kwargs)
+    memory_map_path.write_text(report_text, encoding='utf-8')
+    return report_text
diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv
index e1651a5..a6b7aa5 100644
--- a/target/verif/src/application_driver.sv
+++ b/target/verif/src/application_driver.sv
@@ -1,7 +1,9 @@
 /*
  * application_driver.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -14,132 +16,328 @@
 
 /**
  * Application driver module
- * Reads stimuli from file and drives transactions on HCI interface
+ * Reads stimuli from file and drives transactions on HCI interface.
+ *
+ * Stimulus file format (one line per cycle):
+ *   req(1b) id(IWb) wen(1b) data(Nb) add(Ab)   -- active transaction
+ *   PAUSE                                      -- fence synchronization point
+ *
+ * Idle entries (req=0) are consumed as issue gaps when the driver is free to
+ * advance. While stalled waiting for a grant, the driver may advance over later
+ * idle entries, so the stimuli file represents offered traffic order and fence
+ * structure rather than an exact wall-clock replay under backpressure.
+ *
+ * When a PAUSE token is encountered the driver drains all in-flight reads
+ * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1
+ * until resume_i is asserted. This allows multi-phase execution on a single
+ * driver without resetting counters between phases.
+ * Multiple consecutive PAUSE tokens are legal and represent multiple fence slots
+ * with no intervening traffic (e.g. free-pass completion fence followed by a
+ * synthetic blocking fence for the next pattern).
  */
 
 module application_driver #(
   parameter int unsigned MASTER_NUMBER = 1,
-  parameter int unsigned IS_HWPE = 1,
   parameter int unsigned DATA_WIDTH = 1,
-  parameter int unsigned ADD_WIDTH = 1,
-  parameter int unsigned APPL_DELAY = 2,  // Delay on the input signals
+  parameter int unsigned ADDR_WIDTH = 1,
   parameter int unsigned IW = 1,
   parameter string STIM_FILE = ""
 ) (
-  hci_core_intf.initiator hci_if,
-  input logic             rst_ni,
   input logic             clk_i,
-  output logic            end_stimuli_o,
-  output logic            end_latency_o,
-  output int unsigned     n_issued_transactions_o,
-  output int unsigned     n_issued_read_transactions_o
+  input logic             rst_ni,
+  input logic             resume_i,       // asserted by tb_hci when fence dependencies are met
+  hci_core_intf.initiator hci_if,
+  output logic            fence_reached_o, // held HIGH while driver is paused at a fence
+  output logic            end_resp_o,      // held HIGH after all transactions and responses done
+  output int unsigned     n_issued_tr_o,
+  output int unsigned     n_issued_rd_tr_o,
+  output int unsigned     n_retired_rd_tr_o
 );
 
-  logic [IW-1:0] id;
-  string file_path;
-  int stim;
-  int scan_status;
-  logic wen;
-  logic req;
-  logic [DATA_WIDTH-1:0] data;
-  logic [ADD_WIDTH-1:0]  add;
-  int unsigned n_completed_read_transactions;
-  logic pending_rsp_is_read[$];
-
-  always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter
-    logic retired_is_read;
-    if (!rst_ni) begin
-      n_completed_read_transactions <= '0;
-      pending_rsp_is_read.delete();
-    end else begin
-      if (hci_if.req && hci_if.gnt) begin
-        pending_rsp_is_read.push_back(hci_if.wen);
-      end
-      if (hci_if.r_valid && hci_if.r_ready) begin
-        if (pending_rsp_is_read.size() != 0) begin
-          retired_is_read = pending_rsp_is_read.pop_front();
-          if (retired_is_read) begin
-            n_completed_read_transactions <= n_completed_read_transactions + 1;
-          end
-        end
-      end
-    end
-  end
+  int unsigned n_req_issued_q, n_req_issued_d;
+  int unsigned n_rd_req_issued_q, n_rd_req_issued_d;
+  int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d;
+
+  // Transaction queue from file. is_pause=1 entries are fence tokens, not real transactions.
+  typedef struct {
+    logic                  is_pause;
+    logic                  req;
+    logic [IW-1:0]         id;
+    logic                  wen;
+    logic [DATA_WIDTH-1:0] data;
+    logic [ADDR_WIDTH-1:0] add;
+  } transaction_t;
+  transaction_t transactions[$];
+
+  // Fill up the queue by reading the stimuli file until the end.
+  // PAUSE lines are read as fence tokens with is_pause=1.
+  initial begin
+    string file_path;
+    int    stim;
+    string line;
 
-  initial begin : proc_application_driver
-    hci_if.id = '0;
-    hci_if.add = '0;
-    hci_if.data = '0;
-    hci_if.req = 1'b0;
-    hci_if.wen = 1'b0;
-    hci_if.ecc = '0;
-    hci_if.ereq = '0;
-    hci_if.r_eready = '0;
-    hci_if.be = '1;
-    hci_if.r_ready = 1'b1;
-    hci_if.user = '0;
-    end_stimuli_o = 1'b0;
-    end_latency_o = 1'b0;
-    n_issued_transactions_o = '0;
-    n_issued_read_transactions_o = '0;
-
-    wait (rst_ni);
     if (STIM_FILE != "") begin
       file_path = STIM_FILE;
     end else begin
-      if (IS_HWPE) begin
-        file_path = $sformatf(
-          "../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt",
-          MASTER_NUMBER
-        );
-      end else begin
-        file_path = $sformatf(
-          "../simvectors/generated/stimuli_processed/master_log_%0d.txt",
-          MASTER_NUMBER
-        );
-      end
+      $fatal("ERROR: Specify STIM_FILE path");
     end
     stim = $fopen(file_path, "r");
     if (stim == 0) begin
-      $fatal("ERROR: Could not open stimuli file!");
+      $fatal("ERROR: Could not open stimuli file: %s", file_path);
     end
-    @(posedge clk_i);
     while (!$feof(stim)) begin
-      scan_status = $fscanf(stim, "%b %b %b %b %b\n", req, id, wen, data, add);
-      if (scan_status != 5) begin
-        if (!$feof(stim)) begin
-          $fatal(1, "ERROR: malformed stimuli line in %s", file_path);
+      transaction_t t;
+      int scan_status;
+      void'($fgets(line, stim));
+      // Strip trailing newline/CR for comparison
+      if (line.len() > 0 && (line[line.len()-1] == "\n" || line[line.len()-1] == "\r"))
+        line = line.substr(0, line.len()-2);
+      if (line.len() > 1 && line[line.len()-1] == "\r")
+        line = line.substr(0, line.len()-2);
+      if (line == "PAUSE") begin
+        t.is_pause = 1'b1;
+        t.req      = 1'b0;
+        t.id       = '0;
+        t.wen      = 1'b0;
+        t.data     = '0;
+        t.add      = '0;
+        transactions.push_back(t);
+      end else if (line.len() > 0) begin
+        t.is_pause = 1'b0;
+        scan_status = $sscanf(line, "%b %b %b %b %b",
+            t.req, t.id, t.wen, t.data, t.add);
+        if (scan_status != 5) begin
+          if (!$feof(stim)) begin
+            $fatal(1, "ERROR: malformed stimuli line in %s: '%s'", file_path, line);
+          end
+          break;
         end
-        break;
+        transactions.push_back(t);
       end
-      #(APPL_DELAY);
-      hci_if.id = id;
-      hci_if.data = data;
-      hci_if.add = add;
-      hci_if.wen = wen;
-      hci_if.req = req;
-
-      if (req) begin
-        @(posedge clk_i iff hci_if.gnt);
-        n_issued_transactions_o++;
-        if (wen) begin
-          n_issued_read_transactions_o++;
+    end
+    $fclose(stim);
+  end
+
+  //////////////////
+  // Requests FSM //
+  //////////////////
+
+  typedef enum logic [2:0] {
+    REQ_IDLE,
+    WAIT_GNT,
+    REQ_DONE,
+    DRAIN_FOR_PAUSE, // drain in-flight reads before asserting fence_reached_o
+    PAUSED,          // fence synchronization: hold fence_reached_o until resume_i
+    RSP_DONE
+  } req_state_t;
+
+  req_state_t  req_state_q, req_state_d;
+  int unsigned tr_idx_q, tr_idx_d;
+  int unsigned last_op_issued_q, last_op_issued_d;
+
+  assign n_issued_tr_o    = n_req_issued_q;
+  assign n_issued_rd_tr_o = n_rd_req_issued_q;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      req_state_q        <= REQ_IDLE;
+      tr_idx_q           <= '0;
+      n_req_issued_q     <= '0;
+      n_rd_req_issued_q  <= '0;
+      last_op_issued_q   <= '0;
+    end else begin
+      req_state_q        <= req_state_d;
+      tr_idx_q           <= tr_idx_d;
+      n_req_issued_q     <= n_req_issued_d;
+      n_rd_req_issued_q  <= n_rd_req_issued_d;
+      last_op_issued_q   <= last_op_issued_d;
+    end
+  end
+
+  always_comb begin
+    // FSM defaults
+    req_state_d      = req_state_q;
+    tr_idx_d         = tr_idx_q;
+    n_req_issued_d   = n_req_issued_q;
+    n_rd_req_issued_d = n_rd_req_issued_q;
+    last_op_issued_d = last_op_issued_q;
+    // HCI output defaults
+    hci_if.id       = '0;
+    hci_if.add      = '0;
+    hci_if.data     = '0;
+    hci_if.req      = 1'b0;
+    hci_if.wen      = 1'b0;
+    hci_if.ecc      = '0;
+    hci_if.ereq     = '0;
+    hci_if.r_eready = '0;
+    hci_if.be       = '1;
+    hci_if.r_ready  = 1'b1;
+    hci_if.user     = '0;
+    // Output defaults
+    fence_reached_o = 1'b0;
+    end_resp_o      = 1'b0;
+
+    case (req_state_q)
+      REQ_IDLE: begin
+        if (tr_idx_q < transactions.size()) begin
+          if (transactions[tr_idx_q].is_pause) begin
+            // Consume the PAUSE token and drain any in-flight reads before pausing
+            tr_idx_d = tr_idx_q + 1;
+            if (n_rd_req_issued_q > n_rd_resp_retired_q) begin
+              req_state_d = DRAIN_FOR_PAUSE;
+            end else begin
+              req_state_d = PAUSED;
+            end
+          end else begin
+            tr_idx_d = tr_idx_q + 1;
+            if (transactions[tr_idx_q].req) begin
+              hci_if.req  = 1'b1;
+              hci_if.id   = transactions[tr_idx_q].id;
+              hci_if.wen  = transactions[tr_idx_q].wen;
+              hci_if.data = transactions[tr_idx_q].data;
+              hci_if.add  = transactions[tr_idx_q].add;
+              n_req_issued_d = n_req_issued_q + 1;
+              if (transactions[tr_idx_q].wen) begin
+                n_rd_req_issued_d = n_rd_req_issued_q + 1;
+              end
+              last_op_issued_d = tr_idx_q;
+              req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT;
+            end
+          end
+        end else begin
+          // No more transactions
+          if (n_rd_req_issued_q > n_rd_resp_retired_q) begin
+            req_state_d = REQ_DONE;
+          end else begin
+            req_state_d = RSP_DONE;
+          end
         end
-        // Deassert in NBA region so monitors sampling this edge see the handshake.
-        hci_if.id <= '0;
-        hci_if.data <= '0;
-        hci_if.add <= '0;
-        hci_if.wen <= 1'b0;
-        hci_if.req <= 1'b0;
-        wait (hci_if.req == 1'b0);
-      end else begin
-        @(posedge clk_i);
       end
+
+      WAIT_GNT: begin
+        hci_if.req  = 1'b1;
+        hci_if.id   = transactions[last_op_issued_q].id;
+        hci_if.wen  = transactions[last_op_issued_q].wen;
+        hci_if.data = transactions[last_op_issued_q].data;
+        hci_if.add  = transactions[last_op_issued_q].add;
+        if (tr_idx_q < transactions.size()) begin
+          // Consume later idle entries while stalled so the driver can hide memory
+          // latency/backpressure when the workload permits it. This makes req=0 tokens
+          // issue-gap hints, not strict simulation-time no-op cycles.
+          if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin
+            tr_idx_d = tr_idx_q + 1;
+          end
+          req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT;
+        end else begin
+          if (hci_if.gnt) begin
+            if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin
+              req_state_d = REQ_DONE;
+            end else begin
+              req_state_d = RSP_DONE;
+            end
+          end
+        end
+      end
+
+      REQ_DONE: begin
+        if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
+          req_state_d = RSP_DONE;
+        end
+      end
+
+      DRAIN_FOR_PAUSE: begin
+        // Wait for all in-flight reads to retire before asserting the fence
+        if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin
+          req_state_d = PAUSED;
+        end
+      end
+
+      PAUSED: begin
+        // Hold fence_reached_o HIGH until tb_hci asserts resume_i.
+        // If the next token is also a PAUSE (e.g. trailing free-pass followed by
+        // a blocking synthetic idle), consume it immediately and stay in PAUSED
+        // to avoid a spurious one-cycle REQ_IDLE bounce between consecutive fences.
+        fence_reached_o = 1'b1;
+        if (resume_i) begin
+          if (tr_idx_q < transactions.size() && transactions[tr_idx_q].is_pause) begin
+            tr_idx_d    = tr_idx_q + 1;
+            req_state_d = PAUSED;
+          end else begin
+            req_state_d = REQ_IDLE;
+          end
+        end
+      end
+
+      RSP_DONE: begin
+        end_resp_o = 1'b1;
+      end
+
+      default: begin
+        req_state_d = REQ_IDLE;
+      end
+    endcase
+  end
+
+  ///////////////////////
+  // Read response FSM //
+  ///////////////////////
+
+  typedef enum logic [1:0] {
+    RESP_IDLE,
+    RESP_WAIT_RVALID
+  } resp_state_t;
+
+  resp_state_t resp_state_q, resp_state_d;
+  int unsigned n_rd_in_flight_q, n_rd_in_flight_d;
+
+  assign n_retired_rd_tr_o = n_rd_resp_retired_q;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      resp_state_q        <= RESP_IDLE;
+      n_rd_resp_retired_q <= '0;
+      n_rd_in_flight_q    <= '0;
+    end else begin
+      resp_state_q        <= resp_state_d;
+      n_rd_resp_retired_q <= n_rd_resp_retired_d;
+      n_rd_in_flight_q    <= n_rd_in_flight_d;
     end
+  end
 
-    $fclose(stim);
-    end_stimuli_o = 1'b1;
-    wait (n_completed_read_transactions >= n_issued_read_transactions_o);
-    end_latency_o = 1'b1;
+  always_comb begin
+    resp_state_d        = resp_state_q;
+    n_rd_resp_retired_d = n_rd_resp_retired_q;
+    n_rd_in_flight_d    = n_rd_in_flight_q;
+
+    case (resp_state_q)
+      RESP_IDLE: begin
+        if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+          n_rd_in_flight_d = n_rd_in_flight_q + 1;
+          resp_state_d     = RESP_WAIT_RVALID;
+        end
+      end
+
+      RESP_WAIT_RVALID: begin
+        if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+          n_rd_in_flight_d = n_rd_in_flight_q + 1;
+        end
+        if (hci_if.r_valid && hci_if.r_ready) begin
+          n_rd_resp_retired_d = n_rd_resp_retired_q + 1;
+          if (hci_if.req && hci_if.wen && hci_if.gnt) begin
+            n_rd_in_flight_d = n_rd_in_flight_q; // +1 grant -1 retire = net 0
+          end else begin
+            n_rd_in_flight_d = n_rd_in_flight_q - 1;
+          end
+          if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin
+            resp_state_d = RESP_IDLE;
+          end
+        end
+      end
+
+      default: begin
+        resp_state_d = RESP_IDLE;
+      end
+    endcase
   end
+
 endmodule
diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/bandwidth_monitor.sv
similarity index 75%
rename from target/verif/src/throughput_monitor.sv
rename to target/verif/src/bandwidth_monitor.sv
index b25d16e..524e1ff 100644
--- a/target/verif/src/throughput_monitor.sv
+++ b/target/verif/src/bandwidth_monitor.sv
@@ -1,7 +1,10 @@
 /*
- * throughput_monitor.sv
+ * bandwidth_monitor.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -13,21 +16,20 @@
  */
 
 /**
- * Throughput monitor
- * Measures actual throughput and simulation time for each master
+ * Bandwidth monitor
+ * Measures actual bandwidth and completion time for each master
  */
 
-module throughput_monitor #(
+module bandwidth_monitor #(
   parameter int unsigned N_MASTER,
   parameter int unsigned N_HWPE,
   parameter int unsigned CLK_PERIOD,
   parameter int unsigned DATA_WIDTH,
-  parameter int unsigned HWPE_WIDTH
+  parameter int unsigned HWPE_WIDTH_FACT
 ) (
   input logic                clk_i,
   input logic                rst_ni,
-  input logic [0:N_MASTER-1] end_stimuli_i,
-  input logic [0:N_MASTER-1] end_latency_i,
+  input logic [N_MASTER-1:0] end_resp_i,
   // Read transactions number
   input int unsigned         n_read_complete_log_i[N_MASTER-N_HWPE],
   input int unsigned         n_read_complete_hwpe_i[N_HWPE],
@@ -36,28 +38,11 @@ module throughput_monitor #(
   input int unsigned         n_write_granted_hwpe_i[N_HWPE],
   // Completion-side throughput: accepted writes + completed reads per elapsed completion cycle.
   output real                throughput_complete_o,
-  // Elapsed cycles from reset release to end_stimuli.
-  output real                stim_latency_o,
   // Total simulation time (cycles) and simulation time per master (cycles)
   output real                tot_latency_o,
   output real                latency_per_master_o[N_MASTER]
 );
 
-  // Stimulus duration at stimulus completion.
-  initial begin
-    time start_time, end_time;
-    real stim_time_cycles;
-    stim_latency_o = -1;
-    wait (rst_ni);
-    #(CLK_PERIOD/100);
-    @(posedge clk_i);
-    start_time = $time;
-    wait (&end_stimuli_i);
-    end_time = $time;
-    stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD);  // cycles
-    stim_latency_o = stim_time_cycles;
-  end
-
   // Completion-side throughput at full completion.
   initial begin
     time start_time, end_time;
@@ -69,7 +54,7 @@ module throughput_monitor #(
     #(CLK_PERIOD/100);
     @(posedge clk_i);
     start_time = $time;
-    wait (&end_latency_i);
+    wait (&end_resp_i);
     end_time = $time;
     completion_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD);  // cycles
     tot_latency_o = completion_time_cycles;
@@ -82,7 +67,7 @@ module throughput_monitor #(
     for (int i = 0; i < N_HWPE; i++) begin
       tot_data += real'(
         n_write_granted_hwpe_i[i] + n_read_complete_hwpe_i[i]
-      ) * real'(HWPE_WIDTH * DATA_WIDTH);
+      ) * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
     end
     if (completion_time_cycles > 0.0) begin
       throughput_complete_o = tot_data / completion_time_cycles;  // bits per cycle
@@ -101,7 +86,7 @@ module throughput_monitor #(
         #(CLK_PERIOD/100);
         @(posedge clk_i);
         start_time = $time;
-        wait (end_latency_i[ii]);
+        wait (end_resp_i[ii] == 1'b1);
         end_time = $time;
         latency_per_master_o[ii] = real'(end_time - start_time) / real'(CLK_PERIOD);
       end
diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/req_gnt_monitor.sv
similarity index 73%
rename from target/verif/src/latency_monitor.sv
rename to target/verif/src/req_gnt_monitor.sv
index d89cdf5..66fa4cb 100644
--- a/target/verif/src/latency_monitor.sv
+++ b/target/verif/src/req_gnt_monitor.sv
@@ -1,7 +1,10 @@
 /*
- * latency_monitor.sv
+ * req_gnt_monitor.sv
  *
- * Copyright (C) 2019-2020 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -13,19 +16,19 @@
  */
 
 /**
- * Latency monitor
- * Tracks request-to-grant latency and transaction counters for all masters
+ * Request-to-grant monitor
+ * Tracks request-to-grant stall latency and transaction counters for all masters
  */
 
-module latency_monitor #(
+module req_gnt_monitor #(
   parameter int unsigned N_MASTER = 4,
   parameter int unsigned N_HWPE = 1
 ) (
   input logic                clk_i,
   input logic                rst_ni,
   // Monitored interfaces
-  hci_core_intf.monitor      hci_log_if [0:N_MASTER-N_HWPE-1],
-  hci_core_intf.monitor      hci_hwpe_if [0:N_HWPE-1],
+  hci_core_intf.monitor      hci_driver_log_if [0:N_MASTER-N_HWPE-1],
+  hci_core_intf.monitor      hci_driver_hwpe_if [0:N_HWPE-1],
   // Accumulated request-to-grant latency.
   output real                sum_req_to_gnt_latency_log_o[N_MASTER-N_HWPE],
   output real                sum_req_to_gnt_latency_hwpe_o[N_HWPE],
@@ -74,30 +77,32 @@ module latency_monitor #(
           req_start_cycle_log <= '0;
           req_prev_log <= 1'b0;
         end else begin
-          if (hci_log_if[gi].req && !req_prev_log) begin
+          if (hci_driver_log_if[gi].req && !req_prev_log && !hci_driver_log_if[gi].gnt) begin
             req_start_cycle_log <= cycle_q;
+          end else if (hci_driver_log_if[gi].gnt) begin
+            req_start_cycle_log <= cycle_q + 1;
           end
 
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt) begin
             if (req_prev_log) begin
               sum_req_to_gnt_latency_log_o[gi] <=
                   sum_req_to_gnt_latency_log_o[gi] +
                   real'(cycle_q - req_start_cycle_log);
             end
             n_gnt_transactions_log_o[gi] <= n_gnt_transactions_log_o[gi] + 1;
-            pending_rsp_is_read_log.push_back(hci_log_if[gi].wen);
+            pending_rsp_is_read_log.push_back(hci_driver_log_if[gi].wen);
           end
-          req_prev_log <= hci_log_if[gi].req;
+          req_prev_log <= hci_driver_log_if[gi].req;
 
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt && hci_log_if[gi].wen) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && hci_driver_log_if[gi].wen) begin
             n_read_granted_log_o[gi] <=
                 n_read_granted_log_o[gi] + 1;
           end
-          if (hci_log_if[gi].req && hci_log_if[gi].gnt && !hci_log_if[gi].wen) begin
+          if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && !hci_driver_log_if[gi].wen) begin
             n_write_granted_log_o[gi] <=
                 n_write_granted_log_o[gi] + 1;
           end
-          if (hci_log_if[gi].r_valid && hci_log_if[gi].r_ready) begin
+          if (hci_driver_log_if[gi].r_valid && hci_driver_log_if[gi].r_ready) begin
             if (pending_rsp_is_read_log.size() != 0) begin
               retired_is_read_log = pending_rsp_is_read_log.pop_front();
               if (retired_is_read_log) begin
@@ -130,30 +135,32 @@ module latency_monitor #(
           req_start_cycle_hwpe <= '0;
           req_prev_hwpe <= 1'b0;
         end else begin
-          if (hci_hwpe_if[gi].req && !req_prev_hwpe) begin
+          if (hci_driver_hwpe_if[gi].req && !req_prev_hwpe && !hci_driver_hwpe_if[gi].gnt) begin
             req_start_cycle_hwpe <= cycle_q;
+          end else if (hci_driver_hwpe_if[gi].gnt) begin
+            req_start_cycle_hwpe <= cycle_q + 1;
           end
 
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt) begin
             if (req_prev_hwpe) begin
               sum_req_to_gnt_latency_hwpe_o[gi] <=
                   sum_req_to_gnt_latency_hwpe_o[gi] +
                   real'(cycle_q - req_start_cycle_hwpe);
             end
             n_gnt_transactions_hwpe_o[gi] <= n_gnt_transactions_hwpe_o[gi] + 1;
-            pending_rsp_is_read_hwpe.push_back(hci_hwpe_if[gi].wen);
+            pending_rsp_is_read_hwpe.push_back(hci_driver_hwpe_if[gi].wen);
           end
-          req_prev_hwpe <= hci_hwpe_if[gi].req;
+          req_prev_hwpe <= hci_driver_hwpe_if[gi].req;
 
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && hci_hwpe_if[gi].wen) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && hci_driver_hwpe_if[gi].wen) begin
             n_read_granted_hwpe_o[gi] <=
                 n_read_granted_hwpe_o[gi] + 1;
           end
-          if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && !hci_hwpe_if[gi].wen) begin
+          if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && !hci_driver_hwpe_if[gi].wen) begin
             n_write_granted_hwpe_o[gi] <=
                 n_write_granted_hwpe_o[gi] + 1;
           end
-          if (hci_hwpe_if[gi].r_valid && hci_hwpe_if[gi].r_ready) begin
+          if (hci_driver_hwpe_if[gi].r_valid && hci_driver_hwpe_if[gi].r_ready) begin
             if (pending_rsp_is_read_hwpe.size() != 0) begin
               retired_is_read_hwpe = pending_rsp_is_read_hwpe.pop_front();
               if (retired_is_read_hwpe) begin
diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv
index 4d4e2e4..7f27050 100644
--- a/target/verif/src/simulation_report.sv
+++ b/target/verif/src/simulation_report.sv
@@ -1,7 +1,10 @@
 /*
  * simulation_report.sv
  *
- * Copyright (C) 2026 ETH Zurich, University of Bologna
+ * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
+ * Luca Codeluppi <lcodelupp@student.ethz.ch>
+ *
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -20,21 +23,19 @@
 module simulation_report
   import tb_hci_pkg::*;
 (
-  input logic [0:N_MASTER-1] end_stimuli_i,
-  input logic [0:N_MASTER-1] end_latency_i,
-  input real                 throughput_complete_i,
-  input real                 stim_latency_i,
-  input real                 tot_latency_i,
-  input real                 latency_per_master_i[N_MASTER],
-  input real                 sum_req_to_gnt_latency_log_i[N_MASTER-N_HWPE],
+  input logic [N_DRIVERS-1:0] end_resp_i,
+  input real                  throughput_complete_i,
+  input real                  tot_latency_i,
+  input real                 latency_per_master_i[N_DRIVERS],
+  input real                 sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE],
   input real                 sum_req_to_gnt_latency_hwpe_i[N_HWPE],
-  input int unsigned         n_gnt_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_gnt_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_gnt_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_read_granted_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_read_granted_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_read_granted_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_write_granted_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_write_granted_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_write_granted_transactions_hwpe_i[N_HWPE],
-  input int unsigned         n_read_complete_transactions_log_i[N_MASTER-N_HWPE],
+  input int unsigned         n_read_complete_transactions_log_i[N_DRIVERS-N_HWPE],
   input int unsigned         n_read_complete_transactions_hwpe_i[N_HWPE]
 );
 
@@ -60,6 +61,16 @@ module simulation_report
     int unsigned log_masters_with_grants;
     int unsigned hwpe_masters_with_grants;
     logic missing_reads;
+    // Ideal bandwidth: maximum data the memory system can serve per cycle.
+    // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide.
+    real ideal_bw_mem_side_bpc;     // bits per cycle (memory-side ceiling)
+    // Interconnect side: HCI-facing narrow/wide initiator interfaces.
+    real ideal_bw_interco_side_bpc; // bits per cycle (interconnect-side ceiling)
+    real ideal_bw_bpc;             // min(mem, interco) = bottleneck ideal BW
+    real actual_bw_utilization;    // throughput_complete / ideal_bw
+    int unsigned n_narrow_if_total;
+    int unsigned n_wide_if_total;
+    string interco_type_str;
 
     sum_req_to_gnt_latency_all = 0.0;
     average_req_to_gnt_latency_weighted = 0.0;
@@ -81,12 +92,10 @@ module simulation_report
     hwpe_masters_with_grants = '0;
     missing_reads = 1'b0;
 
-    wait (&end_stimuli_i);
-    wait (stim_latency_i >= 0);
-    wait (&end_latency_i);
+    wait (&end_resp_i);
     wait (throughput_complete_i >= 0);
     wait (tot_latency_i >= 0);
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -104,7 +113,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -122,7 +131,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_log_i[i];
       total_write_granted_transactions += n_write_granted_transactions_log_i[i];
       total_read_complete_transactions += n_read_complete_transactions_log_i[i];
@@ -140,7 +149,7 @@ module simulation_report
         log_masters_with_grants++;
       end
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       total_read_granted_transactions += n_read_granted_transactions_hwpe_i[i];
       total_write_granted_transactions += n_write_granted_transactions_hwpe_i[i];
       total_read_complete_transactions += n_read_complete_transactions_hwpe_i[i];
@@ -184,32 +193,74 @@ module simulation_report
           average_req_to_gnt_latency_hwpe_unweighted / real'(hwpe_masters_with_grants);
     end
 
+    // Ideal bandwidth computation.
+    // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle.
+    ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH);
+    // Interconnect side: HCI interface ports (narrow + wide).
+    // NOTE: for MUX mode N_WIDE_HCI=1, i.e. one shared wide initiator.
+    n_narrow_if_total = N_NARROW_HCI + N_DMA + N_EXT;
+    n_wide_if_total = N_WIDE_HCI;
+    ideal_bw_interco_side_bpc = real'(n_narrow_if_total) * real'(DATA_WIDTH)
+                              + real'(n_wide_if_total)   * real'(HWPE_WIDTH_FACT * DATA_WIDTH);
+    // Bottleneck = minimum of the two sides.
+    ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_interco_side_bpc)
+                 ? ideal_bw_mem_side_bpc : ideal_bw_interco_side_bpc;
+    // Utilization = actual / ideal.
+    actual_bw_utilization = (ideal_bw_bpc > 0.0)
+                          ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0;
+    if (INTERCO_TYPE == LOG) begin
+      interco_type_str = "LOG";
+    end else if (INTERCO_TYPE == HCI) begin
+      interco_type_str = "HCI";
+    end else if (INTERCO_TYPE == MUX) begin
+      interco_type_str = "MUX";
+    end else begin
+      interco_type_str = "UNKNOWN";
+    end
+
     $display("------ Simulation Summary ------");
     $display("\\\\HW CONFIG\\\\");
     $display(
       "Masters: CORE=%0d DMA=%0d EXT=%0d HWPE=%0d (total=%0d)",
-      N_CORE_REAL, N_DMA_REAL, N_EXT_REAL, N_HWPE_REAL, N_MASTER_REAL
+      N_CORE, N_DMA, N_EXT, N_HWPE, N_DRIVERS
     );
     $display(
       "Memory: banks=%0d total_size=%0d kB data_width=%0d bits hwpe_width=%0d lanes",
-      N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH
+      N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH_FACT
     );
     $display(
       "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d",
       SEL_LIC, TS_BIT, EXPFIFO
     );
     $display(
-      "ID/address: IW=%0d ADD_WIDTH=%0d AddrMemWidth=%0d",
-      IW, ADD_WIDTH, AddrMemWidth
+      "Interconnect-side: TYPE=%s N_NARROW_HCI=%0d N_WIDE_HCI=%0d N_DMA=%0d N_EXT=%0d",
+      interco_type_str, N_NARROW_HCI, N_WIDE_HCI, N_DMA, N_EXT
+    );
+    $display(
+      "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d",
+      IW, ADDR_WIDTH, ADDR_WIDTH_BANK
     );
 
     $display("\n\\\\BANDWIDTH\\\\");
     $display(
-      "Completion bandwidth (writes granted + reads completed): %0.1f bit/cycle",
-      throughput_complete_i
+      "Ideal BW (memory side):  %0.0f bit/cycle  [%0d banks x %0d bits]",
+      ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH
+    );
+    $display(
+      "Ideal BW (interco side): %0.0f bit/cycle  [%0d narrow-if x %0d bits + %0d wide-if x %0d bits]",
+      ideal_bw_interco_side_bpc,
+      n_narrow_if_total, DATA_WIDTH,
+      n_wide_if_total, HWPE_WIDTH_FACT * DATA_WIDTH
+    );
+    $display(
+      "Ideal BW (bottleneck):   %0.0f bit/cycle",
+      ideal_bw_bpc
+    );
+    $display(
+      "Actual BW (completion):  %0.2f bit/cycle  [utilization: %0.1f%%]",
+      throughput_complete_i, actual_bw_utilization
     );
-    $display("Stimulus phase duration: %0.1f cycles", stim_latency_i);
-    $display("Completion phase duration: %0.1f cycles", tot_latency_i);
+    $display("Completion phase duration: %0.2f cycles", tot_latency_i);
     $display(
       "Granted transactions: reads=%0d writes=%0d total=%0d",
       total_read_granted_transactions,
@@ -222,36 +273,36 @@ module simulation_report
     );
 
     $display("\n\\\\SIMULATION TIME\\\\");
-    $display("Total simulation time: %0.1f cycles", tot_latency_i);
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    $display("Total simulation time: %0.2f cycles", tot_latency_i);
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
-        "Core%0d (master_log_%0d): %0.1f cycles",
+        "Core%0d (master_log_%0d): %0.2f cycles",
         i, i, latency_per_master_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
-        "DMA%0d (master_log_%0d): %0.1f cycles",
+        "DMA%0d (master_log_%0d): %0.2f cycles",
         i - N_CORE, i, latency_per_master_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
-        "EXT%0d (master_log_%0d): %0.1f cycles",
+        "EXT%0d (master_log_%0d): %0.2f cycles",
         i - (N_CORE + N_DMA), i, latency_per_master_i[i]
       );
     end
-    for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin
+    for (int i = N_DRIVERS - N_HWPE; i < N_DRIVERS - N_HWPE + N_HWPE; i++) begin
       $display(
-        "HWPE%0d (master_hwpe_%0d): %0.1f cycles",
-        i - (N_MASTER - N_HWPE),
-        i - (N_MASTER - N_HWPE),
+        "HWPE%0d (master_hwpe_%0d): %0.2f cycles",
+        i - (N_DRIVERS - N_HWPE),
+        i - (N_DRIVERS - N_HWPE),
         latency_per_master_i[i]
       );
     end
 
     $display("\n\\\\READ RESPONSE COVERAGE\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -262,7 +313,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -273,7 +324,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       expected_reads = n_read_granted_transactions_log_i[i];
       observed_reads = n_read_complete_transactions_log_i[i];
       $display(
@@ -284,7 +335,7 @@ module simulation_report
         missing_reads = 1'b1;
       end
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       expected_reads = n_read_granted_transactions_hwpe_i[i];
       observed_reads = n_read_complete_transactions_hwpe_i[i];
       $display(
@@ -300,7 +351,7 @@ module simulation_report
     end
 
     $display("\n\\\\TRANSACTION COUNTS\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -309,7 +360,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -318,7 +369,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
         "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -327,7 +378,7 @@ module simulation_report
         n_read_complete_transactions_log_i[i]
       );
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       $display(
         "master_hwpe_%0d: granted reads=%0d writes=%0d, read-complete=%0d",
         i,
@@ -338,9 +389,9 @@ module simulation_report
     end
 
     $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\");
-    for (int i = 0; i < N_CORE_REAL; i++) begin
+    for (int i = 0; i < N_CORE; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -348,9 +399,9 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin
+    for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -358,9 +409,9 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin
+    for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin
       $display(
-        "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_log_i[i] != 0) ?
             (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) :
@@ -368,9 +419,9 @@ module simulation_report
         n_gnt_transactions_log_i[i]
       );
     end
-    for (int i = 0; i < N_HWPE_REAL; i++) begin
+    for (int i = 0; i < N_HWPE; i++) begin
       $display(
-        "master_hwpe_%0d: avg req->gnt latency %0.1f cycles over %0d grants",
+        "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants",
         i,
         (n_gnt_transactions_hwpe_i[i] != 0) ?
             (sum_req_to_gnt_latency_hwpe_i[i] / real'(n_gnt_transactions_hwpe_i[i])) :
@@ -380,32 +431,32 @@ module simulation_report
     end
     $display("");
     $display(
-      "Total accumulated req->gnt latency: %0.1f cycles over %0d grants",
+      "Total accumulated req->gnt latency: %0d cycles over %0d grants",
       sum_req_to_gnt_latency_all,
       total_gnt_transactions_all
     );
     $display(
-      "LOG avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "LOG avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_log_weighted
     );
     $display(
-      "LOG avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "LOG avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_log_unweighted
     );
     $display(
-      "HWPE avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "HWPE avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_hwpe_weighted
     );
     $display(
-      "HWPE avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "HWPE avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_hwpe_unweighted
     );
     $display(
-      "Global avg req->gnt latency (weighted by grant count): %0.1f cycles",
+      "Global avg req->gnt stall latency (weighted by grant count): %0.2f cycles",
       average_req_to_gnt_latency_weighted
     );
     $display(
-      "Global avg req->gnt latency (mean of per-master averages): %0.1f cycles",
+      "Global avg req->gnt stall latency (mean of per-master averages): %0.2f cycles",
       average_req_to_gnt_latency_unweighted
     );
     $display("");
diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv
index 5fb3411..5ada60b 100644
--- a/target/verif/src/tb_hci.sv
+++ b/target/verif/src/tb_hci.sv
@@ -4,8 +4,7 @@
  * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
  * Luca Codeluppi <lcodelupp@student.ethz.ch>
  *
- *
- * Copyright (C) 2019-2025 ETH Zurich, University of Bologna
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -27,14 +26,12 @@ module tb_hci
 ();
 
   logic                   clk, rst_n;
-  logic                   s_clear;
+  logic [N_DRIVERS-1:0]   s_end_resp;      // end_resp_o from all drivers
+  logic [N_DRIVERS-1:0]   s_fence_reached; // fence_reached_o from all drivers (level, HIGH while PAUSED)
+  logic [N_DRIVERS-1:0]   s_resume;        // resume_i to each driver (asserted when fence deps are met)
+  int unsigned             fence_idx [N_DRIVERS]; // number of fences each driver has passed so far
   hci_interconnect_ctrl_t s_hci_ctrl;
 
-  logic [0:N_MASTER-1] s_end_stimuli;
-  logic [0:N_MASTER-1] s_end_latency;
-  int unsigned s_issued_transactions[0:N_MASTER-1];
-  int unsigned s_issued_read_transactions[0:N_MASTER-1];
-
   clk_rst_gen #(
     .ClkPeriod(CLK_PERIOD),
     .RstClkCycles(RST_CLK_CYCLES)
@@ -43,39 +40,80 @@ module tb_hci
     .rst_no(rst_n)
   );
 
-  /////////
-  // HCI //
-  /////////
+  ////////////////////
+  // HCI interfaces //
+  ////////////////////
 
-  /* HCI interfaces */
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{    // CORE + DMA + EXT parameters
-    DW:  DEFAULT_DW,
-    AW:  DEFAULT_AW,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{
+    DW:  DW_cores,
+    AW:  AW_cores,
+    BW:  BW_cores,
+    UW:  UW_cores,
+    IW:  IW_cores,
+    EW:  EW_cores,
+    EHW: EHW_cores
   };
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{     // Bank parameters
-    DW:  DEFAULT_DW,
-    AW:  AddrMemWidth,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{
+    DW:  DW_hwpe,
+    AW:  AW_hwpe,
+    BW:  BW_hwpe,
+    UW:  UW_hwpe,
+    IW:  IW_hwpe,
+    EW:  EW_hwpe,
+    EHW: EHW_hwpe
   };
-  localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{     // HWPE parameters
-    DW:  HWPE_WIDTH*DATA_WIDTH,
-    AW:  DEFAULT_AW,
-    BW:  DEFAULT_BW,
-    UW:  DEFAULT_UW,
-    IW:  IW,
-    EW:  DEFAULT_EW,
-    EHW: DEFAULT_EHW
+
+  localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{
+    DW:  DW_mems,
+    AW:  AW_mems,
+    BW:  BW_mems,
+    UW:  UW_mems,
+    IW:  IW_mems,
+    EW:  EW_mems,
+    EHW: EHW_mems
   };
 
+  /* Application-driver-side interfaces */
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_driver_log_if [0:N_LOG_MASTERS-1] (
+    .clk(clk)
+  );
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_hwpe.DW),
+    .AW(HCI_SIZE_hwpe.AW),
+    .BW(HCI_SIZE_hwpe.BW),
+    .UW(HCI_SIZE_hwpe.UW),
+    .IW(HCI_SIZE_hwpe.IW),
+    .EW(HCI_SIZE_hwpe.EW),
+    .EHW(HCI_SIZE_hwpe.EHW)
+  ) hci_driver_hwpe_if [0:N_HWPE-1] (
+    .clk(clk)
+  );
+
+  /* Interconnect-side interfaces (hci_system-style organization) */
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_initiator_narrow [0:N_NARROW_HCI-1] (
+    .clk(clk)
+  );
+
   hci_core_intf #(
     .DW(HCI_SIZE_hwpe.DW),
     .AW(HCI_SIZE_hwpe.AW),
@@ -84,7 +122,19 @@ module tb_hci
     .IW(HCI_SIZE_hwpe.IW),
     .EW(HCI_SIZE_hwpe.EW),
     .EHW(HCI_SIZE_hwpe.EHW)
-  ) hci_hwpe_if [0:N_HWPE-1] (
+  ) hci_initiator_wide [0:N_WIDE_HCI-1] (
+    .clk(clk)
+  );
+
+  hci_core_intf #(
+    .DW(HCI_SIZE_cores.DW),
+    .AW(HCI_SIZE_cores.AW),
+    .BW(HCI_SIZE_cores.BW),
+    .UW(HCI_SIZE_cores.UW),
+    .IW(HCI_SIZE_cores.IW),
+    .EW(HCI_SIZE_cores.EW),
+    .EHW(HCI_SIZE_cores.EHW)
+  ) hci_initiator_dma [0:N_DMA-1] (
     .clk(clk)
   );
 
@@ -96,7 +146,7 @@ module tb_hci
     .IW(HCI_SIZE_cores.IW),
     .EW(HCI_SIZE_cores.EW),
     .EHW(HCI_SIZE_cores.EHW)
-  ) hci_log_if [0:N_MASTER-N_HWPE-1] (
+  ) hci_initiator_ext [0:N_EXT-1] (
     .clk(clk)
   );
 
@@ -112,26 +162,187 @@ module tb_hci
     .WAIVE_RQ4_ASSERT(1'b1),
     .WAIVE_RSP3_ASSERT(1'b1),
     .WAIVE_RSP5_ASSERT(1'b1)
-  ) hci_mem_if [0:N_BANKS-1] (
+  ) hci_target_mems [0:N_BANKS-1] (
     .clk(clk)
   );
 
-  /* HCI instance */
+  ///////////////////////////
+  // Interface assignments //
+  ///////////////////////////
+
+  /* Assignments of narrow initiators to LOG branch of HCI */
+
+  generate
+    for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow
+      hci_core_assign i_core_to_narrow_assign (
+        .tcdm_target(hci_driver_log_if[ii]),
+        .tcdm_initiator(hci_initiator_narrow[ii])
+      );
+    end
+
+    for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_hci
+      hci_core_assign i_dma_to_hci_assign (
+        .tcdm_target(hci_driver_log_if[N_CORE + ii]),
+        .tcdm_initiator(hci_initiator_dma[ii])
+      );
+    end
+
+    for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_hci
+      hci_core_assign i_ext_to_hci_assign (
+        .tcdm_target(hci_driver_log_if[N_CORE + N_DMA + ii]),
+        .tcdm_initiator(hci_initiator_ext[ii])
+      );
+    end
+  endgenerate
+
+  /* Assignments of wide initiators to HCI (either LOG branch, HCI branch, or static MUX) */
+
+  generate
+    if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign
+        hci_core_assign i_hwpe_hci_assign (
+          .tcdm_target(hci_driver_hwpe_if[ii]),
+          .tcdm_initiator(hci_initiator_wide[ii])
+        );
+      end
+    end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux
+      // Phase-ordered MUX arbitration:
+      //
+      // The mux is held by whichever HWPE is currently running (not paused, not done).
+      // In-flight reads are drained before any PAUSE (DRAIN_FOR_PAUSE state in the
+      // driver FSM), so sel_i is safe to switch as soon as fence_reached_o goes high.
+      //
+      // When no HWPE is running (all are either paused or done), the mux is granted
+      // to the lowest-indexed HWPE that is paused AND whose fence dependencies are
+      // satisfied (s_resume high). This serializes same-phase jobs by master ID and
+      // respects cross-phase data dependencies.
+      logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel;
+      always_comb begin
+        automatic logic any_running;
+        any_running = 1'b0;
+        for (int i = 0; i < N_HWPE; i++) begin
+          if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i])
+            any_running = 1'b1;
+        end
+        s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1);
+        for (int i = N_HWPE-1; i >= 0; i--) begin
+          if (any_running) begin
+            // Active HWPE holds the mux; lowest index wins (descending loop)
+            if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i])
+              s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+          end else begin
+            // No HWPE running: grant to lowest-indexed paused+ready HWPE
+            if (!s_end_resp[N_LOG_MASTERS + i] && s_fence_reached[N_LOG_MASTERS + i]
+                && s_resume[N_LOG_MASTERS + i])
+              s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i);
+          end
+        end
+      end
+      hci_core_mux_static #(
+        .NB_CHAN(N_HWPE),
+        .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe)
+      ) i_hwpe_mux (
+        .clk_i(clk),
+        .rst_ni(rst_n),
+        .clear_i(s_clear),
+        .sel_i(s_mux_sel),
+        .in(hci_driver_hwpe_if),
+        .out(hci_initiator_wide[0])
+      );
+    end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split
+      for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master
+        hci_core_split #(
+          .DW(HWPE_WIDTH_FACT * DATA_WIDTH),
+          .BW(DATA_WIDTH / 8),
+          .UW(1),
+          .NB_OUT_CHAN(HWPE_WIDTH_FACT),
+          .FIFO_DEPTH(2),
+          .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe)
+        ) i_hwpe_to_log_split (
+          .clk_i(clk),
+          .rst_ni(rst_n),
+          .clear_i(s_clear),
+          .tcdm_target(hci_driver_hwpe_if[ii]),
+          .tcdm_initiator(hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT : N_CORE + (ii + 1) * HWPE_WIDTH_FACT - 1])
+        );
+      end
+    end else begin : gen_unsupported_mode
+      initial $error("Unsupported INTERCO_TYPE");
+    end
+  endgenerate
+
+  /////////////////
+  // Fence logic //
+  /////////////////
 
-  assign s_clear = 0;
+  logic s_clear;
+  assign s_clear = 1'b0;
+
+  assign s_hci_ctrl.arb_policy = ARBITER_MODE;
   assign s_hci_ctrl.invert_prio = INVERT_PRIO;
-  assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL;
+  assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR;
+  assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR;
+
+  // fence_idx[i] = number of PAUSE tokens driver i has passed.
+  // This counts all fences in file order, including synthetic blocking fences
+  // and trailing completion fences.
+  //
+  // fence_idx increments when resume_i is asserted while fence_reached_o is high,
+  // i.e. when the PAUSED-state handshake completes and the driver leaves that fence.
+  always_ff @(posedge clk or negedge rst_n) begin
+    if (!rst_n) begin
+      for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0;
+    end else begin
+      for (int i = 0; i < N_DRIVERS; i++) begin
+        if (s_resume[i] && s_fence_reached[i])
+          fence_idx[i] <= fence_idx[i] + 1;
+      end
+    end
+  end
+
+  // s_resume[i] is asserted only while driver i is paused at its current fence.
+  // Driver i may pass that fence when, for every dependency bit j set in the
+  // current FENCE_MASKS entry, fence_idx[j] is at least the required level
+  // encoded in FENCE_REQ_LEVELS_PACKED.
+  //
+  // In other words: blocking fences wait for explicit dependency completion;
+  // trailing zero-mask fences are free passes.
+  always_comb begin
+    for (int i = 0; i < N_DRIVERS; i++) begin
+      automatic logic [N_DRIVERS-1:0] cur_mask;
+      automatic logic                 all_satisfied;
+      cur_mask = (fence_idx[i] < MAX_FENCES) ? FENCE_MASKS[i][fence_idx[i]] : '0;
+      all_satisfied = 1'b1;
+      for (int j = 0; j < N_DRIVERS; j++) begin
+        if (cur_mask[j]) begin
+          automatic logic [3:0] req;
+          req = (fence_idx[i] < MAX_FENCES) ?
+                FENCE_REQ_LEVELS_PACKED[i][fence_idx[i]][j*4+3 -: 4] : 4'h0;
+          if (fence_idx[j] < req)
+            all_satisfied = 1'b0;
+        end
+      end
+      // Only assert resume_i while the driver is actually in PAUSED state.
+      // Gating with fence_reached_o makes the signal a clean pulse.
+      s_resume[i] = all_satisfied && s_fence_reached[i];
+    end
+  end
+
+  /////////
+  // HCI //
+  /////////
 
   hci_interconnect #(
-    .N_HWPE(N_HWPE),   // Number of HWPEs attached to the port
-    .N_CORE(N_CORE),   // Number of Core ports
-    .N_DMA(N_DMA),     // Number of DMA ports
-    .N_EXT(N_EXT),     // Number of External ports
-    .N_MEM(N_BANKS),   // Number of Memory banks
-    .TS_BIT(TS_BIT),   // TEST_SET_BIT (for Log Interconnect)
-    .IW(IW),           // ID Width
-    .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect
-    .SEL_LIC(SEL_LIC), // Log interconnect type selector
+    .N_HWPE(N_WIDE_HCI),
+    .N_CORE(N_NARROW_HCI),
+    .N_DMA(N_DMA),
+    .N_EXT(N_EXT),
+    .N_MEM(N_BANKS),
+    .TS_BIT(TS_BIT),
+    .IW(IW),
+    .EXPFIFO(EXPFIFO),
+    .SEL_LIC(SEL_LIC),
+    .FILTER_WRITE_R_VALID(FILTER_WRITE_R_VALID),
     .HCI_SIZE_cores(HCI_SIZE_cores),
     .HCI_SIZE_mems(HCI_SIZE_mems),
     .HCI_SIZE_hwpe(HCI_SIZE_hwpe),
@@ -144,11 +355,11 @@ module tb_hci
     .rst_ni(rst_n),
     .clear_i(s_clear),
     .ctrl_i(s_hci_ctrl),
-    .cores(hci_log_if[0:N_CORE-1]),
-    .dma(hci_log_if[N_CORE:N_CORE+N_DMA-1]),
-    .ext(hci_log_if[N_CORE+N_DMA:N_CORE+N_DMA+N_EXT-1]),
-    .mems(hci_mem_if),
-    .hwpe(hci_hwpe_if)
+    .cores(hci_initiator_narrow),
+    .dma(hci_initiator_dma),
+    .ext(hci_initiator_ext),
+    .mems(hci_target_mems),
+    .hwpe(hci_initiator_wide)
   );
 
   //////////
@@ -159,66 +370,67 @@ module tb_hci
     .BankSize(N_WORDS),
     .NbBanks(N_BANKS),
     .DataWidth(DATA_WIDTH),
-    .AddrWidth(ADD_WIDTH),
-    .BeWidth(DATA_WIDTH/8),
+    .AddrWidth(ADDR_WIDTH),
+    .BeWidth(DATA_WIDTH / 8),
     .IdWidth(IW)
   ) i_tb_mem (
     .clk_i(clk),
     .rst_ni(rst_n),
     .test_mode_i(1'b0),
-    .tcdm_slave(hci_mem_if)
+    .tcdm_slave(hci_target_mems)
   );
 
   /////////////////////////
   // Application drivers //
   /////////////////////////
 
-  /* CORE + DMA + EXT */
+  int unsigned s_issued_transactions[0:N_DRIVERS-1];
+  int unsigned s_issued_read_transactions[0:N_DRIVERS-1];
+
   generate
-    for (genvar ii = 0; ii < N_MASTER - N_HWPE; ii++) begin : gen_app_driver_log
+    for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log
       localparam string STIM_FILE_LOG =
-          $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii);
+          $sformatf("../simvectors/generated/stimuli/master_log_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
-        .IS_HWPE(0),
         .DATA_WIDTH(DATA_WIDTH),
-        .ADD_WIDTH(ADD_WIDTH),
-        .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
-        .IW(IW),
+        .ADDR_WIDTH(ADDR_WIDTH),
+        .IW(IW_cores),
         .STIM_FILE(STIM_FILE_LOG)
       ) i_app_driver_log (
-        .hci_if(hci_log_if[ii]),
-        .rst_ni(rst_n),
         .clk_i(clk),
-        .end_stimuli_o(s_end_stimuli[ii]),
-        .end_latency_o(s_end_latency[ii]),
-        .n_issued_transactions_o(s_issued_transactions[ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[ii])
+        .rst_ni(rst_n),
+        .resume_i(s_resume[ii]),
+        .hci_if(hci_driver_log_if[ii]),
+        .fence_reached_o(s_fence_reached[ii]),
+        .end_resp_o(s_end_resp[ii]),
+        .n_issued_tr_o(s_issued_transactions[ii]),
+        .n_issued_rd_tr_o(s_issued_read_transactions[ii]),
+        .n_retired_rd_tr_o()
       );
     end
   endgenerate
 
-  /* HWPE */
   generate
     for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe
       localparam string STIM_FILE_HWPE =
-          $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii);
+          $sformatf("../simvectors/generated/stimuli/master_hwpe_%0d.txt", ii);
       application_driver #(
         .MASTER_NUMBER(ii),
-        .IS_HWPE(1),
-        .DATA_WIDTH(HWPE_WIDTH * DATA_WIDTH),
-        .ADD_WIDTH(ADD_WIDTH),
-        .APPL_DELAY(APPL_DELAY),  // Delay on the input signals
-        .IW(IW),
+        .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH),
+        .ADDR_WIDTH(ADDR_WIDTH),
+        .IW(IW_hwpe),
         .STIM_FILE(STIM_FILE_HWPE)
       ) i_app_driver_hwpe (
-        .hci_if(hci_hwpe_if[ii]),
-        .rst_ni(rst_n),
         .clk_i(clk),
-        .end_stimuli_o(s_end_stimuli[N_MASTER-N_HWPE+ii]),
-        .end_latency_o(s_end_latency[N_MASTER-N_HWPE+ii]),
-        .n_issued_transactions_o(s_issued_transactions[N_MASTER-N_HWPE+ii]),
-        .n_issued_read_transactions_o(s_issued_read_transactions[N_MASTER-N_HWPE+ii])
+        .rst_ni(rst_n),
+        .resume_i(s_resume[N_LOG_MASTERS + ii]),
+        .hci_if(hci_driver_hwpe_if[ii]),
+        .fence_reached_o(s_fence_reached[N_LOG_MASTERS + ii]),
+        .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]),
+        .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]),
+        .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]),
+        .n_retired_rd_tr_o()
       );
     end
   endgenerate
@@ -227,54 +439,48 @@ module tb_hci
   // QoS //
   /////////
 
-  real SUM_REQ_TO_GNT_LATENCY_LOG[N_MASTER-N_HWPE];
+  real SUM_REQ_TO_GNT_LATENCY_LOG[N_LOG_MASTERS];
   real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE];
-  int unsigned N_GNT_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_GNT_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE];
-  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_MASTER-N_HWPE];
+  int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_LOG_MASTERS];
   int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE];
 
-  /* REAL THROUGHPUT AND SIMULATION TIME */
-
-  real latency_per_master[N_MASTER];
+  real latency_per_master[N_DRIVERS];
   real throughput_completed;
-  real stim_latency;
   real tot_latency;
 
-  throughput_monitor #(
-    .N_MASTER(N_MASTER),
+  bandwidth_monitor #(
+    .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE),
     .CLK_PERIOD(CLK_PERIOD),
     .DATA_WIDTH(DATA_WIDTH),
-    .HWPE_WIDTH(HWPE_WIDTH)
-  ) i_throughput_monitor (
+    .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT)
+  ) i_bandwidth_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
-    .end_stimuli_i(s_end_stimuli),
-    .end_latency_i(s_end_latency),
+    .end_resp_i(s_end_resp),
     .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG),
     .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE),
     .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG),
     .n_write_granted_hwpe_i(N_WRITE_GRANTED_TRANSACTIONS_HWPE),
     .throughput_complete_o(throughput_completed),
-    .stim_latency_o(stim_latency),
     .tot_latency_o(tot_latency),
     .latency_per_master_o(latency_per_master)
   );
 
-  /* LATENCY MONITOR */
-  latency_monitor #(
-    .N_MASTER(N_MASTER),
+  req_gnt_monitor #(
+    .N_MASTER(N_DRIVERS),
     .N_HWPE(N_HWPE)
-  ) i_latency_monitor (
+  ) i_req_gnt_monitor (
     .clk_i(clk),
     .rst_ni(rst_n),
-    .hci_log_if(hci_log_if),
-    .hci_hwpe_if(hci_hwpe_if),
+    .hci_driver_log_if(hci_driver_log_if),
+    .hci_driver_hwpe_if(hci_driver_hwpe_if),
     .sum_req_to_gnt_latency_log_o(SUM_REQ_TO_GNT_LATENCY_LOG),
     .sum_req_to_gnt_latency_hwpe_o(SUM_REQ_TO_GNT_LATENCY_HWPE),
     .n_gnt_transactions_log_o(N_GNT_TRANSACTIONS_LOG),
@@ -287,16 +493,13 @@ module tb_hci
     .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
-  ///////////
-  // Other //
-  ///////////
+  ///////////////
+  // Reporting //
+  ///////////////
 
-  /* SIMULATION REPORT */
   simulation_report i_simulation_report (
-    .end_stimuli_i(s_end_stimuli),
-    .end_latency_i(s_end_latency),
+    .end_resp_i(s_end_resp),
     .throughput_complete_i(throughput_completed),
-    .stim_latency_i(stim_latency),
     .tot_latency_i(tot_latency),
     .latency_per_master_i(latency_per_master),
     .sum_req_to_gnt_latency_log_i(SUM_REQ_TO_GNT_LATENCY_LOG),
@@ -311,7 +514,10 @@ module tb_hci
     .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE)
   );
 
-  /* ASSERTIONS */
+  ////////////////
+  // Assertions //
+  ////////////////
+
   localparam int unsigned MAX_BANK_LOCAL_ADDR =
       TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE;
 
@@ -319,15 +525,15 @@ module tb_hci
     for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_assert_hwpe_address
       a_hwpe_addr_in_bounds: assert property (
         @(posedge clk)
-          get_bank_local_address(hci_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR
+          get_bank_local_address(hci_driver_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR
       ) else begin
         $display("--------------------------------------------");
         $display("Time %0t: Test stopped", $time);
         $error(
           "HWPE%0d generated an out-of-bounds address (raw=0x%0h, bank_local=0x%0h, max=0x%0h).",
           ii,
-          hci_hwpe_if[ii].add,
-          get_bank_local_address(hci_hwpe_if[ii].add),
+          hci_driver_hwpe_if[ii].add,
+          get_bank_local_address(hci_driver_hwpe_if[ii].add),
           MAX_BANK_LOCAL_ADDR
         );
         $display("This workload is invalid; rerun with a different workload configuration.");
@@ -336,4 +542,17 @@ module tb_hci
     end
   endgenerate
 
+  // Advisory check only. The hard overflow guard is in Python generation
+  // before packing FENCE_REQ_LEVELS into 4-bit fields.
+  // In case of failure due to this asser, modify tb_hci_pkg.sv and generation of fence_masks.mk
+  initial begin
+    if (MAX_FENCES > 16) begin
+      $warning(
+        "MAX_FENCES=%0d exceeds the nominal 4-bit fence-level range; "
+        "ensure no dependency requires a level > 15.",
+        MAX_FENCES
+      );
+    end
+  end
+
 endmodule
diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv
index cdaf055..507565c 100644
--- a/target/verif/src/tb_hci_pkg.sv
+++ b/target/verif/src/tb_hci_pkg.sv
@@ -4,8 +4,7 @@
  * Sergio Mazzola <smazzola@iis.ee.ethz.ch>
  * Luca Codeluppi <lcodelupp@student.ethz.ch>
  *
- *
- * Copyright (C) 2019-2025 ETH Zurich, University of Bologna
+ * Copyright (C) 2019-2026 ETH Zurich, University of Bologna
  * Copyright and related rights are licensed under the Solderpad Hardware
  * License, Version 0.51 (the "License"); you may not use this file except in
  * compliance with the License.  You may obtain a copy of the License at
@@ -18,77 +17,146 @@
 
 package tb_hci_pkg;
 
+  typedef enum logic [1:0] {
+    LOG = 2'd0,
+    MUX = 2'd1,
+    HCI = 2'd2
+  } interco_e;
+
   //////////////////////////
   // Testbench parameters //
   //////////////////////////
   // from verif/config/testbench.mk
 
-  /* Timing parameters */
-
-  localparam time         CLK_PERIOD     = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif;
-  localparam time         APPL_DELAY     = 0;
-  localparam unsigned     RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif;
+  localparam time     CLK_PERIOD     = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif;
+  localparam time     APPL_DELAY     = 0;
+  localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif;
 
-  /* Simulation parameters */
+  // TCDM and arbitration parameters
+  localparam int unsigned RANDOM_GNT          = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
+  localparam int unsigned ARBITER_MODE        = 0;
+  localparam int unsigned INVERT_PRIO         = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif;
+  localparam int unsigned PRIORITY_CNT_NUMERATOR  = `ifdef PRIORITY_CNT_NUMERATOR `PRIORITY_CNT_NUMERATOR `else 3 `endif;
+  localparam int unsigned PRIORITY_CNT_DENOMINATOR  = `ifdef PRIORITY_CNT_DENOMINATOR `PRIORITY_CNT_DENOMINATOR `else 4 `endif;
 
-  // Transaction counts
-  localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif;
-  localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif;
-  localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO);
-
-  // TCDM interface parameters
-  localparam int unsigned MAX_CYCLES_BETWEEN_GNT_RVALID = `ifdef MAX_CYCLES_BETWEEN_GNT_RVALID `MAX_CYCLES_BETWEEN_GNT_RVALID `else 1 `endif;
-  localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif;
+  /////////////////////////////
+  // Configurable parameters //
+  /////////////////////////////
+  // from verif/config/hardware.mk
 
-  // Arbiter configuration
-  localparam int unsigned ARBITER_MODE = 0;
-  localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif;
-  localparam int unsigned LOW_PRIO_MAX_STALL = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif;
+  // Interconnect mode
+  localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif;
 
-  /////////////////////////
-  // Hardware parameters //
-  /////////////////////////
-  // from verif/config/hardware.mk
+  // Number of initiators
+  localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif;
+  localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif;
+  localparam int unsigned N_DMA  = `ifdef N_DMA `N_DMA `else 1 `endif;
+  localparam int unsigned N_EXT  = `ifdef N_EXT `N_EXT `else 1 `endif;
 
-  /* Config */
+  // Interconnect configuration
+  localparam int unsigned TS_BIT  = `ifdef TS_BIT `TS_BIT `else 0 `endif;
+  localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif;
+  localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif;
 
-  // Master port counts
-  localparam int unsigned N_HWPE_REAL = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port
-  localparam int unsigned N_CORE_REAL = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports
-  localparam int unsigned N_DMA_REAL  = `ifdef N_DMA `N_DMA `else 1 `endif;   // Number of DMA ports
-  localparam int unsigned N_EXT_REAL = `ifdef N_EXT `N_EXT `else 1 `endif;    // Number of External ports
+  // Memory system configuration
+  localparam int unsigned N_BANKS          = `ifdef N_BANKS `N_BANKS `else 16 `endif;
+  localparam int unsigned TOT_MEM_SIZE     = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif;
+  localparam int unsigned DATA_WIDTH       = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif;
+  localparam int unsigned HWPE_WIDTH_FACT  = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif;
 
-  // Normalized master counts (minimum 1 for array sizing)
-  localparam int unsigned N_HWPE = (N_HWPE_REAL == 0) ? 1 : N_HWPE_REAL;
-  localparam int unsigned N_CORE = (N_CORE_REAL == 0) ? 1 : N_CORE_REAL;
-  localparam int unsigned N_DMA  = (N_DMA_REAL == 0) ? 1 : N_DMA_REAL;
-  localparam int unsigned N_EXT  = (N_EXT_REAL == 0) ? 1 : N_EXT_REAL;
-  localparam int unsigned N_MASTER = N_HWPE + N_CORE + N_DMA + N_EXT;                          // Total number of masters
-  localparam int unsigned N_MASTER_REAL = N_HWPE_REAL + N_CORE_REAL + N_DMA_REAL + N_EXT_REAL; // Total number of masters (real)
+  //////////////////////////
+  // Hardcoded parameters //
+  //////////////////////////
 
-  // Interconnect configuration
-  localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif;    // TEST_SET_BIT (for Log Interconnect)
-  localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; // FIFO Depth for HWPE Interconnect
-  localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector
+  localparam int unsigned WORD_SIZE = DATA_WIDTH / 8;
 
-  // Data and memory parameters
-  localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif;       // Width of DATA in bits
-  localparam int unsigned HWPE_WIDTH = `ifdef HWPE_WIDTH `HWPE_WIDTH `else 4 `endif;        // Width of an HWPE wide-word
-  localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB)
-  localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif;                // Number of memory banks
+  //////////////////////////
+  // Dependent parameters //
+  //////////////////////////
 
-  /* Derived parameters */
+  localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT;
+  localparam int unsigned N_DRIVERS     = N_LOG_MASTERS + N_HWPE;
+
+// Fence parameters.
+//
+// Fence slots enumerate every PAUSE token in the stimulus file, in file order.
+// This includes:
+//   (1) synthetic blocking PAUSEs inserted before patterns that have wait_for_jobs,
+//   (2) trailing PAUSEs emitted after every pattern.
+//
+// fence_idx[i] counts how many PAUSE tokens driver i has passed so far.
+// It is therefore a "passed-fence count", not a "completed-pattern count".
+//
+// FENCE_MASKS[i][f][j] = 1:
+//   at fence slot f of driver i, driver j is a dependency.
+//
+// FENCE_REQ_LEVELS_PACKED[i][f][j]:
+//   minimum fence_idx[j] required before driver i may pass fence slot f.
+//
+// For a synthetic pre-pattern fence, the required level corresponds to the
+// dependency driver's fence count after the referenced job has completed.
+// For a trailing pattern fence, the mask is zero and the fence is a free pass.
+//
+// Both arrays are generated by main.py and passed via defines.
+  localparam int unsigned MAX_FENCES =
+      `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif;
+  localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] =
+      `ifdef FENCE_MASKS_PARAM `FENCE_MASKS_PARAM `else '{default: '{default: '0}} `endif;
+  // FENCE_REQ_LEVELS_PACKED[i][f] is a packed vector of N_DRIVERS × 4 bits.
+  // Bits [j*4+3:j*4] hold the required fence_idx[j] before driver i can pass fence f.
+  // 4 bits supports up to 15 (sufficient for up to 7 patterns × 2 fences/pattern).
+  localparam logic [N_DRIVERS*4-1:0] FENCE_REQ_LEVELS_PACKED [N_DRIVERS][MAX_FENCES] =
+      `ifdef FENCE_REQ_LEVELS_PARAM `FENCE_REQ_LEVELS_PARAM `else '{default: '{default: '0}} `endif;
+
+  // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE.
+  localparam int unsigned N_NARROW_HCI =
+      N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG);
+
+  // If full HCI is used, instantiate one wide port per HWPE.
+  // If static MUX is used, instantiate a single shared wide port.
+  localparam int unsigned N_WIDE_HCI =
+      (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0);
+
+  // One-hot response ID width used by the interconnect.
+  localparam int unsigned IW = N_NARROW_HCI + N_WIDE_HCI + N_DMA + N_EXT;
+
+  localparam int unsigned FILTER_WRITE_R_VALID[0:N_WIDE_HCI-1] = '{default: 0};
+
+  localparam int unsigned ADDR_WIDTH           = $clog2(TOT_MEM_SIZE * 1024);
+  localparam int unsigned WIDTH_OF_MEMORY      = DATA_WIDTH;
+  localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8;
+  localparam int unsigned BIT_BANK_INDEX       = $clog2(N_BANKS);
+  localparam int unsigned ADDR_WIDTH_BANK      = ADDR_WIDTH - BIT_BANK_INDEX;
+  localparam int unsigned N_WORDS              =
+      (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE;
 
-  localparam int unsigned ADD_WIDTH = $clog2(TOT_MEM_SIZE * 1024);    // Width of ADDRESS in bits
-  localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH;               // Width of a memory bank (bits)
-  localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes)
-  localparam int unsigned BIT_BANK_INDEX       = $clog2(N_BANKS);     // Bits of the Bank index
-  localparam int unsigned AddrMemWidth = ADD_WIDTH - BIT_BANK_INDEX;  // Number of address bits per TCDM bank
-  localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank
-  localparam int unsigned FILTER_WRITE_R_VALID = '0;
+  ///////////////
+  // Bitwidths //
+  ///////////////
 
-  localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * (N_MASTER_REAL - N_HWPE_REAL) + N_TRANSACTION_HWPE * N_HWPE_REAL); // ID Width
-  localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE_REAL + N_DMA_REAL + N_EXT_REAL) + N_HWPE_REAL * N_TRANSACTION_HWPE * HWPE_WIDTH;
+  localparam int unsigned DW_cores  = DATA_WIDTH;
+  localparam int unsigned AW_cores  = 32;
+  localparam int unsigned BW_cores  = 8;
+  localparam int unsigned UW_cores  = 1;
+  localparam int unsigned IW_cores  = 8;
+  localparam int unsigned EW_cores  = 1;
+  localparam int unsigned EHW_cores = 1;
+
+  localparam int unsigned DW_hwpe  = DW_cores * HWPE_WIDTH_FACT;
+  localparam int unsigned AW_hwpe  = AW_cores;
+  localparam int unsigned BW_hwpe  = BW_cores;
+  localparam int unsigned UW_hwpe  = UW_cores;
+  localparam int unsigned IW_hwpe  = IW_cores;
+  localparam int unsigned EW_hwpe  = EW_cores;
+  localparam int unsigned EHW_hwpe = EHW_cores;
+
+  localparam int unsigned DW_mems  = DW_cores;
+  localparam int unsigned AW_mems  = ADDR_WIDTH_BANK;
+  localparam int unsigned BW_mems  = BW_cores;
+  localparam int unsigned UW_mems  = UW_cores;
+  localparam int unsigned IW_mems  = IW;
+  localparam int unsigned EW_mems  = EW_cores;
+  localparam int unsigned EHW_mems = EHW_cores;
 
   ///////////
   // Types //
@@ -97,45 +165,42 @@ package tb_hci_pkg;
   typedef struct packed {
     logic                  wen;
     logic [DATA_WIDTH-1:0] data;
-    logic [ADD_WIDTH-1:0]  add;
+    logic [ADDR_WIDTH-1:0] add;
   } stimuli_t;
 
   typedef struct packed {
-    logic [DATA_WIDTH - 1 : 0]   data;
-    logic [AddrMemWidth - 1 : 0] add;
+    logic [DATA_WIDTH-1:0]      data;
+    logic [ADDR_WIDTH_BANK-1:0] add;
   } out_intc_to_mem_t;
 
-  // Helper return type for HWPE address/data creation
   typedef struct {
-    logic [ADD_WIDTH-1:0] address;
+    logic [ADDR_WIDTH-1:0] address;
     logic [DATA_WIDTH-1:0] data;
-    logic rolls_over;
+    logic                  rolls_over;
   } hwpe_addr_data_t;
 
   /////////////
   // Helpers //
   /////////////
 
-  // Zero-time pure function returning address/data for an HWPE lane
   function automatic hwpe_addr_data_t create_address_and_data_hwpe(
-    input logic [ADD_WIDTH-1:0] address_before,
-    input logic [HWPE_WIDTH * DATA_WIDTH-1:0] data_before,
-    input int index,
-    input logic rolls_over_check_before
+    input logic [ADDR_WIDTH-1:0]                  address_before,
+    input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before,
+    input int                                     index,
+    input logic                                   rolls_over_check_before
   );
     hwpe_addr_data_t ret;
     logic [BIT_BANK_INDEX-1:0] bank_index_before, bank_index_after;
     begin
-      bank_index_before = address_before[BIT_BANK_INDEX-1 + 2 : 2];
-      // Legacy behavior: add full-width index and let truncation wrap
-      bank_index_after = index + bank_index_before;
-      ret.rolls_over = rolls_over_check_before;
+      bank_index_before = address_before[BIT_BANK_INDEX-1 + 2:2];
+      bank_index_after  = index + bank_index_before;
+      ret.rolls_over    = rolls_over_check_before;
       if (bank_index_before > bank_index_after) begin
         ret.rolls_over = 1'b1;
       end
 
       ret.address = {
-        address_before[ADD_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over,
+        address_before[ADDR_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over,
         bank_index_after,
         address_before[1:0]
       };
@@ -145,43 +210,20 @@ package tb_hci_pkg;
   endfunction
 
   task calculate_bank_index(
-    input logic [ADD_WIDTH-1:0] address,
+    input logic [ADDR_WIDTH-1:0] address,
     output logic [BIT_BANK_INDEX-1:0] index
   );
     index = address[BIT_BANK_INDEX-1 + 2:2];
   endtask
 
-  /* Metrics helpers */
-
-  task calculate_theoretical_throughput(output real throughput_theo);
-    real tot_data, band_memory_limit, tot_time;
-    if (TRANSACTION_RATIO >= 1) begin
-      tot_time = N_TRANSACTION_HWPE;
-    end else begin
-      tot_time = N_TRANSACTION_LOG;
-    end
-    tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_MASTER_REAL - N_HWPE_REAL) +
-               (N_TRANSACTION_HWPE * HWPE_WIDTH * DATA_WIDTH) * N_HWPE_REAL;  // bit
-    throughput_theo = tot_data / tot_time;  // bit per cycle
-    band_memory_limit = real'(N_BANKS * DATA_WIDTH);
-    if (throughput_theo >= band_memory_limit) begin
-      throughput_theo = band_memory_limit;
-    end
-  endtask
-
-  ///////////////
-  // Functions //
-  ///////////////
-
-  // Convert a full system address to per-bank local word address.
-  function int unsigned get_bank_local_address(input logic [ADD_WIDTH-1:0] addr_i);
-    logic [ADD_WIDTH-1:0] mapped_addr;
-    logic [ADD_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
+  function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i);
+    logic [ADDR_WIDTH-1:0] mapped_addr;
+    logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr;
     tb_hci_pkg::hwpe_addr_data_t hwpe_lane_addr_data;
-    hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH, '0);
+    hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH_FACT, '0);
     mapped_addr = hwpe_lane_addr_data.address;
     bank_local_addr = {
-      mapped_addr[ADD_WIDTH-1:BIT_BANK_INDEX + 2],
+      mapped_addr[ADDR_WIDTH-1:BIT_BANK_INDEX + 2],
       mapped_addr[1:0]
     };
     return int'(bank_local_addr);
diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv
index 46c251c..7aad9a8 100644
--- a/target/verif/src/tcdm_banks_wrap.sv
+++ b/target/verif/src/tcdm_banks_wrap.sv
@@ -83,18 +83,20 @@ module tcdm_banks_wrap #(
       .rdata_o(tcdm_slave[i].r_data                     )  // read data
     );
 
+    //NOTE: Commented out. r_valid response is handled by interconnect
+
     //r_valid
-    always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen
-      if(~rst_ni) begin
-        tcdm_slave[i].r_valid <= 1'b0;
-      end else begin
-        if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin
-          tcdm_slave[i].r_valid <= 1'b1;
-        end else begin
-          tcdm_slave[i].r_valid <= 1'b0;
-      end
-    end
-    end
+    // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen
+    //   if(~rst_ni) begin
+    //     tcdm_slave[i].r_valid <= 1'b0;
+    //   end else begin
+    //     if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin
+    //       tcdm_slave[i].r_valid <= 1'b1;
+    //     end else begin
+    //       tcdm_slave[i].r_valid <= 1'b0;
+    //     end
+    //   end
+    // end
   end
 
 endmodule
diff --git a/target/verif/verif.mk b/target/verif/verif.mk
index dd656b0..5d08eba 100644
--- a/target/verif/verif.mk
+++ b/target/verif/verif.mk
@@ -1,37 +1,76 @@
-# Copyright 2025 ETH Zurich and University of Bologna.
+# Copyright 2026 ETH Zurich and University of Bologna.
 # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details.
 # SPDX-License-Identifier: SHL-0.51
 #
 # Sergio Mazzola <smazzola@iis.ee.ethz.ch>
 
+.DELETE_ON_ERROR:
+
 HCI_VERIF_DIR = $(HCI_ROOT)/target/verif
 HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config
 HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated
 
+# Other Makefiles
+include $(HCI_VERIF_DIR)/exploration/exploration.mk
+
 # Include generated Makefiles
 include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk
 include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk
+ifeq (,$(filter clean%,$(MAKECMDGOALS)))
+-include $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
+endif
 
 # Bender targets and defines
 include $(HCI_VERIF_DIR)/bender.mk
 
 # Tooling
 #NOTE: Only QuestaSim is currently supported by verification framework
-SIM_QUESTA ?= questa-2022.3
+ifneq (,$(wildcard /etc/iis.version))
+    SIM_QUESTA ?= questa-2022.3
+else
+    SIM_QUESTA ?=
+endif
 SIM_VLIB ?= $(SIM_QUESTA) vlib
 SIM_VSIM ?= $(SIM_QUESTA) vsim
 SIM_VOPT ?= $(SIM_QUESTA) vopt
 
 PYTHON ?= python3
 
+##################
+# Simvectors gen #
+##################
+
+GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py
+STIM_SRC_FILES := $(shell find $(HCI_VERIF_DIR)/config -type f -not -path '$(HCI_VERIF_CFG_GEN_DIR)/*') \
+                  $(shell find $(HCI_VERIF_DIR)/simvectors -type f -not -path '$(HCI_VERIF_DIR)/simvectors/generated/*')
+SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated
+
+.PHONY: stim-verif
+stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
+$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR)
+	mkdir -p $(SIMVECTORS_GEN_DIR)
+	$(PYTHON) $(GEN_STIM_SCRIPT) \
+		--workload_config $(WORKLOAD_JSON) \
+		--testbench_config $(TESTBENCH_JSON) \
+		--hardware_config $(HARDWARE_JSON) \
+		--emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
+	date > $@
+
+.PHONY: clean-stim-verif
+clean-stim-verif:
+	rm -rf $(SIMVECTORS_GEN_DIR)
+
 ##############
 # Config gen #
 ##############
 
-# Source-of-truth JSON configs
-VERIF_CFG_JSON := $(HCI_VERIF_CFG_DIR)/hardware.json \
-	$(HCI_VERIF_CFG_DIR)/testbench.json \
-	$(HCI_VERIF_CFG_DIR)/workload.json
+# JSON configs are configurable from env var (default to config/)
+HARDWARE_JSON  ?= $(HCI_VERIF_CFG_DIR)/hardware.json
+TESTBENCH_JSON ?= $(HCI_VERIF_CFG_DIR)/testbench.json
+WORKLOAD_JSON  ?= $(HCI_VERIF_CFG_DIR)/workload.json
+
+# Source-of-truth JSON configs (used as stim-verif dependencies)
+VERIF_CFG_JSON := $(HARDWARE_JSON) $(TESTBENCH_JSON) $(WORKLOAD_JSON)
 
 # Makefiles to generate from JSON configs
 VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \
@@ -39,45 +78,26 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \
 
 .PHONY: config-verif
 config-verif: $(VERIF_CFG_MK)
-# Generate Makefiles from JSON configs
-$(HCI_VERIF_CFG_GEN_DIR)/%.mk: $(HCI_VERIF_CFG_DIR)/%.json $(HCI_VERIF_CFG_GEN_DIR)/%.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
-	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py $* $(HCI_VERIF_CFG_DIR) $(HCI_VERIF_CFG_GEN_DIR) > $@
+
+$(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@
+
+$(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR)
+	$(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@
 
 $(HCI_VERIF_CFG_GEN_DIR):
 	mkdir -p $@
 
 .PHONY: clean-config-verif
 clean-config-verif:
-	rm -f $(VERIF_CFG_MK)
-
-##################
-# Simvectors gen #
-##################
-
-GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py
-STIM_SRC_FILES := $(shell find {$(HCI_VERIF_DIR)/config,$(HCI_VERIF_DIR)/simvectors} -type f)
-SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated
-
-.PHONY: stim-verif
-stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp
-$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES)
-	mkdir -p $(SIMVECTORS_GEN_DIR)
-	$(PYTHON) $(GEN_STIM_SCRIPT) \
-		--workload_config $(HCI_VERIF_CFG_DIR)/workload.json \
-		--testbench_config $(HCI_VERIF_CFG_DIR)/testbench.json \
-		--hardware_config $(HCI_VERIF_CFG_DIR)/hardware.json
-	date > $@
-
-.PHONY: clean-stim-verif
-clean-stim-verif:
-	rm -rf $(SIMVECTORS_GEN_DIR)
+	rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
 
 ##############
 # Simulation #
 ##############
 
 # Parameters
-GUI ?= 0
+GUI ?= $(if $(gui),$(gui),0)
 # Top-level to simulate
 sim_top_level ?= tb_hci
 sim_vsim_lib ?= $(HCI_VERIF_DIR)/vsim/work
@@ -99,9 +119,15 @@ ifeq ($(GUI),0)
 	SIM_HCI_VSIM_ARGS += -c
 endif
 
-$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK)
+FENCE_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk
+MAX_FENCES_PARAM       = $(shell grep '^MAX_FENCES_PARAM'       $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+FENCE_MASKS_PARAM      = $(shell grep '^FENCE_MASKS_PARAM'      $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+FENCE_REQ_LEVELS_PARAM = $(shell grep '^FENCE_REQ_LEVELS_PACKED_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-)
+
+$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp
 	mkdir -p $(HCI_VERIF_DIR)/vsim
-	$(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) --vlog-arg="$(SIM_HCI_VLOG_ARGS)" > $@
+	$(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \
+		--vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+MAX_FENCES_PARAM=$(MAX_FENCES_PARAM) +define+FENCE_MASKS_PARAM=$(FENCE_MASKS_PARAM) +define+FENCE_REQ_LEVELS_PARAM=$(FENCE_REQ_LEVELS_PARAM)\"" > $@
 
 .PHONY: compile-verif
 compile-verif: $(sim_vsim_lib)/.hw_compiled
@@ -119,11 +145,12 @@ $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled: $(sim_vsim_lib)/.hw
 	date > $@
 
 .PHONY: run-verif
-run-verif: $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp
+run-verif: $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp
 	cd $(HCI_VERIF_DIR)/vsim && \
 	$(SIM_VSIM) $(SIM_HCI_VSIM_ARGS) \
 	$(sim_top_level)_optimized \
-	-do 'set GUI $(GUI); source $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl'
+	-do 'set GUI $(GUI); source $<'
+
 
 .PHONY: clean-verif
 clean-sim-verif:
diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl
index 01111cc..e0de90a 100644
--- a/target/verif/vsim/tb_hci.tcl
+++ b/target/verif/vsim/tb_hci.tcl
@@ -1,5 +1,176 @@
 # If GUI is 1, spawn waveforms
 if {$GUI == 1} {
     echo "GUI mode enabled"
+    log -r /*
+
+    set N_CORE [examine -radix dec /tb_hci_pkg/N_CORE]
+    set N_DMA [examine -radix dec /tb_hci_pkg/N_DMA]
+    set N_EXT [examine -radix dec /tb_hci_pkg/N_EXT]
+    set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE]
+    set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS]
+
+    set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS]
+    set N_DRIVERS [examine -radix dec /tb_hci_pkg/N_DRIVERS]
+    set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI]
+    set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI]
+    set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT]
+    set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE]
+    set MAX_FENCES [examine -radix dec /tb_hci_pkg/MAX_FENCES]
+
+    add wave -noupdate /tb_hci/clk
+    add wave -noupdate /tb_hci/rst_n
+
+    add wave -noupdate -divider Interfaces
+    # -------------------------------------------------------------------------
+    # Application-driver interfaces
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group driver_side -divider narrow_masters
+    for {set i 0} {$i < $N_LOG_MASTERS} {incr i} {
+        add wave -noupdate -group driver_side -group log_$i /tb_hci/hci_driver_log_if[$i]/*
+    }
+
+    add wave -noupdate -group driver_side -divider hwpe_masters
+    for {set i 0} {$i < $N_HWPE} {incr i} {
+        add wave -noupdate -group driver_side -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/*
+    }
+
+    # -------------------------------------------------------------------------
+    # Interconnect-side interfaces
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group hci_initiator_side -divider narrow_cores
+    for {set i 0} {$i < $N_CORE} {incr i} {
+        add wave -noupdate -group hci_initiator_side -group core_$i /tb_hci/hci_initiator_narrow[$i]/*
+    }
+
+    if {[string first "LOG" $INTERCO_TYPE] != -1} {
+        add wave -noupdate -group hci_initiator_side -divider narrow_hwpe_split
+        for {set i 0} {$i < $N_HWPE} {incr i} {
+            for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} {
+                set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}]
+                if {$idx < $N_NARROW_HCI} {
+                    add wave -noupdate -group hci_initiator_side -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/*
+                }
+            }
+        }
+    }
+
+    if {$N_WIDE_HCI > 0} {
+        add wave -noupdate -group hci_initiator_side -divider wide_hwpe
+        for {set i 0} {$i < $N_WIDE_HCI} {incr i} {
+            add wave -noupdate -group hci_initiator_side -group wide_$i /tb_hci/hci_initiator_wide[$i]/*
+        }
+    }
+
+    add wave -noupdate -group hci_initiator_side -divider dma
+    for {set i 0} {$i < $N_DMA} {incr i} {
+        add wave -noupdate -group hci_initiator_side -group dma_$i /tb_hci/hci_initiator_dma[$i]/*
+    }
+
+    add wave -noupdate -group hci_initiator_side -divider ext
+    for {set i 0} {$i < $N_EXT} {incr i} {
+        add wave -noupdate -group hci_initiator_side -group ext_$i /tb_hci/hci_initiator_ext[$i]/*
+    }
+
+    # -------------------------------------------------------------------------
+    # Memory slaves
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group memory_targets
+    for {set i 0} {$i < $N_BANKS} {incr i} {
+        add wave -noupdate -group memory_targets -group bank_$i /tb_hci/hci_target_mems[$i]/*
+    }
+
+    add wave -noupdate -divider "Application drivers"
+    # -------------------------------------------------------------------------
+    # Per-driver driver internals (req/resp FSM states, counters)
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group driver_internals -divider log_masters
+    for {set i 0} {$i < $N_LOG_MASTERS} {incr i} {
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/req_state_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resp_state_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/tr_idx_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_req_issued_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_req_issued_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_resp_retired_q
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/fence_reached_o
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/end_resp_o
+        add wave -noupdate -group driver_internals -group log_$i \
+            /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resume_i
+    }
+
+    add wave -noupdate -group driver_internals -divider hwpe_masters
+    for {set i 0} {$i < $N_HWPE} {incr i} {
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/req_state_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resp_state_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/tr_idx_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_req_issued_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_req_issued_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_resp_retired_q
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/fence_reached_o
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/end_resp_o
+        add wave -noupdate -group driver_internals -group hwpe_$i \
+            /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resume_i
+    }
+
+    add wave -noupdate -divider Testbench
+    # -------------------------------------------------------------------------
+    # Fence / synchronization signals
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group fence_sync /tb_hci/s_end_resp
+    add wave -noupdate -group fence_sync /tb_hci/s_fence_reached
+    add wave -noupdate -group fence_sync /tb_hci/s_resume
+
+    for {set i 0} {$i < $N_DRIVERS} {incr i} {
+        add wave -noupdate -group fence_sync -group fence_idx /tb_hci/fence_idx[$i]
+    }
+
+    # MUX sel (only present when INTERCO_TYPE == MUX)
+    if {[string first "MUX" $INTERCO_TYPE] != -1} {
+        add wave -noupdate -group fence_sync /tb_hci/gen_hwpe_mux/s_mux_sel
+    }
+
+
+    # -------------------------------------------------------------------------
+    # Metrics
+    # -------------------------------------------------------------------------
+    add wave -noupdate -group metrics /tb_hci/s_issued_transactions
+    add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions
+    add wave -noupdate -group metrics /tb_hci/tot_latency
+    add wave -noupdate -group metrics /tb_hci/latency_per_master
+    add wave -noupdate -group metrics /tb_hci/throughput_completed
+    add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG
+    add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE
+    add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG
+    add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE
+
+    # -------------------------------------------------------------------------
+    # HCI control
+    # -------------------------------------------------------------------------
+    add wave -noupdate /tb_hci/s_clear
+    add wave -noupdate /tb_hci/s_hci_ctrl
+
+    configure wave -signalnamewidth 1
+} else {
+    run -a
 }
-run -a