diff --git a/.github/workflows/gitlab-ci.yml b/.github/workflows/gitlab-ci.yml new file mode 100644 index 0000000..e89ce0d --- /dev/null +++ b/.github/workflows/gitlab-ci.yml @@ -0,0 +1,29 @@ +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Paul Scheffler + +name: gitlab-ci + +on: [ push, pull_request, workflow_dispatch ] + +permissions: + # deployments permission to deploy GitHub pages website + deployments: write + # contents permission to update benchmark contents in gh-pages branch + contents: write + +jobs: + gitlab-ci: + runs-on: ubuntu-latest + steps: + - name: Check Gitlab CI + uses: pulp-platform/pulp-actions/gitlab-ci@v1 + # Skip on forks or pull requests from forks due to missing secrets. + if: github.repository == 'pulp-platform/hci' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) + with: + domain: iis-git.ee.ethz.ch + repo: github-mirror/hci + token: ${{ secrets.GITLAB_TOKEN }} + diff --git a/.gitignore b/.gitignore index ada3be7..6212410 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ target/verif/vsim/compile.tcl target/verif/vsim/modelsim.ini target/verif/vsim/transcript target/verif/vsim/vsim.wlf +target/verif/exploration/results diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..a3d926f --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,44 @@ +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Paul Scheffler + +# We initialize the nonfree repo, then spawn a sub-pipeline from it + +variables: + GIT_SUBMODULE_STRATEGY: recursive + # Our reference GCC toolchain for reproducible builds + +before_script: + - python -V # Print out python version for debugging + - python -m pip install --user virtualenv + +.base: + artifacts: + when: always + expire_in: 1 week + +stages: + - build + - test + +build: + stage: build + script: + - make checkout + artifacts: + when: always + expire_in: 3 hours + paths: [ .bender ] + +testset: + extends: .base + needs: [ build ] + stage: test + script: + - regr/full_regression.sh + artifacts: + when: always + expire_in: 1 year + paths: [ regr/hci_tests.xml ] diff --git a/Bender.yml b/Bender.yml index 516fe27..37aa7af 100644 --- a/Bender.yml +++ b/Bender.yml @@ -2,10 +2,11 @@ package: name: hci authors: - "Francesco Conti " - - "Gianna Paulin " - "Tobias Riedener " - "Luigi Ghionda " - "Arpan Suravi Prasad " + - "Sergio Mazzola " dependencies: hwpe-stream: { git: "https://github.com/pulp-platform/hwpe-stream.git", version: 1.9.0 } @@ -70,8 +71,8 @@ sources: # Level 1 - target/verif/src/application_driver.sv - target/verif/src/tcdm_banks_wrap.sv - - target/verif/src/latency_monitor.sv - - target/verif/src/throughput_monitor.sv + - target/verif/src/req_gnt_monitor.sv + - target/verif/src/bandwidth_monitor.sv # Level 2 - target/verif/src/simulation_report.sv # Level 3 diff --git a/README.md b/README.md index 690bc53..fc10244 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,38 @@ The `hci` repository contains the definition of the Heterogeneous Cluster Interc - https://github.com/pulp-platform/neureka - https://github.com/pulp-platform/redmule +# Verification flow +The typical full flow is: + +``` +make checkout # Fetch and check out dependencies via Bender +make config-verif # Generate Makefiles from JSON verification configs +make stim-verif # Generate simulation stimulus vectors (requires Python 3) +make compile-verif # Compile RTL and testbench with QuestaSim +make opt-verif # Optimize the compiled design with vopt +make run-verif # Run the simulation (batch mode by default) +``` + +To open the simulation in the QuestaSim GUI with waveforms, pass `GUI=1`: + +``` +make run-verif GUI=1 +``` + +Cleanup targets: + +| Target | Effect | +|----------------------|----------------------------------------------------| +| `clean-config-verif` | Remove generated configuration Makefiles | +| `clean-stim-verif` | Remove generated stimulus vectors | +| `clean-sim-verif` | Remove QuestaSim build artifacts (work lib, logs) | +| `clean-verif` | Run all three clean targets above | + +**Notes:** +- On IIS machines, defaults to QuestaSim (`questa-2022.3`) (can be overriden with `SIM_QUESTA=`). On non-IIS machines, defaults to QuestaSim available in `PATH`. +- Verification configuration is driven by JSON files under `target/verif/config/`. Edit those before running `config-verif` and `stim-verif`. +- `run-verif` depends on `opt-verif` and `stim-verif`, so after `checkout` and `config-verif` you can jump straight to it. + # Style guide These IPs use a slightly different style than other PULP IPs. Refer to `STYLE.md` for some indications. diff --git a/bender.mk b/bender.mk index 5f78e57..788c13a 100644 --- a/bender.mk +++ b/bender.mk @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # diff --git a/regr/basic.yml b/regr/basic.yml new file mode 100644 index 0000000..9f43cfa --- /dev/null +++ b/regr/basic.yml @@ -0,0 +1,31 @@ +# Copyright (C) 2024 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Francesco Conti (f.conti@unibo.it) +# + +hci_tests: + log_fair: + path: . + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/log/hardware.json + hci_fair: + path: . + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/fair/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json + hci_hwpe_prio: + path: . + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/hwpe_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json + hci_log_prio: + path: . + command: make clean-config-verif clean-stim-verif clean-sim-verif config-verif stim-verif compile-verif run-verif TESTBENCH_JSON=regr/testbench/log_prio/testbench.json HARDWARE_JSON=regr/hardware/hci/hardware.json diff --git a/regr/bwruntests.py b/regr/bwruntests.py new file mode 100755 index 0000000..555a1d4 --- /dev/null +++ b/regr/bwruntests.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 + +# Copyright 2020 ETH Zurich and University of Bologna +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Run shell commands listed in a file separated by newlines in a parallel +# fashion. If requested the results (tuples consisting of command, stdout, +# stderr and returncode) will be gathered in a junit.xml file. There a few +# knobs to tune the number of spawned processes and the junit.xml formatting. + +# Author: Robert Balas (balasr@iis.ee.ethz.ch) + +import argparse +import re +from subprocess import (Popen, TimeoutExpired, + CalledProcessError, PIPE) +from threading import Lock +import shlex +import sys +import signal +import os +import multiprocessing +import errno +import pprint +import time +import random +from collections import OrderedDict +import json + +runtest = argparse.ArgumentParser( + prog='bwruntests', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""Run PULP tests in parallel""", + epilog=""" +Test_file needs to be either a .yaml file (set the --yaml switch) +which looks like this: + +mytests.yml +[...] +parallel_bare_tests: # name of the test set + parMatrixMul8: # name of the test + path: ./parallel_bare_tests/parMatrixMul8 # path to the test's folder + command: make clean all run # command to run in the test's folder +[...] + +or + +Test_file needs to be a list of commands to be executed. Each line corresponds +to a single command and a test + +commands.f +[...] +make -C ./ml_tests/mlGrad clean all run +make -C ./ml_tests/mlDct clean all run +[...] + +Example: +bwruntests.py --proc-verbose -v \\ + --report_junit -t 3600 --yaml \\ + -o simplified-runtime.xml runtime-tests.yaml + +This Runs a set of tests defined in runtime-tests.yaml and dumps the +resulting junit.xml into simplified-runtime.xml. The --proc-verbose +scripts makes sure to print the stdout of each process to the shell. To +prevent a broken process from running forever, a maximum timeout of 3600 +seconds was set. For debugging purposes we enabled -v (--verbose) which +shows the full set of commands being run.""") + +runtest.version = '0.2' + +runtest.add_argument('test_file', type=str, + help='file defining tests to be run') +runtest.add_argument('--version', action='version', + version='%(prog)s ' + runtest.version) +runtest.add_argument('-p', '--max_procs', type=int, + default=multiprocessing.cpu_count(), + help="""Number of parallel + processes used to run test. + Default is number of cpu cores.""") +runtest.add_argument('-t', '--timeout', type=float, + default=None, + help="""Timeout for all processes in seconds""") +runtest.add_argument('-v', '--verbose', action='store_true', + help="""Enable verbose output""") +runtest.add_argument('-s', '--proc_verbose', action='store_true', + help="""Write processes' stdout and stderr to shell stdout + after they terminate""") +runtest.add_argument('--report_junit', action='store_true', + help="""Generate a junit report""") +runtest.add_argument('--disable_junit_pp', action='store_true', + help="""Disable pretty print of junit report""") +runtest.add_argument('--disable_results_pp', action='store_true', + help="""Disable printing test results""") +runtest.add_argument('-y,', '--yaml', action='store_true', + help="""Read tests from yaml file instead of executing + from a list of commands""") +runtest.add_argument('-o,', '--output', type=str, + help="""Write junit.xml to file instead of stdout""") +runtest.add_argument('-P,', '--perf', type=str, default=None, + help="""Write performance results to JSON file""") +stdout_lock = Lock() + +shared_total = 0 +len_total = 0 + +class FinishedProcess(object): + """A process that has finished running. + """ + def __init__(self, name, cwd, runargs, returncode, + stdout=None, stderr=None, time=None): + self.name = name + self.cwd = cwd + self.runargs = runargs + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + self.time = time + exec_time = 0 + throughput = 0 + workload = 0 + if returncode == 0: + matches = re.findall("# hwpe cycles =\s+(\d+)", stdout) + if matches: + exec_time = int(matches[0]) + self.exec_time = exec_time + + + def __repr__(self): + runargs = ['name={!r}'.format(self.name)] + runargs += ['cwd={!r}'.format(self.cwd)] + runargs += ['args={!r}'.format(self.runargs), + 'returncode={!r}'.format(self.returncode)] + if self.stdout is not None: + runargs.append('stdout={!r}'.format(self.stdout)) + if self.stderr is not None: + runargs.append('stderr={!r}'.format(self.stderr)) + if self.time is not None: + runargs.append('time={!r}'.format(self.time)) + return "{}({})".format(type(self).__name__, ', '.join(runargs)) + +def fork(name, cwd, *popenargs, check=False, shell=True, + **kwargs): + """Run subprocess and return process args, error code, stdout and stderr + """ + + def proc_out(cwd, stdout, stderr): + print('cwd={}'.format(cwd)) + print('stdout=') + print(stdout.decode('utf-8')) + print('stderr=') + print(stderr.decode('utf-8')) + + kwargs['stdout'] = PIPE + kwargs['stderr'] = PIPE + + with Popen(*popenargs, preexec_fn=os.setpgrp, cwd=cwd, + **kwargs) as process: + try: + # Child and parent are racing for setting/using the pgid so we have + # to set it in both processes. See glib manual. + try: + os.setpgid(process.pid, process.pid) + except OSError as e: + if e.errno != errno.EACCES: + raise + # measure runtime + start = time.time() + stdout, stderr = process.communicate(input, timeout=args.timeout) + except TimeoutExpired: + pgid = os.getpgid(process.pid) + os.killpg(pgid, signal.SIGKILL) + # process.kill() will only kill the immediate child but not its + # forks. This won't work since our commands will create a few forks + # (make -> vsim -> etc). We need to make a process group and kill + # that + stdout, stderr = process.communicate() + timeoutmsg = 'TIMEOUT after {:f}s'.format(args.timeout) + + if args.proc_verbose: + stdout_lock.acquire() + print(name) + print(timeoutmsg) + proc_out(cwd, stdout, stderr) + stdout_lock.release() + + return FinishedProcess(name, cwd, process.args, 1, + stdout.decode('utf-8'), + timeoutmsg + '\n' + + stderr.decode('utf-8'), + time.time() - start) + # Including KeyboardInterrupt, communicate handled that. + except: # noqa: E722 + pgid = os.getpgid(process.pid) + os.killpg(pgid, signal.SIGKILL) + # We don't call process.wait() as .__exit__ does that for us. + raise + retcode = process.poll() + if check and retcode: + raise CalledProcessError(retcode, process.args, + output=stdout, stderr=stderr) + if args.proc_verbose: + stdout_lock.acquire() + print(name) + proc_out(cwd, stdout, stderr) + stdout_lock.release() + + with lock: + shared_total.value += 1 + print("[%s][%d/%d] %s" % ("\033[1;32m OK \033[0m" if retcode == 0 else "\033[1;31mFAIL\033[0m", shared_total.value, len_total.value, name)) + + return FinishedProcess(name, cwd, process.args, retcode, + stdout.decode('utf-8'), + stderr.decode('utf-8'), + time.time() - start) + +def poolInit(s, t, l): + global shared_total + global len_total + global lock + shared_total = s + len_total = t + lock = l + +if __name__ == '__main__': + args = runtest.parse_args() + pp = pprint.PrettyPrinter(indent=4) + + # lazy importing so that we can work without junit_xml + if args.report_junit: + try: + from junit_xml import TestSuite, TestCase + except ImportError: + print("""Error: The --report_junit option requires +the junit_xml library which is not installed.""", + file=sys.stderr) + exit(1) + + # lazy import PrettyTable for displaying results + if not(args.disable_results_pp): + try: + from prettytable import PrettyTable + except ImportError: + print("""Warning: Displaying results requires the PrettyTable +library which is not installed""") + + tests = [] # list of tuple (testname, working dir, command) + + # load tests (yaml or command list) + if args.yaml: + try: + import yaml + except ImportError: + print("""Error: The --yaml option requires +the pyyaml library which is not installed.""", + file=sys.stderr) + exit(1) + with open(args.test_file) as f: + testyaml = yaml.load(f, Loader=yaml.Loader) + for testsetname, testv in testyaml.items(): + for testname, insn in testv.items(): + cmd = shlex.split(insn['command']) + cwd = insn['path'] + tests.append((testsetname + ':' + testname, cwd, cmd)) + if args.verbose: + pp.pprint(tests) + else: # (command list) + with open(args.test_file) as f: + testnames = list(map(str.rstrip, f)) + shellcmds = [shlex.split(e) for e in testnames] + cwds = ['./' for e in testnames] + tests = list(zip(testnames, cwds, shellcmds)) + if args.verbose: + print('Tests which we are running:') + pp.pprint(tests) + pp.pprint(shellcmds) + + # Spawning process pool + # Disable signals to prevent race. Child processes inherit SIGINT handler + original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) + lock = multiprocessing.Lock() + shared_total = multiprocessing.Value('i', 0) + len_total = multiprocessing.Value('i', len(tests)) + pool = multiprocessing.Pool(processes=args.max_procs, initializer=poolInit, initargs=(shared_total, len_total, lock )) + # Restore SIGINT handler + signal.signal(signal.SIGINT, original_sigint_handler) + # Shuffle tests + random.shuffle(tests) + try: + procresults = pool.starmap(fork, tests) + except KeyboardInterrupt: + print("\nTerminating bwruntest.py") + pool.terminate() + pool.join() + exit(1) + + # pp.pprint(procresults) + pool.close() + pool.join() + + # Generate junit.xml file. Junit.xml differentiates between failure and + # errors but we treat everything as errors. + if args.report_junit: + testcases = [] + for p in procresults: + # we can either expect p.name = testsetname:testname + # or p.name = testname + testcase = TestCase(p.name, + classname=((p.name).split(':'))[0], + stdout=p.stdout, + stderr=p.stderr, + elapsed_sec=p.time) + if p.returncode != 0: + testcase.add_failure_info(p.stderr) + testcases.append(testcase) + + testsuite = TestSuite('bwruntests', testcases) + if args.output: + with open(args.output, 'w') as f: + TestSuite.to_file(f, [testsuite], + prettyprint=not(args.disable_junit_pp)) + else: + print(TestSuite.to_xml_string([testsuite], + prettyprint=(args.disable_junit_pp))) + + # # print JSON for performance regression + # if args.perf is not None: + # # if file does not exist, create new dictionary: + # if not os.path.isfile(args.perf): + # d = OrderedDict([]) + # # else, load the existing dictionary + # else: + # with open(args.perf) as f: + # d = json.load(f, object_pairs_hook=OrderedDict) + # # save the new execution times + # for p in procresults: + # if p.returncode == 0: + # d[p.name] = p.exec_time + # with open(args.perf, 'w', encoding='utf-8') as f: + # json.dump(d, f, ensure_ascii=False, indent=4) + + # print JSON for performance regression + if args.perf is not None: + # if file does not exist, create new dictionary: + if not os.path.isfile(args.perf): + d = list([]) + # else, load the existing dictionary + else: + with open(args.perf) as f: + d = json.load(f) + # save the new execution times + for p in procresults: + if p.returncode == 0: + d.append({ 'name': p.name, 'value': p.exec_time, 'unit': 'cycles'}) + with open(args.perf, 'w', encoding='utf-8') as f: + json.dump(d, f, ensure_ascii=False, indent=4) + + # print summary of test results + if not(args.disable_results_pp): + testcount = sum(1 for x in tests) + testfailcount = sum(1 for p in procresults if p.returncode != 0) + testpassedcount = testcount - testfailcount + resulttable = PrettyTable(['test', 'cycles', 'time', 'passed/total']) + resulttable.align['test'] = "l" + for p in procresults: + testpassed = 1 if p.returncode == 0 else 0 + testname = p.name + resulttable.add_row([testname, + p.exec_time, + '{0:.2f}s'.format(p.time), + '{0:d}/{1:d}'.format(testpassed, 1)]) + resulttable.add_row(['total', '', '', '{0:d}/{1:d}'. + format(testpassedcount, testcount)]) + print(resulttable) + if testpassedcount != testcount: + import sys; sys.exit(1) + diff --git a/regr/full_regression.sh b/regr/full_regression.sh new file mode 100755 index 0000000..1e10f4e --- /dev/null +++ b/regr/full_regression.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright (C) 2020-2024 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# +# Author: Francesco Conti (f.conti@unibo.it) +# + +export N_PROC=1 +export P_STALL=0.04 +TIMEOUT=400 + +# Declare a string array with type +declare -a test_list=( + "regr/basic.yml" +) + +# Read the list values with space +for val in "${test_list[@]}"; do + nice -n10 regr/bwruntests.py --report_junit -t ${TIMEOUT} --yaml -o regr/hci_tests.xml -p${N_PROC} $val + if test $? -ne 0; then + echo "Error in test $val" + exit 1 + fi +done +unset P_STALL diff --git a/regr/hardware/hci/hardware.json b/regr/hardware/hci/hardware.json new file mode 100644 index 0000000..b349a65 --- /dev/null +++ b/regr/hardware/hci/hardware.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "HCI", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/regr/hardware/log/hardware.json b/regr/hardware/log/hardware.json new file mode 100644 index 0000000..a505fa1 --- /dev/null +++ b/regr/hardware/log/hardware.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "LOG", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/regr/testbench/fair/testbench.json b/regr/testbench/fair/testbench.json new file mode 100644 index 0000000..011e20b --- /dev/null +++ b/regr/testbench/fair/testbench.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 0, + "PRIORITY_CNT_NUMERATOR": 1, + "PRIORITY_CNT_DENOMINATOR": 2 + } +} diff --git a/regr/testbench/hwpe_prio/testbench.json b/regr/testbench/hwpe_prio/testbench.json new file mode 100644 index 0000000..4eefeb8 --- /dev/null +++ b/regr/testbench/hwpe_prio/testbench.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 1, + "PRIORITY_CNT_NUMERATOR": 9, + "PRIORITY_CNT_DENOMINATOR": 10 + } +} diff --git a/regr/testbench/log_prio/testbench.json b/regr/testbench/log_prio/testbench.json new file mode 100644 index 0000000..473e3fd --- /dev/null +++ b/regr/testbench/log_prio/testbench.json @@ -0,0 +1,11 @@ +{ + "description": "Testbench configuration parameters", + "parameters": { + "CLK_PERIOD": 50, + "RST_CLK_CYCLES": 10, + "RANDOM_GNT": 0, + "INVERT_PRIO": 0, + "PRIORITY_CNT_NUMERATOR": 9, + "PRIORITY_CNT_DENOMINATOR": 10 + } +} diff --git a/rtl/common/hci_package.sv b/rtl/common/hci_package.sv index 1e3d1e1..15a36b8 100644 --- a/rtl/common/hci_package.sv +++ b/rtl/common/hci_package.sv @@ -51,7 +51,8 @@ package hci_package; typedef struct packed { logic [1:0] arb_policy; // used only in some systems logic invert_prio; - logic [7:0] low_prio_max_stall; + logic [7:0] priority_cnt_numerator; + logic [7:0] priority_cnt_denominator; } hci_interconnect_ctrl_t; typedef struct packed { diff --git a/rtl/core/hci_core_split.sv b/rtl/core/hci_core_split.sv index c0c96fb..88e2521 100644 --- a/rtl/core/hci_core_split.sv +++ b/rtl/core/hci_core_split.sv @@ -224,7 +224,11 @@ module hci_core_split end // r_ready masking - assign tcdm_initiator_lrdy_masked_d = cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid | ~tcdm_initiator_req : tcdm_initiator_r_valid | ~tcdm_initiator_req; + // Track lanes that have produced a response for the current split transaction. + // Using "~req" here can mark a lane as completed before r_valid is observed. + assign tcdm_initiator_lrdy_masked_d = + cs_rvalid==NO_RVALID ? tcdm_initiator_lrdy_masked_q | tcdm_initiator_r_valid + : tcdm_initiator_r_valid; always_ff @(posedge clk_i or negedge rst_ni) begin if(~rst_ni) begin diff --git a/rtl/hci_interconnect.sv b/rtl/hci_interconnect.sv index 9825891..fee4918 100644 --- a/rtl/hci_interconnect.sv +++ b/rtl/hci_interconnect.sv @@ -142,8 +142,22 @@ module hci_interconnect EW: DEFAULT_EW, EHW: DEFAULT_EHW }; - `HCI_INTF_ARRAY(hwpe_mem_muxed, clk_i, 0:N_MEM-1); - + hci_core_intf #( + .DW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).DW ), + .AW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).AW ), + .BW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).BW ), + .UW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).UW ), + .IW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).IW ), + .EW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EW ), + .EHW ( `HCI_SIZE_PARAM(hwpe_mem_muxed).EHW ) +`ifndef SYNTHESIS + , + .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), // hwpe_mem_muxed is an internal muxed signal, not a protocol-compliant port + .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT ) +`endif + ) hwpe_mem_muxed [0:N_MEM-1] ( + .clk ( clk_i ) + ); localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe_mem) = '{ DW: DEFAULT_DW, @@ -254,15 +268,31 @@ module hci_interconnect end : hwpe_req2mem + // Set arbitration tree to be perfectly fair. It must not + // follow the max stall policy of the HWPE vs LIC arbiter. + // FIXME: it would be interesting to explore what happens + // with an unfair but configurable setting. Probably we need + // a generator to do that, I do not see a way to code it in + // pure SystemVerilog. + hci_interconnect_ctrl_t ctrl_arbiter_tree; + always_comb + begin + ctrl_arbiter_tree = ctrl_i; + ctrl_arbiter_tree.priority_cnt_numerator = 1; + ctrl_arbiter_tree.priority_cnt_denominator = 2; + end + hci_arbiter_tree #( .NB_REQUESTS(N_HWPE), .NB_CHAN ( N_MEM ), + .WAIVE_RQ3_ASSERT ( WAIVE_RQ3_ASSERT ), + .WAIVE_RQ4_ASSERT ( WAIVE_RQ4_ASSERT ), .`HCI_SIZE_PARAM(out)(`HCI_SIZE_PARAM(hwpe_mem_muxed)) ) i_wide_port_arbiter_tree ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .clear_i ( clear_i ), - .ctrl_i ( ctrl_i ), + .ctrl_i ( ctrl_arbiter_tree ), .in ( hwpe_mem ), .out ( hwpe_mem_muxed ) ); diff --git a/rtl/interco/hci_arbiter.sv b/rtl/interco/hci_arbiter.sv index a291532..da75e51 100644 --- a/rtl/interco/hci_arbiter.sv +++ b/rtl/interco/hci_arbiter.sv @@ -44,13 +44,15 @@ * .. _hci_arbiter_ctrl: * .. table:: **hci_arbiter** input control signals. * - * +----------------------+------------------------+---------------------------------------------------------------+ - * | **Name** | **Type** | **Description** | - * +----------------------+------------------------+---------------------------------------------------------------+ - * | *invert_prio* | `logic` | When 1, invert priorities between `in_high` and `in_low`. | - * +----------------------+------------------------+---------------------------------------------------------------+ - * | *low_prio_max_stall* | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel. | - * +----------------------+------------------------+---------------------------------------------------------------+ + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | **Name** | **Type** | **Description** | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *invert_prio* | `logic` | When 1, invert priorities between `in_high` and `in_low`. | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *priority_cnt_numerator* | `logic[7:0]` | Maximum number of consecutive stalls on low-priority channel. | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ + * | *priority_cnt_denominator* | `logic[7:0]` | Clear condition of priority counter (max low-prio stalls + high-prio stalls). | + * +----------------------------+--------------+-------------------------------------------------------------------------------+ * */ @@ -75,8 +77,9 @@ module hci_arbiter logic [NB_CHAN-1:0] hs_pass_d; logic hs_req_d; logic ls_req_d; + logic hs_req_masked_d; logic switch_channels_d; - logic unsigned [7:0] ls_stall_ctr_d; + logic unsigned [7:0] priority_cnt_q; // priority_req is the OR of all requests coming out of the log interconnect. // it should be simplified to simply an OR of all requests coming *into* the @@ -85,10 +88,11 @@ module hci_arbiter begin hs_req_d = |hs_req_in; ls_req_d = |ls_req_in; - if (ctrl_i.low_prio_max_stall > 0) //Set to 0 to disable this functionality + hs_req_masked_d = hs_req_d; + if (ctrl_i.priority_cnt_numerator > 0) //Set to 0 to disable this functionality begin - if (ls_stall_ctr_d >= ctrl_i.low_prio_max_stall) - hs_req_d = 0; //Let low side through for once + if (priority_cnt_q >= ctrl_i.priority_cnt_numerator && priority_cnt_q < ctrl_i.priority_cnt_denominator) + hs_req_masked_d = 0; //Let low side through for once end end @@ -96,11 +100,11 @@ module hci_arbiter always_ff @(posedge clk_i or negedge rst_ni) begin if (~rst_ni) - ls_stall_ctr_d <= 0; + priority_cnt_q <= 0; + else if(priority_cnt_q == ctrl_i.priority_cnt_denominator-1) + priority_cnt_q <= 0; else if (hs_req_d & ls_req_d) - ls_stall_ctr_d <= ls_stall_ctr_d + 1; - else - ls_stall_ctr_d <= 0; + priority_cnt_q <= priority_cnt_q + 1; end assign switch_channels_d = ctrl_i.invert_prio; @@ -129,7 +133,7 @@ module hci_arbiter // Side select generate for(genvar ii=0; ii.json` to `make stim-verif` / `make run-verif` to select an alternative workload. diff --git a/target/verif/bender.mk b/target/verif/bender.mk index 2e1362e..02e640b 100644 --- a/target/verif/bender.mk +++ b/target/verif/bender.mk @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # @@ -8,25 +8,24 @@ VERIF_DEFS ?= VERIF_DEFS += \ -D N_HWPE=$(N_HWPE) \ - -D HWPE_WIDTH=$(HWPE_WIDTH) \ + -D HWPE_WIDTH_FACT=$(HWPE_WIDTH_FACT) \ -D N_CORE=$(N_CORE) \ -D N_DMA=$(N_DMA) \ -D N_EXT=$(N_EXT) \ - -D TS_BIT=$(TS_BIT) \ - -D EXPFIFO=$(EXPFIFO) \ - -D SEL_LIC=$(SEL_LIC) \ -D DATA_WIDTH=$(DATA_WIDTH) \ -D TOT_MEM_SIZE=$(TOT_MEM_SIZE) \ -D N_BANKS=$(N_BANKS) \ - -D N_TRANSACTION_LOG=$(N_TRANSACTION_LOG) \ - -D TRANSACTION_RATIO=$(TRANSACTION_RATIO) \ + -D TS_BIT=$(TS_BIT) \ + -D EXPFIFO=$(EXPFIFO) \ + -D SEL_LIC=$(SEL_LIC) \ -D CLK_PERIOD=$(CLK_PERIOD) \ -D RST_CLK_CYCLES=$(RST_CLK_CYCLES) \ - -D MAX_CYCLES_BETWEEN_GNT_RVALID=$(MAX_CYCLES_BETWEEN_GNT_RVALID) \ -D RANDOM_GNT=$(RANDOM_GNT) \ + -D INTERCO_TYPE=$(INTERCO_TYPE) \ -D INVERT_PRIO=$(INVERT_PRIO) \ - -D LOW_PRIO_MAX_STALL=$(LOW_PRIO_MAX_STALL) + -D PRIORITY_CNT_NUMERATOR=$(PRIORITY_CNT_NUMERATOR) \ + -D PRIORITY_CNT_DENOMINATOR=$(PRIORITY_CNT_DENOMINATOR) # Common targets for bender VERIF_TARGS ?= -VERIF_TARGS += -t hci_verif \ No newline at end of file +VERIF_TARGS += -t hci_verif diff --git a/target/verif/config/generated/hardware.mk.tpl b/target/verif/config/generated/hardware.mk.tpl index f06a13d..031d817 100644 --- a/target/verif/config/generated/hardware.mk.tpl +++ b/target/verif/config/generated/hardware.mk.tpl @@ -1,4 +1,4 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # @@ -6,10 +6,11 @@ # Hardware configuration parameters (from hardware.json) N_HWPE?=${N_HWPE} -HWPE_WIDTH?=${HWPE_WIDTH} +HWPE_WIDTH_FACT?=${HWPE_WIDTH_FACT} N_CORE?=${N_CORE} N_DMA?=${N_DMA} N_EXT?=${N_EXT} +INTERCO_TYPE?=${INTERCO_TYPE} TS_BIT?=${TS_BIT} EXPFIFO?=${EXPFIFO} SEL_LIC?=${SEL_LIC} diff --git a/target/verif/config/generated/json_to_mk.py b/target/verif/config/generated/json_to_mk.py index fcec54f..1d6ad27 100755 --- a/target/verif/config/generated/json_to_mk.py +++ b/target/verif/config/generated/json_to_mk.py @@ -6,8 +6,8 @@ Templates are automatically discovered based on the config type argument. """ -import json import argparse +import json import sys from pathlib import Path from string import Template @@ -45,6 +45,34 @@ def flatten_dict(d, prefix=''): items.append((new_key, v)) return dict(items) + +def get_parameters(config): + """Return flattened parameters from config JSON.""" + params = config.get("parameters") + if not isinstance(params, dict): + return {} + return flatten_dict(params) + + +def load_all_parameters(config_dir): + """Load flattened parameters from all JSON files in config_dir.""" + merged = {} + for json_path in sorted(config_dir.glob("*.json")): + cfg = load_json_config(json_path) + merged.update(get_parameters(cfg)) + return merged + + +def template_variables(template_content): + """Extract Template variable names used by template content.""" + pattern = Template.pattern + vars_found = set() + for match in pattern.finditer(template_content): + name = match.group("named") or match.group("braced") + if name is not None: + vars_found.add(name) + return vars_found + def parse_args(argv=None): parser = argparse.ArgumentParser( description="Convert JSON configuration to Makefile fragment using templates." @@ -55,9 +83,9 @@ def parse_args(argv=None): help="Configuration type to generate.", ) parser.add_argument( - "config_dir", + "config_json", type=Path, - help="Directory containing source-of-truth JSON files.", + help="Path to selected source-of-truth JSON file.", ) parser.add_argument( "generated_dir", @@ -70,11 +98,10 @@ def parse_args(argv=None): def main(): args = parse_args() config_type = args.config_type - config_dir = args.config_dir.resolve() + json_file = args.config_json.resolve() + config_dir = json_file.parent generated_dir = args.generated_dir.resolve() - # Construct file paths based on config_type argument - json_file = config_dir / f"{config_type}.json" template_file = generated_dir / f"{config_type}.mk.tpl" # Load JSON config @@ -83,16 +110,22 @@ def main(): # Load template template_content = load_template(template_file) - # Flatten the parameters dict for template substitution - template_data = flatten_dict(config['parameters']) + # Build substitution dictionary: + # 1. all parameters from all configs (fallback) + # 2. parameters from selected config (override) + template_data = load_all_parameters(config_dir) + template_data.update(get_parameters(config)) # Apply template substitution template = Template(template_content) - try: - result = template.substitute(template_data) - except KeyError as e: - print(f"ERROR: Missing template variable: {e}", file=sys.stderr) + missing = sorted(v for v in template_variables(template_content) if v not in template_data) + if missing: + print( + f"ERROR: Missing template variable(s): {', '.join(missing)}", + file=sys.stderr, + ) sys.exit(1) + result = template.substitute(template_data) # Output to stdout print(result) diff --git a/target/verif/config/generated/testbench.mk.tpl b/target/verif/config/generated/testbench.mk.tpl index a1cf812..c36dad7 100644 --- a/target/verif/config/generated/testbench.mk.tpl +++ b/target/verif/config/generated/testbench.mk.tpl @@ -1,15 +1,13 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # # This file is auto-generated from testbench.json - DO NOT EDIT MANUALLY # Testbench parameters (from testbench.json) -N_TRANSACTION_LOG?=${N_TRANSACTION_LOG} -TRANSACTION_RATIO?=${TRANSACTION_RATIO} CLK_PERIOD?=${CLK_PERIOD} RST_CLK_CYCLES?=${RST_CLK_CYCLES} -MAX_CYCLES_BETWEEN_GNT_RVALID?=${MAX_CYCLES_BETWEEN_GNT_RVALID} RANDOM_GNT?=${RANDOM_GNT} INVERT_PRIO?=${INVERT_PRIO} -LOW_PRIO_MAX_STALL?=${LOW_PRIO_MAX_STALL} +PRIORITY_CNT_NUMERATOR?=${PRIORITY_CNT_NUMERATOR} +PRIORITY_CNT_DENOMINATOR?=${PRIORITY_CNT_DENOMINATOR} diff --git a/target/verif/config/hardware.json b/target/verif/config/hardware.json index 678217b..b349a65 100644 --- a/target/verif/config/hardware.json +++ b/target/verif/config/hardware.json @@ -2,15 +2,16 @@ "description": "Hardware configuration parameters for HCI interconnect", "parameters": { "N_HWPE": 2, - "HWPE_WIDTH": 8, + "HWPE_WIDTH_FACT": 8, "N_CORE": 8, - "N_DMA": 1, + "N_DMA": 0, "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "HCI", "TS_BIT": 21, "EXPFIFO": 0, - "SEL_LIC": 0, - "DATA_WIDTH": 32, - "TOT_MEM_SIZE": 32, - "N_BANKS": 64 + "SEL_LIC": 0 } } diff --git a/target/verif/config/testbench.json b/target/verif/config/testbench.json index a4daaf4..924c947 100644 --- a/target/verif/config/testbench.json +++ b/target/verif/config/testbench.json @@ -1,13 +1,11 @@ { "description": "Testbench configuration parameters", "parameters": { - "N_TRANSACTION_LOG": 1000, - "TRANSACTION_RATIO": 1, "CLK_PERIOD": 50, "RST_CLK_CYCLES": 10, - "MAX_CYCLES_BETWEEN_GNT_RVALID": 1, "RANDOM_GNT": 0, "INVERT_PRIO": 0, - "LOW_PRIO_MAX_STALL": 10 + "PRIORITY_CNT_NUMERATOR": 10, + "PRIORITY_CNT_DENOMINATOR": 11 } } diff --git a/target/verif/config/workload.json b/target/verif/config/workload.json index 7da0838..aa2da43 100644 --- a/target/verif/config/workload.json +++ b/target/verif/config/workload.json @@ -1,144 +1,540 @@ { - "description": "Workload configuration for stimuli generation", - "simulation_parameters": { - "EXACT_OR_MAX_OFFSET": 0, - "CYCLE_OFFSET_LOG": 1, - "CYCLE_OFFSET_HWPE": 1 - }, + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", "log_masters": [ { "id": 0, - "description": "Core 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 0 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 0 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core0_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core0_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core0_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core0_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + } + ] }, { "id": 1, - "description": "Core 1", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 1 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 1 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core1_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core1_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core1_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core1_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + } + ] }, { "id": 2, - "description": "Core 2", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 2 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 2 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core2_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core2_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core2_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core2_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + } + ] }, { "id": 3, - "description": "Core 3", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 3 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 3 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core3_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core3_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core3_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core3_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + } + ] }, { "id": 4, - "description": "Core 4", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 4 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 4 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core4_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core4_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core4_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core4_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + } + ] }, { "id": 5, - "description": "Core 5", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 5 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 5 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core5_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core5_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core5_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core5_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + } + ] }, { "id": 6, - "description": "Core 6", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 6 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 6 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core6_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core6_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core6_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core6_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + } + ] }, { "id": 7, - "description": "Core 7", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core7_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core7_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core7_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] }, - { - "id": 8, - "description": "DMA 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - }, - { - "id": 9, - "description": "External 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 - } + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } ], + "hwpe_masters": [ { "id": 0, - "description": "HWPE 0", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A1_buf1", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A2 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A2_buf0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A0", + "mem_access_type": "linear", + "job": "dma_store_C0_buf2", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A3 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A3_buf1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A1", + "mem_access_type": "linear", + "job": "dma_store_C1_buf3", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A2", + "mem_access_type": "linear", + "job": "dma_store_C2_buf2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + } + ] }, { "id": 1, - "description": "HWPE 1", - "mem_access_type": 0, - "start_address": 0, - "stride0": 0, - "len_d0": 0, - "stride1": 0, - "len_d1": 0, - "stride2": 0 + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] } ] -} \ No newline at end of file +} diff --git a/target/verif/config/workload.schema.json b/target/verif/config/workload.schema.json new file mode 100644 index 0000000..fde1870 --- /dev/null +++ b/target/verif/config/workload.schema.json @@ -0,0 +1,471 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "workload.schema.json", + "title": "HCI Verification Workload Configuration", + "description": "Schema for workload.json consumed by target/verif/simvectors/main.py", + "type": "object", + "required": ["log_masters", "hwpe_masters"], + "additionalProperties": false, + "properties": { + "description": { + "type": "string", + "description": "Human-readable description of this workload." + }, + "log_masters": { + "type": "array", + "description": "Ordered list of LOG masters (CORE, then DMA, then EXT). Length must equal N_CORE + N_DMA + N_EXT in hardware.json.", + "items": { "$ref": "#/$defs/master" } + }, + "hwpe_masters": { + "type": "array", + "description": "Ordered list of HWPE masters. Length must equal N_HWPE in hardware.json.", + "items": { "$ref": "#/$defs/master" } + } + }, + + "$defs": { + + "pattern": { + "type": "object", + "description": "A single traffic pattern segment. Multiple patterns on the same master are executed sequentially, separated by PAUSE fence tokens in the stimulus file.", + "required": ["mem_access_type"], + "unevaluatedProperties": false, + "properties": { + + "description": { + "type": "string", + "description": "Human-readable label for this pattern segment, shown in the memory-map report." + }, + "mem_access_type": { + "type": "string", + "enum": [ + "idle", + "random", + "linear", + "2d", + "3d", + "matmul_phased", + "matmul", + "multi_linear", + "bank_group_linear", + "rw_rowwise", + "gather_scatter", + "matmul_tiled_interleave", + "matmul_tiled", + "hotspot_random" + ], + "description": "Access pattern selector. Aliases: 'matmul' -> 'matmul_phased', 'matmul_tiled' -> 'matmul_tiled_interleave'." + }, + "job": { + "type": "string", + "default": "default", + "description": "Job name for this pattern segment. Used by other patterns' wait_for_jobs to reference this segment." + }, + "wait_for_jobs": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "description": "Job names this pattern waits for before starting. Generates a PAUSE fence before this pattern segment and holds the driver until all referenced jobs have advanced past the same fence level." + }, + + "n_transactions": { + "type": "integer", + "minimum": 0, + "description": "Number of real memory accesses (idles are NOT counted). Mandatory for 'random'. For other patterns can be derived from geometry or from region_size_bytes (transaction-width based)." + }, + + "region_base_address": { + "oneOf": [ + { "type": "integer", "minimum": 0 }, + { "type": "string", "description": "Decimal, hex (0x...) or binary string." } + ], + "description": "[random, linear, matmul_phased] Base byte address of the memory region." + }, + "region_size_bytes": { + "oneOf": [ + { "type": "integer", "minimum": 0 }, + { "type": "string", "description": "Decimal, hex (0x...) or binary string." } + ], + "description": "[random, linear, matmul_phased] Size in bytes of the memory region. For linear/matmul_phased, if n_transactions is omitted this can be used to derive it from transaction width." + }, + + "start_address": { + "type": "string", + "default": "0", + "description": "[linear, 2d, 3d] Start byte address." + }, + "stride0": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[linear, 2d, 3d] Innermost stride in words." + }, + "stride1": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[2d, 3d] Middle stride in words." + }, + "stride2": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[3d] Outermost stride in words." + }, + "length": { + "type": "integer", + "minimum": 0, + "description": "[linear] Alias for n_transactions." + }, + "len_d0": { + "type": "integer", + "minimum": 1, + "description": "[2d, 3d] Innermost dimension length." + }, + "len_d1": { + "type": "integer", + "minimum": 1, + "description": "[2d, 3d] Middle dimension length." + }, + "len_d2": { + "type": "integer", + "minimum": 1, + "description": "[3d] Outermost dimension length." + }, + + "matrix_m": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Rows of A and C." + }, + "matrix_n": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Columns of B and C." + }, + "matrix_k": { + "type": "integer", + "minimum": 1, + "description": "[matmul_phased] Columns of A / rows of B." + }, + "region_base_address_a": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the read-A region." + }, + "region_size_bytes_a": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the read-A region." + }, + "region_base_address_b": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the read-B region." + }, + "region_size_bytes_b": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the read-B region." + }, + "region_base_address_c": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Base address of the write-C region." + }, + "region_size_bytes_c": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[matmul_phased] Size in bytes of the write-C region." + }, + "regions": { + "type": "array", + "description": "[multi_linear] Subregions to stream in schedule order.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "stride_words": { "type": "integer", "minimum": 1, "default": 1 }, + "read_pct": { "type": "integer", "minimum": 0, "maximum": 100 } + } + } + }, + "schedule": { + "type": "string", + "description": "[multi_linear, gather_scatter] Access schedule selector (e.g. round_robin, 4read_1write)." + }, + "burst_len": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[multi_linear] Number of consecutive accesses per selected region before switching." + }, + "start_bank": { + "type": "integer", + "minimum": 0, + "description": "[bank_group_linear] Starting bank index." + }, + "bank_group_span": { + "type": "integer", + "minimum": 1, + "description": "[bank_group_linear] Number of banks in the active group." + }, + "stride_beats": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[bank_group_linear] Stride in beats through the bank-group phase." + }, + "bank_group_hop": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[bank_group_linear] Optional phase hop applied when advancing to the next group window." + }, + "wen": { + "type": "integer", + "enum": [0, 1], + "description": "[bank_group_linear] Fixed direction: 1=read, 0=write. If omitted, reads/writes are mixed." + }, + "row_base_address": { + "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }], + "description": "[rw_rowwise] Base address of row 0." + }, + "row_size_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[rw_rowwise] Bytes touched inside each row." + }, + "n_rows": { + "type": "integer", + "minimum": 1, + "description": "[rw_rowwise] Number of rows." + }, + "row_stride_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[rw_rowwise] Byte stride between consecutive row bases." + }, + "reads_per_row": { + "type": "integer", + "minimum": 0, + "description": "[rw_rowwise] Number of reads emitted per row." + }, + "writes_per_row": { + "type": "integer", + "minimum": 0, + "description": "[rw_rowwise] Number of writes emitted per row." + }, + "idle_cycles_between_rows": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[rw_rowwise] Idle cycles inserted between rows." + }, + "read_regions": { + "type": "array", + "description": "[gather_scatter] Source regions for gather reads.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] } + } + } + }, + "write_region": { + "type": "object", + "description": "[gather_scatter] Destination region for scatter writes.", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] } + } + }, + "chunk_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[gather_scatter] Address increment size in bytes for gather/scatter stepping." + }, + "tile_a_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes read from A per tile schedule step." + }, + "tile_b_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes read from B per tile schedule step." + }, + "tile_c_bytes": { + "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }], + "description": "[matmul_tiled_interleave] Bytes written to C per tile schedule step." + }, + "tiles": { + "type": "integer", + "minimum": 1, + "default": 1, + "description": "[matmul_tiled_interleave] Number of tile iterations before the pattern repeats." + }, + "ab_c_schedule": { + "type": "string", + "description": "[matmul_tiled_interleave] Tile phase order string, e.g. A_B_B_C." + }, + "idle_cycles_between_tiles": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[matmul_tiled_interleave] Idle cycles inserted between tile iterations." + }, + "hot_regions": { + "type": "array", + "description": "[hotspot_random] Weighted hot regions used for random accesses.", + "items": { + "type": "object", + "required": ["base", "size_bytes"], + "additionalProperties": false, + "properties": { + "base": { "oneOf": [{ "type": "integer", "minimum": 0 }, { "type": "string" }] }, + "size_bytes": { "oneOf": [{ "type": "integer", "minimum": 1 }, { "type": "string" }] }, + "weight": { "type": "integer", "minimum": 1, "default": 1 } + } + } + }, + "matmul_ratio_a": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the read-A phase." + }, + "matmul_ratio_b": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the read-B phase." + }, + "matmul_ratio_c": { + "type": "integer", + "minimum": 0, + "default": 1, + "description": "[matmul_phased] Relative weight of the write-C phase." + }, + + "traffic_pct": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "default": 100, + "description": "[random, linear, matmul_phased, multi_linear, bank_group_linear, rw_rowwise, gather_scatter, matmul_tiled_interleave, hotspot_random] Modeled request utilization percentage (adds req=0 idles after each request when <100)." + }, + "traffic_read_pct": { + "type": "integer", + "minimum": 0, + "maximum": 100, + "description": "[random, linear, hotspot_random] Percentage of accesses that are reads (wen=1)." + }, + "idle_cycles_between_phases": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "[2d, 3d, matmul_phased] Idle cycles inserted at each inner phase boundary." + } + }, + + "allOf": [ + { + "if": { "properties": { "mem_access_type": { "const": "random" } }, "required": ["mem_access_type"] }, + "then": { "required": ["n_transactions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "linear" } }, "required": ["mem_access_type"] }, + "then": { + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["length"] }, + { "required": ["region_size_bytes"] } + ] + } + }, + { + "if": { "properties": { "mem_access_type": { "const": "2d" } }, "required": ["mem_access_type"] }, + "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1"] }] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "3d" } }, "required": ["mem_access_type"] }, + "then": { "anyOf": [{ "required": ["n_transactions"] }, { "required": ["len_d0", "len_d1", "len_d2"] }] } + }, + { + "if": { + "properties": { "mem_access_type": { "enum": ["matmul_phased", "matmul"] } }, + "required": ["mem_access_type"] + }, + "then": { + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["region_size_bytes"] }, + { "required": ["matrix_m", "matrix_n", "matrix_k"] } + ] + } + }, + { + "if": { "properties": { "mem_access_type": { "const": "multi_linear" } }, "required": ["mem_access_type"] }, + "then": { "required": ["regions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "bank_group_linear" } }, "required": ["mem_access_type"] }, + "then": { "required": ["start_bank", "bank_group_span", "n_transactions"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "rw_rowwise" } }, "required": ["mem_access_type"] }, + "then": { "required": ["row_base_address", "row_size_bytes", "n_rows", "row_stride_bytes", "reads_per_row", "writes_per_row"] } + }, + { + "if": { "properties": { "mem_access_type": { "const": "gather_scatter" } }, "required": ["mem_access_type"] }, + "then": { "required": ["read_regions", "write_region"] } + }, + { + "if": { + "properties": { "mem_access_type": { "enum": ["matmul_tiled_interleave", "matmul_tiled"] } }, + "required": ["mem_access_type"] + }, + "then": { + "anyOf": [ + { "required": ["n_transactions"] }, + { "required": ["region_base_address_a", "region_size_bytes_a", "region_base_address_b", "region_size_bytes_b", "region_base_address_c", "region_size_bytes_c"] } + ] + } + }, + { + "if": { "properties": { "mem_access_type": { "const": "hotspot_random" } }, "required": ["mem_access_type"] }, + "then": { "required": ["hot_regions"] } + } + ] + }, + + "master": { + "type": "object", + "description": "Per-master configuration. A master either has a flat single-pattern config or a 'patterns' list for multi-job execution.", + "unevaluatedProperties": false, + "properties": { + "id": { + "type": "integer", + "description": "Optional consistency label. Mapping is always positional." + }, + "description": { + "type": "string", + "description": "Human-readable master label." + }, + "start_delay_cycles": { + "type": "integer", + "minimum": 0, + "default": 0, + "description": "Number of idle cycles prepended to the stimulus file before the first pattern." + }, + "patterns": { + "type": "array", + "minItems": 1, + "description": "Ordered list of pattern segments. Executed sequentially; a PAUSE fence token is inserted between each pair of consecutive patterns. The wait_for_jobs of pattern f defines which jobs must have advanced past fence f before this master resumes. If omitted, the master config itself is treated as a single flat pattern.", + "items": { "$ref": "#/$defs/pattern" } + } + }, + "allOf": [ + { "$ref": "#/$defs/pattern" } + ] + } + } +} diff --git a/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json new file mode 100644 index 0000000..b349a65 --- /dev/null +++ b/target/verif/exploration/config/hardware/hardware_hci_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "HCI", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json new file mode 100644 index 0000000..a505fa1 --- /dev/null +++ b/target/verif/exploration/config/hardware/hardware_log_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "LOG", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json new file mode 100644 index 0000000..84c47a1 --- /dev/null +++ b/target/verif/exploration/config/hardware/hardware_mux_2hwpe_8fact.json @@ -0,0 +1,17 @@ +{ + "description": "Hardware configuration parameters for HCI interconnect", + "parameters": { + "N_HWPE": 2, + "HWPE_WIDTH_FACT": 8, + "N_CORE": 8, + "N_DMA": 0, + "N_EXT": 1, + "DATA_WIDTH": 32, + "TOT_MEM_SIZE": 256, + "N_BANKS": 64, + "INTERCO_TYPE": "MUX", + "TS_BIT": 21, + "EXPFIFO": 0, + "SEL_LIC": 0 + } +} diff --git a/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json new file mode 100644 index 0000000..aa2da43 --- /dev/null +++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores.json @@ -0,0 +1,540 @@ +{ + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", + "log_masters": [ + { + "id": 0, + "description": "Core 0 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 0 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core0_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core0_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core0_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + }, + { + "description": "Core 0 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core0_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x10000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 1, + "description": "Core 1 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 1 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core1_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core1_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core1_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + }, + { + "description": "Core 1 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core1_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x11000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 2, + "description": "Core 2 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 2 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core2_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core2_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core2_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + }, + { + "description": "Core 2 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core2_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x12000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 3, + "description": "Core 3 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 3 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core3_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core3_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core3_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + }, + { + "description": "Core 3 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core3_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x13000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 4, + "description": "Core 4 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 4 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core4_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core4_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core4_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + }, + { + "description": "Core 4 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core4_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x14000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 5, + "description": "Core 5 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 5 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core5_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core5_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core5_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + }, + { + "description": "Core 5 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core5_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x15000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 6, + "description": "Core 6 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 6 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core6_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core6_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core6_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + }, + { + "description": "Core 6 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core6_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x16000", + "region_size_bytes": 1024 + } + ] + }, + { + "id": 7, + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A0", + "mem_access_type": "linear", + "job": "core7_after_gemm_A0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A1", + "mem_access_type": "linear", + "job": "core7_after_gemm_A1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A2", + "mem_access_type": "linear", + "job": "core7_after_gemm_A2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + }, + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] + }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A1 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A1_buf1", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A2 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A2_buf0", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A0", + "mem_access_type": "linear", + "job": "dma_store_C0_buf2", + "wait_for_jobs": ["gemm_A0"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Prefetch tile A3 into buffer #1 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A3_buf1", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x04000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A1", + "mem_access_type": "linear", + "job": "dma_store_C1_buf3", + "wait_for_jobs": ["gemm_A1"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C0 after GEMM tile A2", + "mem_access_type": "linear", + "job": "dma_store_C2_buf2", + "wait_for_jobs": ["gemm_A2"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x06000", + "region_size_bytes": 8192 + }, + { + "description": "Read back C1 after GEMM tile A3", + "mem_access_type": "linear", + "job": "dma_store_C3_buf3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 100, + "traffic_read_pct": 100, + "region_base_address": "0x08000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 1, + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] + } + ] +} diff --git a/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json new file mode 100644 index 0000000..c8ab84c --- /dev/null +++ b/target/verif/exploration/config/workloads/workload_dma_gemm_cores_ideal.json @@ -0,0 +1,126 @@ +{ + "description": "Simple 4-tile double-buffer GEMM. DMA uses linear transfers sized only by region_size_bytes. Cores generate linear 50% traffic after each GEMM on dedicated non-overlapping regions.", + "log_masters": [ + {"id": 0, "description": "Core 0 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 1, "description": "Core 1 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 2, "description": "Core 2 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 3, "description": "Core 3 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 4, "description": "Core 4 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 5, "description": "Core 5 post-GEMM background traffic", "mem_access_type": "idle" }, + {"id": 6, "description": "Core 6 post-GEMM background traffic", "mem_access_type": "idle" }, + { + "id": 7, + "description": "Core 7 post-GEMM background traffic", + "patterns": [ + { + "description": "Core 7 traffic after gemm_A3", + "mem_access_type": "linear", + "job": "core7_after_gemm_A3", + "wait_for_jobs": ["gemm_A3"], + "traffic_pct": 50, + "traffic_read_pct": 50, + "region_base_address": "0x17000", + "region_size_bytes": 1024 + } + ] + }, + { "id": 8, "description": "External 0 (idle)", "mem_access_type": "idle" } + ], + + "hwpe_masters": [ + { + "id": 0, + "description": "DMA engine (linear setup + ping/pong prefetch)", + "patterns": [ + { + "description": "Setup preload: B matrix (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_B", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x00000", + "region_size_bytes": 8192 + }, + { + "description": "Setup preload: tile A0 into buffer #0 (8 KiB)", + "mem_access_type": "linear", + "job": "dma_load_A0_buf0", + "traffic_pct": 100, + "traffic_read_pct": 0, + "region_base_address": "0x02000", + "region_size_bytes": 8192 + } + ] + }, + { + "id": 1, + "description": "Single GEMM engine processing 4 tiles with ping/pong buffers", + "patterns": [ + { + "description": "GEMM tile A0: A0 x B -> C0", + "mem_access_type": "matmul_phased", + "job": "gemm_A0", + "wait_for_jobs": ["dma_load_B", "dma_load_A0_buf0"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A1: A1 x B -> C1", + "mem_access_type": "matmul_phased", + "job": "gemm_A1", + "wait_for_jobs": ["dma_load_B", "dma_load_A1_buf1"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A2: A0 x B -> C0 (after dma_load_A2_buf0)", + "mem_access_type": "matmul_phased", + "job": "gemm_A2", + "wait_for_jobs": ["dma_load_B", "dma_load_A2_buf0", "dma_store_C0_buf2"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x02000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x06000", + "region_size_bytes_c": 8192 + }, + { + "description": "GEMM tile A3: A1 x B -> C1 (after dma_load_A3_buf1)", + "mem_access_type": "matmul_phased", + "job": "gemm_A3", + "wait_for_jobs": ["dma_load_B", "dma_load_A3_buf1", "dma_store_C1_buf3"], + "traffic_pct": 90, + "matrix_m": 16, + "matrix_n": 16, + "matrix_k": 16, + "region_base_address_a": "0x04000", + "region_size_bytes_a": 8192, + "region_base_address_b": "0x00000", + "region_size_bytes_b": 8192, + "region_base_address_c": "0x08000", + "region_size_bytes_c": 8192 + } + ] + } + ] +} diff --git a/target/verif/exploration/exploration.mk b/target/verif/exploration/exploration.mk new file mode 100644 index 0000000..1a93e7b --- /dev/null +++ b/target/verif/exploration/exploration.mk @@ -0,0 +1,17 @@ +# Copyright 2026 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Sergio Mazzola + +HCI_VERIF_EXPL_DIR = $(HCI_ROOT)/target/verif/exploration + +################ +# Benchmarking # +################ + +# Modify this script to configure parameters (e.g., workload to run) +BENCHMARK_SCRIPT := $(HCI_VERIF_EXPL_DIR)/scripts/run_sweep.sh + +benchmarking-sweep: + . $(BENCHMARK_SCRIPT) \ No newline at end of file diff --git a/target/verif/exploration/scripts/parse_vsim.py b/target/verif/exploration/scripts/parse_vsim.py new file mode 100644 index 0000000..9c0f0ad --- /dev/null +++ b/target/verif/exploration/scripts/parse_vsim.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +"""Parse the final 'Simulation Summary' section from one transcript file.""" + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Dict, List + + +SUMMARY_MARKER = "------ Simulation Summary ------" + + +class ParseError(RuntimeError): + """Raised when the transcript summary cannot be parsed.""" + + +def _as_float(value: str) -> float: + return float(value) + + +def _as_int(value: str) -> int: + return int(value) + + +def _clean_line(raw: str) -> str: + line = raw.strip() + if line.startswith("#"): + line = line[1:].strip() + return line + + +def _summary_lines(transcript_text: str) -> List[str]: + idx = transcript_text.rfind(SUMMARY_MARKER) + if idx < 0: + raise ParseError(f"Summary marker '{SUMMARY_MARKER}' not found.") + return [_clean_line(line) for line in transcript_text[idx:].splitlines()] + + +def _ensure_master(masters: Dict[str, Dict[str, object]], master_name: str) -> Dict[str, object]: + entry = masters.get(master_name) + if entry is None: + entry = {"master_name": master_name} + masters[master_name] = entry + return entry + + +def parse_summary(transcript_text: str) -> Dict[str, object]: + lines = _summary_lines(transcript_text) + + result: Dict[str, object] = { + "hw_config": {}, + "bandwidth": {}, + "simulation_time": {"per_master": []}, + "read_response_coverage": {}, + "transaction_counts": {}, + "request_to_grant_latency": { + "per_master": [], + "accumulated": {}, + "averages": {}, + }, + "finish": {}, + } + + masters: Dict[str, Dict[str, object]] = {} + + patterns = { + "masters": re.compile(r"^Masters:\s*CORE=(\d+)\s*DMA=(\d+)\s*EXT=(\d+)\s*HWPE=(\d+)\s*\(total=(\d+)\)$"), + "memory": re.compile( + r"^Memory:\s*banks=(\d+)\s*total_size=(\d+)\s*kB\s*data_width=(\d+)\s*bits\s*hwpe_width=(\d+)\s*lanes$" + ), + "interconnect": re.compile(r"^Interconnect:\s*SEL_LIC=(\d+)\s*TS_BIT=(\d+)\s*EXPFIFO=(\d+)$"), + "interconnect_side": re.compile( + r"^Interconnect-side:\s*TYPE=(LOG|HCI|MUX|UNKNOWN)\s*N_NARROW_HCI=(\d+)\s*N_WIDE_HCI=(\d+)\s*N_DMA=(\d+)\s*N_EXT=(\d+)$" + ), + "id_addr": re.compile(r"^ID/address:\s*IW=(\d+)\s*ADDR_WIDTH=(\d+)\s*ADDR_WIDTH_BANK=(\d+)$"), + "ideal_mem_bw": re.compile(r"^Ideal BW \(memory side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_interco_bw": re.compile(r"^Ideal BW \(interco side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_master_bw_legacy": re.compile(r"^Ideal BW \(master side\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "ideal_bottleneck_bw": re.compile(r"^Ideal BW \(bottleneck\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle"), + "actual_bw": re.compile( + r"^Actual BW \(completion\):\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle\s*\[utilization:\s*([0-9]+(?:\.[0-9]+)?)%\]$" + ), + "completion_bw_legacy": re.compile(r"^Completion bandwidth .*:\s*([0-9]+(?:\.[0-9]+)?)\s*bit/cycle$"), + "completion_cycles": re.compile(r"^Completion phase duration:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "granted": re.compile(r"^Granted transactions:\s*reads=(\d+)\s*writes=(\d+)\s*total=(\d+)$"), + "read_complete": re.compile(r"^Read-complete responses:\s*(\d+)$"), + "total_sim_cycles": re.compile(r"^Total simulation time:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "per_master_sim_time": re.compile(r"^([A-Za-z0-9_]+)\s*\((master_[^)]+)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$"), + "coverage": re.compile(r"^(master_[^:]+):\s*observed\s*(\d+)\s*/\s*expected\s*(\d+)$"), + "tx_counts": re.compile(r"^(master_[^:]+):\s*granted reads=(\d+)\s*writes=(\d+),\s*read-complete=(\d+)$"), + "req_gnt": re.compile( + r"^(master_[^:]+):\s*avg req->gnt stall latency\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$" + ), + "total_accum": re.compile( + r"^Total accumulated req->gnt latency:\s*([0-9]+(?:\.[0-9]+)?)\s*cycles over\s*(\d+)\s*grants$" + ), + "class_avg": re.compile( + r"^(LOG|HWPE|Global) avg req->gnt stall latency " + r"\((weighted by grant count|mean of per-master averages)\):\s*([0-9]+(?:\.[0-9]+)?)\s*cycles$" + ), + "finish_note": re.compile(r"^\*\* Note: \$finish\s*:\s*(.+)\((\d+)\)$"), + "finish_time": re.compile(r"^Time:\s*([0-9]+)\s*ps\s*Iteration:\s*(\d+)\s*Instance:\s*(.+)$"), + } + + for line in lines: + if not line or line == SUMMARY_MARKER: + continue + + match = patterns["masters"].match(line) + if match: + result["hw_config"]["masters"] = { + "core": _as_int(match.group(1)), + "dma": _as_int(match.group(2)), + "ext": _as_int(match.group(3)), + "hwpe": _as_int(match.group(4)), + "total": _as_int(match.group(5)), + } + continue + + match = patterns["memory"].match(line) + if match: + result["hw_config"]["memory"] = { + "banks": _as_int(match.group(1)), + "total_size_kb": _as_int(match.group(2)), + "data_width_bits": _as_int(match.group(3)), + "hwpe_width_lanes": _as_int(match.group(4)), + } + continue + + match = patterns["interconnect"].match(line) + if match: + result["hw_config"]["interconnect"] = { + "sel_lic": _as_int(match.group(1)), + "ts_bit": _as_int(match.group(2)), + "expfifo": _as_int(match.group(3)), + } + continue + + match = patterns["interconnect_side"].match(line) + if match: + narrow_hci = _as_int(match.group(2)) + wide_hci = _as_int(match.group(3)) + n_dma = _as_int(match.group(4)) + n_ext = _as_int(match.group(5)) + result["hw_config"]["interconnect_side"] = { + "type": match.group(1), + "n_narrow_hci": narrow_hci, + "n_wide_hci": wide_hci, + "n_dma": n_dma, + "n_ext": n_ext, + "narrow_total_ports": narrow_hci + n_dma + n_ext, + "total_initiator_ports": narrow_hci + wide_hci + n_dma + n_ext, + } + continue + + match = patterns["id_addr"].match(line) + if match: + result["hw_config"]["id_address"] = { + "iw": _as_int(match.group(1)), + "addr_width": _as_int(match.group(2)), + "addr_width_bank": _as_int(match.group(3)), + } + continue + + match = patterns["ideal_mem_bw"].match(line) + if match: + result["bandwidth"]["ideal_memory_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_interco_bw"].match(line) + if match: + result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_master_bw_legacy"].match(line) + if match: + result["bandwidth"]["ideal_interconnect_side_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["ideal_bottleneck_bw"].match(line) + if match: + result["bandwidth"]["ideal_bottleneck_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["actual_bw"].match(line) + if match: + result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1)) + result["bandwidth"]["actual_completion_utilization_pct"] = _as_float(match.group(2)) + continue + + match = patterns["completion_bw_legacy"].match(line) + if match: + result["bandwidth"]["actual_completion_bit_per_cycle"] = _as_float(match.group(1)) + continue + + match = patterns["completion_cycles"].match(line) + if match: + result["bandwidth"]["completion_phase_duration_cycles"] = _as_float(match.group(1)) + continue + + match = patterns["granted"].match(line) + if match: + result["bandwidth"]["granted_transactions"] = { + "reads": _as_int(match.group(1)), + "writes": _as_int(match.group(2)), + "total": _as_int(match.group(3)), + } + continue + + match = patterns["read_complete"].match(line) + if match: + result["bandwidth"]["read_complete_responses"] = _as_int(match.group(1)) + continue + + match = patterns["total_sim_cycles"].match(line) + if match: + result["simulation_time"]["total_cycles"] = _as_float(match.group(1)) + continue + + match = patterns["per_master_sim_time"].match(line) + if match: + role_name = match.group(1) + master_name = match.group(2) + sim_cycles = _as_float(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["role_name"] = role_name + entry["sim_time_cycles"] = sim_cycles + continue + + match = patterns["coverage"].match(line) + if match: + master_name = match.group(1) + observed = _as_int(match.group(2)) + expected = _as_int(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["read_observed"] = observed + entry["read_expected"] = expected + result["read_response_coverage"][master_name] = { + "observed": observed, + "expected": expected, + } + continue + + match = patterns["tx_counts"].match(line) + if match: + master_name = match.group(1) + reads = _as_int(match.group(2)) + writes = _as_int(match.group(3)) + read_complete = _as_int(match.group(4)) + entry = _ensure_master(masters, master_name) + entry["granted_reads"] = reads + entry["granted_writes"] = writes + entry["read_complete"] = read_complete + result["transaction_counts"][master_name] = { + "granted_reads": reads, + "granted_writes": writes, + "read_complete": read_complete, + } + continue + + match = patterns["req_gnt"].match(line) + if match: + master_name = match.group(1) + avg_cycles = _as_float(match.group(2)) + grants = _as_int(match.group(3)) + entry = _ensure_master(masters, master_name) + entry["avg_req_to_gnt_stall_latency_cycles"] = avg_cycles + entry["req_to_gnt_grants"] = grants + continue + + match = patterns["total_accum"].match(line) + if match: + result["request_to_grant_latency"]["accumulated"] = { + "cycles": _as_float(match.group(1)), + "grants": _as_int(match.group(2)), + } + continue + + match = patterns["class_avg"].match(line) + if match: + group = match.group(1).lower() + avg_type = match.group(2) + value = _as_float(match.group(3)) + key = "weighted_cycles" if "weighted by grant count" in avg_type else "unweighted_cycles" + averages = result["request_to_grant_latency"]["averages"] + group_entry = averages.get(group, {}) + group_entry[key] = value + averages[group] = group_entry + continue + + match = patterns["finish_note"].match(line) + if match: + result["finish"]["source"] = match.group(1).strip() + result["finish"]["line"] = _as_int(match.group(2)) + continue + + match = patterns["finish_time"].match(line) + if match: + result["finish"]["time_ps"] = _as_int(match.group(1)) + result["finish"]["iteration"] = _as_int(match.group(2)) + result["finish"]["instance"] = match.group(3).strip() + continue + + sorted_masters = [masters[name] for name in sorted(masters.keys())] + result["simulation_time"]["per_master"] = [ + { + "master_name": row["master_name"], + "role_name": row.get("role_name"), + "sim_time_cycles": row.get("sim_time_cycles"), + } + for row in sorted_masters + ] + result["request_to_grant_latency"]["per_master"] = [ + { + "master_name": row["master_name"], + "avg_req_to_gnt_stall_latency_cycles": row.get("avg_req_to_gnt_stall_latency_cycles"), + "req_to_gnt_grants": row.get("req_to_gnt_grants"), + } + for row in sorted_masters + ] + result["masters"] = sorted_masters + + return result + + +def _cli_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Parse final Simulation Summary lines from one transcript.") + parser.add_argument("--transcript", required=True, help="Path to transcript file") + parser.add_argument("--out", default="", help="Optional output JSON file path") + return parser.parse_args() + + +def main() -> int: + args = _cli_args() + transcript_path = Path(args.transcript) + if not transcript_path.exists(): + raise ParseError(f"Transcript not found: {transcript_path}") + + text = transcript_path.read_text(encoding="utf-8", errors="replace") + parsed = parse_summary(text) + + output = json.dumps(parsed, indent=2, sort_keys=False) + if args.out: + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(output + "\n", encoding="ascii") + else: + print(output) + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except ParseError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(2) diff --git a/target/verif/exploration/scripts/plot_sweep_results.py b/target/verif/exploration/scripts/plot_sweep_results.py new file mode 100644 index 0000000..786561e --- /dev/null +++ b/target/verif/exploration/scripts/plot_sweep_results.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +"""Plot sweep metrics from parsed transcript JSON files.""" + +import argparse +import json +import math +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.colors import ListedColormap +from matplotlib.lines import Line2D +from matplotlib.patches import Patch + +# Cycles of ideal workload runtime +IDEAL_WORKLOAD_RUNTIME = 4121.0 + +INTERCO_ORDER = {"LOG": 0, "MUX": 1, "HCI": 2} +INTERCO_COLORS = {"LOG": "#1f77b4", "MUX": "#9467bd", "HCI": "#ff7f0e"} +IDEAL_COLOR = "#7f7f7f" + + +def _to_int(value: object, default: int = 0) -> int: + try: + return int(value) + except Exception: + return default + + +def _to_float(value: object, default: float = float("nan")) -> float: + try: + return float(value) + except Exception: + return default + + +def _master_sort_key(master: str) -> Tuple[int, int]: + if master.startswith("master_log_"): + return (0, int(master.rsplit("_", 1)[1])) + if master.startswith("master_hwpe_"): + return (1, int(master.rsplit("_", 1)[1])) + return (9, 0) + + +def _parse_cfg_from_filename(path: Path) -> Tuple[str, int, int]: + match = re.match(r"^hardware_([a-zA-Z]+)_([0-9]+)hwpe_([0-9]+)fact\.json$", path.name) + if not match: + return ("UNK", 0, 0) + return (match.group(1).upper(), int(match.group(2)), int(match.group(3))) + + +def _derive_interco_side(hw_cfg: Dict[str, object]) -> Dict[str, int]: + masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {} + memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {} + interco_side = hw_cfg.get("interconnect_side", {}) if isinstance(hw_cfg, dict) else {} + + if isinstance(interco_side, dict) and "narrow_total_ports" in interco_side: + return { + "n_narrow_hci": _to_int(interco_side.get("n_narrow_hci")), + "n_wide_hci": _to_int(interco_side.get("n_wide_hci")), + "n_dma": _to_int(interco_side.get("n_dma")), + "n_ext": _to_int(interco_side.get("n_ext")), + "narrow_total_ports": _to_int(interco_side.get("narrow_total_ports")), + "total_initiator_ports": _to_int(interco_side.get("total_initiator_ports")), + } + + interco_type = str(interco_side.get("type", "UNK")).upper() + if interco_type == "UNK": + interco_type = str(hw_cfg.get("interco_type", "UNK")).upper() + + n_core = _to_int(masters.get("core")) + n_dma = _to_int(masters.get("dma")) + n_ext = _to_int(masters.get("ext")) + n_hwpe = _to_int(masters.get("hwpe")) + hwpe_width = _to_int(memory.get("hwpe_width_lanes"), 1) + + if interco_type == "LOG": + n_narrow_hci = n_core + n_hwpe * hwpe_width + n_wide_hci = 0 + elif interco_type == "MUX": + n_narrow_hci = n_core + n_wide_hci = 1 if n_hwpe > 0 else 0 + else: + n_narrow_hci = n_core + n_wide_hci = n_hwpe + + narrow_total = n_narrow_hci + n_dma + n_ext + return { + "n_narrow_hci": n_narrow_hci, + "n_wide_hci": n_wide_hci, + "n_dma": n_dma, + "n_ext": n_ext, + "narrow_total_ports": narrow_total, + "total_initiator_ports": narrow_total + n_wide_hci, + } + + +def _load_results(results_dir: Path) -> List[Dict[str, object]]: + entries: List[Dict[str, object]] = [] + for path in sorted(results_dir.glob("hardware_*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + + hw_cfg = data.get("hw_config", {}) + masters = hw_cfg.get("masters", {}) if isinstance(hw_cfg, dict) else {} + memory = hw_cfg.get("memory", {}) if isinstance(hw_cfg, dict) else {} + bw = data.get("bandwidth", {}) + + interco_from_name, n_hwpe_name, wf_name = _parse_cfg_from_filename(path) + interco_type = str(hw_cfg.get("interconnect_side", {}).get("type", interco_from_name)).upper() + if interco_type not in INTERCO_ORDER: + interco_type = interco_from_name + + n_hwpe = _to_int(masters.get("hwpe"), n_hwpe_name) + hwpe_wf = _to_int(memory.get("hwpe_width_lanes"), wf_name) + cfg_label = f"{interco_type}_{n_hwpe}x{hwpe_wf}" + + interco_side = _derive_interco_side(hw_cfg if isinstance(hw_cfg, dict) else {}) + banks = _to_int(memory.get("banks")) + data_width = _to_int(memory.get("data_width_bits")) + ideal_mem = float(banks * data_width) + ideal_interco = float( + interco_side["narrow_total_ports"] * data_width + + interco_side["n_wide_hci"] * hwpe_wf * data_width + ) + ideal_bottleneck = min(ideal_mem, ideal_interco) + + actual_bw = _to_float(bw.get("actual_completion_bit_per_cycle")) + util_pct = (actual_bw / ideal_bottleneck * 100.0) if ideal_bottleneck > 0 and not math.isnan(actual_bw) else float("nan") + + entries.append( + { + "path": path, + "label": cfg_label, + "interco_type": interco_type, + "n_hwpe": n_hwpe, + "hwpe_width_fact": hwpe_wf, + "json": data, + "total_sim_cycles": _to_float(data.get("simulation_time", {}).get("total_cycles")), + "avg_req_to_gnt_per_master": data.get("request_to_grant_latency", {}).get("per_master", []), + "ideal_mem_bw": ideal_mem, + "ideal_interco_bw": ideal_interco, + "ideal_bottleneck_bw": ideal_bottleneck, + "actual_bw": actual_bw, + "utilization_pct": util_pct, + } + ) + + entries.sort(key=lambda e: (e["n_hwpe"], e["hwpe_width_fact"], INTERCO_ORDER.get(e["interco_type"], 9))) + return entries + + +def _plot_total_sim_time(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + values = [e["total_sim_cycles"] for e in entries] + colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries] + x = np.arange(len(entries), dtype=float) + + fig, ax = plt.subplots(figsize=(max(8, 1.2 * len(entries)), 5.6)) + bars = ax.bar(x, values, color=colors, width=0.68) + ax.set_title("Total simulation time vs ideal workload runtime") + ax.set_ylabel("cycles") + ax.set_xlabel("Configuration") + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_axisbelow(True) + ax.grid(axis="y", alpha=0.25) + + for bar, val in zip(bars, values): + if math.isnan(val): + continue + mult_of_ideal = (val / IDEAL_WORKLOAD_RUNTIME) if val > 0 and IDEAL_WORKLOAD_RUNTIME > 0 else float("nan") + pct_txt = "n/a" if math.isnan(mult_of_ideal) else f"{mult_of_ideal:.2f}X of ideal runtime" + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.0f}\n({pct_txt})", + ha="center", + va="bottom", + fontsize=8, + ) + + ax.axhline( + y=IDEAL_WORKLOAD_RUNTIME, + color="red", + linestyle="--", + linewidth=1.6, + label=f"Ideal workload runtime ({IDEAL_WORKLOAD_RUNTIME:.0f} cycles)", + ) + + legend = [Patch(facecolor=INTERCO_COLORS[k], label=k) for k in ("LOG", "MUX", "HCI")] + legend.append(Line2D([0], [0], color="red", linestyle="--", linewidth=1.6, label="Ideal workload runtime")) + ax.legend(handles=legend, loc="lower left") + ax.margins(y=0.24) + fig.tight_layout(rect=(0.0, 0.02, 1.0, 0.98)) + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def _plot_per_master_avg_req_to_gnt(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + masters = sorted( + { + row.get("master_name", "") + for e in entries + for row in e.get("avg_req_to_gnt_per_master", []) + if isinstance(row, dict) and row.get("master_name", "") + }, + key=_master_sort_key, + ) + + if not masters: + fig, ax = plt.subplots(figsize=(8, 3)) + ax.text(0.5, 0.5, "No per-master req->gnt data", ha="center", va="center") + ax.axis("off") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + return + + matrix = np.full((len(masters), len(entries)), np.nan, dtype=float) + master_idx = {m: i for i, m in enumerate(masters)} + for j, entry in enumerate(entries): + for row in entry.get("avg_req_to_gnt_per_master", []): + if not isinstance(row, dict): + continue + m = row.get("master_name", "") + if m not in master_idx: + continue + matrix[master_idx[m], j] = _to_float(row.get("avg_req_to_gnt_stall_latency_cycles")) + + cmap = ListedColormap(plt.cm.get_cmap("viridis")(np.linspace(0.0, 1.0, 256))) + cmap.set_bad(color="white") + + fig, ax = plt.subplots(figsize=(max(10, 1.2 * len(entries)), max(4, 0.35 * len(masters)))) + im = ax.imshow(np.ma.masked_invalid(matrix), aspect="auto", cmap=cmap, interpolation="nearest") + ax.set_title("Avg req->gnt stall latency per master") + ax.set_xlabel("Configuration") + ax.set_ylabel("Master") + ax.set_xticks(np.arange(len(entries))) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_yticks(np.arange(len(masters))) + ax.set_yticklabels(masters) + + vmax = np.nanmax(matrix) if np.any(~np.isnan(matrix)) else 0.0 + thresh = 0.6 * vmax if vmax > 0 else 0.0 + for r in range(matrix.shape[0]): + for c in range(matrix.shape[1]): + val = matrix[r, c] + if math.isnan(val): + continue + color = "white" if val < thresh else "black" + ax.text(c, r, f"{val:.2f}", ha="center", va="center", fontsize=6, color=color) + + fig.colorbar(im, ax=ax, label="cycles") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def _plot_bandwidth(entries: List[Dict[str, object]], out_path: Path) -> None: + labels = [e["label"] for e in entries] + ideal_vals = [e["ideal_bottleneck_bw"] for e in entries] + actual_vals = [e["actual_bw"] for e in entries] + util_vals = [e["utilization_pct"] for e in entries] + sim_cycles_vals = [e["total_sim_cycles"] for e in entries] + ideal_app_vals = [] + for actual_bw, sim_cycles in zip(actual_vals, sim_cycles_vals): + if math.isnan(actual_bw) or math.isnan(sim_cycles) or IDEAL_WORKLOAD_RUNTIME <= 0.0: + ideal_app_vals.append(float("nan")) + else: + ideal_app_vals.append(actual_bw * sim_cycles / IDEAL_WORKLOAD_RUNTIME) + actual_colors = [INTERCO_COLORS.get(e["interco_type"], "#333333") for e in entries] + x = np.arange(len(entries), dtype=float) + width = 0.34 + + fig, ax = plt.subplots(figsize=(max(9, 1.25 * len(entries)), 5.0)) + ideal_bars = ax.bar( + x - width / 2.0, + ideal_vals, + width=width, + color=IDEAL_COLOR, + label="Max interco bandwidth", + ) + actual_bars = ax.bar(x + width / 2.0, actual_vals, width=width, color=actual_colors, label="Actual BW (completion)") + ax.set_title("Bandwidth: interconnect-side ideal vs actual") + ax.set_ylabel("bit/cycle") + ax.set_xlabel("Configuration") + ax.set_xticks(x) + ax.set_xticklabels(labels, rotation=20, ha="right") + ax.set_axisbelow(True) + ax.grid(axis="y", alpha=0.25) + + for bar, val in zip(ideal_bars, ideal_vals): + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.0f}", + ha="center", + va="bottom", + fontsize=8, + zorder=8, + ) + for bar, val, util in zip(actual_bars, actual_vals, util_vals): + if math.isnan(val): + continue + util_txt = "n/a" if math.isnan(util) else f"{util:.1f}% interco util" + ax.text( + bar.get_x() + bar.get_width() / 2.0, + val, + f"{val:.1f}\n({util_txt})", + ha="center", + va="bottom", + fontsize=8, + zorder=8, + ) + + # Ideal app BW computed from moved data and ideal application duration: + # ideal_app_bw = effective_bw * total_real_sim_time / IDEAL_WORKLOAD_RUNTIME + valid_ideal_app_vals = [v for v in ideal_app_vals if not math.isnan(v)] + if valid_ideal_app_vals: + ideal_workload_bw = sum(valid_ideal_app_vals) / len(valid_ideal_app_vals) + ax.axhline( + y=ideal_workload_bw, + color="red", + linestyle="--", + linewidth=1.8, + label="Ideal workload bandwidth", + zorder=4, + ) + ax.text( + x[-1] + 0.35, + ideal_workload_bw, + f"{ideal_workload_bw:.1f}", + color="red", + fontsize=9, + ha="right", + va="bottom", + zorder=8, + ) + + interco_legend = [Patch(facecolor=INTERCO_COLORS[k], label=f"Actual {k}") for k in ("LOG", "MUX", "HCI")] + base_legend = [Patch(facecolor=IDEAL_COLOR, label="Max interco bandwidth")] + extra_legend = [Line2D([0], [0], color="red", linestyle="--", linewidth=1.8, label="Ideal workload bandwidth")] + ax.legend(handles=base_legend + interco_legend + extra_legend, loc="best") + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Plot sweep results from parsed transcript JSON files.") + parser.add_argument( + "--results-dir", + default="target/verif/results", + help="Directory containing parsed sweep JSON files (hardware_*.json).", + ) + parser.add_argument( + "--out-dir", + default="target/verif/results/plots", + help="Output directory for generated plots.", + ) + args = parser.parse_args() + + results_dir = Path(args.results_dir) + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + entries = _load_results(results_dir) + if not entries: + raise SystemExit(f"No sweep JSON files found in: {results_dir}") + + _plot_total_sim_time(entries, out_dir / "total_simulation_time.png") + _plot_per_master_avg_req_to_gnt(entries, out_dir / "avg_req_to_gnt_per_master.png") + _plot_bandwidth(entries, out_dir / "bandwidth_ideal_vs_actual.png") + + print(f"Plots written to: {out_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/target/verif/exploration/scripts/run_sweep.sh b/target/verif/exploration/scripts/run_sweep.sh new file mode 100644 index 0000000..8759a9f --- /dev/null +++ b/target/verif/exploration/scripts/run_sweep.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +# Make sure we are in hci root +if ! git_root=$(git rev-parse --show-toplevel 2>/dev/null) || [ "$(basename "$git_root")" != "hci" ]; then + echo "Error: This script must be run from within the hci project git repository." >&2 + exit 1 +fi +cd "$git_root" + +# Directories +VERIF_DIR=./target/verif +VERIF_EXPL_DIR=$VERIF_DIR/exploration +RESULTS_DIR=$VERIF_EXPL_DIR/results +PLOTS_DIR=$RESULTS_DIR/plots + +RUN_NAME=gemm_cores_double_buffer + +# Makefile settings (verif.mk) +export GUI=0 +export WORKLOAD_JSON=$VERIF_EXPL_DIR/config/workloads/workload_dma_gemm_cores.json +export TESTBENCH_JSON=$VERIF_DIR/config/testbench.json +# HARDWARE_JSON is swept in the loop below + +mkdir -p "$RESULTS_DIR" + +# For each hardware*.json, run simulation and parse transcript +for hardware_config in $VERIF_EXPL_DIR/config/hardware/hardware_*.json; do + make clean-verif + echo -e "\033[32;1mRunning simulation with hardware config: $hardware_config\033[0m" + export HARDWARE_JSON="$hardware_config" + make run-verif + python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).json + # Copy html report + cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/$(basename "$hardware_config" .json).html +done + +python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME + +# Ideal run (manual) +# do not forget to change IDEAL_WORKLOAD_RUNTIME in plot_sweep_results.py +python3 $VERIF_EXPL_DIR/scripts/parse_vsim.py --transcript $VERIF_DIR/vsim/transcript --out $RESULTS_DIR/$RUN_NAME/ideal.json +cp $VERIF_DIR/simvectors/generated/dataflow.html $RESULTS_DIR/$RUN_NAME/ideal.html +# Regenerate plots with correct IDEAL_WORKLOAD_RUNTIME +python3 $VERIF_EXPL_DIR/scripts/plot_sweep_results.py --results-dir $RESULTS_DIR/$RUN_NAME --out-dir $RESULTS_DIR/$RUN_NAME diff --git a/target/verif/simvectors/README.md b/target/verif/simvectors/README.md index 97be798..472323b 100644 --- a/target/verif/simvectors/README.md +++ b/target/verif/simvectors/README.md @@ -1,23 +1,293 @@ -# HCI Stimuli Generator +# Simvectors Stimuli Generator -Python package for generating test stimuli for the HCI verification environment. +## Scope +`main.py` generates per-master stimuli vectors from: +- `workload.json` +- `hardware.json` +- `testbench.json` -## Usage +This README summarizes: +- available `mem_access_type` patterns and JSON parameters +- every source of `req=0` cycles +- read/write blocked-set behavior +- generated outputs and formats -### Generate Stimuli +## JSON Structure (workload) +Top-level: +- `log_masters`: list of narrow masters +- `hwpe_masters`: list of wide masters -```bash -python target/verif/simvectors/main.py --workload_config --testbench_config --hardware_config -``` +### Field Format Conventions +| Kind | Accepted JSON format | Unit / meaning | +|---|---|---| +| Address fields (`*_address`, `base`) | integer or string (`"1234"`, `"0x4000"`, `"101010"`) | byte address | +| Size fields (`*_size_bytes`, `chunk_bytes`, `tile_*_bytes`) | integer or numeric string | bytes | +| Strides (`stride0/1/2`) | integer | words (word = `DATA_WIDTH/8` for that master) | +| `row_stride_bytes` | integer or numeric string | bytes | +| `stride_beats` | integer | beats (beat = transaction width in bytes for that master) | +| Counters (`n_transactions`, `len_*`, `tiles`, `reads_per_row`, `writes_per_row`) | integer | count | +| Percentages (`traffic_pct`, `traffic_read_pct`, `read_pct`) | integer | percent | +| `wen` | `0` or `1` | `0`=write, `1`=read | -Or use the Makefile: -```bash -make stimuli -``` +### Master-Level Fields +| Field | Required | Default | Notes | +|---|---|---|---| +| `id` | no | positional index | Informational/consistency warning only. | +| `description` | no | empty | Human-readable label. | +| `start_delay_cycles` | no | `0` | Prepended `req=0` cycles before first pattern. | +| `patterns` | no | absent | If present, this list drives generation. | -## Configuration +Precedence/exclusivity: +- Master format is effectively either: + - flat single-pattern master (no `patterns`) + - or `patterns` list. +- If `patterns` exists, flat pattern fields at master level are ignored for traffic generation (except master-level fields like `start_delay_cycles`). -Configuration files are located in `target/verif/config/`: -- `hardware.json` - HCI hardware parameters (auto-generates `generated/hardware.mk`) -- `testbench.json` - Testbench parameters (auto-generates `generated/testbench.mk`) -- `workload.json` - Workload configuration with simulation parameters and master-specific settings +### Common Pattern Fields (apply to every pattern type) +| Field | Required | Default | Notes | +|---|---|---|---| +| `mem_access_type` | yes | none | Pattern selector. | +| `description` | no | empty | Label used in reports. | +| `job` | no | `"default"` | Dependency graph node name. | +| `wait_for_jobs` | no | `[]` | Inserts dependency gate before pattern. | +| `n_transactions` | conditional | derivable for many patterns | If omitted, derived when supported. | +| `traffic_pct` | no | `100` | Adds per-request idle shaping (`req=0`) on patterns that implement traffic shaping. | + +## Pattern Catalog +The tables below list pattern-specific fields. +Complete field set for a pattern = **common fields above + pattern-specific fields below**. +`Required = conditional` means: required unless the documented derivation path is present. + +### `idle` +No memory transaction. Emits idle and trailing `PAUSE`. + +Pattern-specific fields: none. + +### `random` +Uniform random over a region. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | yes | none | int | Not derivable for this pattern. | +| `region_base_address` | no | evenly partitioned per master | address (bytes) | | +| `region_size_bytes` | no | evenly partitioned per master | bytes | | +| `traffic_read_pct` | no | random R/W mix | % | If set, deterministic read/write split. | + +### `linear` +1D strided stream. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `length` or `region_size_bytes`. | +| `length` | no | none | int | Alias source for derived `n_transactions`. | +| `start_address` | no | `"0"` | address (bytes) | If absent and `region_base_address` exists, uses `region_base_address`. | +| `stride0` | no | `0` (or `1` if `region_size_bytes` set) | words | | +| `region_base_address` | no | evenly partitioned per master | address (bytes) | Used for region context and start fallback. | +| `region_size_bytes` | no | evenly partitioned per master | bytes | Can derive `n_transactions`. | +| `traffic_read_pct` | no | random R/W mix | % | | + +### `2d` +2D walk. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1`. | +| `start_address` | no | `"0"` | address (bytes) | | +| `stride0` | no | `0` | words | | +| `len_d0` | conditional | none | int | | +| `stride1` | no | `0` | words | | +| `len_d1` | conditional | none | int | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. | + +### `3d` +3D walk. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from `len_d0 * len_d1 * len_d2`. | +| `start_address` | no | `"0"` | address (bytes) | | +| `stride0` | no | `0` | words | | +| `len_d0` | conditional | none | int | | +| `stride1` | no | `0` | words | | +| `len_d1` | conditional | none | int | | +| `stride2` | no | `0` | words | | +| `len_d2` | conditional | none | int | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit boundary idles. | + +### `matmul_phased` (alias: `matmul`) +Phased A-read / B-read / C-write traffic. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `n_transactions` | conditional | derived | int | Derivable from region size or matrix dims. | +| `region_base_address`, `region_size_bytes` | conditional | evenly partitioned | bytes | Combined region (auto A/B/C split). | +| `matrix_m`, `matrix_n`, `matrix_k` | no | none | int | Alternative source for derived `n_transactions`. | +| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | no | none | bytes | Explicit per-phase regions. | +| `matmul_ratio_a/b/c` | no | `1/1/1` | relative weights | | +| `idle_cycles_between_phases` | no | `0` | cycles | Inserts explicit phase-boundary idles. | + +Mutual exclusivity / precedence: +- If explicit `*_a/b/c` regions are provided, they take precedence over combined-region auto-split. + +### `multi_linear` +Multiple subregions, schedule-driven interleave. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `regions` | yes | none | array | Each entry has `base`, `size_bytes`, optional `stride_words`, `read_pct`. | +| `schedule` | no | `round_robin` | string | | +| `burst_len` | no | `1` | int | | +| `n_transactions` | conditional | derived | int | Derivable from sum of region sizes. | + +### `bank_group_linear` +Linear stream constrained by bank group phase controls. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `start_bank` | yes | none | int | 0-based bank index. | +| `bank_group_span` | yes | none | int | Number of banks in active group. | +| `stride_beats` | no | `1` | beats | | +| `bank_group_hop` | no | `0` | int | Group-phase hop per wrap. | +| `wen` | no | mixed R/W | `0` or `1` | Fixed direction if set. | +| `n_transactions` | yes | none | int | Required for this pattern. | + +### `rw_rowwise` +Per-row read phase then write phase. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `row_base_address` | yes | none | address (bytes) | | +| `row_size_bytes` | yes | none | bytes | | +| `n_rows` | yes | none | int | | +| `row_stride_bytes` | yes | none | bytes | | +| `reads_per_row` | yes | none | int | | +| `writes_per_row` | yes | none | int | | +| `idle_cycles_between_rows` | no | `0` | cycles | Inserts explicit row-boundary idles. | +| `n_transactions` | conditional | derived | int | Derivable as `n_rows * (reads_per_row + writes_per_row)`. | + +### `gather_scatter` +Gather from multiple read regions, scatter to write region. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `read_regions` | yes | none | array | Each entry: `base`, `size_bytes`. | +| `write_region` | yes | none | object | `base`, `size_bytes`. | +| `chunk_bytes` | no | transaction width | bytes | Address increment granularity. | +| `schedule` | no | `4read_1write` | string | | +| `n_transactions` | conditional | derived | int | Derivable from region sizes and chunk. | + +### `matmul_tiled_interleave` (alias: `matmul_tiled`) +Tile-like interleaving among A/B/C streams. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `region_base_address`, `region_size_bytes` | no | evenly partitioned | bytes | Used as fallback context for auto split when explicit A/B/C are absent. | +| `region_base_address_a/b/c`, `region_size_bytes_a/b/c` | conditional | fallback split | bytes | Preferred explicit mode. | +| `tile_a_bytes`, `tile_b_bytes`, `tile_c_bytes` | no | transaction width | bytes | Tile step payloads per stream. | +| `tiles` | no | `1` | int | | +| `ab_c_schedule` | no | `A_B_C` | string | | +| `idle_cycles_between_tiles` | no | `0` | cycles | Inserts explicit tile-boundary idles. | +| `n_transactions` | conditional | derived | int | Can be derived from tile parameters/schedule. | + +Mutual exclusivity / precedence: +- Preferred: explicit A/B/C regions. +- If missing, generator falls back to splitting combined region context. + +### `hotspot_random` +Weighted random traffic across hot regions. + +| Field | Required | Default | Format / unit | Notes | +|---|---|---|---|---| +| `hot_regions` | yes | none | array | Each entry: `base`, `size_bytes`, optional `weight` (default `1`). | +| `n_transactions` | no | weak fallback | int | Prefer setting explicitly. | +| `traffic_read_pct` | no | random R/W mix | % | | + +## All Sources of `req=0` Cycles +`req=0` can be generated by: + +1. `traffic_pct` shaping +- For every emitted request, inserts `idles_per_req = round((100-pct)/pct)` idle lines. +- Applies in random/linear and all new patterns that support `traffic_pct`. + +2. Boundary idle knobs (JSON) +- `idle_cycles_between_phases` in `2d`, `3d`, `matmul_phased` +- `idle_cycles_between_rows` in `rw_rowwise` +- `idle_cycles_between_tiles` in `matmul_tiled_interleave` + +3. `start_delay_cycles` (per master) +- Prepends idle lines before first pattern. + +4. Dependency gate for `wait_for_jobs` +- For each dependent pattern, generator inserts a synthetic idle+`PAUSE` gate before real traffic. + +5. `idle` pattern +- Explicitly emits idle and `PAUSE`. + +## Read/Write Blocked-Set Functionality +Address filtering is implemented inside pattern generators via: +- `_is_allowed(add, wen, read_blocked_set, write_blocked_set)` +- `_record_access(add, wen, read_blocked_set, write_blocked_set)` + +Behavior (within one pattern invocation): +- Read checks (`wen=1`) consult `read_blocked_set`. +- Write checks (`wen=0`) consult `write_blocked_set`. +- On every emitted access, the address is added to `write_blocked_set`. +- On emitted writes only, the address is also added to `read_blocked_set`. + +Effective policy: +- read after read: allowed +- write after read: blocked +- read after write: blocked +- write after write: blocked + +Notes: +- Blocking state is pattern-local (it does not persist across patterns). +- Generators are strict about transaction count: each non-idle pattern must emit exactly `n_transactions` (`N_TEST`). +- If blocking rules make the requested count unreachable for a pattern, generation fails with an explicit error instead of silently under-emitting. + +## Outputs + +### 1. Stimuli vectors +Path: +- `target/verif/simvectors/generated/stimuli/master_log_.txt` +- `target/verif/simvectors/generated/stimuli/master_hwpe_.txt` + +Per-cycle vector line format: +- `req id wen data add` +- `req`: `1` active request, `0` idle +- `id`: request ID (`IW` bits) +- `wen`: `1` read, `0` write +- `data`: payload (`DATA_WIDTH` bits for narrow, `HWPE_WIDTH_FACT*DATA_WIDTH` for wide) +- `add`: byte address (`ADD_WIDTH` bits) + +Fence token: +- A standalone line `PAUSE` is emitted at end of each pattern segment. + +### 2. Memory map report +Path: +- `target/verif/simvectors/generated/memory_map.txt` + +Contains: +- per-pattern region and traffic summary +- dependency/fence map +- temporal schedule summary +- region lifetimes and overlaps context + +### 3. Dataflow visualization +Path: +- `target/verif/simvectors/generated/dataflow.html` + +Contains: +- execution timeline (SVG) +- region-map blocks +- per-region usage cards +- overlap table + +### 4. Optional outputs +- `--golden`: emits expected read-data vectors under `generated/golden/` +- `--emit_phases_mk `: emits fence/dependency Makefile fragment + +## Recommended Extra Documentation +- one minimal JSON example per pattern +- exact dependency semantics for `job` / `wait_for_jobs` with 2-3 pattern chain examples +- known caveat: timeline model in report is simplified and may differ from full RTL contention timing diff --git a/target/verif/simvectors/hci_stimuli/__init__.py b/target/verif/simvectors/hci_stimuli/__init__.py index 3f4cd89..64b9b07 100644 --- a/target/verif/simvectors/hci_stimuli/__init__.py +++ b/target/verif/simvectors/hci_stimuli/__init__.py @@ -6,7 +6,5 @@ """ from .generator import StimuliGenerator -from .processor import unfold_raw_txt, pad_txt_files - -__all__ = ['StimuliGenerator', 'unfold_raw_txt', 'pad_txt_files'] +__all__ = ['StimuliGenerator'] diff --git a/target/verif/simvectors/hci_stimuli/generator.py b/target/verif/simvectors/hci_stimuli/generator.py index ca5e2d0..52993f5 100644 --- a/target/verif/simvectors/hci_stimuli/generator.py +++ b/target/verif/simvectors/hci_stimuli/generator.py @@ -1,186 +1,84 @@ -"""Stimuli Generator class and access-pattern generators.""" -import random -import os - -class StimuliGenerator: - def __init__(self,IW,WIDTH_OF_MEMORY,N_BANKS,TOT_MEM_SIZE,DATA_WIDTH,ADD_WIDTH,filepath,N_TEST,EXACT_OR_MAX_OFFSET,CYCLE_OFFSET,MASTER_NUMBER_IDENTIFICATION): - self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY - self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY/8) - self.N_BANKS = N_BANKS - self.TOT_MEM_SIZE = TOT_MEM_SIZE - self.DATA_WIDTH = DATA_WIDTH - self.ADD_WIDTH = int(ADD_WIDTH) - self.filepath = filepath - os.makedirs(os.path.dirname(filepath),exist_ok=True) - self.N_TEST = N_TEST - self.EXACT_OR_MAX_OFFSET = EXACT_OR_MAX_OFFSET - self.CYCLE_OFFSET = CYCLE_OFFSET - self.IW = IW - self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION - - def random_data(self): - data_decimal = random.randint(0, (2**(self.DATA_WIDTH))-1) # generate random data - data = bin(data_decimal)[2:].zfill(self.DATA_WIDTH) - return data - - def data_wen_offset(self): - wen = random.randint(0,1) # write enable signal (1 = read, 0 = write) - if (self.EXACT_OR_MAX_OFFSET): - cycle_offset = random.randint(1,self.CYCLE_OFFSET) - else: - cycle_offset = self.CYCLE_OFFSET - if wen: - data = "0" * self.DATA_WIDTH - else: - data = self.random_data() - return data, wen, cycle_offset - - - def random_gen(self,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - for test in range(self.N_TEST): - data, wen, cycle_offset = self.data_wen_offset() - while True: - add_decimal = int((random.randint(0, int((self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE)/self.WIDTH_OF_MEMORY_BYTE)))*(self.WIDTH_OF_MEMORY_BYTE)) # generate a random word-aligned memory address. - if add_decimal > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - add_decimal = add_decimal - self.TOT_MEM_SIZE*1024 #rolls over - add = bin(add_decimal)[2:].zfill(self.ADD_WIDTH) - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - break - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - def linear_gen(self,stride0,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - next_address = int(start_address,2) - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - for test in range(self.N_TEST): - data, wen, cycle_offset = self.data_wen_offset() - - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - next_address += (self.WIDTH_OF_MEMORY_BYTE)*stride0 #word-aligned memory address - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - break - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - - return id - - def gen_2d(self,stride0,len_d0,stride1,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - start_address = int(start_address,2) - next_address = start_address - j = 0 - STOP = 0 - while True: - for i in range(len_d0): - data, wen, cycle_offset = self.data_wen_offset() - next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 #word-aligned memory address - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - if STOP: - break - j = j + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - def gen_3d(self,stride0,len_d0,stride1,len_d1,stride2,start_address,id_start,LIST_OF_FORBIDDEN_ADDRESSES_READ,LIST_OF_FORBIDDEN_ADDRESSES_WRITE): - id = id_start - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW = [] - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW = [] - with open(self.filepath, 'w', encoding="ascii") as file: #write an ascii file for each port of each generator - start_address = int(start_address,2) - next_address = start_address - k = 0 - STOP = 0 - while True: - for j in range(len_d1): - for i in range(len_d0): - data, wen, cycle_offset = self.data_wen_offset() - next_address = start_address + i*(self.WIDTH_OF_MEMORY_BYTE)*stride0 + j*(self.WIDTH_OF_MEMORY_BYTE)*stride1 + k*(self.WIDTH_OF_MEMORY_BYTE)*stride2 #word-aligned memory address - if next_address > self.TOT_MEM_SIZE*1024-self.WIDTH_OF_MEMORY_BYTE : - next_address = next_address - self.TOT_MEM_SIZE*1024 #rolls over - add = bin(next_address)[2:].zfill(self.ADD_WIDTH) - - if wen: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_READ: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - else: - if add not in LIST_OF_FORBIDDEN_ADDRESSES_WRITE: - LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW.append(add) - LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW.append(add) - file.write(bin(id)[2:].zfill(self.IW) + " " + str(cycle_offset) + " " + str(wen) + " " + data + " " + add + "\n") - id = id + 1 - if id - id_start >= self.N_TEST : - STOP = 1 - break - if STOP: - break - if STOP: - break - k = k + 1 - - LIST_OF_FORBIDDEN_ADDRESSES_READ.extend(LIST_OF_FORBIDDEN_ADDRESSES_READ_NEW) - LIST_OF_FORBIDDEN_ADDRESSES_WRITE.extend(LIST_OF_FORBIDDEN_ADDRESSES_WRITE_NEW) - return id - - +"""StimuliGenerator: infrastructure for writing cycle-accurate stimuli files. + +Each output file has one line per simulation cycle in the format: + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) + +req=0 means no transaction that cycle (id/wen/data/add are don't-cares). +req=1 means an active transaction. +""" + +import os +import random + +from .patterns import PatternsMixin + + +class StimuliGenerator(PatternsMixin): + def __init__( + self, + IW, + WIDTH_OF_MEMORY, + N_BANKS, + TOT_MEM_SIZE, + DATA_WIDTH, + ADD_WIDTH, + filepath, + N_TEST, + MASTER_NUMBER_IDENTIFICATION, + ): + self.WIDTH_OF_MEMORY = WIDTH_OF_MEMORY + self.WIDTH_OF_MEMORY_BYTE = int(WIDTH_OF_MEMORY / 8) + self.N_BANKS = N_BANKS + self.TOT_MEM_SIZE = TOT_MEM_SIZE + self.DATA_WIDTH = DATA_WIDTH + self.ADD_WIDTH = int(ADD_WIDTH) + self.filepath = filepath + os.makedirs(os.path.dirname(filepath), exist_ok=True) + self.N_TEST = N_TEST + self.IW = IW + self.MASTER_NUMBER_IDENTIFICATION = MASTER_NUMBER_IDENTIFICATION + + def _format_id(self, id_value): + return bin(id_value % (1 << self.IW))[2:].zfill(self.IW) + + def random_data(self): + data_decimal = random.randint(0, (2 ** self.DATA_WIDTH) - 1) + return bin(data_decimal)[2:].zfill(self.DATA_WIDTH) + + def _write_req(self, file_obj, id_value, wen, data, add): + """Write one active-request line (req=1).""" + file_obj.write( + "1 " + + self._format_id(id_value) + + " " + + str(wen) + + " " + + data + + " " + + add + + "\n" + ) + + def _write_idle(self, file_obj): + """Write one idle line (req=0).""" + file_obj.write( + "0 " + + "0" * self.IW + + " 0 " + + "0" * self.DATA_WIDTH + + " " + + "0" * self.ADD_WIDTH + + "\n" + ) + + def _write_pause(self, file_obj): + """Write a PAUSE fence token line.""" + file_obj.write("PAUSE\n") + + def data_wen(self): + wen = random.randint(0, 1) # 1=read, 0=write + if wen: + data = "0" * self.DATA_WIDTH + else: + data = self.random_data() + return data, wen diff --git a/target/verif/simvectors/hci_stimuli/patterns.py b/target/verif/simvectors/hci_stimuli/patterns.py new file mode 100644 index 0000000..cf080fe --- /dev/null +++ b/target/verif/simvectors/hci_stimuli/patterns.py @@ -0,0 +1,850 @@ +"""Access-pattern generators for StimuliGenerator. + +Each method writes a cycle-accurate stimuli file directly (one line per cycle): + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) + +req=0 lines are idle cycles. req=1 lines are active transactions. + +Fence semantics (one trailing PAUSE per pattern): + Each pattern ends with a PAUSE. fence_idx[i] increments when resume_i fires while + fence_reached_o is high (i.e. while the driver is sitting at the PAUSE). + fence_idx[i] >= k means driver i has been granted to leave fence k-1, + i.e. pattern k-1 is complete and driver i is about to start pattern k. + + resume_i fires when the dependencies of the NEXT pattern are satisfied. + So resume_i = "your next job's inputs are ready, proceed". + + Trailing PAUSE of the last pattern has mask=0 → resume_i fires in one cycle + → fence_idx advances to N_patterns, signalling final completion to dependents. + +All generators accept append=True to open the file in append mode. +""" + +import random + +class PatternsMixin: + + @staticmethod + def _parse_address(addr_str): + s = str(addr_str) + if s.startswith('0x') or s.startswith('0X'): return int(s, 16) + if set(s) <= {'0', '1'}: return int(s, 2) + return int(s, 0) + + @staticmethod + def _align_down(value, alignment): + if alignment <= 0: return value + return (value // alignment) * alignment + + @staticmethod + def _phase_counts(total_ops, ratio_a, ratio_b, ratio_c): + ra = max(0, int(ratio_a)); rb = max(0, int(ratio_b)); rc = max(0, int(ratio_c)) + if ra == 0 and rb == 0 and rc == 0: ra, rb, rc = 1, 1, 1 + s = ra + rb + rc + ca = (total_ops * ra) // s; cb = (total_ops * rb) // s; cc = total_ops - ca - cb + if total_ops >= 3: + if ra > 0 and ca == 0: ca = 1; cc = max(0, cc-1) + if rb > 0 and cb == 0: cb = 1; cc = max(0, cc-1) + if rc > 0 and cc == 0: + if ca > 1: ca -= 1; cc = 1 + elif cb > 1: cb -= 1; cc = 1 + return ca, cb, cc + + @staticmethod + def _idles_per_req(traffic_pct): + traffic_pct = max(1, min(100, int(traffic_pct))) + return 0 if traffic_pct >= 100 else int(round((100 - traffic_pct) / traffic_pct)) + + def _is_allowed(self, add, wen, read_blocked_set, write_blocked_set): + return add not in (read_blocked_set if wen else write_blocked_set) + + def _record_access(self, add, wen, read_blocked_set, write_blocked_set): + write_blocked_set.add(add) + if not wen: + read_blocked_set.add(add) + + @staticmethod + def _init_blocked_sets(read_blocked, write_blocked): + return set(read_blocked or []), set(write_blocked or []) + + @staticmethod + def _extend_unique_sorted(target, values): + if not isinstance(target, list): + return + known = set(target) + for v in sorted(values): + if v in known: + continue + target.append(v) + known.add(v) + + def _commit_blocked_sets(self, read_blocked, write_blocked, read_blocked_set, write_blocked_set): + self._extend_unique_sorted(read_blocked, read_blocked_set) + self._extend_unique_sorted(write_blocked, write_blocked_set) + + def _require_exact_emits(self, pattern_name, id_start, id_value): + emitted = int(id_value - id_start) + expected = int(self.N_TEST) + if emitted == expected: + return + raise RuntimeError( + f"{pattern_name}: emitted {emitted} transaction(s), expected {expected}. " + "Adjust region/shape/traffic to satisfy the read/write blocked policy." + ) + + def _open(self, append): + return open(self.filepath, "a" if append else "w", encoding="ascii") + + def _total_mem_bytes(self): + return int(self.TOT_MEM_SIZE * 1024) + + def _normalize_addr(self, addr): + total = self._total_mem_bytes() + if total <= self.WIDTH_OF_MEMORY_BYTE: + return 0 + max_addr = total - self.WIDTH_OF_MEMORY_BYTE + a = int(addr) % total + if a > max_addr: + a = max_addr + return a + + @staticmethod + def _parse_read_write_schedule(schedule, default="4read_1write"): + raw = str(schedule if schedule is not None else default).strip().lower() + if not raw: + raw = default + tokens = [] + for chunk in raw.replace("-", "_").split("_"): + c = chunk.strip() + if not c: + continue + count = 1 + word = c + if c[0].isdigit(): + i = 0 + while i < len(c) and c[i].isdigit(): + i += 1 + count = max(1, int(c[:i])) + word = c[i:] + elif c[-1].isdigit(): + i = len(c) - 1 + while i >= 0 and c[i].isdigit(): + i -= 1 + count = max(1, int(c[i + 1:])) + word = c[:i + 1] + word = word.strip() + if word in {"read", "r"}: + tokens.extend(["R"] * count) + elif word in {"write", "w"}: + tokens.extend(["W"] * count) + if not tokens: + return ["R", "R", "R", "R", "W"] + return tokens + + @staticmethod + def _parse_abc_schedule(schedule, default="A_B_C"): + raw = str(schedule if schedule is not None else default).strip().upper() + if not raw: + raw = default + tokens = [] + for chunk in raw.replace("-", "_").split("_"): + c = chunk.strip() + if not c: + continue + count = 1 + letter = c + if c[0].isdigit(): + i = 0 + while i < len(c) and c[i].isdigit(): + i += 1 + count = max(1, int(c[:i])) + letter = c[i:] + elif c[-1].isdigit(): + i = len(c) - 1 + while i >= 0 and c[i].isdigit(): + i -= 1 + count = max(1, int(c[i + 1:])) + letter = c[:i + 1] + letter = letter.strip().upper() + if letter in {"A", "B", "C"}: + tokens.extend([letter] * count) + if not tokens: + return ["A", "B", "C"] + return tokens + + # ------------------------------------------------------------------ # + # Access patterns — each writes: transactions | PAUSE # + # ------------------------------------------------------------------ # + + def random_gen(self, id_start, read_blocked, write_blocked, + region_base=0, region_size=None, traffic_pct=100, + traffic_read_pct=None, append=False): + total = int(self.TOT_MEM_SIZE * 1024) + if region_size is None: region_size = total + region_size = min(region_size, total - region_base) + n_words = max(1, region_size // self.WIDTH_OF_MEMORY_BYTE) + n_idles = self._idles_per_req(traffic_pct) + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) + else: + wen_seq = None + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + max_attempts = max(1, n_words * 4) + with self._open(append) as f: + for i in range(self.N_TEST): + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: data, wen = self.data_wen() + else: data = "0"*self.DATA_WIDTH if wen else self.random_data() + placed = False + for _ in range(max_attempts): + ad = region_base + random.randint(0, int(n_words)-1)*self.WIDTH_OF_MEMORY_BYTE + add = bin(int(ad))[2:].zfill(self.ADD_WIDTH) + if self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + self._record_access(add, wen, read_blocked_set, write_blocked_set) + placed = True + break + if not placed: + continue + self._write_req(f, id_value, wen, data, add); id_value += 1 + for _ in range(n_idles): self._write_idle(f) + self._write_pause(f) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("random", id_start, id_value) + return id_value + + def linear_gen(self, stride0, start_address, id_start, read_blocked, write_blocked, + traffic_pct=100, traffic_read_pct=None, append=False): + n_idles = self._idles_per_req(traffic_pct) + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1]*n_reads + [0]*(self.N_TEST-n_reads) + else: + wen_seq = None + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + with self._open(append) as f: + addr = self._parse_address(start_address) + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + for i in range(self.N_TEST): + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: data, wen = self.data_wen() + else: data = "0"*self.DATA_WIDTH if wen else self.random_data() + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + addr += self.WIDTH_OF_MEMORY_BYTE * stride0 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add); id_value += 1 + for _ in range(n_idles): self._write_idle(f) + self._write_pause(f) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("linear", id_start, id_value) + return id_value + + def gen_2d(self, stride0, len_d0, stride1, start_address, id_start, + read_blocked, write_blocked, idle_cycles_between_phases=0, append=False): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + with self._open(append) as f: + base = self._parse_address(start_address); j = 0 + while id_value - id_start < self.N_TEST: + emitted_before = id_value + for i in range(len_d0): + data, wen = self.data_wen() + addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add); id_value += 1 + if id_value - id_start >= self.N_TEST: break + for _ in range(idle_cycles_between_phases): self._write_idle(f) + if id_value == emitted_before: + break + j += 1 + self._write_pause(f) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("2d", id_start, id_value) + return id_value + + def gen_3d(self, stride0, len_d0, stride1, len_d1, stride2, start_address, id_start, + read_blocked, write_blocked, idle_cycles_between_phases=0, append=False): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + with self._open(append) as f: + base = self._parse_address(start_address); k = 0 + while id_value - id_start < self.N_TEST: + emitted_before = id_value + for j in range(len_d1): + for i in range(len_d0): + data, wen = self.data_wen() + addr = base + i*self.WIDTH_OF_MEMORY_BYTE*stride0 + j*self.WIDTH_OF_MEMORY_BYTE*stride1 + k*self.WIDTH_OF_MEMORY_BYTE*stride2 + if addr > self.TOT_MEM_SIZE*1024 - self.WIDTH_OF_MEMORY_BYTE: + addr -= self.TOT_MEM_SIZE*1024 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add); id_value += 1 + if id_value - id_start >= self.N_TEST: break + if id_value - id_start >= self.N_TEST: break + for _ in range(idle_cycles_between_phases): self._write_idle(f) + if id_value == emitted_before: + break + k += 1 + self._write_pause(f) + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("3d", id_start, id_value) + return id_value + + def idle_gen(self, id_start, append=False): + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + return id_start + + def matmul_phased_gen(self, id_start, read_blocked, write_blocked, + region_base_address, region_size_bytes, + matmul_ratio_a=1, matmul_ratio_b=1, matmul_ratio_c=1, + traffic_pct=100, + idle_cycles_between_phases=0, + region_base_address_a=None, region_size_bytes_a=None, + region_base_address_b=None, region_size_bytes_b=None, + region_base_address_c=None, region_size_bytes_c=None, + append=False): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = max(1, self.DATA_WIDTH // 8); tm = int(self.TOT_MEM_SIZE * 1024) + n_idles = self._idles_per_req(traffic_pct) + + def _res(bo, so, fb, fs): + b = self._align_down(int(bo if bo is not None else fb), ab) + s = self._align_down(int(so if so is not None else fs), ab) + if b+s > tm: s = self._align_down(tm-b, ab) + return b, s + + if region_base_address_a is not None and region_size_bytes_a is not None: + a_base, a_size = _res(region_base_address_a, region_size_bytes_a, 0, 0) + b_base, b_size = _res(region_base_address_b, region_size_bytes_b, a_base, a_size) + c_base, c_size = _res(region_base_address_c, region_size_bytes_c, a_base, a_size) + else: + base = self._align_down(int(region_base_address), ab) + size = self._align_down(int(region_size_bytes), ab) + if base+size > tm: size = self._align_down(tm-base, ab) + rw = size // ab + if rw < 3: + with self._open(append) as f: self._write_idle(f); self._write_pause(f) + self._require_exact_emits("matmul_phased", id_start, id_value) + return id_value + aw=max(1,rw//3); bw=max(1,rw//3); cw=rw-aw-bw + a_base=base; a_size=aw*ab; b_base=a_base+a_size; b_size=bw*ab + c_base=b_base+b_size; c_size=cw*ab + + ca, cb, cc = self._phase_counts(self.N_TEST, matmul_ratio_a, matmul_ratio_b, matmul_ratio_c) + + def _emit(fobj, count, wen, pb, pe): + nonlocal id_value + addr = pb + for _ in range(count): + data = "0"*self.DATA_WIDTH if wen else self.random_data() + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + addr += ab + if addr >= pe: + addr = pb + continue + self._write_req(fobj, id_value, wen, data, add) + self._record_access(add, wen, read_blocked_set, write_blocked_set) + id_value += 1; addr += ab + if addr >= pe: addr = pb + for _ in range(n_idles): self._write_idle(fobj) + + with self._open(append) as f: + _emit(f, ca, 1, a_base, a_base+a_size) + if ca > 0 and (cb > 0 or cc > 0): + for _ in range(idle_cycles_between_phases): self._write_idle(f) + _emit(f, cb, 1, b_base, b_base+b_size) + if cb > 0 and cc > 0: + for _ in range(idle_cycles_between_phases): self._write_idle(f) + _emit(f, cc, 0, c_base, c_base+c_size) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("matmul_phased", id_start, id_value) + return id_value + + def multi_linear_gen( + self, + id_start, + read_blocked, + write_blocked, + regions, + schedule="round_robin", + burst_len=1, + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + burst = max(1, int(burst_len)) + + norm_regions = [] + for reg in regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + stride_words = max(1, int(reg.get("stride_words", 1))) + read_pct = reg.get("read_pct") + if read_pct is not None: + read_pct = max(0, min(100, int(read_pct))) + norm_regions.append({ + "base": base, + "size": size, + "stride_words": stride_words, + "read_pct": read_pct, + "offset": 0, + }) + + if not norm_regions: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("multi_linear", id_start, id_value) + return id_value + + rr = 0 + stalled_rounds = 0 + with self._open(append) as f: + while id_value - id_start < self.N_TEST: + emitted_before = id_value + reg = norm_regions[rr % len(norm_regions)] + rr += 1 + chunk = burst if str(schedule).strip().lower() == "round_robin" else max(1, self.N_TEST) + for _ in range(chunk): + if id_value - id_start >= self.N_TEST: + break + addr = reg["base"] + reg["offset"] + add = bin(self._normalize_addr(addr))[2:].zfill(self.ADD_WIDTH) + if reg["read_pct"] is None: + data, wen = self.data_wen() + else: + wen = 1 if random.randint(1, 100) <= reg["read_pct"] else 0 + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + step = reg["stride_words"] * ab + reg["offset"] = (reg["offset"] + step) % reg["size"] + if id_value == emitted_before: + stalled_rounds += 1 + if stalled_rounds >= len(norm_regions): + break + else: + stalled_rounds = 0 + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("multi_linear", id_start, id_value) + return id_value + + def bank_group_linear_gen( + self, + id_start, + read_blocked, + write_blocked, + start_bank, + bank_group_span, + stride_beats=1, + bank_group_hop=0, + wen=None, + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + span = max(1, min(int(bank_group_span), int(self.N_BANKS))) + start_bank = int(start_bank) % max(1, int(self.N_BANKS)) + stride = max(1, int(stride_beats)) + hop = max(0, int(bank_group_hop)) + + with self._open(append) as f: + for tx in range(self.N_TEST): + phase = tx * stride + group_idx = phase // span + bank_base = (start_bank + group_idx * hop * span) % self.N_BANKS + bank = (bank_base + (phase % span)) % self.N_BANKS + row = group_idx + word_idx = row * self.N_BANKS + bank + addr = self._normalize_addr(word_idx * ab) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + if wen is None: + data, wen_cur = self.data_wen() + else: + wen_cur = 1 if int(wen) else 0 + data = "0" * self.DATA_WIDTH if wen_cur else self.random_data() + if not self._is_allowed(add, wen_cur, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen_cur, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen_cur, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("bank_group_linear", id_start, id_value) + return id_value + + def rw_rowwise_gen( + self, + id_start, + read_blocked, + write_blocked, + row_base_address, + row_size_bytes, + n_rows, + row_stride_bytes, + reads_per_row, + writes_per_row, + traffic_pct=100, + idle_cycles_between_rows=0, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + n_idles = self._idles_per_req(traffic_pct) + base = self._align_down(int(row_base_address), ab) + row_size = max(ab, self._align_down(int(row_size_bytes), ab)) + row_stride = max(ab, self._align_down(int(row_stride_bytes), ab)) + n_rows = max(0, int(n_rows)) + reads_per_row = max(0, int(reads_per_row)) + writes_per_row = max(0, int(writes_per_row)) + + with self._open(append) as f: + for r in range(n_rows): + if id_value - id_start >= self.N_TEST: + break + row_base = self._normalize_addr(base + r * row_stride) + for i in range(reads_per_row): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(row_base + (i * ab) % row_size) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = 1 + data = "0" * self.DATA_WIDTH + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + for i in range(writes_per_row): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(row_base + (i * ab) % row_size) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = 0 + data = self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + if r < n_rows - 1: + for _ in range(max(0, int(idle_cycles_between_rows))): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("rw_rowwise", id_start, id_value) + return id_value + + def gather_scatter_gen( + self, + id_start, + read_blocked, + write_blocked, + read_regions, + write_region, + chunk_bytes=0, + schedule="4read_1write", + traffic_pct=100, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + chunk_val = ab if chunk_bytes is None else int(chunk_bytes) + step = max(ab, self._align_down(chunk_val if chunk_val > 0 else ab, ab)) + tokens = self._parse_read_write_schedule(schedule) + + reads = [] + for reg in read_regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + reads.append({"base": base, "size": size, "offset": 0}) + + wb = self._align_down(int((write_region or {}).get("base", 0)), ab) + ws = self._align_down(int((write_region or {}).get("size_bytes", 0)), ab) + if wb >= tm: + wb %= tm + if wb + ws > tm: + ws = self._align_down(tm - wb, ab) + + if not reads and ws <= 0: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("gather_scatter", id_start, id_value) + return id_value + + read_rr = 0 + token_idx = 0 + write_offset = 0 + max_no_progress = max(32, len(tokens) * max(1, len(reads) + (1 if ws > 0 else 0))) + no_progress_iters = 0 + with self._open(append) as f: + while id_value - id_start < self.N_TEST: + token = tokens[token_idx % len(tokens)] + token_idx += 1 + wen = 1 if token == "R" else 0 + if token == "R" and reads: + reg = reads[read_rr % len(reads)] + read_rr += 1 + addr = self._normalize_addr(reg["base"] + reg["offset"]) + reg["offset"] = (reg["offset"] + step) % reg["size"] + elif ws > 0: + addr = self._normalize_addr(wb + write_offset) + write_offset = (write_offset + step) % ws + elif reads: + reg = reads[read_rr % len(reads)] + read_rr += 1 + wen = 1 + addr = self._normalize_addr(reg["base"] + reg["offset"]) + reg["offset"] = (reg["offset"] + step) % reg["size"] + else: + break + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + no_progress_iters += 1 + if no_progress_iters >= max_no_progress: + break + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + no_progress_iters = 0 + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("gather_scatter", id_start, id_value) + return id_value + + def matmul_tiled_interleave_gen( + self, + id_start, + read_blocked, + write_blocked, + region_base_address_a, + region_size_bytes_a, + region_base_address_b, + region_size_bytes_b, + region_base_address_c, + region_size_bytes_c, + tile_a_bytes=0, + tile_b_bytes=0, + tile_c_bytes=0, + tiles=1, + ab_c_schedule="A_B_C", + traffic_pct=100, + idle_cycles_between_tiles=0, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + tile_idle = max(0, int(idle_cycles_between_tiles)) + tokens = self._parse_abc_schedule(ab_c_schedule) + + def _res(base_raw, size_raw): + base = self._align_down(int(base_raw), ab) + size = self._align_down(int(size_raw), ab) + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + return base, size + + a_base, a_size = _res(region_base_address_a, region_size_bytes_a) + b_base, b_size = _res(region_base_address_b, region_size_bytes_b) + c_base, c_size = _res(region_base_address_c, region_size_bytes_c) + if a_size <= 0 or b_size <= 0 or c_size <= 0: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("matmul_tiled_interleave", id_start, id_value) + return id_value + + ta = ab if tile_a_bytes is None else int(tile_a_bytes) + tb = ab if tile_b_bytes is None else int(tile_b_bytes) + tc = ab if tile_c_bytes is None else int(tile_c_bytes) + cnt_a = max(1, self._align_down(ta if ta > 0 else ab, ab) // ab) + cnt_b = max(1, self._align_down(tb if tb > 0 else ab, ab) // ab) + cnt_c = max(1, self._align_down(tc if tc > 0 else ab, ab) // ab) + counts = {"A": cnt_a, "B": cnt_b, "C": cnt_c} + ptr = {"A": 0, "B": 0, "C": 0} + base = {"A": a_base, "B": b_base, "C": c_base} + size = {"A": a_size, "B": b_size, "C": c_size} + max_tiles = max(1, int(tiles)) + + with self._open(append) as f: + tile_idx = 0 + stalled_tiles = 0 + while id_value - id_start < self.N_TEST: + emitted_before = id_value + for tok in tokens: + for _ in range(counts[tok]): + if id_value - id_start >= self.N_TEST: + break + addr = self._normalize_addr(base[tok] + ptr[tok]) + ptr[tok] = (ptr[tok] + ab) % size[tok] + wen = 0 if tok == "C" else 1 + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + tile_idx += 1 + if tile_idle > 0 and id_value - id_start < self.N_TEST: + for _ in range(tile_idle): + self._write_idle(f) + if tile_idx >= max_tiles: + tile_idx = 0 + if id_value == emitted_before: + stalled_tiles += 1 + if stalled_tiles >= max_tiles: + break + else: + stalled_tiles = 0 + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("matmul_tiled_interleave", id_start, id_value) + return id_value + + def hotspot_random_gen( + self, + id_start, + read_blocked, + write_blocked, + hot_regions, + traffic_pct=100, + traffic_read_pct=None, + append=False, + ): + id_value = id_start + read_blocked_set, write_blocked_set = self._init_blocked_sets(read_blocked, write_blocked) + ab = self.WIDTH_OF_MEMORY_BYTE + tm = self._total_mem_bytes() + n_idles = self._idles_per_req(traffic_pct) + + regions = [] + weights = [] + for reg in hot_regions or []: + base = self._align_down(int(reg.get("base", 0)), ab) + size = self._align_down(int(reg.get("size_bytes", 0)), ab) + weight = max(1, int(reg.get("weight", 1))) + if size <= 0: + continue + if base >= tm: + base %= tm + if base + size > tm: + size = self._align_down(tm - base, ab) + if size <= 0: + continue + regions.append({"base": base, "size": size}) + weights.append(weight) + + if not regions: + with self._open(append) as f: + self._write_idle(f) + self._write_pause(f) + self._require_exact_emits("hotspot_random", id_start, id_value) + return id_value + + if traffic_read_pct is not None: + rpct = max(0, min(100, int(traffic_read_pct))) + n_reads = (self.N_TEST * rpct) // 100 + wen_seq = [1] * n_reads + [0] * (self.N_TEST - n_reads) + else: + wen_seq = None + + with self._open(append) as f: + for i in range(self.N_TEST): + reg = random.choices(regions, weights=weights, k=1)[0] + n_words = max(1, reg["size"] // ab) + ad = reg["base"] + random.randint(0, n_words - 1) * ab + addr = self._normalize_addr(ad) + add = bin(addr)[2:].zfill(self.ADD_WIDTH) + wen = wen_seq[i] if wen_seq is not None else None + if wen is None: + data, wen = self.data_wen() + else: + data = "0" * self.DATA_WIDTH if wen else self.random_data() + if not self._is_allowed(add, wen, read_blocked_set, write_blocked_set): + continue + self._record_access(add, wen, read_blocked_set, write_blocked_set) + self._write_req(f, id_value, wen, data, add) + id_value += 1 + for _ in range(n_idles): + self._write_idle(f) + self._write_pause(f) + + self._commit_blocked_sets(read_blocked, write_blocked, read_blocked_set, write_blocked_set) + self._require_exact_emits("hotspot_random", id_start, id_value) + return id_value diff --git a/target/verif/simvectors/hci_stimuli/processor.py b/target/verif/simvectors/hci_stimuli/processor.py deleted file mode 100644 index 91b4892..0000000 --- a/target/verif/simvectors/hci_stimuli/processor.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Processor helpers: unfold and pad stimuli text files.""" - -import os - - -# 1) ++UNFOLD++ the transactions in the .txt files into a cycle-level list. -# -folder_path_raw --> String that specifies the path of the folder containing the raw txt files (where the cycle offset is still indicated) -# -folder_path_processed --> String that specifies the path of the folder containing the new txt files created by this function -def unfold_raw_txt(folder_path_raw,folder_path_processed,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH): - file_names = [file for file in os.listdir(folder_path_raw) if file.endswith(".txt")] - for file in file_names: - filepath_read = os.path.join(folder_path_raw,file) - filepath_write = os.path.join(folder_path_processed, file) - os.makedirs(os.path.dirname(filepath_write),exist_ok=True) - with open(filepath_read, 'r', encoding = "ascii") as file_read: - with open(filepath_write, 'w', encoding="ascii") as file_write: - for line in file_read: - if line != 'zero': - values = line.split() - id = values[0] - cycle_offset = values[1] - wen = values[2] - data = values[3] - add = values[4] - if "log" in file: - for _ in range(int(cycle_offset)-1): - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - for _ in range(int(cycle_offset)-1): - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - file_write.write('1 ' + id + " " + wen + " " + data + " " + add + "\n") - else: - if "log" in file: - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - file_write.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - - -# 2) ++PAD++ txt files to have the same number of lines -# -Folder_path --> path of the folder containing the txt files to be padded -def pad_txt_files(folder_path,IW,DATA_WIDTH,ADD_WIDTH,HWPE_WIDTH): - file_names = [file for file in os.listdir(folder_path) if file.endswith(".txt")] # List of the txt file names in the folder - max_lines = 0 - line_count = {} # Dictionary to store the number of lines in each txt file - # Determining the maximum number of lines among the txt files - for file in file_names: - file_path = os.path.join(folder_path,file) - with open(file_path,'r', encoding = 'ascii') as f: - line_count[file] = sum(1 for _ in f) - max_lines = max(max_lines, line_count[file]) - # Pad files - for file in file_names: - padding_needed = max_lines - line_count[file] - if padding_needed > 0: - file_path = os.path.join(folder_path,file) - with open(file_path, 'a', encoding = 'ascii') as f: - if "log" in file: - for _ in range(padding_needed): - f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") - else: - for _ in range(padding_needed): - f.write("0 " + '0'*IW + " " + '0' + " " + '0'*int(HWPE_WIDTH*DATA_WIDTH) + " " + '0'*ADD_WIDTH + "\n") diff --git a/target/verif/simvectors/html_report.py b/target/verif/simvectors/html_report.py new file mode 100644 index 0000000..5784714 --- /dev/null +++ b/target/verif/simvectors/html_report.py @@ -0,0 +1,394 @@ +"""HTML report generation for memory lifetime visualization.""" + +from pathlib import Path +import html +import math + + +def build_memory_lifetime_html( + *, + pattern_nodes, + driver_windows, + regions_timeline, + total_cycles, + mux_serialization_applied, + mux_phase_order, + schedule_has_cycle, + driver_name_fn, + interco_type, + n_core_cfg, + n_dma_cfg, + n_ext_cfg, + n_hwpe_cfg, + dw_narrow, + dw_wide, + n_narrow_hci_cfg, + n_wide_hci_cfg, + n_banks, +): + palette = [ + "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#17becf", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#9467bd", + "#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e", + "#e6ab02", "#a6761d", "#666666", + ] + + def _color_for_driver(drv_idx): + return palette[drv_idx % len(palette)] + + def _tick_step(total): + if total <= 10: + return 1 + raw = max(1, total // 10) + mag = 10 ** int(math.log10(raw)) + return int(math.ceil(raw / mag) * mag) + + total_for_plot = max(1, total_cycles) + chart_width = 1280 + x_left = 220 + x_right = 24 + plot_w = chart_width - x_left - x_right + row_h = 80 + y_top = 36 + exec_rows = [driver_windows[d] for d in sorted(driver_windows.keys())] + exec_h = y_top + row_h * len(exec_rows) + 72 + tick = _tick_step(total_for_plot) + ticks = list(range(0, total_for_plot + 1, tick)) + if ticks[-1] != total_for_plot: + ticks.append(total_for_plot) + + exec_svg = [] + exec_svg.append(f'') + exec_svg.append('') + exec_svg.append(f'Execution Timeline (transaction-count model)') + for t in ticks: + x = x_left + (t / total_for_plot) * plot_w + exec_svg.append(f'') + exec_svg.append(f'{t}') + exec_svg.append( + f'Transaction number' + ) + exec_svg.append( + f'' + 'Issued memory transactions (r/w) and computation cycles (i.e., req = 0) are both modeled here.' + '' + ) + + def _rw_mix_text(node): + rpct = node.get('traffic_read_pct') + if rpct is None: + read_bytes = 0 + write_bytes = 0 + for reg in node.get('regions', []): + label_l = str(reg.get('label', '')).lower() + size_b = max(0, int(reg.get('size', 0))) + if 'read' in label_l and 'write' not in label_l: + read_bytes += size_b + elif 'write' in label_l and 'read' not in label_l: + write_bytes += size_b + if (read_bytes + write_bytes) > 0: + rpct = int(round(100.0 * read_bytes / (read_bytes + write_bytes))) + else: + rpct = 50 + rpct = max(0, min(100, int(rpct))) + if rpct == 0: + return "100% writes" + if rpct == 100: + return "100% reads" + return f"{rpct}% reads / {100 - rpct}% writes" + + def _fmt_kib(bytes_v): + return f"{(max(0, int(bytes_v)) / 1024.0):.1f} KiB" + + def _outside_detail_lines(node, base_addr, size_kib): + regs = list(node.get('regions', [])) + label_map = {str(r.get('label', '')): r for r in regs} + has_matmul_regs = any(k in label_map for k in ('A(read)', 'B(read)', 'C(write)')) + is_matmul = node.get('mem_access_type') in {'matmul', 'matmul_phased', 'matmul_tiled_interleave'} or has_matmul_regs + + if is_matmul: + mat_lines = [] + for lbl, short in [('A(read)', 'A'), ('B(read)', 'B'), ('C(write)', 'C')]: + if lbl in label_map: + r = label_map[lbl] + mat_lines.append(f"{short}@0x{int(r['base']):08x} ({_fmt_kib(r['size'])})") + if not mat_lines: + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + return mat_lines + + if regs: + r0 = regs[0] + return [f"@0x{int(r0['base']):08x}", f"{_fmt_kib(r0['size'])}"] + return [f"@0x{base_addr:08x}", f"{size_kib:.1f} KiB"] + + for row_idx, row in enumerate(exec_rows): + y = y_top + row_idx * row_h + name = html.escape(row['name']) + label_color = "#111111" if row['is_hwpe'] else "#555555" + exec_svg.append(f'{name}') + exec_svg.append(f'') + nodes = [n for n in pattern_nodes if n['driver_idx'] == row['driver_idx']] + for node in nodes: + if node['end_cycle'] <= node['start_cycle']: + continue + x = x_left + (node['start_cycle'] / total_for_plot) * plot_w + w = max(1.0, ((node['end_cycle'] - node['start_cycle']) / total_for_plot) * plot_w) + color = _color_for_driver(node['driver_idx']) + base_addr = min((int(r.get('base', 0)) for r in node.get('regions', [])), default=0) + size_kib = (int(node['n_transactions']) * int(node.get('txn_bytes', 1))) / 1024.0 + rw_mix = _rw_mix_text(node) + line1_full = f"{node['job']}" + line2_full = f"{node['mem_access_type']}" + line3_full = rw_mix + outside_lines = _outside_detail_lines(node, base_addr, size_kib) + title = ( + f"{node['driver_name']} p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']}, {node['end_cycle']}) " + f"{node['mem_access_type']} n={node['n_transactions']} " + f"base=0x{base_addr:08x} {size_kib:.1f}KiB {rw_mix} " + f"{' | '.join(outside_lines)}" + ) + exec_svg.append('') + exec_svg.append(f'{html.escape(title)}') + exec_svg.append( + f'' + ) + line1 = html.escape(line1_full) + exec_svg.append( + f'{line1}' + ) + line2 = html.escape(line2_full) + exec_svg.append( + f'{line2}' + ) + line3 = html.escape(line3_full) + exec_svg.append( + f'{line3}' + ) + for ext_idx, ext in enumerate(outside_lines): + y_ext = y + 49 + (ext_idx * 9) + ext_txt = html.escape(ext) + exec_svg.append( + f'{ext_txt}' + ) + exec_svg.append('') + exec_svg.append('') + + region_rows = [regions_timeline[k] for k in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2]))] + overlap_rows = [] + overlaps_by_region = {i: [] for i in range(len(region_rows))} + for i in range(len(region_rows)): + a = region_rows[i] + for j in range(i + 1, len(region_rows)): + b = region_rows[j] + ov_base = max(a['base'], b['base']) + ov_end = min(a['end'], b['end']) + if ov_base <= ov_end: + ov_size = ov_end - ov_base + 1 + overlap_rows.append({ + 'a_idx': i, + 'b_idx': j, + 'ov_base': ov_base, + 'ov_end': ov_end, + 'ov_size': ov_size, + }) + overlaps_by_region[i].append((j, ov_base, ov_end, ov_size)) + overlaps_by_region[j].append((i, ov_base, ov_end, ov_size)) + + used_min = min((reg['base'] for reg in region_rows), default=0) + used_max = max((reg['end'] for reg in region_rows), default=0) + if used_max < used_min: + used_max = used_min + used_span = max(1, used_max - used_min + 1) + + map_left = 170 + map_right = 24 + map_plot_w = chart_width - map_left - map_right + map_h = 124 + bar_y = 56 + bar_h = 24 + region_map_svg = [] + region_map_svg.append(f'') + region_map_svg.append('') + region_map_svg.append( + f'' + f"Memory Region Blocks (used range 0x{used_min:08x} - 0x{used_max:08x})" + f"" + ) + region_map_svg.append( + f'' + ) + for pct in [0, 25, 50, 75, 100]: + x = map_left + (pct / 100.0) * map_plot_w + addr = used_min + int(((used_span - 1) * pct) / 100.0) + region_map_svg.append(f'') + region_map_svg.append( + f'0x{addr:08x}' + ) + for reg in region_rows: + x = map_left + ((reg['base'] - used_min) / used_span) * map_plot_w + w = max(1.0, (reg['size'] / used_span) * map_plot_w) + color = _color_for_driver(reg['accesses'][0]['driver_idx']) if reg['accesses'] else "#888888" + title = ( + f"{reg['label']} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"size={reg['size']}B accesses={len(reg['accesses'])}" + ) + region_map_svg.append( + f'' + f'{html.escape(title)}' + ) + if w >= 92: + region_map_svg.append( + f'{html.escape(reg["label"])}' + ) + region_map_svg.append('') + + legend_items = [] + for d in sorted(driver_windows.keys()): + n = driver_name_fn(d) + c = _color_for_driver(d) + legend_items.append( + f'' + f'' + f'{html.escape(n)}' + ) + + region_cards = [] + for idx, reg in enumerate(region_rows): + access_rows = [] + accesses = sorted(reg['accesses'], key=lambda a: (a['driver_idx'], a['pattern_idx'], a['start'])) + for acc in accesses: + desc = acc['description'] if acc['description'] else "-" + access_rows.append( + "" + f"{html.escape(acc['driver_name'])}" + f"p{acc['pattern_idx']}" + f"{html.escape(acc['job'])}" + f"{html.escape(desc)}" + f"[{acc['start']}, {acc['end']})" + "" + ) + overlap_refs = overlaps_by_region.get(idx, []) + if overlap_refs: + ov_txt = ", ".join( + [ + f"{html.escape(region_rows[j]['label'])} " + f"(0x{ovb:08x}-0x{ove:08x}, {ovs} B)" + for (j, ovb, ove, ovs) in overlap_refs + ] + ) + else: + ov_txt = "none" + region_cards.append( + "
" + f"
{html.escape(reg['label'])} | base=0x{reg['base']:08x} | end=0x{reg['end']:08x} | size={reg['size']} B
" + f"
Overlaps: {ov_txt}
" + "" + "" + "" + f"{''.join(access_rows)}" + "
Driver/HWPEPatternJobDescriptionModeled interval
" + "
" + ) + + overlap_table_rows = [] + for ov in overlap_rows: + a = region_rows[ov['a_idx']] + b = region_rows[ov['b_idx']] + overlap_table_rows.append( + "" + f"{html.escape(a['label'])} (0x{a['base']:08x}-0x{a['end']:08x})" + f"{html.escape(b['label'])} (0x{b['base']:08x}-0x{b['end']:08x})" + f"0x{ov['ov_base']:08x}" + f"0x{ov['ov_end']:08x}" + f"{ov['ov_size']}" + "" + ) + + note = "Timeline follows declared wait_for_jobs dependencies from workload.json." + note_2 = ( + "Time axis is transaction-count based only " + "(no interconnect conflict/stall/arbitration modeling)." + ) + note_2b = "Per-driver list order is still enforced by stimulus/fence sequencing." + note_3 = "" + if mux_serialization_applied: + note_3 = ( + "MUX mode: HWPE execution is serialized by job order, " + "with lower HWPE ID first inside each job (tb_hci-like)." + ) + if mux_phase_order: + note_3 += f" Job order: {', '.join(mux_phase_order)}." + note_3_html = f"
{html.escape(note_3)}
" if note_3 else "" + cycle_warning_html = ( + "

Warning: dependency cycle detected; fallback scheduling order used.

" + if schedule_has_cycle else "" + ) + overlap_html = ( + "" + f"{''.join(overlap_table_rows)}" + "
Region ARegion BOverlap BaseOverlap EndOverlap Size (B)
" + if overlap_table_rows else + "
No overlaps detected among used regions.
" + ) + + return ( + "" + "Memory Access Region View (Transaction-Count Model)" + "" + "

Memory Access Region View

" + f"
Drivers ({dw_narrow} bit): " + f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg}
" + f"
Drivers ({dw_wide} bit): HWPE={n_hwpe_cfg}
" + f"
Interconnect type: {html.escape(interco_type)} | " + f"Narrow master ports ({dw_narrow} bit): {n_narrow_hci_cfg} | " + f"Wide master ports ({dw_wide} bit): {n_wide_hci_cfg} | " + f"Slave ports (banks): {n_banks} | " + f"Total modeled time: {total_cycles} units
" + f"
{html.escape(note)}
" + f"
{html.escape(note_2)}
" + f"
{html.escape(note_2b)}
" + f"{note_3_html}" + f"{cycle_warning_html}" + "

Legend

" + f"{''.join(legend_items)}
" + "
" + f"{''.join(exec_svg)}" + "
" + "
" + f"{''.join(region_map_svg)}" + "
" + "

Region Usage Blocks

" + f"{''.join(region_cards)}" + "

Overlapping Regions

" + f"{overlap_html}" + "" + ) + + +def write_memory_lifetime_html(memory_lifetime_path: Path, **kwargs): + html_doc = build_memory_lifetime_html(**kwargs) + memory_lifetime_path.write_text(html_doc, encoding='utf-8') diff --git a/target/verif/simvectors/main.py b/target/verif/simvectors/main.py index 21c2e28..da740bb 100644 --- a/target/verif/simvectors/main.py +++ b/target/verif/simvectors/main.py @@ -1,29 +1,37 @@ """Stimuli generator (reads JSON configs in `verif/config`). This script is invoked by the top-level Makefile and expects three -JSON config files: workload, testbench and hardware. It produces raw -and processed stimuli in `verif/simvectors/generated`. +JSON config files: workload, testbench and hardware. It produces +cycle-accurate stimuli in `verif/simvectors/generated/stimuli`. + +Each stimuli file encodes an offered per-cycle request stream plus PAUSE fence tokens. + +Idle lines (req=0) represent intended issue gaps in the absence of backpressure. +The application driver may consume some idle entries while stalled on an earlier +request grant, and hide some memory/interconnect latency. So the file is not a +strict wall-clock replay under contention. + +Stimuli line format: + req(1b) id(IWb) wen(1b) data(Nb) add(Ab) """ -### LIBRARIES AND DEPENDENCIES ### import json +import math import sys from pathlib import Path import argparse -import numpy as np code_directory = Path(__file__).resolve().parent - -# Try to import the local package `hci_stimuli`. If the running -# environment doesn't include the `simvectors` directory on `sys.path` -# (for example when invoked from a different working directory), add -# `code_directory` to `sys.path` as a minimal fallback. try: - from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files + from hci_stimuli import StimuliGenerator + from memory_report import write_memory_map_txt + from html_report import write_memory_lifetime_html except Exception: sys.path.insert(0, str(code_directory)) - from hci_stimuli import StimuliGenerator, unfold_raw_txt, pad_txt_files + from hci_stimuli import StimuliGenerator + from memory_report import write_memory_map_txt + from html_report import write_memory_lifetime_html def parse_args(argv=None): @@ -31,6 +39,8 @@ def parse_args(argv=None): parser.add_argument('--workload_config', required=True, help="Path to JSON workload configuration file") parser.add_argument('--testbench_config', required=True, help="Path to JSON testbench configuration file") parser.add_argument('--hardware_config', required=True, help="Path to JSON hardware configuration file") + parser.add_argument('--emit_phases_mk', default=None, metavar='PATH', + help="Also write the phases.mk Makefile fragment to PATH") parser.add_argument( '--golden', action='store_true', @@ -52,19 +62,17 @@ def load_config(filename, description): except json.JSONDecodeError as e: print(f"ERROR: Invalid JSON in {description} file: {e}") sys.exit(1) + + ### MAIN ENTRYPOINT ### def main(argv=None): - # parse CLI args args = parse_args(argv) - # load configs hardware_config = load_config(args.hardware_config, "Hardware configuration") testbench_config = load_config(args.testbench_config, "Testbench configuration") workload_config = load_config(args.workload_config, "Workload configuration") - # helpers imported at module-level (with a small sys.path fallback) - - # Extract hardware parameters + # Hardware parameters hw_params = hardware_config['parameters'] N_BANKS = hw_params['N_BANKS'] TOT_MEM_SIZE = hw_params['TOT_MEM_SIZE'] @@ -73,152 +81,863 @@ def main(argv=None): N_DMA = hw_params['N_DMA'] N_EXT = hw_params['N_EXT'] N_HWPE = hw_params['N_HWPE'] - HWPE_WIDTH = hw_params['HWPE_WIDTH'] - - # Extract testbench parameters - tb_params = testbench_config['parameters'] - TEST_RATIO = tb_params['TRANSACTION_RATIO'] - N_TEST_LOG = tb_params['N_TRANSACTION_LOG'] - - # Extract workload simulation parameters - workload_sim_params = workload_config['simulation_parameters'] - CYCLE_OFFSET_LOG = workload_sim_params['CYCLE_OFFSET_LOG'] - CYCLE_OFFSET_HWPE = workload_sim_params['CYCLE_OFFSET_HWPE'] - EXACT_OR_MAX_OFFSET = workload_sim_params['EXACT_OR_MAX_OFFSET'] + HWPE_WIDTH_FACT = hw_params['HWPE_WIDTH_FACT'] + N_CORE_CFG = N_CORE + N_DMA_CFG = N_DMA + N_EXT_CFG = N_EXT + N_HWPE_CFG = N_HWPE - # Extract workload master parameters log_masters = workload_config['log_masters'] hwpe_masters = workload_config['hwpe_masters'] # Derived parameters - WIDTH_OF_MEMORY = DATA_WIDTH - WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8 - N_WORDS = (TOT_MEM_SIZE * 1000 / N_BANKS) / WIDTH_OF_MEMORY_BYTE - ADD_WIDTH = int(np.ceil(np.log2(TOT_MEM_SIZE * 1000))) - N_TEST_HWPE = int(N_TEST_LOG * TEST_RATIO) + ADD_WIDTH = math.ceil(math.log2(TOT_MEM_SIZE * 1024)) N_LOG = N_CORE + N_DMA + N_EXT - N_MASTER = N_LOG + N_HWPE - IW = int(np.ceil(np.log2(N_TEST_LOG * N_LOG + N_TEST_HWPE * N_HWPE))) - CORE_ZERO_FLAG = False - EXT_ZERO_FLAG = False - DMA_ZERO_FLAG = False - HWPE_ZERO_FLAG = False + N_LOG_CFG = N_LOG + IW = 8 + + def _narrow_driver_name(local_idx: int) -> str: + idx = int(local_idx) + if idx < N_CORE_CFG: + return f"core_{idx}" + idx -= N_CORE_CFG + if idx < N_DMA_CFG: + return f"dma_{idx}" + idx -= N_DMA_CFG + if idx < N_EXT_CFG: + return f"ext_{idx}" + return f"narrow_{local_idx}" # Validations if len(log_masters) != N_LOG: print(f"ERROR: Number of log masters in workload config ({len(log_masters)}) doesn't match hardware config N_LOG ({N_LOG})") sys.exit(1) - if len(hwpe_masters) != N_HWPE: print(f"ERROR: Number of HWPE masters in workload config ({len(hwpe_masters)}) doesn't match hardware config N_HWPE ({N_HWPE})") sys.exit(1) - - if (not N_WORDS.is_integer()): - print("ERROR: the number of words is not an integer value") - sys.exit(1) - if (N_MASTER < 1): + if N_LOG + N_HWPE < 1: print("ERROR: the number of masters must be > 0") sys.exit(1) + n_words = (TOT_MEM_SIZE * 1024 / N_BANKS) / (DATA_WIDTH / 8) + if not n_words.is_integer(): + print("ERROR: the number of words is not an integer value") + sys.exit(1) # Prepare output dirs simvectors_dir = code_directory.resolve() generated_dir = (simvectors_dir / 'generated').resolve() - raw_dir = (generated_dir / 'stimuli_raw').resolve() - processed_dir = (generated_dir / 'stimuli_processed').resolve() + stimuli_dir = (generated_dir / 'stimuli').resolve() generated_dir.mkdir(parents=True, exist_ok=True) - raw_dir.mkdir(parents=True, exist_ok=True) - processed_dir.mkdir(parents=True, exist_ok=True) + stimuli_dir.mkdir(parents=True, exist_ok=True) - # Create zero files when a class of masters is absent. We keep the - # original behaviour of creating a single 'zero' file per missing - # class to preserve downstream expectations. - def _create_zero_file(path: Path): + def _create_idle_file(path: Path, data_width: int): + """Write a single idle line for a master that is not present in hardware.""" path.parent.mkdir(parents=True, exist_ok=True) - path.write_text('zero', encoding='ascii') + path.write_text( + "0 " + "0" * IW + " 0 " + "0" * data_width + " " + "0" * ADD_WIDTH + "\n", + encoding='ascii', + ) + + CORE_ZERO_FLAG = False + DMA_ZERO_FLAG = False + EXT_ZERO_FLAG = False + HWPE_ZERO_FLAG = False if N_CORE <= 0: CORE_ZERO_FLAG = True N_CORE = 1 - _create_zero_file(raw_dir / 'master_log_0.txt') + _create_idle_file(stimuli_dir / 'master_log_0.txt', DATA_WIDTH) if N_DMA <= 0: DMA_ZERO_FLAG = True N_DMA = 1 - _create_zero_file(raw_dir / f'master_log_{N_CORE}.txt') + _create_idle_file(stimuli_dir / f'master_log_{N_CORE}.txt', DATA_WIDTH) if N_EXT <= 0: EXT_ZERO_FLAG = True N_EXT = 1 - _create_zero_file(raw_dir / f'master_log_{N_CORE + N_DMA}.txt') + _create_idle_file(stimuli_dir / f'master_log_{N_CORE + N_DMA}.txt', DATA_WIDTH) if N_HWPE <= 0: HWPE_ZERO_FLAG = True N_HWPE = 1 - _create_zero_file(raw_dir / 'master_hwpe_0.txt') + _create_idle_file(stimuli_dir / 'master_hwpe_0.txt', HWPE_WIDTH_FACT * DATA_WIDTH) next_start_id = 0 - LIST_OF_FORBIDDEN_ADDRESSES_WRITE = [] - LIST_OF_FORBIDDEN_ADDRESSES_READ = [] - - def _generate_master(filepath: Path, master_config: dict, *, is_hwpe: bool, master_global_idx: int): - """Create StimuliGenerator and run the configured generation method. - - Parameters: - - filepath: output raw txt file path - - master_config: dict from workload.json for this master - - is_hwpe: whether this is an HWPE master (affects data width and counts) - - master_global_idx: global master id used by the generator - """ + + # Memory map entries collected during generation, printed at the end + memory_map_entries = [] + + def _bank_of(byte_addr): + return (byte_addr // (DATA_WIDTH // 8)) % N_BANKS + + def _record_memory_map(kind, local_idx, description, config, n_test, data_width, access_bytes, + region_base, region_size, start_address, stride0, len_d0, + stride1, len_d1, stride2, master_config, total_mem_bytes): + if kind == 'master_log': + label_prefix = _narrow_driver_name(local_idx) + elif kind == 'master_hwpe': + label_prefix = f"hwpe_{local_idx}" + else: + label_prefix = f"{kind}_{local_idx}" + label = label_prefix + (f" ({description})" if description else "") + if config == 'idle' or n_test == 0: + memory_map_entries.append({'label': label, 'pattern': config, 'n': 0, + 'info': 'idle - no memory accesses'}) + return + + first_addr = last_addr = None + detail = {} + + if config == 'random': + first_addr = region_base + last_addr = region_base + region_size - access_bytes + detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x} ({region_size} B)" + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" + elif config == 'matmul_phased': + ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), None) + if ra is not None and sa is not None: + # Explicit per-phase regions + a_base, a_size = ra, sa + b_base, b_size = (rb, sb) if rb is not None and sb is not None else (ra, sa) + c_base, c_size = (rc, sc) if rc is not None and sc is not None else (ra, sa) + detail['matrix_A (read)'] = f"0x{a_base:08x} - 0x{a_base + a_size - access_bytes:08x} ({a_size} B)" + if int(master_config.get('matmul_ratio_b', 1)) > 0: + detail['matrix_B (read)'] = f"0x{b_base:08x} - 0x{b_base + b_size - access_bytes:08x} ({b_size} B)" + detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_size - access_bytes:08x} ({c_size} B)" + first_addr = a_base + last_addr = c_base + c_size - access_bytes + else: + # Auto-split combined region into thirds + a_words = max(1, (region_size // access_bytes) // 3) + b_words = max(1, (region_size // access_bytes) // 3) + c_words = (region_size // access_bytes) - a_words - b_words + a_base = region_base + b_base = a_base + a_words * access_bytes + c_base = b_base + b_words * access_bytes + detail['region'] = f"0x{region_base:08x} - 0x{region_base + region_size - 1:08x} ({region_size} B) [auto-split]" + detail['matrix_A (read)'] = f"0x{a_base:08x} - 0x{b_base - access_bytes:08x} ({a_words * access_bytes} B)" + detail['matrix_B (read)'] = f"0x{b_base:08x} - 0x{c_base - access_bytes:08x} ({b_words * access_bytes} B)" + detail['matrix_C (write)'] = f"0x{c_base:08x} - 0x{c_base + c_words * access_bytes - access_bytes:08x} ({c_words * access_bytes} B)" + first_addr = a_base + last_addr = c_base + c_words * access_bytes - access_bytes + if all(k in master_config for k in ('matrix_m', 'matrix_n', 'matrix_k')): + m, n, k = int(master_config['matrix_m']), int(master_config['matrix_n']), int(master_config['matrix_k']) + detail['matrix_dims'] = f"M={m} N={n} K={k} (A: {m}x{k}, B: {k}x{n}, C: {m}x{n})" + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + elif config == 'multi_linear': + regs = master_config.get('regions', []) or [] + detail['schedule'] = str(master_config.get('schedule', 'round_robin')) + detail['burst_len'] = int(master_config.get('burst_len', 1)) + for idx, reg in enumerate(regs): + base = _parse_maybe_bin_int(reg.get('base'), 0) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + stride_w = int(reg.get('stride_words', 1)) + rpct = reg.get('read_pct') + rpct_txt = f", read={int(rpct)}%" if rpct is not None else "" + detail[f"region_{idx}"] = ( + f"0x{base:08x} - 0x{base + max(0, size) - 1:08x} " + f"({size} B, stride={stride_w} words{rpct_txt})" + ) + if regs: + first_addr = _parse_maybe_bin_int(regs[0].get('base'), 0) + last_reg = regs[-1] + lb = _parse_maybe_bin_int(last_reg.get('base'), 0) + ls = _parse_maybe_bin_int(last_reg.get('size_bytes'), 0) + last_addr = lb + max(0, ls) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'bank_group_linear': + span = max(1, int(master_config.get('bank_group_span', 1))) + start_bank = int(master_config.get('start_bank', 0)) % max(1, int(N_BANKS)) + stride_beats = max(1, int(master_config.get('stride_beats', 1))) + first_addr = start_bank * access_bytes + phase = max(0, n_test - 1) * stride_beats + group_idx = phase // span + bank = (start_bank + (phase % span)) % max(1, int(N_BANKS)) + last_addr = (group_idx * N_BANKS + bank) * access_bytes + last_addr = last_addr % total_mem_bytes + detail['start_bank'] = start_bank + detail['bank_group_span'] = span + detail['stride_beats'] = stride_beats + if 'bank_group_hop' in master_config: + detail['bank_group_hop'] = int(master_config.get('bank_group_hop', 0)) + if 'wen' in master_config: + detail['wen'] = int(master_config.get('wen', 1)) + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'rw_rowwise': + row_base = _parse_maybe_bin_int(master_config.get('row_base_address'), region_base) + row_size = _parse_maybe_bin_int(master_config.get('row_size_bytes'), access_bytes) + n_rows = max(0, int(master_config.get('n_rows', 0))) + row_stride = _parse_maybe_bin_int(master_config.get('row_stride_bytes'), row_size) + rpr = max(0, int(master_config.get('reads_per_row', 0))) + wpr = max(0, int(master_config.get('writes_per_row', 0))) + first_addr = row_base + last_addr = row_base + max(0, n_rows - 1) * row_stride + max(0, row_size - access_bytes) + last_addr = last_addr % total_mem_bytes + detail['rows'] = f"n_rows={n_rows}, row_size={row_size} B, row_stride={row_stride} B" + detail['per_row'] = f"reads={rpr}, writes={wpr}" + idle_between = int(master_config.get('idle_cycles_between_rows', 0)) + if idle_between: + detail['idle_between_rows'] = f"{idle_between} cycles" + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'gather_scatter': + rr = master_config.get('read_regions', []) or [] + wr = master_config.get('write_region', {}) or {} + for idx, reg in enumerate(rr): + b = _parse_maybe_bin_int(reg.get('base'), 0) + s = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + detail[f"read_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x} ({s} B)" + wb = _parse_maybe_bin_int(wr.get('base'), 0) + ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0) + detail['write_region'] = f"0x{wb:08x} - 0x{wb + max(0, ws) - 1:08x} ({ws} B)" + detail['schedule'] = str(master_config.get('schedule', '4read_1write')) + detail['chunk_bytes'] = int(_parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes)) + if rr: + first_addr = _parse_maybe_bin_int(rr[0].get('base'), 0) + else: + first_addr = wb + last_addr = wb + max(0, ws) - access_bytes if ws > 0 else first_addr + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(master_config.get('region_base_address_a'), region_base) + sa = _parse_maybe_bin_int(master_config.get('region_size_bytes_a'), region_size // 3) + rb = _parse_maybe_bin_int(master_config.get('region_base_address_b'), ra + sa) + sb = _parse_maybe_bin_int(master_config.get('region_size_bytes_b'), region_size // 3) + rc = _parse_maybe_bin_int(master_config.get('region_base_address_c'), rb + sb) + sc = _parse_maybe_bin_int(master_config.get('region_size_bytes_c'), region_size - max(0, sa) - max(0, sb)) + detail['matrix_A (read)'] = f"0x{ra:08x} - 0x{ra + max(0, sa) - access_bytes:08x} ({sa} B)" + detail['matrix_B (read)'] = f"0x{rb:08x} - 0x{rb + max(0, sb) - access_bytes:08x} ({sb} B)" + detail['matrix_C (write)'] = f"0x{rc:08x} - 0x{rc + max(0, sc) - access_bytes:08x} ({sc} B)" + detail['tile_bytes'] = ( + f"A={int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes))}, " + f"B={int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes))}, " + f"C={int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes))}" + ) + detail['tiles'] = int(master_config.get('tiles', 1)) + detail['ab_c_schedule'] = str(master_config.get('ab_c_schedule', 'A_B_C')) + idle_tiles = int(master_config.get('idle_cycles_between_tiles', 0)) + if idle_tiles: + detail['idle_between_tiles'] = f"{idle_tiles} cycles" + first_addr = ra + last_addr = rc + max(0, sc) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + elif config == 'hotspot_random': + hrs = master_config.get('hot_regions', []) or [] + for idx, reg in enumerate(hrs): + b = _parse_maybe_bin_int(reg.get('base'), 0) + s = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + w = int(reg.get('weight', 1)) + detail[f"hot_region_{idx}"] = f"0x{b:08x} - 0x{b + max(0, s) - 1:08x} ({s} B, weight={w})" + if hrs: + first_addr = _parse_maybe_bin_int(hrs[0].get('base'), 0) + lb = _parse_maybe_bin_int(hrs[-1].get('base'), 0) + ls = _parse_maybe_bin_int(hrs[-1].get('size_bytes'), 0) + last_addr = lb + max(0, ls) - access_bytes + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" + elif config == 'linear': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + last_addr = base + (n_test - 1) * stride0 * access_bytes + last_addr = last_addr % total_mem_bytes + detail['start'] = f"0x{base:08x}" + detail['stride'] = f"{stride0} words ({stride0 * access_bytes} B)" + tpct = master_config.get('traffic_pct') + if tpct is not None: + rpct = master_config.get('traffic_read_pct', 50) + n_idles_per_req = max(0, round((100 - int(tpct)) / int(tpct))) if int(tpct) < 100 else 0 + detail['traffic_pct'] = f"{tpct}% ({n_idles_per_req} idle(s) after each transaction)" + detail['read_pct'] = f"{rpct}%" + elif config == '2d': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + last_addr = (base + (len_d0 - 1) * stride0 * access_bytes + + (n_test // max(len_d0, 1) - 1) * stride1 * access_bytes) % total_mem_bytes + detail['dims'] = f"{len_d0} x (n_rows) stride0={stride0} stride1={stride1}" + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + elif config == '3d': + base = int(start_address, 2) if set(start_address) <= {'0','1'} else int(start_address, 0) + first_addr = base + detail['dims'] = f"{len_d0} x {len_d1} x (n_outer) stride0={stride0} stride1={stride1} stride2={stride2}" + last_addr = base # approximate for 3d + idle_between = master_config.get('idle_cycles_between_phases', 0) + if idle_between: + detail['idle_between_phases'] = f"{idle_between} cycles" + + if first_addr is not None: + detail['first_addr'] = f"0x{first_addr:08x} (bank {_bank_of(first_addr)})" + detail['last_addr'] = f"0x{last_addr:08x} (bank {_bank_of(last_addr)})" + detail['transfer'] = f"{n_test} transactions x {data_width // 8} B = {n_test * data_width // 8} B" + + memory_map_entries.append({'label': label, 'pattern': config, 'n': n_test, + 'detail': detail}) + + # (file_path, n_idle_cycles) -- populated during generation, + # prepended as idle lines before padding for static start delays. + pending_start_delays = [] + + def _parse_maybe_bin_int(raw_value, default_value): + """Parse an int or binary/hex/decimal string; return default on failure.""" + if raw_value is None: + return default_value + if isinstance(raw_value, int): + return raw_value + if isinstance(raw_value, str): + v = raw_value.strip() + if not v: + return default_value + if set(v) <= {"0", "1"}: + return int(v, 2) + try: + return int(v, 0) + except ValueError: + return default_value + return default_value + + def _normalize_mem_access_type(raw_value, master_name): + allowed = { + "random", + "linear", + "2d", + "3d", + "idle", + "matmul_phased", + "multi_linear", + "bank_group_linear", + "rw_rowwise", + "gather_scatter", + "matmul_tiled_interleave", + "hotspot_random", + } + aliases = { + "matmul": "matmul_phased", + "matmul_tiled": "matmul_tiled_interleave", + } + + if not isinstance(raw_value, str): + print( + f"ERROR: {master_name} has invalid mem_access_type={raw_value} " + f"(type={type(raw_value).__name__}). Allowed: {', '.join(sorted(allowed))}" + ) + sys.exit(1) + + key = aliases.get(raw_value.strip().lower(), raw_value.strip().lower()) + if key not in allowed: + print( + f"ERROR: {master_name} has invalid mem_access_type='{raw_value}'. " + f"Allowed: {', '.join(sorted(allowed))}" + ) + sys.exit(1) + return key + + def _pattern_job_name(pattern_config: dict) -> str: + """Resolve job name from the mandatory 'job' key.""" + return str(pattern_config.get('job', 'default')) + + def _pattern_wait_for_jobs(pattern_config): + """Resolve dependency list from the mandatory 'wait_for_jobs' key.""" + raw = pattern_config.get('wait_for_jobs', []) + if raw is None: + return [] + if isinstance(raw, list): + return [str(x) for x in raw] + return [str(raw)] + + def _warn_if_id_mismatch(master_cfg, expected_idx, master_name): + raw_id = master_cfg.get("id", expected_idx) + try: + cfg_id = int(raw_id) + except (TypeError, ValueError): + print(f"WARNING: {master_name} has non-integer id={raw_id}; positional index {expected_idx} is used.") + return + if cfg_id != expected_idx: + print( + f"WARNING: {master_name} has id={cfg_id} but positional index is {expected_idx}; " + "stimuli-to-driver mapping is positional." + ) + + def _resolve_n_transactions(master_config: dict, mem_access_type: str, data_width: int, kind: str, local_idx: int) -> int: + """Resolve n_transactions from explicit field or geometry, depending on pattern.""" + if 'n_transactions' in master_config: + return int(master_config['n_transactions']) + access_bytes = max(1, int(data_width // 8)) + if mem_access_type == 'linear': + length = master_config.get('length') + if length is not None: + return int(length) + raw_region_size = master_config.get('region_size_bytes') + if raw_region_size is not None: + region_size = _parse_maybe_bin_int(raw_region_size, None) + if region_size is not None: + return max(0, int(region_size) // access_bytes) + elif mem_access_type == '2d': + len_d0 = master_config.get('len_d0') + len_d1 = master_config.get('len_d1') + if len_d0 is not None and len_d1 is not None: + return int(len_d0) * int(len_d1) + elif mem_access_type == '3d': + len_d0 = master_config.get('len_d0') + len_d1 = master_config.get('len_d1') + len_d2 = master_config.get('len_d2') + if len_d0 is not None and len_d1 is not None and len_d2 is not None: + return int(len_d0) * int(len_d1) * int(len_d2) + elif mem_access_type == 'matmul_phased': + # For non-random region-based traffic, allow deriving transactions + # from region size and transaction width. + raw_region_size = master_config.get('region_size_bytes') + if raw_region_size is not None: + region_size = _parse_maybe_bin_int(raw_region_size, None) + if region_size is not None: + return max(0, int(region_size) // access_bytes) + m = master_config.get('matrix_m') + n = master_config.get('matrix_n') + k = master_config.get('matrix_k') + if m is not None and n is not None and k is not None: + return int(m) * int(k) + int(k) * int(n) + int(m) * int(n) + elif mem_access_type == 'multi_linear': + total = 0 + for reg in master_config.get('regions', []) or []: + size_v = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + total += max(0, int(size_v)) // access_bytes + if total > 0: + return total + elif mem_access_type == 'bank_group_linear': + print( + f"ERROR: {kind}_{local_idx} mem_access_type='bank_group_linear' " + "requires explicit 'n_transactions'." + ) + sys.exit(1) + elif mem_access_type == 'rw_rowwise': + n_rows = master_config.get('n_rows') + rpr = master_config.get('reads_per_row') + wpr = master_config.get('writes_per_row') + if n_rows is not None and rpr is not None and wpr is not None: + return max(0, int(n_rows)) * (max(0, int(rpr)) + max(0, int(wpr))) + elif mem_access_type == 'gather_scatter': + chunk = _parse_maybe_bin_int(master_config.get('chunk_bytes'), access_bytes) + step = max(access_bytes, int(chunk) if chunk is not None else access_bytes) + total = 0 + for reg in master_config.get('read_regions', []) or []: + total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // step + wr = master_config.get('write_region', {}) or {} + total += max(0, int(_parse_maybe_bin_int(wr.get('size_bytes'), 0))) // step + if total > 0: + return total + elif mem_access_type == 'matmul_tiled_interleave': + tiles = max(1, int(master_config.get('tiles', 1))) + sched = str(master_config.get('ab_c_schedule', 'A_B_C')).upper().replace('-', '_') + toks = [t for t in sched.split('_') if t] + if not toks: + toks = ['A', 'B', 'C'] + cnt_a = max(1, int(_parse_maybe_bin_int(master_config.get('tile_a_bytes'), access_bytes)) // access_bytes) + cnt_b = max(1, int(_parse_maybe_bin_int(master_config.get('tile_b_bytes'), access_bytes)) // access_bytes) + cnt_c = max(1, int(_parse_maybe_bin_int(master_config.get('tile_c_bytes'), access_bytes)) // access_bytes) + per_tile = 0 + for t in toks: + if t == 'A': + per_tile += cnt_a + elif t == 'B': + per_tile += cnt_b + elif t == 'C': + per_tile += cnt_c + if per_tile > 0: + return tiles * per_tile + elif mem_access_type == 'hotspot_random': + total = 0 + for reg in master_config.get('hot_regions', []) or []: + total += max(0, int(_parse_maybe_bin_int(reg.get('size_bytes'), 0))) // access_bytes + if total > 0: + return total + elif mem_access_type == 'idle': + return 0 + print(f"ERROR: {kind}_{local_idx} has mem_access_type='{mem_access_type}' but no " + f"'n_transactions' and no geometry fields to derive it from.") + sys.exit(1) + + def _generate_pattern( + filepath: Path, + pattern_config: dict, + *, + is_hwpe: bool, + master_global_idx: int, + master_local_idx: int, + n_peers_of_kind: int, + append: bool, + ): + """Generate one pattern segment. append=True opens file in append mode. + Every pattern always writes a trailing PAUSE (handled by the generator).""" nonlocal next_start_id - data_width = HWPE_WIDTH * DATA_WIDTH if is_hwpe else DATA_WIDTH - n_test = N_TEST_HWPE if is_hwpe else N_TEST_LOG - cycle_offset = CYCLE_OFFSET_HWPE if is_hwpe else CYCLE_OFFSET_LOG + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + kind = 'master_hwpe' if is_hwpe else 'master_log' master = StimuliGenerator( - IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH, - str(filepath), n_test, EXACT_OR_MAX_OFFSET, cycle_offset, master_global_idx + IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, data_width, ADD_WIDTH, + str(filepath), 0, master_global_idx ) - config = str(master_config.get('mem_access_type', '0')) - start_address = str(master_config.get('start_address', '0')) - stride0 = int(master_config.get('stride0', 0)) - len_d0 = int(master_config.get('len_d0', 0)) - stride1 = int(master_config.get('stride1', 0)) - len_d1 = int(master_config.get('len_d1', 0)) - stride2 = int(master_config.get('stride2', 0)) - - if config == '0': - next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '1': - next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '2': - next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '3': - next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - - def _gen_hwpe_master(master_idx, master_config, global_idx): - nonlocal next_start_id - filepath = raw_dir / f"master_hwpe_{master_idx}.txt" - master = StimuliGenerator(IW, WIDTH_OF_MEMORY, N_BANKS, TOT_MEM_SIZE, HWPE_WIDTH * DATA_WIDTH, ADD_WIDTH, - str(filepath), N_TEST_HWPE, EXACT_OR_MAX_OFFSET, CYCLE_OFFSET_HWPE, global_idx) - config = str(master_config.get('mem_access_type', '0')) - start_address = str(master_config.get('start_address', '0')) - stride0 = int(master_config.get('stride0', 0)) - len_d0 = int(master_config.get('len_d0', 0)) - stride1 = int(master_config.get('stride1', 0)) - len_d1 = int(master_config.get('len_d1', 0)) - stride2 = int(master_config.get('stride2', 0)) - - if config == '0': - next_start_id = master.random_gen(next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '1': - next_start_id = master.linear_gen(stride0, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '2': - next_start_id = master.gen_2d(stride0, len_d0, stride1, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) - elif config == '3': - next_start_id = master.gen_3d(stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, LIST_OF_FORBIDDEN_ADDRESSES_READ, LIST_OF_FORBIDDEN_ADDRESSES_WRITE) + if 'mem_access_type' not in pattern_config: + print(f"ERROR: {kind}_{master_local_idx} pattern is missing mem_access_type.") + sys.exit(1) + + config = _normalize_mem_access_type( + pattern_config['mem_access_type'], + f"{kind}_{master_local_idx}", + ) + if 'start_address' in pattern_config: + start_address = str(pattern_config['start_address']) + elif config == 'linear' and 'region_base_address' in pattern_config: + start_address = str(pattern_config['region_base_address']) + else: + start_address = '0' + + if 'stride0' in pattern_config: + stride0 = int(pattern_config['stride0']) + elif config == 'linear' and 'region_size_bytes' in pattern_config: + stride0 = 1 + else: + stride0 = 0 + len_d0 = int(pattern_config.get('len_d0', 0)) + stride1 = int(pattern_config.get('stride1', 0)) + len_d1 = int(pattern_config.get('len_d1', 0)) + stride2 = int(pattern_config.get('stride2', 0)) + + total_mem_bytes = int(TOT_MEM_SIZE * 1024) + access_bytes = max(1, int(data_width // 8)) + default_region_size = total_mem_bytes // max(1, n_peers_of_kind) + default_region_base = master_local_idx * default_region_size + + region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base) + + # For non-random region-based patterns, if n_transactions is provided but + # region_size_bytes is omitted, span a full non-wrapping region that can + # hold all transactions once at the current transaction width. + region_size_input = pattern_config.get('region_size_bytes') + if ( + config in {'linear', 'matmul_phased', 'matmul_tiled_interleave'} + and region_size_input is None + and 'n_transactions' in pattern_config + ): + region_size_input = int(pattern_config['n_transactions']) * access_bytes + + region_size = _parse_maybe_bin_int(region_size_input, default_region_size) + + region_base = (region_base // access_bytes) * access_bytes + if region_base >= total_mem_bytes: + region_base = region_base % total_mem_bytes + region_size = (max(0, region_size) // access_bytes) * access_bytes + if region_size <= 0: + region_size = (default_region_size // access_bytes) * access_bytes + if region_base + region_size > total_mem_bytes: + region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes + + n_test = _resolve_n_transactions(pattern_config, config, data_width, kind, master_local_idx) + master.N_TEST = n_test + # Read/write blocked filtering is pattern-local only. + read_blocked_local = [] + write_blocked_local = [] + tpct_raw = pattern_config.get('traffic_pct', 100) + tpct = 100 if tpct_raw is None else int(tpct_raw) + + multi_regions_cfg = [] + for reg in pattern_config.get('regions', []) or []: + multi_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + 'stride_words': int(reg.get('stride_words', 1)), + 'read_pct': reg.get('read_pct'), + }) + + read_regions_cfg = [] + for reg in pattern_config.get('read_regions', []) or []: + read_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + }) + wr_cfg_raw = pattern_config.get('write_region', {}) or {} + write_region_cfg = { + 'base': _parse_maybe_bin_int(wr_cfg_raw.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(wr_cfg_raw.get('size_bytes'), 0), + } + + hot_regions_cfg = [] + for reg in pattern_config.get('hot_regions', []) or []: + hot_regions_cfg.append({ + 'base': _parse_maybe_bin_int(reg.get('base'), 0), + 'size_bytes': _parse_maybe_bin_int(reg.get('size_bytes'), 0), + 'weight': int(reg.get('weight', 1)), + }) + + if config == 'random': + next_start_id = master.random_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + region_base=region_base, + region_size=region_size, + traffic_pct=tpct, + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, + ) + elif config == 'linear': + next_start_id = master.linear_gen( + stride0, start_address, next_start_id, + read_blocked_local, + write_blocked_local, + traffic_pct=tpct, + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, + ) + elif config == '2d': + next_start_id = master.gen_2d( + stride0, len_d0, stride1, start_address, next_start_id, + read_blocked_local, + write_blocked_local, + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + append=append, + ) + elif config == '3d': + next_start_id = master.gen_3d( + stride0, len_d0, stride1, len_d1, stride2, start_address, next_start_id, + read_blocked_local, + write_blocked_local, + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + append=append, + ) + elif config == 'idle': + next_start_id = master.idle_gen(next_start_id, append=append) + elif config == 'matmul_phased': + if not is_hwpe: + print( + f"WARNING: mem_access_type='matmul_phased' is typically used for HWPE masters; " + f"{kind}_{master_local_idx} will still use requested phased behavior." + ) + min_region_size = 3 * access_bytes + if region_size < min_region_size: + print( + f"ERROR: {kind}_{master_local_idx} region_size_bytes=" + f"{region_size} is too small for matmul_phased (minimum {min_region_size})." + ) + sys.exit(1) + next_start_id = master.matmul_phased_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + region_base, + region_size, + int(pattern_config.get('matmul_ratio_a', 1)), + int(pattern_config.get('matmul_ratio_b', 1)), + int(pattern_config.get('matmul_ratio_c', 1)), + traffic_pct=int(pattern_config.get('traffic_pct', 100)), + idle_cycles_between_phases=int(pattern_config.get('idle_cycles_between_phases', 0)), + region_base_address_a=_parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None), + region_size_bytes_a=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None), + region_base_address_b=_parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None), + region_size_bytes_b=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None), + region_base_address_c=_parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None), + region_size_bytes_c=_parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None), + append=append, + ) + elif config == 'multi_linear': + next_start_id = master.multi_linear_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + regions=multi_regions_cfg, + schedule=pattern_config.get('schedule', 'round_robin'), + burst_len=int(pattern_config.get('burst_len', 1)), + traffic_pct=tpct, + append=append, + ) + elif config == 'bank_group_linear': + next_start_id = master.bank_group_linear_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + start_bank=int(pattern_config.get('start_bank', 0)), + bank_group_span=int(pattern_config.get('bank_group_span', 1)), + stride_beats=int(pattern_config.get('stride_beats', 1)), + bank_group_hop=int(pattern_config.get('bank_group_hop', 0)), + wen=pattern_config.get('wen'), + traffic_pct=tpct, + append=append, + ) + elif config == 'rw_rowwise': + next_start_id = master.rw_rowwise_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + row_base_address=_parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base), + row_size_bytes=_parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes), + n_rows=int(pattern_config.get('n_rows', 1)), + row_stride_bytes=_parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), access_bytes), + reads_per_row=int(pattern_config.get('reads_per_row', 0)), + writes_per_row=int(pattern_config.get('writes_per_row', 0)), + traffic_pct=tpct, + idle_cycles_between_rows=int(pattern_config.get('idle_cycles_between_rows', 0)), + append=append, + ) + elif config == 'gather_scatter': + next_start_id = master.gather_scatter_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + read_regions=read_regions_cfg, + write_region=write_region_cfg, + chunk_bytes=_parse_maybe_bin_int(pattern_config.get('chunk_bytes'), access_bytes), + schedule=pattern_config.get('schedule', '4read_1write'), + traffic_pct=tpct, + append=append, + ) + elif config == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + if ra is None or sa is None or rb is None or sb is None or rc is None or sc is None: + # Fallback to split the combined region into A/B/C thirds. + n_words = max(3, region_size // access_bytes) + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = max(1, n_words - a_words - b_words) + ra = region_base + sa = a_words * access_bytes + rb = ra + sa + sb = b_words * access_bytes + rc = rb + sb + sc = c_words * access_bytes + next_start_id = master.matmul_tiled_interleave_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + region_base_address_a=ra, + region_size_bytes_a=sa, + region_base_address_b=rb, + region_size_bytes_b=sb, + region_base_address_c=rc, + region_size_bytes_c=sc, + tile_a_bytes=_parse_maybe_bin_int(pattern_config.get('tile_a_bytes'), access_bytes), + tile_b_bytes=_parse_maybe_bin_int(pattern_config.get('tile_b_bytes'), access_bytes), + tile_c_bytes=_parse_maybe_bin_int(pattern_config.get('tile_c_bytes'), access_bytes), + tiles=int(pattern_config.get('tiles', 1)), + ab_c_schedule=pattern_config.get('ab_c_schedule', 'A_B_C'), + traffic_pct=tpct, + idle_cycles_between_tiles=int(pattern_config.get('idle_cycles_between_tiles', 0)), + append=append, + ) + elif config == 'hotspot_random': + next_start_id = master.hotspot_random_gen( + next_start_id, + read_blocked_local, + write_blocked_local, + hot_regions=hot_regions_cfg, + traffic_pct=tpct, + traffic_read_pct=pattern_config.get('traffic_read_pct'), + append=append, + ) + + _record_memory_map( + kind, master_local_idx, + pattern_config.get('description', ''), + config, n_test, data_width, access_bytes, + region_base, region_size, + start_address, stride0, len_d0, stride1, len_d1, stride2, + pattern_config, total_mem_bytes, + ) + + def _generate_master( + filepath: Path, + master_config: dict, + *, + is_hwpe: bool, + master_global_idx: int, + master_local_idx: int, + n_peers_of_kind: int, + ): + """Generate stimulus for a master, supporting single flat pattern or patterns list.""" + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + + # Resolve pattern list: either explicit 'patterns' list or a single flat pattern + if 'patterns' in master_config: + patterns = master_config['patterns'] + if not patterns: + kind = 'master_hwpe' if is_hwpe else 'master_log' + print(f"ERROR: {kind}_{master_local_idx} has empty patterns list.") + sys.exit(1) + else: + # Legacy flat format: treat the master config itself as a single pattern + patterns = [master_config] + + # Start delay applies to the whole master (prepended before first pattern) + start_delay = int(master_config.get('start_delay_cycles', 0)) + if start_delay > 0: + pending_start_delays.append((filepath, start_delay, data_width)) + + # For each pattern with wait_for_jobs, prepend a synthetic idle+PAUSE that acts as + # the blocking fence. The pattern's own trailing PAUSE is always mask=0 (free + # pass), so fence_idx advances immediately after the real work is done. + # This separates "I am done" (trailing PAUSE, free) from "I may start" (idle + # gate, blocking), giving resume_i a single clean meaning: start your next job. + dw = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + first_written = False + for p_idx, pattern_config in enumerate(patterns): + if _pattern_wait_for_jobs(pattern_config): + # Synthetic idle+PAUSE gates this pattern + _idle = StimuliGenerator(IW, DATA_WIDTH, N_BANKS, TOT_MEM_SIZE, + dw, ADD_WIDTH, str(filepath), 0, master_global_idx) + _idle.N_TEST = 0 + _idle.idle_gen(next_start_id, append=first_written) + first_written = True + _generate_pattern( + filepath, + pattern_config, + is_hwpe=is_hwpe, + master_global_idx=master_global_idx, + master_local_idx=master_local_idx, + n_peers_of_kind=n_peers_of_kind, + append=first_written, + ) + first_written = True global_idx = 0 - # Generate logarithmic masters (CORE, DMA, EXT) in order + + # Generate LOG masters (CORE, DMA, EXT) in order for i in range(N_LOG): - # determine class of this master (core/dma/ext) if i < N_CORE: if CORE_ZERO_FLAG: global_idx += 1 @@ -233,66 +952,768 @@ def _gen_hwpe_master(master_idx, master_config, global_idx): continue master_cfg = log_masters[i] - _generate_master(raw_dir / f"master_log_{i}.txt", master_cfg, is_hwpe=False, master_global_idx=global_idx) + _warn_if_id_mismatch(master_cfg, i, f"master_log_{i}") + _generate_master( + stimuli_dir / f"master_log_{i}.txt", + master_cfg, + is_hwpe=False, + master_global_idx=global_idx, + master_local_idx=i, + n_peers_of_kind=max(1, N_LOG), + ) global_idx += 1 - # Generate HWPE masters; their global index follows the previous masters + # Generate HWPE masters for hw_idx in range(N_HWPE): if HWPE_ZERO_FLAG: global_idx += 1 continue master_cfg = hwpe_masters[hw_idx] - _generate_master(raw_dir / f"master_hwpe_{hw_idx}.txt", master_cfg, is_hwpe=True, master_global_idx=global_idx) + _warn_if_id_mismatch(master_cfg, hw_idx, f"master_hwpe_{hw_idx}") + _generate_master( + stimuli_dir / f"master_hwpe_{hw_idx}.txt", + master_cfg, + is_hwpe=True, + master_global_idx=global_idx, + master_local_idx=hw_idx, + n_peers_of_kind=max(1, N_HWPE), + ) global_idx += 1 - print("STEP 0 COMPLETED: create raw txt files") + print("STEP 0 COMPLETED: generate stimuli files") + + # ----------------------------------------------------------------------- + # Compute FENCE_MASKS and emit fence_masks.mk + # + # Fence slot f corresponds to the PAUSE before pattern f in the stimulus + # file (i.e. between pattern f-1 and pattern f). The mask at slot f holds + # the set of drivers that must have passed fence f before this driver can + # resume from that PAUSE. + # + # For a master with N patterns, there are N fence slots (slot 0 = before + # pattern 0, slot f = before pattern f). The wait_for_jobs of pattern f defines + # the mask at fence slot f. + # + # Legacy flat masters (no 'patterns' key) are treated as single-pattern + # masters: one fence slot (slot 0) from the top-level wait_for_jobs field. + # ----------------------------------------------------------------------- + N_DRIVERS = N_LOG + N_HWPE + + def _patterns_of(master_config): + """Return the list of pattern configs for a master.""" + if 'patterns' in master_config: + return master_config['patterns'] + return [master_config] + + # Build job->driver map: every pattern of every driver registers its job. + # This allows wait_for_jobs to reference any job, not just first patterns. + # A job may be associated with multiple drivers (e.g. 8 cores all in softmax_t0). + job_to_drivers = {} + all_masters = [(m, False) for m in log_masters] + [(m, True) for m in hwpe_masters] + for i, (m, _) in enumerate(all_masters): + for pat in _patterns_of(m): + job = _pattern_job_name(pat) + if i not in job_to_drivers.get(job, []): + job_to_drivers.setdefault(job, []).append(i) + + # Build job->pattern_index map: for each job, which pattern index within + # each driver corresponds to that job. Used to compute FENCE_REQ_LEVELS. + # job_pattern_idx[job][driver] = pattern index of that job in that driver + job_pattern_idx = {} + for i, (m, _) in enumerate(all_masters): + for p_idx, pat in enumerate(_patterns_of(m)): + job = _pattern_job_name(pat) + job_pattern_idx.setdefault(job, {})[i] = p_idx + + def _resolve_wait_mask(wait_for_jobs_list): + mask = 0 + for dep_job in wait_for_jobs_list: + for dep_drv in job_to_drivers.get(str(dep_job), []): + mask |= (1 << dep_drv) + return mask + + # Precompute per-driver fence_idx value after finishing pattern p: + # = number of fences (synthetic idle gates + trailing PAUSEs) passed up to and + # including the trailing PAUSE of pattern p. + def _fence_idx_after_pattern(drv_idx, pat_idx): + pats = _patterns_of(all_masters[drv_idx][0]) + # Count synthetic idle gates for patterns 0..pat_idx (those with wait_for_jobs) + n_gates = sum(1 for k in range(pat_idx + 1) if _pattern_wait_for_jobs(pats[k])) + # Plus trailing PAUSEs for patterns 0..pat_idx + n_trailing = pat_idx + 1 + return n_gates + n_trailing + + def _resolve_req_levels(wait_for_jobs_list): + """Required fence_idx[j] = fence_idx value of j after finishing pattern p_j.""" + levels = [0] * N_DRIVERS + for dep_job in wait_for_jobs_list: + for dep_drv in job_to_drivers.get(str(dep_job), []): + p_idx = job_pattern_idx.get(str(dep_job), {}).get(dep_drv, 0) + levels[dep_drv] = _fence_idx_after_pattern(dep_drv, p_idx) + return levels + + # Build per-driver fence mask and req_level lists. + # Each pattern with wait_for_jobs gets a synthetic idle gate (mask = wait_for_jobs) before it. + # Trailing PAUSEs always have mask=0 (free pass — just advance fence_idx). + # Fences are enumerated in file order: for each pattern p: + # if p has wait_for_jobs: synthetic idle fence (mask = wait_for_jobs of p) + # trailing PAUSE fence (mask = 0) + fence_masks = [] + req_levels = [] + for i, (m, _) in enumerate(all_masters): + patterns = _patterns_of(m) + per_masks = [] + per_levels = [] + for pat in patterns: + wait_for_jobs = _pattern_wait_for_jobs(pat) + if wait_for_jobs: + # Synthetic idle gate: blocking fence + per_masks.append(_resolve_wait_mask(wait_for_jobs)) + per_levels.append(_resolve_req_levels(wait_for_jobs)) + # Trailing PAUSE: free pass, just signals completion + per_masks.append(0) + per_levels.append([0] * N_DRIVERS) + fence_masks.append(per_masks) + req_levels.append(per_levels) + + max_fences = max((len(fm) for fm in fence_masks), default=1) + + # Pad to max_fences + for i in range(N_DRIVERS): + while len(fence_masks[i]) < max_fences: + fence_masks[i].append(0) + while len(req_levels[i]) < max_fences: + req_levels[i].append([0] * N_DRIVERS) + + # Emit SV literals + hex_width = max(1, (N_DRIVERS + 3) // 4) + per_driver_literals = [] + for i in range(N_DRIVERS): + slot_literals = [f"{N_DRIVERS}'h{fence_masks[i][f]:0{hex_width}x}" for f in range(max_fences)] + per_driver_literals.append("'{" + ", ".join(slot_literals) + "}") + fence_masks_param = "'{" + ", ".join(per_driver_literals) + "}" + + # FENCE_REQ_LEVELS[N_DRIVERS][MAX_FENCES][N_DRIVERS] — int unsigned + # Pack FENCE_REQ_LEVELS as FENCE_REQ_LEVELS_PACKED[i][f] = N_DRIVERS*4-bit vector. + # Bits [j*4+3:j*4] = required fence_idx[j] (4 bits, supports 0..15). + max_req_level = 0 + for i in range(N_DRIVERS): + for f in range(max_fences): + for j in range(N_DRIVERS): + max_req_level = max(max_req_level, int(req_levels[i][f][j])) + if max_req_level > 15: + print( + "ERROR: Fence dependency level overflow: " + f"required fence_idx={max_req_level}, but packed format supports only 0..15. " + "Reduce the number of fence crossings per dependent job or widen LEVEL_BITS." + ) + sys.exit(1) + + LEVEL_BITS = 4 + packed_width = N_DRIVERS * LEVEL_BITS + packed_hex_digits = (packed_width + 3) // 4 + req_driver_literals = [] + for i in range(N_DRIVERS): + fence_literals = [] + for f in range(max_fences): + val = 0 + for j in range(N_DRIVERS): + val |= (req_levels[i][f][j] & 0xF) << (j * LEVEL_BITS) + fence_literals.append(f"{packed_width}'h{val:0{packed_hex_digits}x}") + req_driver_literals.append("'{" + ", ".join(fence_literals) + "}") + fence_req_levels_packed_param = "'{" + ", ".join(req_driver_literals) + "}" + + if args.emit_phases_mk: + phases_mk_path = Path(args.emit_phases_mk) + phases_mk_path.parent.mkdir(parents=True, exist_ok=True) + phases_mk_path.write_text( + "# Auto-generated by main.py - DO NOT EDIT MANUALLY\n" + "# Per-driver per-fence dependency data for tb_hci.sv.\n" + f"# Drivers 0..{N_LOG-1} = narrow masters (core/dma/ext), {N_LOG}..{N_DRIVERS-1} = HWPE masters.\n" + f"# fence f = PAUSE after pattern f; fence_idx[i]==k means i completed k patterns.\n" + f"# FENCE_MASKS[i][f][j]=1: j is a dependency of i at fence f.\n" + f"# FENCE_REQ_LEVELS_PACKED[i][f]: packed {N_DRIVERS*4}-bit vector, bits [j*4+3:j*4] = min fence_idx[j].\n" + f"MAX_FENCES_PARAM := {max_fences}\n" + f"FENCE_MASKS_PARAM := {fence_masks_param}\n" + f"FENCE_REQ_LEVELS_PACKED_PARAM := {fence_req_levels_packed_param}\n", + encoding='utf-8', + ) + print(f"FENCE_MASKS.MK written: {phases_mk_path}") + + # ----------------------------------------------------------------------- + # Build and emit memory map report + # ----------------------------------------------------------------------- + INTERCO_TYPE = str(hw_params.get('INTERCO_TYPE', 'HCI')).strip().upper() + if INTERCO_TYPE not in {"LOG", "MUX", "HCI"}: + INTERCO_TYPE = "HCI" + DW_NARROW = int(DATA_WIDTH) + DW_WIDE = int(HWPE_WIDTH_FACT * DATA_WIDTH) + N_NARROW_HCI_CFG = int( + N_CORE_CFG + N_DMA_CFG + N_EXT_CFG + + (N_HWPE_CFG * HWPE_WIDTH_FACT if INTERCO_TYPE == "LOG" else 0) + ) + N_WIDE_HCI_CFG = int(N_HWPE_CFG if INTERCO_TYPE == "HCI" else (1 if INTERCO_TYPE == "MUX" else 0)) + N_MASTER_PORTS_CFG = int(N_NARROW_HCI_CFG + N_WIDE_HCI_CFG) + + def _driver_name(driver_idx): + if driver_idx < N_LOG: + return _narrow_driver_name(driver_idx) + return f"hwpe_{driver_idx - N_LOG}" + + def _resolve_regions(pattern_config, mem_access_type, is_hwpe, local_idx, n_peers): + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + access_bytes = max(1, int(data_width // 8)) + total_mem_bytes = int(TOT_MEM_SIZE * 1024) + default_region_size = total_mem_bytes // max(1, n_peers) + default_region_base = local_idx * default_region_size + + region_base = _parse_maybe_bin_int(pattern_config.get('region_base_address'), default_region_base) + region_size = _parse_maybe_bin_int(pattern_config.get('region_size_bytes'), default_region_size) + + region_base = (region_base // access_bytes) * access_bytes + if region_base >= total_mem_bytes: + region_base = region_base % total_mem_bytes + region_size = (max(0, region_size) // access_bytes) * access_bytes + if region_size <= 0: + region_size = (default_region_size // access_bytes) * access_bytes + if region_base + region_size > total_mem_bytes: + region_size = ((total_mem_bytes - region_base) // access_bytes) * access_bytes + + if region_size <= 0: + return [] + + if mem_access_type == 'idle': + return [] + if mem_access_type == 'multi_linear': + regions = [] + for idx, reg in enumerate(pattern_config.get('regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), region_size) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + rpct = reg.get('read_pct') + if rpct is None: + lbl = f"R{idx}" + else: + lbl = f"R{idx}({'read' if int(rpct) >= 50 else 'write'})" + regions.append({ + 'label': lbl, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + if mem_access_type == 'bank_group_linear': + span = max(1, int(pattern_config.get('bank_group_span', 1))) + start_bank = int(pattern_config.get('start_bank', 0)) % max(1, int(N_BANKS)) + n_tx = _parse_maybe_bin_int(pattern_config.get('n_transactions'), 1) + n_tx = max(1, int(n_tx)) + rows = max(1, math.ceil(n_tx / span)) + size = min(total_mem_bytes, rows * span * access_bytes) + base = (start_bank * access_bytes) % max(1, total_mem_bytes) + if base + size > total_mem_bytes: + size = max(access_bytes, total_mem_bytes - base) + return [{ + 'label': 'bank_group', + 'base': base, + 'size': size, + 'end': base + size - 1, + }] + if mem_access_type == 'rw_rowwise': + row_base = _parse_maybe_bin_int(pattern_config.get('row_base_address'), region_base) + row_size = _parse_maybe_bin_int(pattern_config.get('row_size_bytes'), access_bytes) + n_rows = max(1, int(pattern_config.get('n_rows', 1))) + row_stride = _parse_maybe_bin_int(pattern_config.get('row_stride_bytes'), row_size) + base = (row_base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = ((max(0, row_stride) * max(0, n_rows - 1)) + max(0, row_size)) + size = (size // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + size = access_bytes + return [{ + 'label': 'rowwise', + 'base': base, + 'size': size, + 'end': base + size - 1, + }] + if mem_access_type == 'gather_scatter': + regions = [] + for idx, reg in enumerate(pattern_config.get('read_regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + regions.append({ + 'label': f"gather_{idx}(read)", + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + wr = pattern_config.get('write_region', {}) or {} + wb = _parse_maybe_bin_int(wr.get('base'), region_base) + ws = _parse_maybe_bin_int(wr.get('size_bytes'), 0) + wb = (wb // access_bytes) * access_bytes + if wb >= total_mem_bytes: + wb = wb % total_mem_bytes + ws = (max(0, ws) // access_bytes) * access_bytes + if wb + ws > total_mem_bytes: + ws = ((total_mem_bytes - wb) // access_bytes) * access_bytes + if ws > 0: + regions.append({ + 'label': 'scatter(write)', + 'base': wb, + 'size': ws, + 'end': wb + ws - 1, + }) + return regions + if mem_access_type == 'hotspot_random': + regions = [] + for idx, reg in enumerate(pattern_config.get('hot_regions', []) or []): + base = _parse_maybe_bin_int(reg.get('base'), region_base) + size = _parse_maybe_bin_int(reg.get('size_bytes'), 0) + base = (base // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, size) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size <= 0: + continue + regions.append({ + 'label': f"hot_{idx}", + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + if mem_access_type == 'matmul_tiled_interleave': + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + regions = [] + if ra is not None and sa is not None and rb is not None and sb is not None and rc is not None and sc is not None: + sub_defs = [ + ('A(read)', ra, sa), + ('B(read)', rb, sb), + ('C(write)', rc, sc), + ] + else: + n_words = max(3, region_size // access_bytes) + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = max(1, n_words - a_words - b_words) + sub_defs = [ + ('A(read)', region_base, a_words * access_bytes), + ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes), + ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes), + ] + for label, base_raw, size_raw in sub_defs: + base = (int(base_raw) // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, int(size_raw)) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size > 0: + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + + if mem_access_type != 'matmul_phased': + return [{ + 'label': 'region', + 'base': region_base, + 'size': region_size, + 'end': region_base + region_size - 1, + }] + + ra = _parse_maybe_bin_int(pattern_config.get('region_base_address_a'), None) + sa = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_a'), None) + rb = _parse_maybe_bin_int(pattern_config.get('region_base_address_b'), None) + sb = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_b'), None) + rc = _parse_maybe_bin_int(pattern_config.get('region_base_address_c'), None) + sc = _parse_maybe_bin_int(pattern_config.get('region_size_bytes_c'), None) + + regions = [] + if ra is not None and sa is not None: + sub_defs = [ + ('A(read)', ra, sa), + ('B(read)', rb if rb is not None else ra, sb if sb is not None else sa), + ('C(write)', rc if rc is not None else ra, sc if sc is not None else sa), + ] + for label, base_raw, size_raw in sub_defs: + base = (int(base_raw) // access_bytes) * access_bytes + if base >= total_mem_bytes: + base = base % total_mem_bytes + size = (max(0, int(size_raw)) // access_bytes) * access_bytes + if base + size > total_mem_bytes: + size = ((total_mem_bytes - base) // access_bytes) * access_bytes + if size > 0: + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + + n_words = region_size // access_bytes + if n_words < 3: + return [{ + 'label': 'region', + 'base': region_base, + 'size': region_size, + 'end': region_base + region_size - 1, + }] + a_words = max(1, n_words // 3) + b_words = max(1, n_words // 3) + c_words = n_words - a_words - b_words + sub_regions = [ + ('A(read)', region_base, a_words * access_bytes), + ('B(read)', region_base + a_words * access_bytes, b_words * access_bytes), + ('C(write)', region_base + (a_words + b_words) * access_bytes, c_words * access_bytes), + ] + for label, base, size in sub_regions: + if size <= 0: + continue + regions.append({ + 'label': label, + 'base': base, + 'size': size, + 'end': base + size - 1, + }) + return regions + + def _estimate_pattern_cycles(pattern_config, _mem_access_type, n_test, _txn_bytes): + # Temporal model intentionally follows emitted traffic only: + # one unit per transaction plus req=0 idles from traffic_pct shaping. + # No absolute/phase/tile/row cycle estimation is applied here. + base = max(0, int(n_test)) + tpct = pattern_config.get('traffic_pct') + n_idles_per_req = 0 + if tpct is not None: + tp = max(1, min(100, int(tpct))) + n_idles_per_req = 0 if tp >= 100 else int(round((100 - tp) / tp)) + return int(base * (1 + n_idles_per_req)) + + pattern_nodes = [] + node_idx_by_driver_pattern = {} + job_to_nodes = {} + driver_last_node = {} + + for drv_idx, (master_cfg, is_hwpe) in enumerate(all_masters): + patterns = _patterns_of(master_cfg) + local_idx = drv_idx - N_LOG if is_hwpe else drv_idx + data_width = HWPE_WIDTH_FACT * DATA_WIDTH if is_hwpe else DATA_WIDTH + kind = 'master_hwpe' if is_hwpe else 'master_log' + n_peers = max(1, N_HWPE if is_hwpe else N_LOG) + start_delay = int(master_cfg.get('start_delay_cycles', 0)) + for p_idx, pat in enumerate(patterns): + raw_type = pat.get('mem_access_type', 'idle') + mem_access_type = _normalize_mem_access_type(raw_type, f"{kind}_{local_idx}") + n_test = _resolve_n_transactions(pat, mem_access_type, data_width, kind, local_idx) + declared_wait_for_jobs = _pattern_wait_for_jobs(pat) + # Timeline view follows declared dependencies from workload.json. + effective_wait_for_jobs = declared_wait_for_jobs + node = { + 'node_idx': len(pattern_nodes), + 'driver_idx': drv_idx, + 'driver_name': _driver_name(drv_idx), + 'is_hwpe': is_hwpe, + 'local_idx': local_idx, + 'pattern_idx': p_idx, + 'description': str(pat.get('description', '')).strip(), + 'job': _pattern_job_name(pat), + 'wait_for_jobs_declared': declared_wait_for_jobs, + 'wait_for_jobs_effective': effective_wait_for_jobs, + 'n_transactions': int(n_test), + 'cycles': int(_estimate_pattern_cycles(pat, mem_access_type, n_test, int(data_width // 8))), + 'mem_access_type': mem_access_type, + 'traffic_read_pct': pat.get('traffic_read_pct'), + 'txn_bytes': int(data_width // 8), + 'start_delay': start_delay if p_idx == 0 else 0, + 'regions': _resolve_regions(pat, mem_access_type, is_hwpe, local_idx, n_peers), + } + pattern_nodes.append(node) + node_idx_by_driver_pattern[(drv_idx, p_idx)] = node['node_idx'] + job_to_nodes.setdefault(node['job'], []).append(node['node_idx']) + driver_last_node[drv_idx] = node['node_idx'] + + n_nodes = len(pattern_nodes) + preds = [set() for _ in range(n_nodes)] + succs = [set() for _ in range(n_nodes)] + mux_serialization_applied = False + mux_phase_order = [] + + def _add_edge(src, dst): + if src == dst or src < 0 or dst < 0: + return + if src not in preds[dst]: + preds[dst].add(src) + succs[src].add(dst) + + for node in pattern_nodes: + n_idx = node['node_idx'] + drv_idx = node['driver_idx'] + p_idx = node['pattern_idx'] + if p_idx > 0: + _add_edge(node_idx_by_driver_pattern[(drv_idx, p_idx - 1)], n_idx) + for dep_job in node['wait_for_jobs_effective']: + for dep_idx in job_to_nodes.get(dep_job, []): + _add_edge(dep_idx, n_idx) + + if INTERCO_TYPE == "MUX": + # Match tb_hci MUX semantics in the temporal model: + # serialize HWPE execution by job order, and by HWPE ID within a job. + hwpe_nodes = [n for n in pattern_nodes if n['is_hwpe']] + if hwpe_nodes: + job_first_seen = {} + for n in sorted(hwpe_nodes, key=lambda x: (x['pattern_idx'], x['local_idx'], x['node_idx'])): + job_first_seen.setdefault(n['job'], len(job_first_seen)) + + job_preds = {jb: set() for jb in job_first_seen} + job_succs = {jb: set() for jb in job_first_seen} + for n in hwpe_nodes: + cur = n['job'] + for dep_job in n['wait_for_jobs_effective']: + dep = str(dep_job) + if dep in job_first_seen and dep != cur: + job_preds[cur].add(dep) + job_succs[dep].add(cur) + + phase_indeg = {jb: len(job_preds[jb]) for jb in job_first_seen} + phase_ready = sorted([jb for jb, deg in phase_indeg.items() if deg == 0], + key=lambda jb: job_first_seen[jb]) + mux_phase_order = [] + while phase_ready: + cur = phase_ready.pop(0) + mux_phase_order.append(cur) + for nxt in sorted(job_succs[cur], key=lambda jb: job_first_seen[jb]): + phase_indeg[nxt] -= 1 + if phase_indeg[nxt] == 0: + phase_ready.append(nxt) + phase_ready.sort(key=lambda jb: job_first_seen[jb]) + if len(mux_phase_order) != len(job_first_seen): + mux_phase_order = sorted(job_first_seen.keys(), key=lambda jb: job_first_seen[jb]) + + phase_rank = {ph: i for i, ph in enumerate(mux_phase_order)} + hwpe_sorted = sorted( + hwpe_nodes, + key=lambda n: ( + phase_rank.get(n['job'], 10 ** 9), + n['local_idx'], + n['pattern_idx'], + n['node_idx'], + ), + ) + for i in range(1, len(hwpe_sorted)): + _add_edge(hwpe_sorted[i - 1]['node_idx'], hwpe_sorted[i]['node_idx']) + mux_serialization_applied = True + + indeg = [len(preds[i]) for i in range(n_nodes)] + ready = [i for i, d in enumerate(indeg) if d == 0] + ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx'])) + topo_order = [] + while ready: + cur = ready.pop(0) + topo_order.append(cur) + for nxt in sorted(succs[cur]): + indeg[nxt] -= 1 + if indeg[nxt] == 0: + ready.append(nxt) + ready.sort(key=lambda i: (pattern_nodes[i]['driver_idx'], pattern_nodes[i]['pattern_idx'])) - # Process raw files - simvector_raw_path = str(raw_dir) - simvector_processed_path = str((raw_dir.parent / 'stimuli_processed').resolve()) - unfold_raw_txt(simvector_raw_path, simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH) - print("STEP 1 COMPLETED: unfold txt files") + schedule_has_cycle = len(topo_order) != n_nodes + if schedule_has_cycle: + topo_order = list(range(n_nodes)) - pad_txt_files(simvector_processed_path, IW, DATA_WIDTH, ADD_WIDTH, HWPE_WIDTH) - print("STEP 2 COMPLETED: pad txt files") + node_start = [0 for _ in range(n_nodes)] + node_end = [0 for _ in range(n_nodes)] + for _ in range(max(1, n_nodes + 1)): + changed = False + for n_idx in topo_order: + dep_end = max((node_end[p] for p in preds[n_idx]), default=0) + start_time = max(int(pattern_nodes[n_idx]['start_delay']), dep_end) + end_time = start_time + max(0, int(pattern_nodes[n_idx]['cycles'])) + if start_time != node_start[n_idx] or end_time != node_end[n_idx]: + node_start[n_idx] = start_time + node_end[n_idx] = end_time + changed = True + if not changed: + break + for n_idx, node in enumerate(pattern_nodes): + node['start_cycle'] = int(node_start[n_idx]) + node['end_cycle'] = int(node_end[n_idx]) + + total_cycles = max((n['end_cycle'] for n in pattern_nodes), default=0) + + driver_windows = {} + for node in pattern_nodes: + w = driver_windows.setdefault(node['driver_idx'], { + 'driver_idx': node['driver_idx'], + 'name': node['driver_name'], + 'is_hwpe': node['is_hwpe'], + 'start': node['start_cycle'], + 'end': node['end_cycle'], + }) + w['start'] = min(w['start'], node['start_cycle']) + w['end'] = max(w['end'], node['end_cycle']) + + regions_timeline = {} + for node in pattern_nodes: + for reg in node['regions']: + reg_key = (reg['base'], reg['size'], reg['label']) + entry = regions_timeline.setdefault(reg_key, { + 'base': reg['base'], + 'size': reg['size'], + 'end': reg['end'], + 'label': reg['label'], + 'accesses': [], + }) + entry['accesses'].append({ + 'driver_idx': node['driver_idx'], + 'driver_name': node['driver_name'], + 'job': node['job'], + 'start': node['start_cycle'], + 'end': node['end_cycle'], + 'pattern_idx': node['pattern_idx'], + 'description': node['description'], + }) + for reg in regions_timeline.values(): + reg['lifetime_start'] = min((a['start'] for a in reg['accesses']), default=0) + reg['lifetime_end'] = max((a['end'] for a in reg['accesses']), default=0) + + # ----------------------------------------------------------------------- + # Build memory_map.txt + # ----------------------------------------------------------------------- + memory_map_path = generated_dir / 'memory_map.txt' + write_memory_map_txt( + memory_map_path=memory_map_path, + total_mem_size_kib=TOT_MEM_SIZE, + n_banks=N_BANKS, + data_width=DATA_WIDTH, + hwpe_data_width=HWPE_WIDTH_FACT * DATA_WIDTH, + n_core_cfg=N_CORE_CFG, + n_dma_cfg=N_DMA_CFG, + n_ext_cfg=N_EXT_CFG, + n_log_cfg=N_LOG_CFG, + n_hwpe_cfg=N_HWPE_CFG, + interco_type=INTERCO_TYPE, + dw_narrow=DW_NARROW, + dw_wide=DW_WIDE, + n_narrow_hci_cfg=N_NARROW_HCI_CFG, + n_wide_hci_cfg=N_WIDE_HCI_CFG, + memory_map_entries=memory_map_entries, + job_to_drivers=job_to_drivers, + driver_name_fn=_driver_name, + n_drivers=N_DRIVERS, + fence_masks=fence_masks, + total_cycles=total_cycles, + mux_serialization_applied=mux_serialization_applied, + mux_phase_order=mux_phase_order, + schedule_has_cycle=schedule_has_cycle, + driver_windows=driver_windows, + pattern_nodes=pattern_nodes, + regions_timeline=regions_timeline, + ) + print(f"Memory map written: {memory_map_path}") + + # ----------------------------------------------------------------------- + # Build dataflow.html (simple SVG timeline view) + # ----------------------------------------------------------------------- + dataflow_path = generated_dir / 'dataflow.html' + write_memory_lifetime_html( + memory_lifetime_path=dataflow_path, + pattern_nodes=pattern_nodes, + driver_windows=driver_windows, + regions_timeline=regions_timeline, + total_cycles=total_cycles, + mux_serialization_applied=mux_serialization_applied, + mux_phase_order=mux_phase_order, + schedule_has_cycle=schedule_has_cycle, + driver_name_fn=_driver_name, + interco_type=INTERCO_TYPE, + n_core_cfg=N_CORE_CFG, + n_dma_cfg=N_DMA_CFG, + n_ext_cfg=N_EXT_CFG, + n_hwpe_cfg=N_HWPE_CFG, + dw_narrow=DW_NARROW, + dw_wide=DW_WIDE, + n_narrow_hci_cfg=N_NARROW_HCI_CFG, + n_wide_hci_cfg=N_WIDE_HCI_CFG, + n_banks=N_BANKS, + ) + print(f"Dataflow plot written: {dataflow_path}") + + # ----------------------------------------------------------------------- + # Apply per-master start delays + # ----------------------------------------------------------------------- + for fpath, delay, dw in pending_start_delays: + if fpath.exists(): + idle_line = "0 " + "0" * IW + " 0 " + "0" * dw + " " + "0" * ADD_WIDTH + "\n" + original = fpath.read_text(encoding='ascii') + fpath.write_text(idle_line * delay + original, encoding='ascii') + + print("STEP 1 COMPLETED: generate documents and apply start delays to stimuli") + + # ----------------------------------------------------------------------- + # Golden vectors + # ----------------------------------------------------------------------- if args.golden: golden_dir = (generated_dir / 'golden').resolve() golden_dir.mkdir(parents=True, exist_ok=True) - for stim_path in sorted(processed_dir.glob('master_*.txt')): + for stim_path in sorted(stimuli_dir.glob('master_*.txt')): try: text = stim_path.read_text(encoding='ascii') except OSError: continue - if text.strip() == 'zero': - continue - mem = {} out_lines = [] for raw_line in text.splitlines(): line = raw_line.strip() if not line: continue - parts = line.split() if len(parts) != 5: continue - req_s, id_s, wen_s, data_s, add_s = parts if req_s != '1': continue - if wen_s == '0': mem[add_s] = data_s continue - exp_s = mem.get(add_s, '1' * len(data_s)) out_lines.append(f"{id_s} {add_s} {exp_s}") - (golden_dir / f"golden_{stim_path.name}").write_text("\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii') - print("STEP 3 COMPLETED: golden vectors") + (golden_dir / f"golden_{stim_path.name}").write_text( + "\n".join(out_lines) + ("\n" if out_lines else ""), encoding='ascii' + ) + print("STEP 2 COMPLETED: golden vectors") if __name__ == '__main__': diff --git a/target/verif/simvectors/memory_report.py b/target/verif/simvectors/memory_report.py new file mode 100644 index 0000000..2da9cfb --- /dev/null +++ b/target/verif/simvectors/memory_report.py @@ -0,0 +1,119 @@ +"""Text memory map report generation.""" + +from pathlib import Path + + +def build_memory_map_text( + *, + total_mem_size_kib, + n_banks, + data_width, + hwpe_data_width, + n_core_cfg, + n_dma_cfg, + n_ext_cfg, + n_log_cfg, + n_hwpe_cfg, + interco_type, + dw_narrow, + dw_wide, + n_narrow_hci_cfg, + n_wide_hci_cfg, + memory_map_entries, + job_to_drivers, + driver_name_fn, + n_drivers, + fence_masks, + total_cycles, + mux_serialization_applied, + mux_phase_order, + schedule_has_cycle, + driver_windows, + pattern_nodes, + regions_timeline, +): + word_bytes = data_width // 8 + bank_stride_bytes = n_banks * word_bytes + lines = [] + lines.append("=" * 72) + lines.append("MEMORY MAP REPORT") + lines.append(f" Total memory : {total_mem_size_kib} KiB ({total_mem_size_kib * 1024} B)") + lines.append(f" Banks : {n_banks} x {word_bytes} B/word (interleaved, stride {bank_stride_bytes} B)") + lines.append(f" Data width : {data_width} b LOG / {hwpe_data_width} b HWPE") + lines.append( + f" Drivers({dw_narrow} bit) : " + f"CORE={n_core_cfg}, DMA={n_dma_cfg}, EXT={n_ext_cfg} (LOG total={n_log_cfg})" + ) + lines.append( + f" Drivers({dw_wide} bit) : " + f"HWPE={n_hwpe_cfg}" + ) + lines.append( + f" Interconnect type : {interco_type} | " + f"Narrow master ports ({dw_narrow} bit)={n_narrow_hci_cfg} | " + f"Wide master ports ({dw_wide} bit)={n_wide_hci_cfg} | " + f"Slave ports (banks)={n_banks}" + ) + lines.append("=" * 72) + for entry in memory_map_entries: + lines.append(f"\n [{entry['label']}] pattern={entry['pattern']} n_transactions={entry['n']}") + if 'info' in entry: + lines.append(f" {entry['info']}") + for k, v in entry.get('detail', {}).items(): + lines.append(f" {k:<14}: {v}") + lines.append("") + lines.append(" Job / dependency map:") + for job, drivers in sorted(job_to_drivers.items()): + driver_names = [driver_name_fn(d) for d in drivers] + lines.append(f" job '{job}': {', '.join(driver_names)}") + for i in range(n_drivers): + name = driver_name_fn(i) + for f, mask in enumerate(fence_masks[i]): + if mask: + deps = [driver_name_fn(j) for j in range(n_drivers) if mask & (1 << j)] + lines.append(f" {name} after pattern[{f}] (fence {f}) waits for: {', '.join(deps)}") + lines.append("") + lines.append(" Temporal schedule (transaction-count model):") + lines.append(f" Total modeled time: {total_cycles} units (1 unit = 1 transaction)") + lines.append(" Note: Declared wait_for_jobs dependencies are used for scheduling.") + lines.append(" Note: Per-driver list order is also enforced (pattern p[i] -> p[i+1]).") + lines.append(" Note: No interconnect contention/stall timing is modeled.") + if mux_serialization_applied: + lines.append(" Note: MUX mode serializes HWPE execution by job order, then HWPE ID (tb_hci-like).") + lines.append(f" MUX job order: {', '.join(mux_phase_order)}") + if schedule_has_cycle: + lines.append(" WARNING: dependency cycle detected while scheduling; using fallback order.") + for d in range(n_drivers): + if d not in driver_windows: + continue + w = driver_windows[d] + lines.append(f" {w['name']:<8}: [{w['start']:>6}, {w['end']:>6}) dur={w['end'] - w['start']:>6}") + for node in [n for n in pattern_nodes if n['driver_idx'] == d]: + reg_tokens = [] + for reg in node['regions']: + reg_tokens.append(f"{reg['label']}@0x{reg['base']:08x}+{reg['size']}B") + reg_text = ", ".join(reg_tokens) if reg_tokens else "no regions" + lines.append( + f" p{node['pattern_idx']} job={node['job']} " + f"[{node['start_cycle']},{node['end_cycle']}) " + f"type={node['mem_access_type']} n={node['n_transactions']} {reg_text}" + ) + lines.append("") + lines.append(" Memory region lifetimes:") + for key in sorted(regions_timeline.keys(), key=lambda k: (k[0], k[1], k[2])): + reg = regions_timeline[key] + users = sorted({a['driver_name'] for a in reg['accesses']}) + lines.append( + f" {reg['label']:<8} 0x{reg['base']:08x}-0x{reg['end']:08x} " + f"({reg['size']:>6} B) lifetime=[{reg['lifetime_start']},{reg['lifetime_end']}) " + f"users={', '.join(users)}" + ) + lines.append("=" * 72) + + return "\n".join(lines) + "\n" + + +def write_memory_map_txt(memory_map_path: Path, **kwargs): + report_text = build_memory_map_text(**kwargs) + memory_map_path.write_text(report_text, encoding='utf-8') + return report_text diff --git a/target/verif/src/application_driver.sv b/target/verif/src/application_driver.sv index e1651a5..a6b7aa5 100644 --- a/target/verif/src/application_driver.sv +++ b/target/verif/src/application_driver.sv @@ -1,7 +1,9 @@ /* * application_driver.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -14,132 +16,328 @@ /** * Application driver module - * Reads stimuli from file and drives transactions on HCI interface + * Reads stimuli from file and drives transactions on HCI interface. + * + * Stimulus file format (one line per cycle): + * req(1b) id(IWb) wen(1b) data(Nb) add(Ab) -- active transaction + * PAUSE -- fence synchronization point + * + * Idle entries (req=0) are consumed as issue gaps when the driver is free to + * advance. While stalled waiting for a grant, the driver may advance over later + * idle entries, so the stimuli file represents offered traffic order and fence + * structure rather than an exact wall-clock replay under backpressure. + * + * When a PAUSE token is encountered the driver drains all in-flight reads + * (waits in DRAIN_FOR_PAUSE), then enters PAUSED and holds fence_reached_o=1 + * until resume_i is asserted. This allows multi-phase execution on a single + * driver without resetting counters between phases. + * Multiple consecutive PAUSE tokens are legal and represent multiple fence slots + * with no intervening traffic (e.g. free-pass completion fence followed by a + * synthetic blocking fence for the next pattern). */ module application_driver #( parameter int unsigned MASTER_NUMBER = 1, - parameter int unsigned IS_HWPE = 1, parameter int unsigned DATA_WIDTH = 1, - parameter int unsigned ADD_WIDTH = 1, - parameter int unsigned APPL_DELAY = 2, // Delay on the input signals + parameter int unsigned ADDR_WIDTH = 1, parameter int unsigned IW = 1, parameter string STIM_FILE = "" ) ( - hci_core_intf.initiator hci_if, - input logic rst_ni, input logic clk_i, - output logic end_stimuli_o, - output logic end_latency_o, - output int unsigned n_issued_transactions_o, - output int unsigned n_issued_read_transactions_o + input logic rst_ni, + input logic resume_i, // asserted by tb_hci when fence dependencies are met + hci_core_intf.initiator hci_if, + output logic fence_reached_o, // held HIGH while driver is paused at a fence + output logic end_resp_o, // held HIGH after all transactions and responses done + output int unsigned n_issued_tr_o, + output int unsigned n_issued_rd_tr_o, + output int unsigned n_retired_rd_tr_o ); - logic [IW-1:0] id; - string file_path; - int stim; - int scan_status; - logic wen; - logic req; - logic [DATA_WIDTH-1:0] data; - logic [ADD_WIDTH-1:0] add; - int unsigned n_completed_read_transactions; - logic pending_rsp_is_read[$]; - - always_ff @(posedge clk_i or negedge rst_ni) begin : proc_read_response_counter - logic retired_is_read; - if (!rst_ni) begin - n_completed_read_transactions <= '0; - pending_rsp_is_read.delete(); - end else begin - if (hci_if.req && hci_if.gnt) begin - pending_rsp_is_read.push_back(hci_if.wen); - end - if (hci_if.r_valid && hci_if.r_ready) begin - if (pending_rsp_is_read.size() != 0) begin - retired_is_read = pending_rsp_is_read.pop_front(); - if (retired_is_read) begin - n_completed_read_transactions <= n_completed_read_transactions + 1; - end - end - end - end - end + int unsigned n_req_issued_q, n_req_issued_d; + int unsigned n_rd_req_issued_q, n_rd_req_issued_d; + int unsigned n_rd_resp_retired_q, n_rd_resp_retired_d; + + // Transaction queue from file. is_pause=1 entries are fence tokens, not real transactions. + typedef struct { + logic is_pause; + logic req; + logic [IW-1:0] id; + logic wen; + logic [DATA_WIDTH-1:0] data; + logic [ADDR_WIDTH-1:0] add; + } transaction_t; + transaction_t transactions[$]; + + // Fill up the queue by reading the stimuli file until the end. + // PAUSE lines are read as fence tokens with is_pause=1. + initial begin + string file_path; + int stim; + string line; - initial begin : proc_application_driver - hci_if.id = '0; - hci_if.add = '0; - hci_if.data = '0; - hci_if.req = 1'b0; - hci_if.wen = 1'b0; - hci_if.ecc = '0; - hci_if.ereq = '0; - hci_if.r_eready = '0; - hci_if.be = '1; - hci_if.r_ready = 1'b1; - hci_if.user = '0; - end_stimuli_o = 1'b0; - end_latency_o = 1'b0; - n_issued_transactions_o = '0; - n_issued_read_transactions_o = '0; - - wait (rst_ni); if (STIM_FILE != "") begin file_path = STIM_FILE; end else begin - if (IS_HWPE) begin - file_path = $sformatf( - "../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", - MASTER_NUMBER - ); - end else begin - file_path = $sformatf( - "../simvectors/generated/stimuli_processed/master_log_%0d.txt", - MASTER_NUMBER - ); - end + $fatal("ERROR: Specify STIM_FILE path"); end stim = $fopen(file_path, "r"); if (stim == 0) begin - $fatal("ERROR: Could not open stimuli file!"); + $fatal("ERROR: Could not open stimuli file: %s", file_path); end - @(posedge clk_i); while (!$feof(stim)) begin - scan_status = $fscanf(stim, "%b %b %b %b %b\n", req, id, wen, data, add); - if (scan_status != 5) begin - if (!$feof(stim)) begin - $fatal(1, "ERROR: malformed stimuli line in %s", file_path); + transaction_t t; + int scan_status; + void'($fgets(line, stim)); + // Strip trailing newline/CR for comparison + if (line.len() > 0 && (line[line.len()-1] == "\n" || line[line.len()-1] == "\r")) + line = line.substr(0, line.len()-2); + if (line.len() > 1 && line[line.len()-1] == "\r") + line = line.substr(0, line.len()-2); + if (line == "PAUSE") begin + t.is_pause = 1'b1; + t.req = 1'b0; + t.id = '0; + t.wen = 1'b0; + t.data = '0; + t.add = '0; + transactions.push_back(t); + end else if (line.len() > 0) begin + t.is_pause = 1'b0; + scan_status = $sscanf(line, "%b %b %b %b %b", + t.req, t.id, t.wen, t.data, t.add); + if (scan_status != 5) begin + if (!$feof(stim)) begin + $fatal(1, "ERROR: malformed stimuli line in %s: '%s'", file_path, line); + end + break; end - break; + transactions.push_back(t); end - #(APPL_DELAY); - hci_if.id = id; - hci_if.data = data; - hci_if.add = add; - hci_if.wen = wen; - hci_if.req = req; - - if (req) begin - @(posedge clk_i iff hci_if.gnt); - n_issued_transactions_o++; - if (wen) begin - n_issued_read_transactions_o++; + end + $fclose(stim); + end + + ////////////////// + // Requests FSM // + ////////////////// + + typedef enum logic [2:0] { + REQ_IDLE, + WAIT_GNT, + REQ_DONE, + DRAIN_FOR_PAUSE, // drain in-flight reads before asserting fence_reached_o + PAUSED, // fence synchronization: hold fence_reached_o until resume_i + RSP_DONE + } req_state_t; + + req_state_t req_state_q, req_state_d; + int unsigned tr_idx_q, tr_idx_d; + int unsigned last_op_issued_q, last_op_issued_d; + + assign n_issued_tr_o = n_req_issued_q; + assign n_issued_rd_tr_o = n_rd_req_issued_q; + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + req_state_q <= REQ_IDLE; + tr_idx_q <= '0; + n_req_issued_q <= '0; + n_rd_req_issued_q <= '0; + last_op_issued_q <= '0; + end else begin + req_state_q <= req_state_d; + tr_idx_q <= tr_idx_d; + n_req_issued_q <= n_req_issued_d; + n_rd_req_issued_q <= n_rd_req_issued_d; + last_op_issued_q <= last_op_issued_d; + end + end + + always_comb begin + // FSM defaults + req_state_d = req_state_q; + tr_idx_d = tr_idx_q; + n_req_issued_d = n_req_issued_q; + n_rd_req_issued_d = n_rd_req_issued_q; + last_op_issued_d = last_op_issued_q; + // HCI output defaults + hci_if.id = '0; + hci_if.add = '0; + hci_if.data = '0; + hci_if.req = 1'b0; + hci_if.wen = 1'b0; + hci_if.ecc = '0; + hci_if.ereq = '0; + hci_if.r_eready = '0; + hci_if.be = '1; + hci_if.r_ready = 1'b1; + hci_if.user = '0; + // Output defaults + fence_reached_o = 1'b0; + end_resp_o = 1'b0; + + case (req_state_q) + REQ_IDLE: begin + if (tr_idx_q < transactions.size()) begin + if (transactions[tr_idx_q].is_pause) begin + // Consume the PAUSE token and drain any in-flight reads before pausing + tr_idx_d = tr_idx_q + 1; + if (n_rd_req_issued_q > n_rd_resp_retired_q) begin + req_state_d = DRAIN_FOR_PAUSE; + end else begin + req_state_d = PAUSED; + end + end else begin + tr_idx_d = tr_idx_q + 1; + if (transactions[tr_idx_q].req) begin + hci_if.req = 1'b1; + hci_if.id = transactions[tr_idx_q].id; + hci_if.wen = transactions[tr_idx_q].wen; + hci_if.data = transactions[tr_idx_q].data; + hci_if.add = transactions[tr_idx_q].add; + n_req_issued_d = n_req_issued_q + 1; + if (transactions[tr_idx_q].wen) begin + n_rd_req_issued_d = n_rd_req_issued_q + 1; + end + last_op_issued_d = tr_idx_q; + req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT; + end + end + end else begin + // No more transactions + if (n_rd_req_issued_q > n_rd_resp_retired_q) begin + req_state_d = REQ_DONE; + end else begin + req_state_d = RSP_DONE; + end end - // Deassert in NBA region so monitors sampling this edge see the handshake. - hci_if.id <= '0; - hci_if.data <= '0; - hci_if.add <= '0; - hci_if.wen <= 1'b0; - hci_if.req <= 1'b0; - wait (hci_if.req == 1'b0); - end else begin - @(posedge clk_i); end + + WAIT_GNT: begin + hci_if.req = 1'b1; + hci_if.id = transactions[last_op_issued_q].id; + hci_if.wen = transactions[last_op_issued_q].wen; + hci_if.data = transactions[last_op_issued_q].data; + hci_if.add = transactions[last_op_issued_q].add; + if (tr_idx_q < transactions.size()) begin + // Consume later idle entries while stalled so the driver can hide memory + // latency/backpressure when the workload permits it. This makes req=0 tokens + // issue-gap hints, not strict simulation-time no-op cycles. + if (!transactions[tr_idx_q].req && !transactions[tr_idx_q].is_pause) begin + tr_idx_d = tr_idx_q + 1; + end + req_state_d = hci_if.gnt ? REQ_IDLE : WAIT_GNT; + end else begin + if (hci_if.gnt) begin + if (transactions[last_op_issued_q].req && transactions[last_op_issued_q].wen) begin + req_state_d = REQ_DONE; + end else begin + req_state_d = RSP_DONE; + end + end + end + end + + REQ_DONE: begin + if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin + req_state_d = RSP_DONE; + end + end + + DRAIN_FOR_PAUSE: begin + // Wait for all in-flight reads to retire before asserting the fence + if (n_rd_resp_retired_q >= n_rd_req_issued_q) begin + req_state_d = PAUSED; + end + end + + PAUSED: begin + // Hold fence_reached_o HIGH until tb_hci asserts resume_i. + // If the next token is also a PAUSE (e.g. trailing free-pass followed by + // a blocking synthetic idle), consume it immediately and stay in PAUSED + // to avoid a spurious one-cycle REQ_IDLE bounce between consecutive fences. + fence_reached_o = 1'b1; + if (resume_i) begin + if (tr_idx_q < transactions.size() && transactions[tr_idx_q].is_pause) begin + tr_idx_d = tr_idx_q + 1; + req_state_d = PAUSED; + end else begin + req_state_d = REQ_IDLE; + end + end + end + + RSP_DONE: begin + end_resp_o = 1'b1; + end + + default: begin + req_state_d = REQ_IDLE; + end + endcase + end + + /////////////////////// + // Read response FSM // + /////////////////////// + + typedef enum logic [1:0] { + RESP_IDLE, + RESP_WAIT_RVALID + } resp_state_t; + + resp_state_t resp_state_q, resp_state_d; + int unsigned n_rd_in_flight_q, n_rd_in_flight_d; + + assign n_retired_rd_tr_o = n_rd_resp_retired_q; + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + resp_state_q <= RESP_IDLE; + n_rd_resp_retired_q <= '0; + n_rd_in_flight_q <= '0; + end else begin + resp_state_q <= resp_state_d; + n_rd_resp_retired_q <= n_rd_resp_retired_d; + n_rd_in_flight_q <= n_rd_in_flight_d; end + end - $fclose(stim); - end_stimuli_o = 1'b1; - wait (n_completed_read_transactions >= n_issued_read_transactions_o); - end_latency_o = 1'b1; + always_comb begin + resp_state_d = resp_state_q; + n_rd_resp_retired_d = n_rd_resp_retired_q; + n_rd_in_flight_d = n_rd_in_flight_q; + + case (resp_state_q) + RESP_IDLE: begin + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + n_rd_in_flight_d = n_rd_in_flight_q + 1; + resp_state_d = RESP_WAIT_RVALID; + end + end + + RESP_WAIT_RVALID: begin + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + n_rd_in_flight_d = n_rd_in_flight_q + 1; + end + if (hci_if.r_valid && hci_if.r_ready) begin + n_rd_resp_retired_d = n_rd_resp_retired_q + 1; + if (hci_if.req && hci_if.wen && hci_if.gnt) begin + n_rd_in_flight_d = n_rd_in_flight_q; // +1 grant -1 retire = net 0 + end else begin + n_rd_in_flight_d = n_rd_in_flight_q - 1; + end + if (n_rd_in_flight_q == 1 && !(hci_if.req && hci_if.wen && hci_if.gnt)) begin + resp_state_d = RESP_IDLE; + end + end + end + + default: begin + resp_state_d = RESP_IDLE; + end + endcase end + endmodule diff --git a/target/verif/src/throughput_monitor.sv b/target/verif/src/bandwidth_monitor.sv similarity index 75% rename from target/verif/src/throughput_monitor.sv rename to target/verif/src/bandwidth_monitor.sv index b25d16e..524e1ff 100644 --- a/target/verif/src/throughput_monitor.sv +++ b/target/verif/src/bandwidth_monitor.sv @@ -1,7 +1,10 @@ /* - * throughput_monitor.sv + * bandwidth_monitor.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -13,21 +16,20 @@ */ /** - * Throughput monitor - * Measures actual throughput and simulation time for each master + * Bandwidth monitor + * Measures actual bandwidth and completion time for each master */ -module throughput_monitor #( +module bandwidth_monitor #( parameter int unsigned N_MASTER, parameter int unsigned N_HWPE, parameter int unsigned CLK_PERIOD, parameter int unsigned DATA_WIDTH, - parameter int unsigned HWPE_WIDTH + parameter int unsigned HWPE_WIDTH_FACT ) ( input logic clk_i, input logic rst_ni, - input logic [0:N_MASTER-1] end_stimuli_i, - input logic [0:N_MASTER-1] end_latency_i, + input logic [N_MASTER-1:0] end_resp_i, // Read transactions number input int unsigned n_read_complete_log_i[N_MASTER-N_HWPE], input int unsigned n_read_complete_hwpe_i[N_HWPE], @@ -36,28 +38,11 @@ module throughput_monitor #( input int unsigned n_write_granted_hwpe_i[N_HWPE], // Completion-side throughput: accepted writes + completed reads per elapsed completion cycle. output real throughput_complete_o, - // Elapsed cycles from reset release to end_stimuli. - output real stim_latency_o, // Total simulation time (cycles) and simulation time per master (cycles) output real tot_latency_o, output real latency_per_master_o[N_MASTER] ); - // Stimulus duration at stimulus completion. - initial begin - time start_time, end_time; - real stim_time_cycles; - stim_latency_o = -1; - wait (rst_ni); - #(CLK_PERIOD/100); - @(posedge clk_i); - start_time = $time; - wait (&end_stimuli_i); - end_time = $time; - stim_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD); // cycles - stim_latency_o = stim_time_cycles; - end - // Completion-side throughput at full completion. initial begin time start_time, end_time; @@ -69,7 +54,7 @@ module throughput_monitor #( #(CLK_PERIOD/100); @(posedge clk_i); start_time = $time; - wait (&end_latency_i); + wait (&end_resp_i); end_time = $time; completion_time_cycles = real'(end_time - start_time) / real'(CLK_PERIOD); // cycles tot_latency_o = completion_time_cycles; @@ -82,7 +67,7 @@ module throughput_monitor #( for (int i = 0; i < N_HWPE; i++) begin tot_data += real'( n_write_granted_hwpe_i[i] + n_read_complete_hwpe_i[i] - ) * real'(HWPE_WIDTH * DATA_WIDTH); + ) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); end if (completion_time_cycles > 0.0) begin throughput_complete_o = tot_data / completion_time_cycles; // bits per cycle @@ -101,7 +86,7 @@ module throughput_monitor #( #(CLK_PERIOD/100); @(posedge clk_i); start_time = $time; - wait (end_latency_i[ii]); + wait (end_resp_i[ii] == 1'b1); end_time = $time; latency_per_master_o[ii] = real'(end_time - start_time) / real'(CLK_PERIOD); end diff --git a/target/verif/src/latency_monitor.sv b/target/verif/src/req_gnt_monitor.sv similarity index 73% rename from target/verif/src/latency_monitor.sv rename to target/verif/src/req_gnt_monitor.sv index d89cdf5..66fa4cb 100644 --- a/target/verif/src/latency_monitor.sv +++ b/target/verif/src/req_gnt_monitor.sv @@ -1,7 +1,10 @@ /* - * latency_monitor.sv + * req_gnt_monitor.sv * - * Copyright (C) 2019-2020 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -13,19 +16,19 @@ */ /** - * Latency monitor - * Tracks request-to-grant latency and transaction counters for all masters + * Request-to-grant monitor + * Tracks request-to-grant stall latency and transaction counters for all masters */ -module latency_monitor #( +module req_gnt_monitor #( parameter int unsigned N_MASTER = 4, parameter int unsigned N_HWPE = 1 ) ( input logic clk_i, input logic rst_ni, // Monitored interfaces - hci_core_intf.monitor hci_log_if [0:N_MASTER-N_HWPE-1], - hci_core_intf.monitor hci_hwpe_if [0:N_HWPE-1], + hci_core_intf.monitor hci_driver_log_if [0:N_MASTER-N_HWPE-1], + hci_core_intf.monitor hci_driver_hwpe_if [0:N_HWPE-1], // Accumulated request-to-grant latency. output real sum_req_to_gnt_latency_log_o[N_MASTER-N_HWPE], output real sum_req_to_gnt_latency_hwpe_o[N_HWPE], @@ -74,30 +77,32 @@ module latency_monitor #( req_start_cycle_log <= '0; req_prev_log <= 1'b0; end else begin - if (hci_log_if[gi].req && !req_prev_log) begin + if (hci_driver_log_if[gi].req && !req_prev_log && !hci_driver_log_if[gi].gnt) begin req_start_cycle_log <= cycle_q; + end else if (hci_driver_log_if[gi].gnt) begin + req_start_cycle_log <= cycle_q + 1; end - if (hci_log_if[gi].req && hci_log_if[gi].gnt) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt) begin if (req_prev_log) begin sum_req_to_gnt_latency_log_o[gi] <= sum_req_to_gnt_latency_log_o[gi] + real'(cycle_q - req_start_cycle_log); end n_gnt_transactions_log_o[gi] <= n_gnt_transactions_log_o[gi] + 1; - pending_rsp_is_read_log.push_back(hci_log_if[gi].wen); + pending_rsp_is_read_log.push_back(hci_driver_log_if[gi].wen); end - req_prev_log <= hci_log_if[gi].req; + req_prev_log <= hci_driver_log_if[gi].req; - if (hci_log_if[gi].req && hci_log_if[gi].gnt && hci_log_if[gi].wen) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && hci_driver_log_if[gi].wen) begin n_read_granted_log_o[gi] <= n_read_granted_log_o[gi] + 1; end - if (hci_log_if[gi].req && hci_log_if[gi].gnt && !hci_log_if[gi].wen) begin + if (hci_driver_log_if[gi].req && hci_driver_log_if[gi].gnt && !hci_driver_log_if[gi].wen) begin n_write_granted_log_o[gi] <= n_write_granted_log_o[gi] + 1; end - if (hci_log_if[gi].r_valid && hci_log_if[gi].r_ready) begin + if (hci_driver_log_if[gi].r_valid && hci_driver_log_if[gi].r_ready) begin if (pending_rsp_is_read_log.size() != 0) begin retired_is_read_log = pending_rsp_is_read_log.pop_front(); if (retired_is_read_log) begin @@ -130,30 +135,32 @@ module latency_monitor #( req_start_cycle_hwpe <= '0; req_prev_hwpe <= 1'b0; end else begin - if (hci_hwpe_if[gi].req && !req_prev_hwpe) begin + if (hci_driver_hwpe_if[gi].req && !req_prev_hwpe && !hci_driver_hwpe_if[gi].gnt) begin req_start_cycle_hwpe <= cycle_q; + end else if (hci_driver_hwpe_if[gi].gnt) begin + req_start_cycle_hwpe <= cycle_q + 1; end - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt) begin if (req_prev_hwpe) begin sum_req_to_gnt_latency_hwpe_o[gi] <= sum_req_to_gnt_latency_hwpe_o[gi] + real'(cycle_q - req_start_cycle_hwpe); end n_gnt_transactions_hwpe_o[gi] <= n_gnt_transactions_hwpe_o[gi] + 1; - pending_rsp_is_read_hwpe.push_back(hci_hwpe_if[gi].wen); + pending_rsp_is_read_hwpe.push_back(hci_driver_hwpe_if[gi].wen); end - req_prev_hwpe <= hci_hwpe_if[gi].req; + req_prev_hwpe <= hci_driver_hwpe_if[gi].req; - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && hci_hwpe_if[gi].wen) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && hci_driver_hwpe_if[gi].wen) begin n_read_granted_hwpe_o[gi] <= n_read_granted_hwpe_o[gi] + 1; end - if (hci_hwpe_if[gi].req && hci_hwpe_if[gi].gnt && !hci_hwpe_if[gi].wen) begin + if (hci_driver_hwpe_if[gi].req && hci_driver_hwpe_if[gi].gnt && !hci_driver_hwpe_if[gi].wen) begin n_write_granted_hwpe_o[gi] <= n_write_granted_hwpe_o[gi] + 1; end - if (hci_hwpe_if[gi].r_valid && hci_hwpe_if[gi].r_ready) begin + if (hci_driver_hwpe_if[gi].r_valid && hci_driver_hwpe_if[gi].r_ready) begin if (pending_rsp_is_read_hwpe.size() != 0) begin retired_is_read_hwpe = pending_rsp_is_read_hwpe.pop_front(); if (retired_is_read_hwpe) begin diff --git a/target/verif/src/simulation_report.sv b/target/verif/src/simulation_report.sv index 4d4e2e4..7f27050 100644 --- a/target/verif/src/simulation_report.sv +++ b/target/verif/src/simulation_report.sv @@ -1,7 +1,10 @@ /* * simulation_report.sv * - * Copyright (C) 2026 ETH Zurich, University of Bologna + * Sergio Mazzola + * Luca Codeluppi + * + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -20,21 +23,19 @@ module simulation_report import tb_hci_pkg::*; ( - input logic [0:N_MASTER-1] end_stimuli_i, - input logic [0:N_MASTER-1] end_latency_i, - input real throughput_complete_i, - input real stim_latency_i, - input real tot_latency_i, - input real latency_per_master_i[N_MASTER], - input real sum_req_to_gnt_latency_log_i[N_MASTER-N_HWPE], + input logic [N_DRIVERS-1:0] end_resp_i, + input real throughput_complete_i, + input real tot_latency_i, + input real latency_per_master_i[N_DRIVERS], + input real sum_req_to_gnt_latency_log_i[N_DRIVERS-N_HWPE], input real sum_req_to_gnt_latency_hwpe_i[N_HWPE], - input int unsigned n_gnt_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_gnt_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_gnt_transactions_hwpe_i[N_HWPE], - input int unsigned n_read_granted_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_read_granted_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_read_granted_transactions_hwpe_i[N_HWPE], - input int unsigned n_write_granted_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_write_granted_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_write_granted_transactions_hwpe_i[N_HWPE], - input int unsigned n_read_complete_transactions_log_i[N_MASTER-N_HWPE], + input int unsigned n_read_complete_transactions_log_i[N_DRIVERS-N_HWPE], input int unsigned n_read_complete_transactions_hwpe_i[N_HWPE] ); @@ -60,6 +61,16 @@ module simulation_report int unsigned log_masters_with_grants; int unsigned hwpe_masters_with_grants; logic missing_reads; + // Ideal bandwidth: maximum data the memory system can serve per cycle. + // Memory side: N_BANKS narrow ports, each DATA_WIDTH bits wide. + real ideal_bw_mem_side_bpc; // bits per cycle (memory-side ceiling) + // Interconnect side: HCI-facing narrow/wide initiator interfaces. + real ideal_bw_interco_side_bpc; // bits per cycle (interconnect-side ceiling) + real ideal_bw_bpc; // min(mem, interco) = bottleneck ideal BW + real actual_bw_utilization; // throughput_complete / ideal_bw + int unsigned n_narrow_if_total; + int unsigned n_wide_if_total; + string interco_type_str; sum_req_to_gnt_latency_all = 0.0; average_req_to_gnt_latency_weighted = 0.0; @@ -81,12 +92,10 @@ module simulation_report hwpe_masters_with_grants = '0; missing_reads = 1'b0; - wait (&end_stimuli_i); - wait (stim_latency_i >= 0); - wait (&end_latency_i); + wait (&end_resp_i); wait (throughput_complete_i >= 0); wait (tot_latency_i >= 0); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -104,7 +113,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -122,7 +131,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin total_read_granted_transactions += n_read_granted_transactions_log_i[i]; total_write_granted_transactions += n_write_granted_transactions_log_i[i]; total_read_complete_transactions += n_read_complete_transactions_log_i[i]; @@ -140,7 +149,7 @@ module simulation_report log_masters_with_grants++; end end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin total_read_granted_transactions += n_read_granted_transactions_hwpe_i[i]; total_write_granted_transactions += n_write_granted_transactions_hwpe_i[i]; total_read_complete_transactions += n_read_complete_transactions_hwpe_i[i]; @@ -184,32 +193,74 @@ module simulation_report average_req_to_gnt_latency_hwpe_unweighted / real'(hwpe_masters_with_grants); end + // Ideal bandwidth computation. + // Memory side: each of the N_BANKS banks can serve one DATA_WIDTH word per cycle. + ideal_bw_mem_side_bpc = real'(N_BANKS) * real'(DATA_WIDTH); + // Interconnect side: HCI interface ports (narrow + wide). + // NOTE: for MUX mode N_WIDE_HCI=1, i.e. one shared wide initiator. + n_narrow_if_total = N_NARROW_HCI + N_DMA + N_EXT; + n_wide_if_total = N_WIDE_HCI; + ideal_bw_interco_side_bpc = real'(n_narrow_if_total) * real'(DATA_WIDTH) + + real'(n_wide_if_total) * real'(HWPE_WIDTH_FACT * DATA_WIDTH); + // Bottleneck = minimum of the two sides. + ideal_bw_bpc = (ideal_bw_mem_side_bpc < ideal_bw_interco_side_bpc) + ? ideal_bw_mem_side_bpc : ideal_bw_interco_side_bpc; + // Utilization = actual / ideal. + actual_bw_utilization = (ideal_bw_bpc > 0.0) + ? (throughput_complete_i / ideal_bw_bpc * 100.0) : 0.0; + if (INTERCO_TYPE == LOG) begin + interco_type_str = "LOG"; + end else if (INTERCO_TYPE == HCI) begin + interco_type_str = "HCI"; + end else if (INTERCO_TYPE == MUX) begin + interco_type_str = "MUX"; + end else begin + interco_type_str = "UNKNOWN"; + end + $display("------ Simulation Summary ------"); $display("\\\\HW CONFIG\\\\"); $display( "Masters: CORE=%0d DMA=%0d EXT=%0d HWPE=%0d (total=%0d)", - N_CORE_REAL, N_DMA_REAL, N_EXT_REAL, N_HWPE_REAL, N_MASTER_REAL + N_CORE, N_DMA, N_EXT, N_HWPE, N_DRIVERS ); $display( "Memory: banks=%0d total_size=%0d kB data_width=%0d bits hwpe_width=%0d lanes", - N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH + N_BANKS, TOT_MEM_SIZE, DATA_WIDTH, HWPE_WIDTH_FACT ); $display( "Interconnect: SEL_LIC=%0d TS_BIT=%0d EXPFIFO=%0d", SEL_LIC, TS_BIT, EXPFIFO ); $display( - "ID/address: IW=%0d ADD_WIDTH=%0d AddrMemWidth=%0d", - IW, ADD_WIDTH, AddrMemWidth + "Interconnect-side: TYPE=%s N_NARROW_HCI=%0d N_WIDE_HCI=%0d N_DMA=%0d N_EXT=%0d", + interco_type_str, N_NARROW_HCI, N_WIDE_HCI, N_DMA, N_EXT + ); + $display( + "ID/address: IW=%0d ADDR_WIDTH=%0d ADDR_WIDTH_BANK=%0d", + IW, ADDR_WIDTH, ADDR_WIDTH_BANK ); $display("\n\\\\BANDWIDTH\\\\"); $display( - "Completion bandwidth (writes granted + reads completed): %0.1f bit/cycle", - throughput_complete_i + "Ideal BW (memory side): %0.0f bit/cycle [%0d banks x %0d bits]", + ideal_bw_mem_side_bpc, N_BANKS, DATA_WIDTH + ); + $display( + "Ideal BW (interco side): %0.0f bit/cycle [%0d narrow-if x %0d bits + %0d wide-if x %0d bits]", + ideal_bw_interco_side_bpc, + n_narrow_if_total, DATA_WIDTH, + n_wide_if_total, HWPE_WIDTH_FACT * DATA_WIDTH + ); + $display( + "Ideal BW (bottleneck): %0.0f bit/cycle", + ideal_bw_bpc + ); + $display( + "Actual BW (completion): %0.2f bit/cycle [utilization: %0.1f%%]", + throughput_complete_i, actual_bw_utilization ); - $display("Stimulus phase duration: %0.1f cycles", stim_latency_i); - $display("Completion phase duration: %0.1f cycles", tot_latency_i); + $display("Completion phase duration: %0.2f cycles", tot_latency_i); $display( "Granted transactions: reads=%0d writes=%0d total=%0d", total_read_granted_transactions, @@ -222,36 +273,36 @@ module simulation_report ); $display("\n\\\\SIMULATION TIME\\\\"); - $display("Total simulation time: %0.1f cycles", tot_latency_i); - for (int i = 0; i < N_CORE_REAL; i++) begin + $display("Total simulation time: %0.2f cycles", tot_latency_i); + for (int i = 0; i < N_CORE; i++) begin $display( - "Core%0d (master_log_%0d): %0.1f cycles", + "Core%0d (master_log_%0d): %0.2f cycles", i, i, latency_per_master_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( - "DMA%0d (master_log_%0d): %0.1f cycles", + "DMA%0d (master_log_%0d): %0.2f cycles", i - N_CORE, i, latency_per_master_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( - "EXT%0d (master_log_%0d): %0.1f cycles", + "EXT%0d (master_log_%0d): %0.2f cycles", i - (N_CORE + N_DMA), i, latency_per_master_i[i] ); end - for (int i = N_MASTER - N_HWPE; i < N_MASTER - N_HWPE + N_HWPE_REAL; i++) begin + for (int i = N_DRIVERS - N_HWPE; i < N_DRIVERS - N_HWPE + N_HWPE; i++) begin $display( - "HWPE%0d (master_hwpe_%0d): %0.1f cycles", - i - (N_MASTER - N_HWPE), - i - (N_MASTER - N_HWPE), + "HWPE%0d (master_hwpe_%0d): %0.2f cycles", + i - (N_DRIVERS - N_HWPE), + i - (N_DRIVERS - N_HWPE), latency_per_master_i[i] ); end $display("\n\\\\READ RESPONSE COVERAGE\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -262,7 +313,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -273,7 +324,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin expected_reads = n_read_granted_transactions_log_i[i]; observed_reads = n_read_complete_transactions_log_i[i]; $display( @@ -284,7 +335,7 @@ module simulation_report missing_reads = 1'b1; end end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin expected_reads = n_read_granted_transactions_hwpe_i[i]; observed_reads = n_read_complete_transactions_hwpe_i[i]; $display( @@ -300,7 +351,7 @@ module simulation_report end $display("\n\\\\TRANSACTION COUNTS\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -309,7 +360,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -318,7 +369,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( "master_log_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -327,7 +378,7 @@ module simulation_report n_read_complete_transactions_log_i[i] ); end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin $display( "master_hwpe_%0d: granted reads=%0d writes=%0d, read-complete=%0d", i, @@ -338,9 +389,9 @@ module simulation_report end $display("\n\\\\REQUEST-TO-GRANT LATENCY\\\\"); - for (int i = 0; i < N_CORE_REAL; i++) begin + for (int i = 0; i < N_CORE; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -348,9 +399,9 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = N_CORE; i < N_CORE + N_DMA_REAL; i++) begin + for (int i = N_CORE; i < N_CORE + N_DMA; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -358,9 +409,9 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT_REAL; i++) begin + for (int i = N_CORE + N_DMA; i < N_CORE + N_DMA + N_EXT; i++) begin $display( - "master_log_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_log_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_log_i[i] != 0) ? (sum_req_to_gnt_latency_log_i[i] / real'(n_gnt_transactions_log_i[i])) : @@ -368,9 +419,9 @@ module simulation_report n_gnt_transactions_log_i[i] ); end - for (int i = 0; i < N_HWPE_REAL; i++) begin + for (int i = 0; i < N_HWPE; i++) begin $display( - "master_hwpe_%0d: avg req->gnt latency %0.1f cycles over %0d grants", + "master_hwpe_%0d: avg req->gnt stall latency %0.2f cycles over %0d grants", i, (n_gnt_transactions_hwpe_i[i] != 0) ? (sum_req_to_gnt_latency_hwpe_i[i] / real'(n_gnt_transactions_hwpe_i[i])) : @@ -380,32 +431,32 @@ module simulation_report end $display(""); $display( - "Total accumulated req->gnt latency: %0.1f cycles over %0d grants", + "Total accumulated req->gnt latency: %0d cycles over %0d grants", sum_req_to_gnt_latency_all, total_gnt_transactions_all ); $display( - "LOG avg req->gnt latency (weighted by grant count): %0.1f cycles", + "LOG avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_log_weighted ); $display( - "LOG avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "LOG avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_log_unweighted ); $display( - "HWPE avg req->gnt latency (weighted by grant count): %0.1f cycles", + "HWPE avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_hwpe_weighted ); $display( - "HWPE avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "HWPE avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_hwpe_unweighted ); $display( - "Global avg req->gnt latency (weighted by grant count): %0.1f cycles", + "Global avg req->gnt stall latency (weighted by grant count): %0.2f cycles", average_req_to_gnt_latency_weighted ); $display( - "Global avg req->gnt latency (mean of per-master averages): %0.1f cycles", + "Global avg req->gnt stall latency (mean of per-master averages): %0.2f cycles", average_req_to_gnt_latency_unweighted ); $display(""); diff --git a/target/verif/src/tb_hci.sv b/target/verif/src/tb_hci.sv index 5fb3411..5ada60b 100644 --- a/target/verif/src/tb_hci.sv +++ b/target/verif/src/tb_hci.sv @@ -4,8 +4,7 @@ * Sergio Mazzola * Luca Codeluppi * - * - * Copyright (C) 2019-2025 ETH Zurich, University of Bologna + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -27,14 +26,12 @@ module tb_hci (); logic clk, rst_n; - logic s_clear; + logic [N_DRIVERS-1:0] s_end_resp; // end_resp_o from all drivers + logic [N_DRIVERS-1:0] s_fence_reached; // fence_reached_o from all drivers (level, HIGH while PAUSED) + logic [N_DRIVERS-1:0] s_resume; // resume_i to each driver (asserted when fence deps are met) + int unsigned fence_idx [N_DRIVERS]; // number of fences each driver has passed so far hci_interconnect_ctrl_t s_hci_ctrl; - logic [0:N_MASTER-1] s_end_stimuli; - logic [0:N_MASTER-1] s_end_latency; - int unsigned s_issued_transactions[0:N_MASTER-1]; - int unsigned s_issued_read_transactions[0:N_MASTER-1]; - clk_rst_gen #( .ClkPeriod(CLK_PERIOD), .RstClkCycles(RST_CLK_CYCLES) @@ -43,39 +40,80 @@ module tb_hci .rst_no(rst_n) ); - ///////// - // HCI // - ///////// + //////////////////// + // HCI interfaces // + //////////////////// - /* HCI interfaces */ - localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{ // CORE + DMA + EXT parameters - DW: DEFAULT_DW, - AW: DEFAULT_AW, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + localparam hci_size_parameter_t `HCI_SIZE_PARAM(cores) = '{ + DW: DW_cores, + AW: AW_cores, + BW: BW_cores, + UW: UW_cores, + IW: IW_cores, + EW: EW_cores, + EHW: EHW_cores }; - localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{ // Bank parameters - DW: DEFAULT_DW, - AW: AddrMemWidth, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + + localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{ + DW: DW_hwpe, + AW: AW_hwpe, + BW: BW_hwpe, + UW: UW_hwpe, + IW: IW_hwpe, + EW: EW_hwpe, + EHW: EHW_hwpe }; - localparam hci_size_parameter_t `HCI_SIZE_PARAM(hwpe) = '{ // HWPE parameters - DW: HWPE_WIDTH*DATA_WIDTH, - AW: DEFAULT_AW, - BW: DEFAULT_BW, - UW: DEFAULT_UW, - IW: IW, - EW: DEFAULT_EW, - EHW: DEFAULT_EHW + + localparam hci_size_parameter_t `HCI_SIZE_PARAM(mems) = '{ + DW: DW_mems, + AW: AW_mems, + BW: BW_mems, + UW: UW_mems, + IW: IW_mems, + EW: EW_mems, + EHW: EHW_mems }; + /* Application-driver-side interfaces */ + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_driver_log_if [0:N_LOG_MASTERS-1] ( + .clk(clk) + ); + + hci_core_intf #( + .DW(HCI_SIZE_hwpe.DW), + .AW(HCI_SIZE_hwpe.AW), + .BW(HCI_SIZE_hwpe.BW), + .UW(HCI_SIZE_hwpe.UW), + .IW(HCI_SIZE_hwpe.IW), + .EW(HCI_SIZE_hwpe.EW), + .EHW(HCI_SIZE_hwpe.EHW) + ) hci_driver_hwpe_if [0:N_HWPE-1] ( + .clk(clk) + ); + + /* Interconnect-side interfaces (hci_system-style organization) */ + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_initiator_narrow [0:N_NARROW_HCI-1] ( + .clk(clk) + ); + hci_core_intf #( .DW(HCI_SIZE_hwpe.DW), .AW(HCI_SIZE_hwpe.AW), @@ -84,7 +122,19 @@ module tb_hci .IW(HCI_SIZE_hwpe.IW), .EW(HCI_SIZE_hwpe.EW), .EHW(HCI_SIZE_hwpe.EHW) - ) hci_hwpe_if [0:N_HWPE-1] ( + ) hci_initiator_wide [0:N_WIDE_HCI-1] ( + .clk(clk) + ); + + hci_core_intf #( + .DW(HCI_SIZE_cores.DW), + .AW(HCI_SIZE_cores.AW), + .BW(HCI_SIZE_cores.BW), + .UW(HCI_SIZE_cores.UW), + .IW(HCI_SIZE_cores.IW), + .EW(HCI_SIZE_cores.EW), + .EHW(HCI_SIZE_cores.EHW) + ) hci_initiator_dma [0:N_DMA-1] ( .clk(clk) ); @@ -96,7 +146,7 @@ module tb_hci .IW(HCI_SIZE_cores.IW), .EW(HCI_SIZE_cores.EW), .EHW(HCI_SIZE_cores.EHW) - ) hci_log_if [0:N_MASTER-N_HWPE-1] ( + ) hci_initiator_ext [0:N_EXT-1] ( .clk(clk) ); @@ -112,26 +162,187 @@ module tb_hci .WAIVE_RQ4_ASSERT(1'b1), .WAIVE_RSP3_ASSERT(1'b1), .WAIVE_RSP5_ASSERT(1'b1) - ) hci_mem_if [0:N_BANKS-1] ( + ) hci_target_mems [0:N_BANKS-1] ( .clk(clk) ); - /* HCI instance */ + /////////////////////////// + // Interface assignments // + /////////////////////////// + + /* Assignments of narrow initiators to LOG branch of HCI */ + + generate + for (genvar ii = 0; ii < N_CORE; ii++) begin : gen_core_to_narrow + hci_core_assign i_core_to_narrow_assign ( + .tcdm_target(hci_driver_log_if[ii]), + .tcdm_initiator(hci_initiator_narrow[ii]) + ); + end + + for (genvar ii = 0; ii < N_DMA; ii++) begin : gen_dma_to_hci + hci_core_assign i_dma_to_hci_assign ( + .tcdm_target(hci_driver_log_if[N_CORE + ii]), + .tcdm_initiator(hci_initiator_dma[ii]) + ); + end + + for (genvar ii = 0; ii < N_EXT; ii++) begin : gen_ext_to_hci + hci_core_assign i_ext_to_hci_assign ( + .tcdm_target(hci_driver_log_if[N_CORE + N_DMA + ii]), + .tcdm_initiator(hci_initiator_ext[ii]) + ); + end + endgenerate + + /* Assignments of wide initiators to HCI (either LOG branch, HCI branch, or static MUX) */ + + generate + if (INTERCO_TYPE == HCI) begin : gen_hwpe_hci + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_hci_assign + hci_core_assign i_hwpe_hci_assign ( + .tcdm_target(hci_driver_hwpe_if[ii]), + .tcdm_initiator(hci_initiator_wide[ii]) + ); + end + end else if (INTERCO_TYPE == MUX) begin : gen_hwpe_mux + // Phase-ordered MUX arbitration: + // + // The mux is held by whichever HWPE is currently running (not paused, not done). + // In-flight reads are drained before any PAUSE (DRAIN_FOR_PAUSE state in the + // driver FSM), so sel_i is safe to switch as soon as fence_reached_o goes high. + // + // When no HWPE is running (all are either paused or done), the mux is granted + // to the lowest-indexed HWPE that is paused AND whose fence dependencies are + // satisfied (s_resume high). This serializes same-phase jobs by master ID and + // respects cross-phase data dependencies. + logic [$clog2(N_HWPE > 1 ? N_HWPE-1 : 1):0] s_mux_sel; + always_comb begin + automatic logic any_running; + any_running = 1'b0; + for (int i = 0; i < N_HWPE; i++) begin + if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i]) + any_running = 1'b1; + end + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(N_HWPE - 1); + for (int i = N_HWPE-1; i >= 0; i--) begin + if (any_running) begin + // Active HWPE holds the mux; lowest index wins (descending loop) + if (!s_end_resp[N_LOG_MASTERS + i] && !s_fence_reached[N_LOG_MASTERS + i]) + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + end else begin + // No HWPE running: grant to lowest-indexed paused+ready HWPE + if (!s_end_resp[N_LOG_MASTERS + i] && s_fence_reached[N_LOG_MASTERS + i] + && s_resume[N_LOG_MASTERS + i]) + s_mux_sel = ($clog2(N_HWPE > 1 ? N_HWPE-1 : 1) + 1)'(i); + end + end + end + hci_core_mux_static #( + .NB_CHAN(N_HWPE), + .`HCI_SIZE_PARAM(in)(HCI_SIZE_hwpe) + ) i_hwpe_mux ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .sel_i(s_mux_sel), + .in(hci_driver_hwpe_if), + .out(hci_initiator_wide[0]) + ); + end else if (INTERCO_TYPE == LOG) begin : gen_hwpe_split + for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_hwpe_split_per_master + hci_core_split #( + .DW(HWPE_WIDTH_FACT * DATA_WIDTH), + .BW(DATA_WIDTH / 8), + .UW(1), + .NB_OUT_CHAN(HWPE_WIDTH_FACT), + .FIFO_DEPTH(2), + .`HCI_SIZE_PARAM(tcdm_target)(HCI_SIZE_hwpe) + ) i_hwpe_to_log_split ( + .clk_i(clk), + .rst_ni(rst_n), + .clear_i(s_clear), + .tcdm_target(hci_driver_hwpe_if[ii]), + .tcdm_initiator(hci_initiator_narrow[N_CORE + ii * HWPE_WIDTH_FACT : N_CORE + (ii + 1) * HWPE_WIDTH_FACT - 1]) + ); + end + end else begin : gen_unsupported_mode + initial $error("Unsupported INTERCO_TYPE"); + end + endgenerate + + ///////////////// + // Fence logic // + ///////////////// - assign s_clear = 0; + logic s_clear; + assign s_clear = 1'b0; + + assign s_hci_ctrl.arb_policy = ARBITER_MODE; assign s_hci_ctrl.invert_prio = INVERT_PRIO; - assign s_hci_ctrl.low_prio_max_stall = LOW_PRIO_MAX_STALL; + assign s_hci_ctrl.priority_cnt_numerator = PRIORITY_CNT_NUMERATOR; + assign s_hci_ctrl.priority_cnt_denominator = PRIORITY_CNT_DENOMINATOR; + + // fence_idx[i] = number of PAUSE tokens driver i has passed. + // This counts all fences in file order, including synthetic blocking fences + // and trailing completion fences. + // + // fence_idx increments when resume_i is asserted while fence_reached_o is high, + // i.e. when the PAUSED-state handshake completes and the driver leaves that fence. + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + for (int i = 0; i < N_DRIVERS; i++) fence_idx[i] <= '0; + end else begin + for (int i = 0; i < N_DRIVERS; i++) begin + if (s_resume[i] && s_fence_reached[i]) + fence_idx[i] <= fence_idx[i] + 1; + end + end + end + + // s_resume[i] is asserted only while driver i is paused at its current fence. + // Driver i may pass that fence when, for every dependency bit j set in the + // current FENCE_MASKS entry, fence_idx[j] is at least the required level + // encoded in FENCE_REQ_LEVELS_PACKED. + // + // In other words: blocking fences wait for explicit dependency completion; + // trailing zero-mask fences are free passes. + always_comb begin + for (int i = 0; i < N_DRIVERS; i++) begin + automatic logic [N_DRIVERS-1:0] cur_mask; + automatic logic all_satisfied; + cur_mask = (fence_idx[i] < MAX_FENCES) ? FENCE_MASKS[i][fence_idx[i]] : '0; + all_satisfied = 1'b1; + for (int j = 0; j < N_DRIVERS; j++) begin + if (cur_mask[j]) begin + automatic logic [3:0] req; + req = (fence_idx[i] < MAX_FENCES) ? + FENCE_REQ_LEVELS_PACKED[i][fence_idx[i]][j*4+3 -: 4] : 4'h0; + if (fence_idx[j] < req) + all_satisfied = 1'b0; + end + end + // Only assert resume_i while the driver is actually in PAUSED state. + // Gating with fence_reached_o makes the signal a clean pulse. + s_resume[i] = all_satisfied && s_fence_reached[i]; + end + end + + ///////// + // HCI // + ///////// hci_interconnect #( - .N_HWPE(N_HWPE), // Number of HWPEs attached to the port - .N_CORE(N_CORE), // Number of Core ports - .N_DMA(N_DMA), // Number of DMA ports - .N_EXT(N_EXT), // Number of External ports - .N_MEM(N_BANKS), // Number of Memory banks - .TS_BIT(TS_BIT), // TEST_SET_BIT (for Log Interconnect) - .IW(IW), // ID Width - .EXPFIFO(EXPFIFO), // FIFO Depth for HWPE Interconnect - .SEL_LIC(SEL_LIC), // Log interconnect type selector + .N_HWPE(N_WIDE_HCI), + .N_CORE(N_NARROW_HCI), + .N_DMA(N_DMA), + .N_EXT(N_EXT), + .N_MEM(N_BANKS), + .TS_BIT(TS_BIT), + .IW(IW), + .EXPFIFO(EXPFIFO), + .SEL_LIC(SEL_LIC), + .FILTER_WRITE_R_VALID(FILTER_WRITE_R_VALID), .HCI_SIZE_cores(HCI_SIZE_cores), .HCI_SIZE_mems(HCI_SIZE_mems), .HCI_SIZE_hwpe(HCI_SIZE_hwpe), @@ -144,11 +355,11 @@ module tb_hci .rst_ni(rst_n), .clear_i(s_clear), .ctrl_i(s_hci_ctrl), - .cores(hci_log_if[0:N_CORE-1]), - .dma(hci_log_if[N_CORE:N_CORE+N_DMA-1]), - .ext(hci_log_if[N_CORE+N_DMA:N_CORE+N_DMA+N_EXT-1]), - .mems(hci_mem_if), - .hwpe(hci_hwpe_if) + .cores(hci_initiator_narrow), + .dma(hci_initiator_dma), + .ext(hci_initiator_ext), + .mems(hci_target_mems), + .hwpe(hci_initiator_wide) ); ////////// @@ -159,66 +370,67 @@ module tb_hci .BankSize(N_WORDS), .NbBanks(N_BANKS), .DataWidth(DATA_WIDTH), - .AddrWidth(ADD_WIDTH), - .BeWidth(DATA_WIDTH/8), + .AddrWidth(ADDR_WIDTH), + .BeWidth(DATA_WIDTH / 8), .IdWidth(IW) ) i_tb_mem ( .clk_i(clk), .rst_ni(rst_n), .test_mode_i(1'b0), - .tcdm_slave(hci_mem_if) + .tcdm_slave(hci_target_mems) ); ///////////////////////// // Application drivers // ///////////////////////// - /* CORE + DMA + EXT */ + int unsigned s_issued_transactions[0:N_DRIVERS-1]; + int unsigned s_issued_read_transactions[0:N_DRIVERS-1]; + generate - for (genvar ii = 0; ii < N_MASTER - N_HWPE; ii++) begin : gen_app_driver_log + for (genvar ii = 0; ii < N_LOG_MASTERS; ii++) begin : gen_app_driver_log localparam string STIM_FILE_LOG = - $sformatf("../simvectors/generated/stimuli_processed/master_log_%0d.txt", ii); + $sformatf("../simvectors/generated/stimuli/master_log_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), - .IS_HWPE(0), .DATA_WIDTH(DATA_WIDTH), - .ADD_WIDTH(ADD_WIDTH), - .APPL_DELAY(APPL_DELAY), // Delay on the input signals - .IW(IW), + .ADDR_WIDTH(ADDR_WIDTH), + .IW(IW_cores), .STIM_FILE(STIM_FILE_LOG) ) i_app_driver_log ( - .hci_if(hci_log_if[ii]), - .rst_ni(rst_n), .clk_i(clk), - .end_stimuli_o(s_end_stimuli[ii]), - .end_latency_o(s_end_latency[ii]), - .n_issued_transactions_o(s_issued_transactions[ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[ii]) + .rst_ni(rst_n), + .resume_i(s_resume[ii]), + .hci_if(hci_driver_log_if[ii]), + .fence_reached_o(s_fence_reached[ii]), + .end_resp_o(s_end_resp[ii]), + .n_issued_tr_o(s_issued_transactions[ii]), + .n_issued_rd_tr_o(s_issued_read_transactions[ii]), + .n_retired_rd_tr_o() ); end endgenerate - /* HWPE */ generate for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_app_driver_hwpe localparam string STIM_FILE_HWPE = - $sformatf("../simvectors/generated/stimuli_processed/master_hwpe_%0d.txt", ii); + $sformatf("../simvectors/generated/stimuli/master_hwpe_%0d.txt", ii); application_driver #( .MASTER_NUMBER(ii), - .IS_HWPE(1), - .DATA_WIDTH(HWPE_WIDTH * DATA_WIDTH), - .ADD_WIDTH(ADD_WIDTH), - .APPL_DELAY(APPL_DELAY), // Delay on the input signals - .IW(IW), + .DATA_WIDTH(HWPE_WIDTH_FACT * DATA_WIDTH), + .ADDR_WIDTH(ADDR_WIDTH), + .IW(IW_hwpe), .STIM_FILE(STIM_FILE_HWPE) ) i_app_driver_hwpe ( - .hci_if(hci_hwpe_if[ii]), - .rst_ni(rst_n), .clk_i(clk), - .end_stimuli_o(s_end_stimuli[N_MASTER-N_HWPE+ii]), - .end_latency_o(s_end_latency[N_MASTER-N_HWPE+ii]), - .n_issued_transactions_o(s_issued_transactions[N_MASTER-N_HWPE+ii]), - .n_issued_read_transactions_o(s_issued_read_transactions[N_MASTER-N_HWPE+ii]) + .rst_ni(rst_n), + .resume_i(s_resume[N_LOG_MASTERS + ii]), + .hci_if(hci_driver_hwpe_if[ii]), + .fence_reached_o(s_fence_reached[N_LOG_MASTERS + ii]), + .end_resp_o(s_end_resp[N_LOG_MASTERS + ii]), + .n_issued_tr_o(s_issued_transactions[N_LOG_MASTERS + ii]), + .n_issued_rd_tr_o(s_issued_read_transactions[N_LOG_MASTERS + ii]), + .n_retired_rd_tr_o() ); end endgenerate @@ -227,54 +439,48 @@ module tb_hci // QoS // ///////// - real SUM_REQ_TO_GNT_LATENCY_LOG[N_MASTER-N_HWPE]; + real SUM_REQ_TO_GNT_LATENCY_LOG[N_LOG_MASTERS]; real SUM_REQ_TO_GNT_LATENCY_HWPE[N_HWPE]; - int unsigned N_GNT_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_GNT_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_GNT_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_READ_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_READ_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_WRITE_GRANTED_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_WRITE_GRANTED_TRANSACTIONS_HWPE[N_HWPE]; - int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_MASTER-N_HWPE]; + int unsigned N_READ_COMPLETE_TRANSACTIONS_LOG[N_LOG_MASTERS]; int unsigned N_READ_COMPLETE_TRANSACTIONS_HWPE[N_HWPE]; - /* REAL THROUGHPUT AND SIMULATION TIME */ - - real latency_per_master[N_MASTER]; + real latency_per_master[N_DRIVERS]; real throughput_completed; - real stim_latency; real tot_latency; - throughput_monitor #( - .N_MASTER(N_MASTER), + bandwidth_monitor #( + .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE), .CLK_PERIOD(CLK_PERIOD), .DATA_WIDTH(DATA_WIDTH), - .HWPE_WIDTH(HWPE_WIDTH) - ) i_throughput_monitor ( + .HWPE_WIDTH_FACT(HWPE_WIDTH_FACT) + ) i_bandwidth_monitor ( .clk_i(clk), .rst_ni(rst_n), - .end_stimuli_i(s_end_stimuli), - .end_latency_i(s_end_latency), + .end_resp_i(s_end_resp), .n_read_complete_log_i(N_READ_COMPLETE_TRANSACTIONS_LOG), .n_read_complete_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE), .n_write_granted_log_i(N_WRITE_GRANTED_TRANSACTIONS_LOG), .n_write_granted_hwpe_i(N_WRITE_GRANTED_TRANSACTIONS_HWPE), .throughput_complete_o(throughput_completed), - .stim_latency_o(stim_latency), .tot_latency_o(tot_latency), .latency_per_master_o(latency_per_master) ); - /* LATENCY MONITOR */ - latency_monitor #( - .N_MASTER(N_MASTER), + req_gnt_monitor #( + .N_MASTER(N_DRIVERS), .N_HWPE(N_HWPE) - ) i_latency_monitor ( + ) i_req_gnt_monitor ( .clk_i(clk), .rst_ni(rst_n), - .hci_log_if(hci_log_if), - .hci_hwpe_if(hci_hwpe_if), + .hci_driver_log_if(hci_driver_log_if), + .hci_driver_hwpe_if(hci_driver_hwpe_if), .sum_req_to_gnt_latency_log_o(SUM_REQ_TO_GNT_LATENCY_LOG), .sum_req_to_gnt_latency_hwpe_o(SUM_REQ_TO_GNT_LATENCY_HWPE), .n_gnt_transactions_log_o(N_GNT_TRANSACTIONS_LOG), @@ -287,16 +493,13 @@ module tb_hci .n_read_complete_hwpe_o(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); - /////////// - // Other // - /////////// + /////////////// + // Reporting // + /////////////// - /* SIMULATION REPORT */ simulation_report i_simulation_report ( - .end_stimuli_i(s_end_stimuli), - .end_latency_i(s_end_latency), + .end_resp_i(s_end_resp), .throughput_complete_i(throughput_completed), - .stim_latency_i(stim_latency), .tot_latency_i(tot_latency), .latency_per_master_i(latency_per_master), .sum_req_to_gnt_latency_log_i(SUM_REQ_TO_GNT_LATENCY_LOG), @@ -311,7 +514,10 @@ module tb_hci .n_read_complete_transactions_hwpe_i(N_READ_COMPLETE_TRANSACTIONS_HWPE) ); - /* ASSERTIONS */ + //////////////// + // Assertions // + //////////////// + localparam int unsigned MAX_BANK_LOCAL_ADDR = TOT_MEM_SIZE * 1024 / N_BANKS - WIDTH_OF_MEMORY_BYTE; @@ -319,15 +525,15 @@ module tb_hci for (genvar ii = 0; ii < N_HWPE; ii++) begin : gen_assert_hwpe_address a_hwpe_addr_in_bounds: assert property ( @(posedge clk) - get_bank_local_address(hci_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR + get_bank_local_address(hci_driver_hwpe_if[ii].add) <= MAX_BANK_LOCAL_ADDR ) else begin $display("--------------------------------------------"); $display("Time %0t: Test stopped", $time); $error( "HWPE%0d generated an out-of-bounds address (raw=0x%0h, bank_local=0x%0h, max=0x%0h).", ii, - hci_hwpe_if[ii].add, - get_bank_local_address(hci_hwpe_if[ii].add), + hci_driver_hwpe_if[ii].add, + get_bank_local_address(hci_driver_hwpe_if[ii].add), MAX_BANK_LOCAL_ADDR ); $display("This workload is invalid; rerun with a different workload configuration."); @@ -336,4 +542,17 @@ module tb_hci end endgenerate + // Advisory check only. The hard overflow guard is in Python generation + // before packing FENCE_REQ_LEVELS into 4-bit fields. + // In case of failure due to this asser, modify tb_hci_pkg.sv and generation of fence_masks.mk + initial begin + if (MAX_FENCES > 16) begin + $warning( + "MAX_FENCES=%0d exceeds the nominal 4-bit fence-level range; " + "ensure no dependency requires a level > 15.", + MAX_FENCES + ); + end + end + endmodule diff --git a/target/verif/src/tb_hci_pkg.sv b/target/verif/src/tb_hci_pkg.sv index cdaf055..507565c 100644 --- a/target/verif/src/tb_hci_pkg.sv +++ b/target/verif/src/tb_hci_pkg.sv @@ -4,8 +4,7 @@ * Sergio Mazzola * Luca Codeluppi * - * - * Copyright (C) 2019-2025 ETH Zurich, University of Bologna + * Copyright (C) 2019-2026 ETH Zurich, University of Bologna * Copyright and related rights are licensed under the Solderpad Hardware * License, Version 0.51 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at @@ -18,77 +17,146 @@ package tb_hci_pkg; + typedef enum logic [1:0] { + LOG = 2'd0, + MUX = 2'd1, + HCI = 2'd2 + } interco_e; + ////////////////////////// // Testbench parameters // ////////////////////////// // from verif/config/testbench.mk - /* Timing parameters */ - - localparam time CLK_PERIOD = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif; - localparam time APPL_DELAY = 0; - localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif; + localparam time CLK_PERIOD = `ifdef CLK_PERIOD `CLK_PERIOD `else 6 `endif; + localparam time APPL_DELAY = 0; + localparam unsigned RST_CLK_CYCLES = `ifdef RST_CLK_CYCLES `RST_CLK_CYCLES `else 10 `endif; - /* Simulation parameters */ + // TCDM and arbitration parameters + localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; + localparam int unsigned ARBITER_MODE = 0; + localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif; + localparam int unsigned PRIORITY_CNT_NUMERATOR = `ifdef PRIORITY_CNT_NUMERATOR `PRIORITY_CNT_NUMERATOR `else 3 `endif; + localparam int unsigned PRIORITY_CNT_DENOMINATOR = `ifdef PRIORITY_CNT_DENOMINATOR `PRIORITY_CNT_DENOMINATOR `else 4 `endif; - // Transaction counts - localparam int unsigned N_TRANSACTION_LOG = `ifdef N_TRANSACTION_LOG `N_TRANSACTION_LOG `else 10 `endif; - localparam int unsigned TRANSACTION_RATIO = `ifdef TRANSACTION_RATIO `TRANSACTION_RATIO `else 1 `endif; - localparam int unsigned N_TRANSACTION_HWPE = int'(N_TRANSACTION_LOG * TRANSACTION_RATIO); - - // TCDM interface parameters - localparam int unsigned MAX_CYCLES_BETWEEN_GNT_RVALID = `ifdef MAX_CYCLES_BETWEEN_GNT_RVALID `MAX_CYCLES_BETWEEN_GNT_RVALID `else 1 `endif; - localparam int unsigned RANDOM_GNT = `ifdef RANDOM_GNT `RANDOM_GNT `else 0 `endif; + ///////////////////////////// + // Configurable parameters // + ///////////////////////////// + // from verif/config/hardware.mk - // Arbiter configuration - localparam int unsigned ARBITER_MODE = 0; - localparam int unsigned INVERT_PRIO = `ifdef INVERT_PRIO `INVERT_PRIO `else 0 `endif; - localparam int unsigned LOW_PRIO_MAX_STALL = `ifdef LOW_PRIO_MAX_STALL `LOW_PRIO_MAX_STALL `else 3 `endif; + // Interconnect mode + localparam interco_e INTERCO_TYPE = `ifdef INTERCO_TYPE `INTERCO_TYPE `else HCI `endif; - ///////////////////////// - // Hardware parameters // - ///////////////////////// - // from verif/config/hardware.mk + // Number of initiators + localparam int unsigned N_HWPE = `ifdef N_HWPE `N_HWPE `else 1 `endif; + localparam int unsigned N_CORE = `ifdef N_CORE `N_CORE `else 1 `endif; + localparam int unsigned N_DMA = `ifdef N_DMA `N_DMA `else 1 `endif; + localparam int unsigned N_EXT = `ifdef N_EXT `N_EXT `else 1 `endif; - /* Config */ + // Interconnect configuration + localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif; + localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; + localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; - // Master port counts - localparam int unsigned N_HWPE_REAL = `ifdef N_HWPE `N_HWPE `else 1 `endif; // Number of HWPEs attached to the port - localparam int unsigned N_CORE_REAL = `ifdef N_CORE `N_CORE `else 1 `endif; // Number of Core ports - localparam int unsigned N_DMA_REAL = `ifdef N_DMA `N_DMA `else 1 `endif; // Number of DMA ports - localparam int unsigned N_EXT_REAL = `ifdef N_EXT `N_EXT `else 1 `endif; // Number of External ports + // Memory system configuration + localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; + localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; + localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; + localparam int unsigned HWPE_WIDTH_FACT = `ifdef HWPE_WIDTH_FACT `HWPE_WIDTH_FACT `else 4 `endif; - // Normalized master counts (minimum 1 for array sizing) - localparam int unsigned N_HWPE = (N_HWPE_REAL == 0) ? 1 : N_HWPE_REAL; - localparam int unsigned N_CORE = (N_CORE_REAL == 0) ? 1 : N_CORE_REAL; - localparam int unsigned N_DMA = (N_DMA_REAL == 0) ? 1 : N_DMA_REAL; - localparam int unsigned N_EXT = (N_EXT_REAL == 0) ? 1 : N_EXT_REAL; - localparam int unsigned N_MASTER = N_HWPE + N_CORE + N_DMA + N_EXT; // Total number of masters - localparam int unsigned N_MASTER_REAL = N_HWPE_REAL + N_CORE_REAL + N_DMA_REAL + N_EXT_REAL; // Total number of masters (real) + ////////////////////////// + // Hardcoded parameters // + ////////////////////////// - // Interconnect configuration - localparam int unsigned TS_BIT = `ifdef TS_BIT `TS_BIT `else 0 `endif; // TEST_SET_BIT (for Log Interconnect) - localparam int unsigned EXPFIFO = `ifdef EXPFIFO `EXPFIFO `else 0 `endif; // FIFO Depth for HWPE Interconnect - localparam int unsigned SEL_LIC = `ifdef SEL_LIC `SEL_LIC `else 0 `endif; // Log interconnect type selector + localparam int unsigned WORD_SIZE = DATA_WIDTH / 8; - // Data and memory parameters - localparam int unsigned DATA_WIDTH = `ifdef DATA_WIDTH `DATA_WIDTH `else 32 `endif; // Width of DATA in bits - localparam int unsigned HWPE_WIDTH = `ifdef HWPE_WIDTH `HWPE_WIDTH `else 4 `endif; // Width of an HWPE wide-word - localparam int unsigned TOT_MEM_SIZE = `ifdef TOT_MEM_SIZE `TOT_MEM_SIZE `else 32 `endif; // Memory size (kB) - localparam int unsigned N_BANKS = `ifdef N_BANKS `N_BANKS `else 16 `endif; // Number of memory banks + ////////////////////////// + // Dependent parameters // + ////////////////////////// - /* Derived parameters */ + localparam int unsigned N_LOG_MASTERS = N_CORE + N_DMA + N_EXT; + localparam int unsigned N_DRIVERS = N_LOG_MASTERS + N_HWPE; + +// Fence parameters. +// +// Fence slots enumerate every PAUSE token in the stimulus file, in file order. +// This includes: +// (1) synthetic blocking PAUSEs inserted before patterns that have wait_for_jobs, +// (2) trailing PAUSEs emitted after every pattern. +// +// fence_idx[i] counts how many PAUSE tokens driver i has passed so far. +// It is therefore a "passed-fence count", not a "completed-pattern count". +// +// FENCE_MASKS[i][f][j] = 1: +// at fence slot f of driver i, driver j is a dependency. +// +// FENCE_REQ_LEVELS_PACKED[i][f][j]: +// minimum fence_idx[j] required before driver i may pass fence slot f. +// +// For a synthetic pre-pattern fence, the required level corresponds to the +// dependency driver's fence count after the referenced job has completed. +// For a trailing pattern fence, the mask is zero and the fence is a free pass. +// +// Both arrays are generated by main.py and passed via defines. + localparam int unsigned MAX_FENCES = + `ifdef MAX_FENCES_PARAM `MAX_FENCES_PARAM `else 1 `endif; + localparam logic [N_DRIVERS-1:0] FENCE_MASKS [N_DRIVERS][MAX_FENCES] = + `ifdef FENCE_MASKS_PARAM `FENCE_MASKS_PARAM `else '{default: '{default: '0}} `endif; + // FENCE_REQ_LEVELS_PACKED[i][f] is a packed vector of N_DRIVERS × 4 bits. + // Bits [j*4+3:j*4] hold the required fence_idx[j] before driver i can pass fence f. + // 4 bits supports up to 15 (sufficient for up to 7 patterns × 2 fences/pattern). + localparam logic [N_DRIVERS*4-1:0] FENCE_REQ_LEVELS_PACKED [N_DRIVERS][MAX_FENCES] = + `ifdef FENCE_REQ_LEVELS_PARAM `FENCE_REQ_LEVELS_PARAM `else '{default: '{default: '0}} `endif; + + // If fully log interconnect is used, instantiate HWPE_WIDTH_FACT narrow ports for each HWPE. + localparam int unsigned N_NARROW_HCI = + N_CORE + (N_HWPE * HWPE_WIDTH_FACT) * (INTERCO_TYPE == LOG); + + // If full HCI is used, instantiate one wide port per HWPE. + // If static MUX is used, instantiate a single shared wide port. + localparam int unsigned N_WIDE_HCI = + (INTERCO_TYPE == HCI) ? N_HWPE : ((INTERCO_TYPE == MUX) ? 1 : 0); + + // One-hot response ID width used by the interconnect. + localparam int unsigned IW = N_NARROW_HCI + N_WIDE_HCI + N_DMA + N_EXT; + + localparam int unsigned FILTER_WRITE_R_VALID[0:N_WIDE_HCI-1] = '{default: 0}; + + localparam int unsigned ADDR_WIDTH = $clog2(TOT_MEM_SIZE * 1024); + localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH; + localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; + localparam int unsigned BIT_BANK_INDEX = $clog2(N_BANKS); + localparam int unsigned ADDR_WIDTH_BANK = ADDR_WIDTH - BIT_BANK_INDEX; + localparam int unsigned N_WORDS = + (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; - localparam int unsigned ADD_WIDTH = $clog2(TOT_MEM_SIZE * 1024); // Width of ADDRESS in bits - localparam int unsigned WIDTH_OF_MEMORY = DATA_WIDTH; // Width of a memory bank (bits) - localparam int unsigned WIDTH_OF_MEMORY_BYTE = WIDTH_OF_MEMORY / 8; // Width of a memory bank (bytes) - localparam int unsigned BIT_BANK_INDEX = $clog2(N_BANKS); // Bits of the Bank index - localparam int unsigned AddrMemWidth = ADD_WIDTH - BIT_BANK_INDEX; // Number of address bits per TCDM bank - localparam int unsigned N_WORDS = (TOT_MEM_SIZE * 1024 / N_BANKS) / WIDTH_OF_MEMORY_BYTE; // Number of words in a bank - localparam int unsigned FILTER_WRITE_R_VALID = '0; + /////////////// + // Bitwidths // + /////////////// - localparam int unsigned IW = $clog2(N_TRANSACTION_LOG * (N_MASTER_REAL - N_HWPE_REAL) + N_TRANSACTION_HWPE * N_HWPE_REAL); // ID Width - localparam int unsigned TOT_CHECK = N_TRANSACTION_LOG * (N_CORE_REAL + N_DMA_REAL + N_EXT_REAL) + N_HWPE_REAL * N_TRANSACTION_HWPE * HWPE_WIDTH; + localparam int unsigned DW_cores = DATA_WIDTH; + localparam int unsigned AW_cores = 32; + localparam int unsigned BW_cores = 8; + localparam int unsigned UW_cores = 1; + localparam int unsigned IW_cores = 8; + localparam int unsigned EW_cores = 1; + localparam int unsigned EHW_cores = 1; + + localparam int unsigned DW_hwpe = DW_cores * HWPE_WIDTH_FACT; + localparam int unsigned AW_hwpe = AW_cores; + localparam int unsigned BW_hwpe = BW_cores; + localparam int unsigned UW_hwpe = UW_cores; + localparam int unsigned IW_hwpe = IW_cores; + localparam int unsigned EW_hwpe = EW_cores; + localparam int unsigned EHW_hwpe = EHW_cores; + + localparam int unsigned DW_mems = DW_cores; + localparam int unsigned AW_mems = ADDR_WIDTH_BANK; + localparam int unsigned BW_mems = BW_cores; + localparam int unsigned UW_mems = UW_cores; + localparam int unsigned IW_mems = IW; + localparam int unsigned EW_mems = EW_cores; + localparam int unsigned EHW_mems = EHW_cores; /////////// // Types // @@ -97,45 +165,42 @@ package tb_hci_pkg; typedef struct packed { logic wen; logic [DATA_WIDTH-1:0] data; - logic [ADD_WIDTH-1:0] add; + logic [ADDR_WIDTH-1:0] add; } stimuli_t; typedef struct packed { - logic [DATA_WIDTH - 1 : 0] data; - logic [AddrMemWidth - 1 : 0] add; + logic [DATA_WIDTH-1:0] data; + logic [ADDR_WIDTH_BANK-1:0] add; } out_intc_to_mem_t; - // Helper return type for HWPE address/data creation typedef struct { - logic [ADD_WIDTH-1:0] address; + logic [ADDR_WIDTH-1:0] address; logic [DATA_WIDTH-1:0] data; - logic rolls_over; + logic rolls_over; } hwpe_addr_data_t; ///////////// // Helpers // ///////////// - // Zero-time pure function returning address/data for an HWPE lane function automatic hwpe_addr_data_t create_address_and_data_hwpe( - input logic [ADD_WIDTH-1:0] address_before, - input logic [HWPE_WIDTH * DATA_WIDTH-1:0] data_before, - input int index, - input logic rolls_over_check_before + input logic [ADDR_WIDTH-1:0] address_before, + input logic [HWPE_WIDTH_FACT * DATA_WIDTH-1:0] data_before, + input int index, + input logic rolls_over_check_before ); hwpe_addr_data_t ret; logic [BIT_BANK_INDEX-1:0] bank_index_before, bank_index_after; begin - bank_index_before = address_before[BIT_BANK_INDEX-1 + 2 : 2]; - // Legacy behavior: add full-width index and let truncation wrap - bank_index_after = index + bank_index_before; - ret.rolls_over = rolls_over_check_before; + bank_index_before = address_before[BIT_BANK_INDEX-1 + 2:2]; + bank_index_after = index + bank_index_before; + ret.rolls_over = rolls_over_check_before; if (bank_index_before > bank_index_after) begin ret.rolls_over = 1'b1; end ret.address = { - address_before[ADD_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over, + address_before[ADDR_WIDTH-1:BIT_BANK_INDEX + 2] + ret.rolls_over, bank_index_after, address_before[1:0] }; @@ -145,43 +210,20 @@ package tb_hci_pkg; endfunction task calculate_bank_index( - input logic [ADD_WIDTH-1:0] address, + input logic [ADDR_WIDTH-1:0] address, output logic [BIT_BANK_INDEX-1:0] index ); index = address[BIT_BANK_INDEX-1 + 2:2]; endtask - /* Metrics helpers */ - - task calculate_theoretical_throughput(output real throughput_theo); - real tot_data, band_memory_limit, tot_time; - if (TRANSACTION_RATIO >= 1) begin - tot_time = N_TRANSACTION_HWPE; - end else begin - tot_time = N_TRANSACTION_LOG; - end - tot_data = (N_TRANSACTION_LOG * DATA_WIDTH) * (N_MASTER_REAL - N_HWPE_REAL) + - (N_TRANSACTION_HWPE * HWPE_WIDTH * DATA_WIDTH) * N_HWPE_REAL; // bit - throughput_theo = tot_data / tot_time; // bit per cycle - band_memory_limit = real'(N_BANKS * DATA_WIDTH); - if (throughput_theo >= band_memory_limit) begin - throughput_theo = band_memory_limit; - end - endtask - - /////////////// - // Functions // - /////////////// - - // Convert a full system address to per-bank local word address. - function int unsigned get_bank_local_address(input logic [ADD_WIDTH-1:0] addr_i); - logic [ADD_WIDTH-1:0] mapped_addr; - logic [ADD_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; + function int unsigned get_bank_local_address(input logic [ADDR_WIDTH-1:0] addr_i); + logic [ADDR_WIDTH-1:0] mapped_addr; + logic [ADDR_WIDTH-BIT_BANK_INDEX-1:0] bank_local_addr; tb_hci_pkg::hwpe_addr_data_t hwpe_lane_addr_data; - hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH, '0); + hwpe_lane_addr_data = create_address_and_data_hwpe(addr_i, '0, HWPE_WIDTH_FACT, '0); mapped_addr = hwpe_lane_addr_data.address; bank_local_addr = { - mapped_addr[ADD_WIDTH-1:BIT_BANK_INDEX + 2], + mapped_addr[ADDR_WIDTH-1:BIT_BANK_INDEX + 2], mapped_addr[1:0] }; return int'(bank_local_addr); diff --git a/target/verif/src/tcdm_banks_wrap.sv b/target/verif/src/tcdm_banks_wrap.sv index 46c251c..7aad9a8 100644 --- a/target/verif/src/tcdm_banks_wrap.sv +++ b/target/verif/src/tcdm_banks_wrap.sv @@ -83,18 +83,20 @@ module tcdm_banks_wrap #( .rdata_o(tcdm_slave[i].r_data ) // read data ); + //NOTE: Commented out. r_valid response is handled by interconnect + //r_valid - always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen - if(~rst_ni) begin - tcdm_slave[i].r_valid <= 1'b0; - end else begin - if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin - tcdm_slave[i].r_valid <= 1'b1; - end else begin - tcdm_slave[i].r_valid <= 1'b0; - end - end - end + // always_ff @(posedge clk_i or negedge rst_ni) begin : rvalid_gen + // if(~rst_ni) begin + // tcdm_slave[i].r_valid <= 1'b0; + // end else begin + // if(tcdm_slave[i].req && tcdm_slave[i].gnt && tcdm_slave[i].wen) begin + // tcdm_slave[i].r_valid <= 1'b1; + // end else begin + // tcdm_slave[i].r_valid <= 1'b0; + // end + // end + // end end endmodule diff --git a/target/verif/verif.mk b/target/verif/verif.mk index dd656b0..5d08eba 100644 --- a/target/verif/verif.mk +++ b/target/verif/verif.mk @@ -1,37 +1,76 @@ -# Copyright 2025 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE.solderpad for details. # SPDX-License-Identifier: SHL-0.51 # # Sergio Mazzola +.DELETE_ON_ERROR: + HCI_VERIF_DIR = $(HCI_ROOT)/target/verif HCI_VERIF_CFG_DIR = $(HCI_VERIF_DIR)/config HCI_VERIF_CFG_GEN_DIR = $(HCI_VERIF_CFG_DIR)/generated +# Other Makefiles +include $(HCI_VERIF_DIR)/exploration/exploration.mk + # Include generated Makefiles include $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk include $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk +ifeq (,$(filter clean%,$(MAKECMDGOALS))) +-include $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk +endif # Bender targets and defines include $(HCI_VERIF_DIR)/bender.mk # Tooling #NOTE: Only QuestaSim is currently supported by verification framework -SIM_QUESTA ?= questa-2022.3 +ifneq (,$(wildcard /etc/iis.version)) + SIM_QUESTA ?= questa-2022.3 +else + SIM_QUESTA ?= +endif SIM_VLIB ?= $(SIM_QUESTA) vlib SIM_VSIM ?= $(SIM_QUESTA) vsim SIM_VOPT ?= $(SIM_QUESTA) vopt PYTHON ?= python3 +################## +# Simvectors gen # +################## + +GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py +STIM_SRC_FILES := $(shell find $(HCI_VERIF_DIR)/config -type f -not -path '$(HCI_VERIF_CFG_GEN_DIR)/*') \ + $(shell find $(HCI_VERIF_DIR)/simvectors -type f -not -path '$(HCI_VERIF_DIR)/simvectors/generated/*') +SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated + +.PHONY: stim-verif +stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp +$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(VERIF_CFG_MK) $(STIM_SRC_FILES) $(GEN_STIM_SCRIPT) | $(HCI_VERIF_CFG_GEN_DIR) + mkdir -p $(SIMVECTORS_GEN_DIR) + $(PYTHON) $(GEN_STIM_SCRIPT) \ + --workload_config $(WORKLOAD_JSON) \ + --testbench_config $(TESTBENCH_JSON) \ + --hardware_config $(HARDWARE_JSON) \ + --emit_phases_mk $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk + date > $@ + +.PHONY: clean-stim-verif +clean-stim-verif: + rm -rf $(SIMVECTORS_GEN_DIR) + ############## # Config gen # ############## -# Source-of-truth JSON configs -VERIF_CFG_JSON := $(HCI_VERIF_CFG_DIR)/hardware.json \ - $(HCI_VERIF_CFG_DIR)/testbench.json \ - $(HCI_VERIF_CFG_DIR)/workload.json +# JSON configs are configurable from env var (default to config/) +HARDWARE_JSON ?= $(HCI_VERIF_CFG_DIR)/hardware.json +TESTBENCH_JSON ?= $(HCI_VERIF_CFG_DIR)/testbench.json +WORKLOAD_JSON ?= $(HCI_VERIF_CFG_DIR)/workload.json + +# Source-of-truth JSON configs (used as stim-verif dependencies) +VERIF_CFG_JSON := $(HARDWARE_JSON) $(TESTBENCH_JSON) $(WORKLOAD_JSON) # Makefiles to generate from JSON configs VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \ @@ -39,45 +78,26 @@ VERIF_CFG_MK := $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk \ .PHONY: config-verif config-verif: $(VERIF_CFG_MK) -# Generate Makefiles from JSON configs -$(HCI_VERIF_CFG_GEN_DIR)/%.mk: $(HCI_VERIF_CFG_DIR)/%.json $(HCI_VERIF_CFG_GEN_DIR)/%.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) - $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py $* $(HCI_VERIF_CFG_DIR) $(HCI_VERIF_CFG_GEN_DIR) > $@ + +$(HCI_VERIF_CFG_GEN_DIR)/hardware.mk: $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR)/hardware.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py hardware $(HARDWARE_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@ + +$(HCI_VERIF_CFG_GEN_DIR)/testbench.mk: $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR)/testbench.mk.tpl $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py | $(HCI_VERIF_CFG_GEN_DIR) + $(PYTHON) $(HCI_VERIF_CFG_GEN_DIR)/json_to_mk.py testbench $(TESTBENCH_JSON) $(HCI_VERIF_CFG_GEN_DIR) > $@ $(HCI_VERIF_CFG_GEN_DIR): mkdir -p $@ .PHONY: clean-config-verif clean-config-verif: - rm -f $(VERIF_CFG_MK) - -################## -# Simvectors gen # -################## - -GEN_STIM_SCRIPT := $(HCI_VERIF_DIR)/simvectors/main.py -STIM_SRC_FILES := $(shell find {$(HCI_VERIF_DIR)/config,$(HCI_VERIF_DIR)/simvectors} -type f) -SIMVECTORS_GEN_DIR := $(HCI_VERIF_DIR)/simvectors/generated - -.PHONY: stim-verif -stim-verif: $(SIMVECTORS_GEN_DIR)/.stim_stamp -$(SIMVECTORS_GEN_DIR)/.stim_stamp: $(VERIF_CFG_JSON) $(STIM_SRC_FILES) - mkdir -p $(SIMVECTORS_GEN_DIR) - $(PYTHON) $(GEN_STIM_SCRIPT) \ - --workload_config $(HCI_VERIF_CFG_DIR)/workload.json \ - --testbench_config $(HCI_VERIF_CFG_DIR)/testbench.json \ - --hardware_config $(HCI_VERIF_CFG_DIR)/hardware.json - date > $@ - -.PHONY: clean-stim-verif -clean-stim-verif: - rm -rf $(SIMVECTORS_GEN_DIR) + rm -f $(VERIF_CFG_MK) $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk ############## # Simulation # ############## # Parameters -GUI ?= 0 +GUI ?= $(if $(gui),$(gui),0) # Top-level to simulate sim_top_level ?= tb_hci sim_vsim_lib ?= $(HCI_VERIF_DIR)/vsim/work @@ -99,9 +119,15 @@ ifeq ($(GUI),0) SIM_HCI_VSIM_ARGS += -c endif -$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) +FENCE_MASKS_MK := $(HCI_VERIF_CFG_GEN_DIR)/fence_masks.mk +MAX_FENCES_PARAM = $(shell grep '^MAX_FENCES_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) +FENCE_MASKS_PARAM = $(shell grep '^FENCE_MASKS_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) +FENCE_REQ_LEVELS_PARAM = $(shell grep '^FENCE_REQ_LEVELS_PACKED_PARAM' $(FENCE_MASKS_MK) 2>/dev/null | cut -d' ' -f3-) + +$(HCI_VERIF_DIR)/vsim/compile.tcl: $(HCI_ROOT)/Bender.lock $(HCI_ROOT)/Bender.yml $(HCI_ROOT)/bender.mk $(HCI_VERIF_DIR)/bender.mk $(SIM_SRC_FILES) $(VERIF_CFG_MK) $(SIMVECTORS_GEN_DIR)/.stim_stamp mkdir -p $(HCI_VERIF_DIR)/vsim - $(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) --vlog-arg="$(SIM_HCI_VLOG_ARGS)" > $@ + $(BENDER) script vsim $(COMMON_DEFS) $(VERIF_DEFS) $(COMMON_TARGS) $(VERIF_TARGS) \ + --vlog-arg="$(SIM_HCI_VLOG_ARGS) \"+define+MAX_FENCES_PARAM=$(MAX_FENCES_PARAM) +define+FENCE_MASKS_PARAM=$(FENCE_MASKS_PARAM) +define+FENCE_REQ_LEVELS_PARAM=$(FENCE_REQ_LEVELS_PARAM)\"" > $@ .PHONY: compile-verif compile-verif: $(sim_vsim_lib)/.hw_compiled @@ -119,11 +145,12 @@ $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled: $(sim_vsim_lib)/.hw date > $@ .PHONY: run-verif -run-verif: $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp +run-verif: $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl $(sim_vsim_lib)/$(sim_top_level)_optimized/.tb_opt_compiled $(SIMVECTORS_GEN_DIR)/.stim_stamp cd $(HCI_VERIF_DIR)/vsim && \ $(SIM_VSIM) $(SIM_HCI_VSIM_ARGS) \ $(sim_top_level)_optimized \ - -do 'set GUI $(GUI); source $(HCI_VERIF_DIR)/vsim/$(sim_top_level).tcl' + -do 'set GUI $(GUI); source $<' + .PHONY: clean-verif clean-sim-verif: diff --git a/target/verif/vsim/tb_hci.tcl b/target/verif/vsim/tb_hci.tcl index 01111cc..e0de90a 100644 --- a/target/verif/vsim/tb_hci.tcl +++ b/target/verif/vsim/tb_hci.tcl @@ -1,5 +1,176 @@ # If GUI is 1, spawn waveforms if {$GUI == 1} { echo "GUI mode enabled" + log -r /* + + set N_CORE [examine -radix dec /tb_hci_pkg/N_CORE] + set N_DMA [examine -radix dec /tb_hci_pkg/N_DMA] + set N_EXT [examine -radix dec /tb_hci_pkg/N_EXT] + set N_HWPE [examine -radix dec /tb_hci_pkg/N_HWPE] + set N_BANKS [examine -radix dec /tb_hci_pkg/N_BANKS] + + set N_LOG_MASTERS [examine -radix dec /tb_hci_pkg/N_LOG_MASTERS] + set N_DRIVERS [examine -radix dec /tb_hci_pkg/N_DRIVERS] + set N_NARROW_HCI [examine -radix dec /tb_hci_pkg/N_NARROW_HCI] + set N_WIDE_HCI [examine -radix dec /tb_hci_pkg/N_WIDE_HCI] + set HWPE_WIDTH_FACT [examine -radix dec /tb_hci_pkg/HWPE_WIDTH_FACT] + set INTERCO_TYPE [examine /tb_hci_pkg/INTERCO_TYPE] + set MAX_FENCES [examine -radix dec /tb_hci_pkg/MAX_FENCES] + + add wave -noupdate /tb_hci/clk + add wave -noupdate /tb_hci/rst_n + + add wave -noupdate -divider Interfaces + # ------------------------------------------------------------------------- + # Application-driver interfaces + # ------------------------------------------------------------------------- + add wave -noupdate -group driver_side -divider narrow_masters + for {set i 0} {$i < $N_LOG_MASTERS} {incr i} { + add wave -noupdate -group driver_side -group log_$i /tb_hci/hci_driver_log_if[$i]/* + } + + add wave -noupdate -group driver_side -divider hwpe_masters + for {set i 0} {$i < $N_HWPE} {incr i} { + add wave -noupdate -group driver_side -group hwpe_$i /tb_hci/hci_driver_hwpe_if[$i]/* + } + + # ------------------------------------------------------------------------- + # Interconnect-side interfaces + # ------------------------------------------------------------------------- + add wave -noupdate -group hci_initiator_side -divider narrow_cores + for {set i 0} {$i < $N_CORE} {incr i} { + add wave -noupdate -group hci_initiator_side -group core_$i /tb_hci/hci_initiator_narrow[$i]/* + } + + if {[string first "LOG" $INTERCO_TYPE] != -1} { + add wave -noupdate -group hci_initiator_side -divider narrow_hwpe_split + for {set i 0} {$i < $N_HWPE} {incr i} { + for {set f 0} {$f < $HWPE_WIDTH_FACT} {incr f} { + set idx [expr {$N_CORE + $i * $HWPE_WIDTH_FACT + $f}] + if {$idx < $N_NARROW_HCI} { + add wave -noupdate -group hci_initiator_side -group hwpe_$i -group lane_$f /tb_hci/hci_initiator_narrow[$idx]/* + } + } + } + } + + if {$N_WIDE_HCI > 0} { + add wave -noupdate -group hci_initiator_side -divider wide_hwpe + for {set i 0} {$i < $N_WIDE_HCI} {incr i} { + add wave -noupdate -group hci_initiator_side -group wide_$i /tb_hci/hci_initiator_wide[$i]/* + } + } + + add wave -noupdate -group hci_initiator_side -divider dma + for {set i 0} {$i < $N_DMA} {incr i} { + add wave -noupdate -group hci_initiator_side -group dma_$i /tb_hci/hci_initiator_dma[$i]/* + } + + add wave -noupdate -group hci_initiator_side -divider ext + for {set i 0} {$i < $N_EXT} {incr i} { + add wave -noupdate -group hci_initiator_side -group ext_$i /tb_hci/hci_initiator_ext[$i]/* + } + + # ------------------------------------------------------------------------- + # Memory slaves + # ------------------------------------------------------------------------- + add wave -noupdate -group memory_targets + for {set i 0} {$i < $N_BANKS} {incr i} { + add wave -noupdate -group memory_targets -group bank_$i /tb_hci/hci_target_mems[$i]/* + } + + add wave -noupdate -divider "Application drivers" + # ------------------------------------------------------------------------- + # Per-driver driver internals (req/resp FSM states, counters) + # ------------------------------------------------------------------------- + add wave -noupdate -group driver_internals -divider log_masters + for {set i 0} {$i < $N_LOG_MASTERS} {incr i} { + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/req_state_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resp_state_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/tr_idx_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_req_issued_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_req_issued_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/n_rd_resp_retired_q + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/fence_reached_o + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/end_resp_o + add wave -noupdate -group driver_internals -group log_$i \ + /tb_hci/gen_app_driver_log[$i]/i_app_driver_log/resume_i + } + + add wave -noupdate -group driver_internals -divider hwpe_masters + for {set i 0} {$i < $N_HWPE} {incr i} { + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/req_state_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resp_state_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/tr_idx_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_req_issued_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_req_issued_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/n_rd_resp_retired_q + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/fence_reached_o + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/end_resp_o + add wave -noupdate -group driver_internals -group hwpe_$i \ + /tb_hci/gen_app_driver_hwpe[$i]/i_app_driver_hwpe/resume_i + } + + add wave -noupdate -divider Testbench + # ------------------------------------------------------------------------- + # Fence / synchronization signals + # ------------------------------------------------------------------------- + add wave -noupdate -group fence_sync /tb_hci/s_end_resp + add wave -noupdate -group fence_sync /tb_hci/s_fence_reached + add wave -noupdate -group fence_sync /tb_hci/s_resume + + for {set i 0} {$i < $N_DRIVERS} {incr i} { + add wave -noupdate -group fence_sync -group fence_idx /tb_hci/fence_idx[$i] + } + + # MUX sel (only present when INTERCO_TYPE == MUX) + if {[string first "MUX" $INTERCO_TYPE] != -1} { + add wave -noupdate -group fence_sync /tb_hci/gen_hwpe_mux/s_mux_sel + } + + + # ------------------------------------------------------------------------- + # Metrics + # ------------------------------------------------------------------------- + add wave -noupdate -group metrics /tb_hci/s_issued_transactions + add wave -noupdate -group metrics /tb_hci/s_issued_read_transactions + add wave -noupdate -group metrics /tb_hci/tot_latency + add wave -noupdate -group metrics /tb_hci/latency_per_master + add wave -noupdate -group metrics /tb_hci/throughput_completed + add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_GNT_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_READ_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_READ_COMPLETE_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_LOG + add wave -noupdate -group metrics /tb_hci/N_WRITE_GRANTED_TRANSACTIONS_HWPE + add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_LOG + add wave -noupdate -group metrics /tb_hci/SUM_REQ_TO_GNT_LATENCY_HWPE + + # ------------------------------------------------------------------------- + # HCI control + # ------------------------------------------------------------------------- + add wave -noupdate /tb_hci/s_clear + add wave -noupdate /tb_hci/s_hci_ctrl + + configure wave -signalnamewidth 1 +} else { + run -a } -run -a