diff --git a/.gitignore b/.gitignore index 43cd241c..8128d3ca 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,10 @@ .bender working_dir/ -# Python virtual environment +# Python virtual environment and caches .cache/pip .venv +__pycache__ # Generated source files .generated diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a98fdb8..fe12d745 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,6 +57,9 @@ chs-sw: vsim-compile: stage: build + extends: + - .cache-deps + - .init-env needs: - generate-rtl variables: @@ -101,6 +104,29 @@ sim-vsim: paths: - transcript +##################### +# Run Experiments # +##################### + +sim-exps: + stage: run + extends: + - .cache-deps + - .init-env + needs: + - chs-sw + - sn-sw + - vsim-compile + parallel: + matrix: + - EXPERIMENT: multicast + - EXPERIMENT: reduction + - EXPERIMENT: barrier + script: + - cd experiments/$EXPERIMENT && ./experiments.py --ci --actions sw run -j + artifacts: + expire_in: 7 days + ################### # Physical Design # ################### diff --git a/.gitlab/common.yml b/.gitlab/common.yml index ac4634c7..708c5cc5 100644 --- a/.gitlab/common.yml +++ b/.gitlab/common.yml @@ -68,7 +68,6 @@ variables: script: - make sn-tests DEBUG=ON - make sn-apps DEBUG=ON - - make pb-sn-tests artifacts: paths: - sw/snitch/tests/build/*.elf diff --git a/.gitlab/sw-tests.yml b/.gitlab/sw-tests.yml index 42cc5c1d..d1cc08eb 100644 --- a/.gitlab/sw-tests.yml +++ b/.gitlab/sw-tests.yml @@ -23,7 +23,7 @@ - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/non_null_exitcode.elf, NZ_EXIT_CODE: 896 } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicluster_atomics.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/mcast_barrier.elf } - - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/dma_multicast.elf } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicast_benchmark.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multi_mcast.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/row_col_mcast.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/access_spm.elf } diff --git a/Bender.lock b/Bender.lock index 55c39087..10fa0b86 100644 --- a/Bender.lock +++ b/Bender.lock @@ -25,7 +25,7 @@ packages: - obi_peripherals - register_interface axi: - revision: bec548fa2a9b18cbd7531105bb1fdf481ea8ad49 + revision: 06410c36819924e32db2afa428d244dbdbcd5d4e version: null source: Git: https://github.com/colluca/axi.git @@ -129,8 +129,8 @@ packages: - common_cells - register_interface cluster_icache: - revision: 64e21ae455bbdde850c4df13bef86ea55ac42537 - version: 0.2.0 + revision: 1642961b9561513f283674efc2c90f24855e8d39 + version: 0.3.1 source: Git: https://github.com/pulp-platform/cluster_icache.git dependencies: @@ -146,7 +146,7 @@ packages: dependencies: - common_cells common_cells: - revision: e1c09c75775c5f03eb45906d5145dbd2f5bcfb95 + revision: ca9d577f2fbc45ec557ab4e0905bbfc154441540 version: null source: Git: https://github.com/pulp-platform/common_cells.git @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: ea35a9909d60d552bbdafc7e898590f326b1898a + revision: 8d19d926fbc9174984a67d76f349d82ea117c443 version: null source: Git: https://github.com/pulp-platform/FlooNoC.git @@ -216,7 +216,15 @@ packages: - axi_riscv_atomics - common_cells - common_verification + - floo_noc_pd + - fpnew - idma + floo_noc_pd: + revision: null + version: null + source: + Path: working_dir/floo_noc/./pd + dependencies: [] fpnew: revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 version: null @@ -266,8 +274,8 @@ packages: dependencies: - tech_cells_generic idma: - revision: 7829f71691a62c1e2e5e3230f370f222c7a83087 - version: null + revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e + version: 0.6.5 source: Git: https://github.com/pulp-platform/iDMA.git dependencies: @@ -383,7 +391,7 @@ packages: - common_cells - register_interface snitch_cluster: - revision: 5d6c957c70e3824e2330d8308eb904724cd41cbe + revision: ecf876f5d8726c6424970c99c5e91f83aaedeacd version: null source: Git: https://github.com/pulp-platform/snitch_cluster.git diff --git a/Bender.yml b/Bender.yml index a1628a36..88dc00a7 100644 --- a/Bender.yml +++ b/Bender.yml @@ -9,11 +9,11 @@ package: dependencies: register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: "0.4.5" } - axi: { git: "https://github.com/pulp-platform/axi.git", version: "0.39.6" } + axi: { git: "https://github.com/colluca/axi.git", rev: "multicast" } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } - snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "5d6c957c70e3824e2330d8308eb904724cd41cbe" } - floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "develop" } + snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "main" } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "reduction-vc-rebase" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } hci: { git: "https://github.com/pulp-platform/hci.git", rev: "06fcba671e060f2e1b03b7ebe2d3e719f1557099" } @@ -36,7 +36,7 @@ sources: - target: pb_gen_rtl files: # Level 0.0 - - .generated/floo_picobello_noc_pkg.sv + - floo_picobello_noc_pkg_REDUCTION.sv - .generated/snitch_cluster_pkg.sv - .generated/pb_soc_regs_pkg.sv # Level 0.1 diff --git a/Makefile b/Makefile index 0419dce0..cd4839c8 100644 --- a/Makefile +++ b/Makefile @@ -20,20 +20,23 @@ SN_ROOT = $(shell $(BENDER) path snitch_cluster) FLOO_ROOT = $(shell $(BENDER) path floo_noc) # Executables -BENDER ?= bender -d $(PB_ROOT) +BENDER ?= bender --suppress W22 -d $(PB_ROOT) FLOO_GEN ?= floogen VERIBLE_FMT ?= verible-verilog-format VERIBLE_FMT_ARGS ?= --flagfile .verilog_format --inplace --verbose PEAKRDL ?= peakrdl # Tiles configuration -SN_CLUSTERS = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.cluster.num 2>/dev/null) -L2_TILES = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.l2_spm.num 2>/dev/null) +SN_CLUSTERS = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.cluster.num 2>/dev/null) +L2_TILES = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.l2_spm.num 2>/dev/null) # Bender prerequisites BENDER_YML = $(PB_ROOT)/Bender.yml BENDER_LOCK = $(PB_ROOT)/Bender.lock +$(PB_GEN_DIR): + mkdir -p $@ + ################ # Bender flags # ################ @@ -45,7 +48,7 @@ SIM_TARGS += -t simulation -t test -t idma_test # systemRDL # ############# -PB_RDL_ALL += $(PB_GEN_DIR)/picobello.rdl +PB_RDL_ALL += $(PB_GEN_DIR)/picobello_addrmap.rdl PB_RDL_ALL += $(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl PB_RDL_ALL += $(PB_GEN_DIR)/snitch_cluster.rdl PB_RDL_ALL += $(wildcard $(PB_ROOT)/cfg/rdl/*.rdl) @@ -58,14 +61,14 @@ $(PB_GEN_DIR)/pb_soc_regs.sv: $(PB_GEN_DIR)/pb_soc_regs_pkg.sv $(PB_GEN_DIR)/pb_soc_regs_pkg.sv: $(PB_ROOT)/cfg/rdl/pb_soc_regs.rdl $(PEAKRDL) regblock $< -o $(PB_GEN_DIR) --cpuif apb4-flat --default-reset arst_n -P Num_Clusters=$(SN_CLUSTERS) -P Num_Mem_Tiles=$(L2_TILES) -$(PB_GEN_DIR)/picobello.rdl: $(FLOO_CFG) - $(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --rdl --rdl-as-mem --rdl-memwidth=32 +$(PB_GEN_DIR)/picobello_addrmap.rdl: $(FLOO_CFG) + $(FLOO_GEN) rdl -c $(FLOO_CFG) -o $(PB_GEN_DIR) --as-mem --memwidth=32 # Those are dummy RDL files, for generation without access to the PD repository. -$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: +$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: | $(PB_GEN_DIR) @touch $@ -$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello.rdl $(PB_RDL_ALL) +$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello_addrmap.rdl $(PB_RDL_ALL) $(PEAKRDL) c-header $< $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) -o $@ -i -b ltoh $(PB_GEN_DIR)/pb_addrmap.svh: $(PB_RDL_ALL) @@ -130,12 +133,12 @@ endif floo-hw-all: $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv: $(FLOO_CFG) | $(PB_GEN_DIR) - $(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --only-pkg $(FLOO_GEN_FLAGS) + $(FLOO_GEN) pkg -c $(FLOO_CFG) -o $(PB_GEN_DIR) $(FLOO_GEN_FLAGS) floo-clean: rm -f $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv - rm -f $(PB_GEN_DIR)/picobello.rdl + rm -f $(PB_GEN_DIR)/picobello_addrmap.rdl ################### # Physical Design # @@ -172,16 +175,16 @@ clean-pd: PB_HW_ALL += $(CHS_HW_ALL) PB_HW_ALL += $(CHS_SIM_ALL) -PB_HW_ALL += $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv PB_HW_ALL += $(PB_RDL_HW_ALL) -PB_HW_ALL += update-sn-cfg -.PHONY: picobello-hw-all picobello-clean clean +.PHONY: picobello-hw-all picobello-hw-clean clean + +picobello-hw-all all: $(PB_HW_ALL) update-sn-cfg sn-hw-all floo-hw-all -picobello-hw-all all: $(PB_HW_ALL) sn-hw-all - $(MAKE) $(PB_HW_ALL) +picobello-hw-clean: sn-hw-clean floo-clean + rm -rf $(PB_HW_ALL) -picobello-hw-clean clean: sn-hw-clean floo-clean +clean: picobello-hw-clean rm -rf $(BENDER_ROOT) ############ @@ -195,6 +198,7 @@ include $(PB_ROOT)/sw/sw.mk ############## TB_DUT = tb_picobello_top +SIM_DIR = $(PB_ROOT) include $(PB_ROOT)/target/sim/vsim/vsim.mk include $(PB_ROOT)/target/sim/traces.mk @@ -221,13 +225,14 @@ python-venv: .venv .venv: $(BASE_PYTHON) -m venv $@ . $@/bin/activate && \ - python -m pip install --upgrade pip setuptools && \ - python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ + python -m pip install --upgrade pip "setuptools<81" && \ + python -m pip install --no-build-isolation --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \ - python -m pip install --cache-dir $(PIP_CACHE_DIR) "$(shell $(BENDER) path snitch_cluster)[kernels]" + python -m pip install --cache-dir $(PIP_CACHE_DIR) -e "$(shell $(BENDER) path snitch_cluster)[kernels]" python-venv-clean: rm -rf .venv + rm -rf $(PIP_CACHE_DIR) verible-fmt: $(VERIBLE_FMT) $(VERIBLE_FMT_ARGS) $(shell $(BENDER) script flist $(SIM_TARGS) --no-deps) diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json index 948bd2a8..7930bc7f 100644 --- a/cfg/snitch_cluster.json +++ b/cfg/snitch_cluster.json @@ -12,7 +12,6 @@ addr_width: 48, data_width: 64, atomic_id_width: 5, // clog2(#clusters + 2) - user_width: 53, // addr_width + atomic_id_width tcdm: { size: 128, banks: 32, @@ -26,13 +25,16 @@ dma_req_fifo_depth: 8, narrow_trans: 4, wide_trans: 32, - dma_user_width: 48, - enable_multicast: true, + enable_narrow_collectives: true, + enable_wide_collectives: true, + enable_dca: true, + dca_data_width: 512, cluster_base_expose: true, alias_region_enable: true, alias_region_base: 0x30000000, num_exposed_wide_tcdm_ports: 1, narrow_axi_port_expose: true, + collective_width: 4, enable_external_interrupts: true, vm_support: false, // Timing parameters diff --git a/experiments/.gitignore b/experiments/.gitignore new file mode 100644 index 00000000..15806eca --- /dev/null +++ b/experiments/.gitignore @@ -0,0 +1,6 @@ +build/ +runs/ +plots/ +results/ +data/ +hw/ \ No newline at end of file diff --git a/experiments/barrier/__init__.py b/experiments/barrier/__init__.py new file mode 100644 index 00000000..d01b5a8f --- /dev/null +++ b/experiments/barrier/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit + +__all__ = ["plot", "experiments", "fit"] diff --git a/experiments/barrier/experiments.py b/experiments/barrier/experiments.py new file mode 100755 index 00000000..8775c6bf --- /dev/null +++ b/experiments/barrier/experiments.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +TCK = 5 +DIR = Path(__file__).parent + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + return eu.derive_axes_from_keys(experiment, ['impl', 'n_clusters']) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'N_ROWS': experiment['n_rows'], + 'N_COLS': experiment['n_cols'], + } + return cdefs + + +def gen_experiments(ci=False): + experiments = [] + # for impl in ['hw']: + # for n_clusters in [8]: + impls = ['sw', 'hw'] + n_clusters_list = [2, 4, 8, 16] + if ci: + impls = ['hw'] + n_clusters_list = [16] + for impl in impls: + for n_clusters in n_clusters_list: + experiments.append({ + 'app': 'barrier_benchmark', + 'cmd': pb.sim_cmd(), + 'impl': impl, + 'n_clusters': n_clusters, + 'n_rows': int(math.ceil(n_clusters / 4)), + 'n_cols': n_clusters % 4 if n_clusters % 4 != 0 else 4, + }) + return experiments + + +def dma_core(row_idx, col_idx): + cluster_idx = row_idx + col_idx * 4 + return 1 + cluster_idx * 9 + 8 + + +def get_total_cycles(sim_results): + roi = SimRegion(f'hart_{dma_core(0, 1)}', 'barrier', 1) + return sim_results.get_timespan(roi) // TCK + + +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +def main(): + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/barrier/fit.py b/experiments/barrier/fit.py new file mode 100755 index 00000000..9a2f66c9 --- /dev/null +++ b/experiments/barrier/fit.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import matplotlib.pyplot as plt +import numpy as np +from scipy import stats +from barrier import experiments + +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +def get_results(): + df = experiments.results() + df['cycles'] = df['results'].apply(experiments.get_total_cycles) + return df + + +def fit_hw(quiet=True): + if not quiet: + print("Hardware barrier:") + df = get_results() + df = df[df['impl'] == 'hw'] + x = df['n_clusters'].to_numpy() + y = df['cycles'].to_numpy() + return fit(x, y, quiet) + + +def fit_sw(quiet=True): + if not quiet: + print("Software barrier:") + df = get_results() + df = df[df['impl'] == 'sw'] + x = df['n_clusters'].to_numpy() + y = df['cycles'].to_numpy() + return fit(x, y, quiet) + + +def main(): + fit_hw(quiet=False) + fit_sw(quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/barrier/plot.py b/experiments/barrier/plot.py new file mode 100755 index 00000000..dfd8341c --- /dev/null +++ b/experiments/barrier/plot.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +from barrier import experiments + + +def plot1(show=True): + """Plot runtime vs number of clusters.""" + + # Load results + df = experiments.results() + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.get_total_cycles) + + # Plot + ax = df.pivot(index="n_clusters", columns="impl", values="cycles").plot(kind="bar", width=0.65) + ax.set_xlabel("Nr. clusters") + ax.set_ylabel("Runtime [cycles]") + ax.tick_params(axis='x', labelrotation=0) + ax.set_axisbelow(True) + ax.grid(axis="y", color="gainsboro") + ax.legend(handlelength=1, ncol=2, columnspacing=0.5, handletextpad=0.3) + plt.tight_layout() + if show: + plt.show() + + +def main(): + + # Parse arguments + functions = [plot1] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + + +if __name__ == "__main__": + main() diff --git a/experiments/barrier/roi.json.tpl b/experiments/barrier/roi.json.tpl new file mode 100644 index 00000000..ea0dfcf1 --- /dev/null +++ b/experiments/barrier/roi.json.tpl @@ -0,0 +1,27 @@ +<% +if experiment['n_clusters'] == 2: + clusters = [0, 4] +elif experiment['n_clusters'] == 4: + clusters = [0, 4, 8, 12] +elif experiment['n_clusters'] == 8: + clusters = [0, 4, 8, 12, 1, 5, 9, 13] +elif experiment['n_clusters'] == 16: + clusters = list(range(16)) +%> + +[ +% for cluster in clusters: + { + "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", + "roi": [ + % if experiment['impl'] == 'sw': + {"idx": 1, "label": "barrier"}, + {"idx": 3, "label": "barrier"}, + % elif experiment['impl'] == 'hw': + {"idx": 3, "label": "barrier"}, + {"idx": 5, "label": "barrier"}, + % endif + ] + }, +% endfor +] diff --git a/experiments/multicast/README.md b/experiments/multicast/README.md new file mode 100644 index 00000000..5e615395 --- /dev/null +++ b/experiments/multicast/README.md @@ -0,0 +1,32 @@ +All commands are assumed to be run from this folder. + +To run the experiments: +```shell +./experiments.py --actions sw run visual-trace -j +``` + +To manually verify a simulation: +```shell +./verify.py placeholder /multicast_benchmark.elf --no-ipc --memdump /l2mem.bin --memaddr 0x70000000 +``` + +To manually generate the traces for a simulation: +```shell +make -C ../../ annotate -j DEBUG=ON SIM_DIR= +``` + +To manually build the visual trace for a simulation: +```shell +make -C ../../ sn-visual-trace -j DEBUG=ON SIM_DIR= SN_ROI_SPEC=/roi_spec.json +``` + +To fit the HW multicast model to the data: +```shell +./fit.py +``` +Verify that `beta=1`, and plug `alpha`s in `model.py`. This file contains runtime models for the hardware and software multicast. + +To plot the results (uses `model.py` behind the scenes): +```shell +./plot.py +``` diff --git a/experiments/multicast/__init__.py b/experiments/multicast/__init__.py new file mode 100644 index 00000000..73945a38 --- /dev/null +++ b/experiments/multicast/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit, model + +__all__ = ["plot", "experiments", "fit", "model"] diff --git a/experiments/multicast/experiments.py b/experiments/multicast/experiments.py new file mode 100755 index 00000000..9611cfd5 --- /dev/null +++ b/experiments/multicast/experiments.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +N_CLUSTERS = 4 +N_CLUSTERS_PER_ROW = 4 +WIDE_DATA_WIDTH = 64 # bytes +TCK = 5 +DIR = Path(__file__).parent + + +############### +# Experiments # +############### + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + keys = ['impl', 'n_rows', 'size'] + if experiment['impl'] == 'seq': + keys.append('batch') + return eu.derive_axes_from_keys(experiment, keys) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])), + 'SIZE': experiment['size'], + 'BATCH': experiment['batch'], + } + return cdefs + + +def gen_experiments(ci=False): + experiments = [] + impls = ['seq', 'tree', 'hw'] + n_rows_list = [1, 2, 4] + sizes = [1024, 2048, 4096, 8192, 16384, 32768] + n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] + if ci: + impls = ['hw'] + n_rows_list = [4] + sizes = [32768] + n_batches_list = [1024, 2048] + for impl in impls: + # for n_rows in [1]: + for n_rows in n_rows_list: + # for size in [1024]: + for size in sizes: + # for n_batches in [1]: + for n_batches in n_batches_list: + + # Only sequential implementation supports batching, for all other + # implementations we only accept n_batches = 1 + batch_size = int(size // n_batches) + batch_beats = int(batch_size // WIDE_DATA_WIDTH) + if impl == 'seq': + valid_n_batches = batch_beats > 8 and batch_beats < 1024 + else: + valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list + + # If the current setting for n_batches is valid, add the experiment + if valid_n_batches: + experiments.append({ + 'impl': impl, + 'n_rows': n_rows, + 'size': size, + 'batch': batch_size, + 'app': 'multicast_benchmark', + 'cmd': pb.sim_and_verify_cmd(Path.cwd() / 'verify.py'), + 'roi': Path.cwd() / f'roi/{impl}.json.tpl', + }) + return experiments + + +########### +# Results # +########### + +def dma_core(cluster_idx): + return f'hart_{1 + cluster_idx * 9 + 8}' + + +def tree_cycles(sim_results, n_rows): + n_levels = int(math.log2(N_CLUSTERS_PER_ROW * n_rows) + 1) + start = SimRegion(dma_core(0 * N_CLUSTERS_PER_ROW), 'level 0', 0) + end = SimRegion(dma_core(2 * N_CLUSTERS_PER_ROW), f'level {n_levels-1}', 0) + return sim_results.get_timespan(start, end) // TCK + + +# Cluster to cluster (C2C) transfer cycles +def tree_c2c_cycles(sim_results): + roi = SimRegion(dma_core(0), 'level 1', 0) + return sim_results.get_timespan(roi) // TCK + + +# Memory to cluster (M2C) transfer cycles +def tree_m2c_cycles(sim_results): + roi = SimRegion(dma_core(0), 'level 0', 0) + return sim_results.get_timespan(roi) // TCK + + +def seq_cycles(sim_results, n_batches, n_rows): + start = SimRegion(dma_core(0), 'batch 0', 1) + end = SimRegion(dma_core(12 + n_rows - 1), f'batch {n_batches-1}', 1) + return sim_results.get_timespan(start, end) // TCK + + +def seq_batch_cycles(sim_results, batch_idx, cluster_idx=0): + start = SimRegion(dma_core(cluster_idx), f'batch {batch_idx}', 1) + return sim_results.get_timespan(start) // TCK + + +def hw_cycles(sim_results): + roi = SimRegion(dma_core(0), 'transfer', 1) + return sim_results.get_timespan(roi) // TCK + + +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +######## +# Main # +######## + +def main(): + + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/multicast/fit.py b/experiments/multicast/fit.py new file mode 100755 index 00000000..6a1a64dc --- /dev/null +++ b/experiments/multicast/fit.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy import stats +from multicast import experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +def fit_hw(df, quiet=True): + # Retrieve only single-row experiments (fitted model should generalize to multi-row) + df = df[df['n_rows'] == 1] + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Fit data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['cycles'].to_numpy() + print("Fit transfer time for HW multicast:") + fit(x, y, quiet=quiet) + + +def fit_seq(df, quiet=True): + # Retrieve 1-batch experiments + df['n_batches'] = df['size'] // df['batch'] + df_no = df[df['n_batches'] == 1] + + # Get cycles for non-overlapped batch (batch 0, cluster 0) + df_no['no_batch_cycles'] = df_no.apply( + lambda row: experiments.seq_batch_cycles( + row['results'], batch_idx=0, cluster_idx=0), + axis=1 + ) + + # Fit non-overlapped batch data + x = df_no['batch'].to_numpy() / BEAT_BYTES + y = df_no['no_batch_cycles'].to_numpy() + print("Fit non-overlapped batch time for sequential multicast:") + fit(x, y, quiet=quiet) + + # Retrieve 2-batch experiments + df_o = df[df['n_batches'] == 2] + + # Get cycles for overlapped batch (batch 1, cluster 0) + df_o['o_batch_cycles'] = df_o.apply( + lambda row: experiments.seq_batch_cycles( + row['results'], batch_idx=1, cluster_idx=0), + axis=1 + ) + + # Fit overlapped batch data + x = df_o['batch'].to_numpy() / BEAT_BYTES + y = df_o['o_batch_cycles'].to_numpy() + print("Fit overlapped batch time for sequential multicast:") + fit(x, y, quiet=quiet) + + +def fit_tree(df, quiet=True): + # Get cycles of cluster to cluster and memory to cluster transfers + df['c2c_cycles'] = df['results'].apply(experiments.tree_c2c_cycles) + df['m2c_cycles'] = df['results'].apply(experiments.tree_m2c_cycles) + + # Fit cluster to cluster data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['c2c_cycles'].to_numpy() + print("Fit cluster-to-cluster transfer time for tree multicast:") + fit(x, y, quiet=quiet) + + # Fit memory to cluster data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['m2c_cycles'].to_numpy() + print("Fit memory-to-cluster transfer time for tree multicast:") + fit(x, y, quiet=quiet) + + +def main(): + df = experiments.results() + + seq = df[df['impl'] == 'seq'] + fit_seq(seq, quiet=False) + + tree = df[df['impl'] == 'tree'] + fit_tree(tree, quiet=False) + + hw = df[df['impl'] == 'hw'] + fit_hw(hw, quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/multicast/model.py b/experiments/multicast/model.py new file mode 100755 index 00000000..b67ddbaf --- /dev/null +++ b/experiments/multicast/model.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from math import isqrt, sqrt, log2, ceil + +# N: num clusters +# L: num beats in transfer +# B: num beats in batch +# delta: num synchronization overhead cycles + +BEAT_BYTES = 64 +SEQ_ALPHA = 30 +DELTA = 36 +HW_ALPHA = 30 +TREE_M2C_ALPHA = 30 +TREE_C2C_ALPHA = 17 + + +def nearest_divisors(divisor, dividend): + from bisect import bisect_left + + # 1) Enumerate all positive divisors of dividend + divs = set() + r = int(isqrt(int(dividend))) + for d in range(1, r + 1): + if dividend % d == 0: # d divides dividend + divs.add(d) # add the small factor + divs.add(dividend // d) # add the paired large factor + divs = sorted(divs) # sort them ascending + + # 2) Binary search to locate where x would be inserted + i = bisect_left(divs, divisor) # first index with divs[i] >= divisor + + # 3) Pick neighbors around divisor + lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None + upper = divs[i] if i < len(divs) else None + return lower, upper + + +# N1: num cols, N2: num rows +def seq_runtime(N1, N2, L, B, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + n_batches = L // B + n_iters_row = n_batches - 1 + (N1 - 1) + n_iters_col = n_batches - 1 + (N2 - 1) if N2 > 1 else 0 + n_iters = n_iters_row + n_iters_col + return (alpha1 + B) + n_iters * (alpha + B + delta) + + +def optimal_batch_size(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA): + if N2 > 1: + real_B = sqrt(L * (alpha + delta) / (N1 + N2 - 1)) + else: + real_B = sqrt(L * (alpha + delta) / (N1 - 1)) + lower_B, upper_B = nearest_divisors(real_B, L) + assert (lower_B is None) or (lower_B > 0) + assert (upper_B is None) or (upper_B <= L) + if lower_B is None: + return upper_B + elif upper_B is None: + return lower_B + else: + T_lower_B = seq_runtime(N1, N2, L, lower_B, delta, alpha) + T_upper_B = seq_runtime(N1, N2, L, upper_B, delta, alpha) + if T_lower_B < T_upper_B: + return lower_B + else: + return upper_B + + +def optimal_seq_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + B_opt = optimal_batch_size(N1, N2, L, delta, alpha) + return seq_runtime(N1, N2, L, B_opt, delta, alpha, alpha1) + + +def hw_runtime(N1, N2, L): + if N2 > 1: + return HW_ALPHA + L + N1 - 1 + N2 - 1 + else: + return HW_ALPHA + L + N1 - 1 + + +def tree_runtime(N1, N2, L, delta=DELTA): + m2c_transfer_cycles = TREE_M2C_ALPHA + L + + # C2C alpha depends on the distance between the clusters. + # On our system we have two cycles latency per hop. + c2c_transfer_cycles = 0 + for i in range(ceil(log2(N1))): + dist = N1 / (2 ** (i + 1)) + c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L + for i in range(ceil(log2(N2))): + dist = N2 / (2 ** (i + 1)) + c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L + + delta_cycles = delta * (log2(N1 * N2) - 1) + return m2c_transfer_cycles + c2c_transfer_cycles + delta_cycles + + +def optimal_sw_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + T_seq = optimal_seq_runtime(N1, N2, L, delta, alpha, alpha1) + T_tree = tree_runtime(N1, N2, L, delta) + return min(T_seq, T_tree) diff --git a/experiments/multicast/plot.py b/experiments/multicast/plot.py new file mode 100755 index 00000000..d591a6b2 --- /dev/null +++ b/experiments/multicast/plot.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from multicast import model +from multicast import experiments + +pd.options.mode.copy_on_write = True + + +# Get actual runtimes +def get_actual_cycles(row): + if row['impl'] == 'seq': + return experiments.seq_cycles( + row['results'], int(row['size'] // row['batch']), row['n_rows']) + elif row['impl'] == 'tree': + return experiments.tree_cycles(row['results'], row['n_rows']) + elif row['impl'] == 'hw': + return experiments.hw_cycles(row['results']) + + +# Get expected runtimes +def get_expected_cycles(row): + if row['impl'] == 'seq': + cycles = model.seq_runtime(4, row['n_rows'], row['size'] // 64, row['batch'] // 64) + elif row['impl'] == 'tree': + cycles = model.tree_runtime(4, row['n_rows'], row['size'] // 64) + elif row['impl'] == 'hw': + cycles = model.hw_runtime(4, row['n_rows'], row['size'] // 64) + return int(cycles) + + +# Select optimal configuration (optimal n_batches) for sequential pipelined runs +def select_best_sequential_configuration(df): + return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \ + .sort_values(['impl', 'n_rows', 'size']) \ + .reset_index(drop=True) + + +def seq_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax + 64, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit. +def monotone_seq_runtime_curve(xmin, xmax, n_rows): + x, y = seq_runtime_curve(xmin, xmax, n_rows) + return find_monotone_lower_fit(x, y) + + +# Find the monotone non-decreasing lower fit of an oscillating curve +def find_monotone_lower_fit(x, y): + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept: + y_monotone.append(y[i]) + x_monotone.append(x[i]) + return x_monotone, y_monotone + + +def tree_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax + 64, 64) + y = [model.tree_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +def hw_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax, 64) + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +def sw_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax + 64, 64) + x, y_seq = seq_runtime_curve(xmin, xmax, n_rows) + _, y_tree = tree_runtime_curve(xmin, xmax, n_rows) + x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) + return x, y_min + + +def plot1(): + """Plot actual vs. expected runtime heatmaps.""" + + # Load results + df = experiments.results() + + # Get only sequential implementation results + df = df[(df['impl'] == 'seq') & (df['n_rows'] == 1)] + + # Add actual and expected runtimes to the dataframe + df['act'] = df.apply( + lambda row: experiments.seq_cycles( + row['results'], int(row['size'] // row['batch']), row['n_rows']), + axis=1 + ) + df['exp'] = df.apply( + lambda row: model.seq_runtime(4, 1, row['size'] // 64, row['batch'] // 64), + axis=1 + ) + + # Create dataframe for actual runtime heatmap + df_act = df.drop(columns=['exp']) + df_act = df_act.pivot(index='size', columns='batch', values='act') + sizes = df_act.index + batches = df_act.columns + + # Create dataframe for expected runtime heatmap + df_exp = df.drop(columns=['act']) + df_exp = df_exp.pivot(index='size', columns='batch', values='exp') + sizes = df_exp.index + batches = df_exp.columns + + # Plot actual and expected heatmaps + fig, ax = plt.subplots(1, 2) + cmap = mpl.colormaps['plasma'] + cmap = mpl.colors.ListedColormap(cmap(np.linspace(0.15, 0.85, 128))) + sns.heatmap( + df_act, + cmap=cmap, + annot=True, + fmt=".0f", + cbar=True, + xticklabels=batches, + yticklabels=sizes, + ax=ax[0] + ) + sns.heatmap( + df_exp, + cmap=cmap, + annot=True, + fmt=".0f", + cbar=True, + xticklabels=batches, + yticklabels=sizes, + ax=ax[1] + ) + plt.tight_layout() + plt.show() + + +def plot2(ax=None, y_label=None, hide_x_axis=False, show=True): + """Plot actual and expected runtimes.""" + + # Load results + df = experiments.results() + + # Get expected and (best) actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_sequential_configuration(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C0", "tree": "C1", "hw": "C2"} + markers = {"expected": "o", "actual": "x"} + + # Plot measured runtimes + if ax is None: + _, ax = plt.subplots() + + def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label): + res = df[df['n_rows'] == n_rows] + sizes = res['size'].unique() + ax.scatter( + sizes, res[res['impl'] == 'seq']['cycles'], label='Actual', + marker=markers['actual'], color=colors['seq'] + ) + ax.scatter( + sizes, res[res['impl'] == 'tree']['cycles'], label='Actual', + marker=markers['actual'], color=colors['tree'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual', + marker=markers['actual'], color=colors['hw'] + ) + + # # Plot expected runtimes + # ax.scatter( + # sizes, res[res['impl'] == 'seq']['exp_cycles'], label='Expected (seq)', + # marker=markers['expected'], color=colors['seq'] + # ) + # ax.scatter( + # sizes, res[res['impl'] == 'tree']['exp_cycles'], label='Expected (tree)', + # marker=markers['expected'], color=colors['tree'] + # ) + # ax.scatter( + # sizes, res[res['impl'] == 'hw']['exp_cycles'], label='Expected (hw)', + # marker=markers['expected'], color=colors['hw'] + # ) + + # Plot model line for sequential runtime. Find monotone non-decreasing lower fit. + x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + + # Plot model line for tree runtime + x, y = tree_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + + # Plot model line for hardware runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) + + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_xlim(0, sizes.max() * 1.08) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) + + plot_runtime_vs_speedup(ax, n_rows=1) + # plot_runtime_vs_speedup(ax[0][0], n_rows=1) + # plot_runtime_vs_speedup(ax[0][1], n_rows=2) + # plot_runtime_vs_speedup(ax[1][0], n_rows=4) + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot3(ax=None, y_label=None, hide_x_axis=False, show=True): + """Plot runtime dependency with delta.""" + + # Load results + df = experiments.results() + + # Get only hardware implementation results + df = df[df['impl'] == 'hw'] + + # Retrieve single-row experiments + n_rows = 1 + df = df[df['n_rows'] == n_rows] + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C1", "hw": "C2"} + + # Plot measured runtimes + if ax is None: + _, ax = plt.subplots() + + # Plot model lines for sequential runtime (varying delta). + x = np.arange(df['size'].min(), df['size'].max() + 64, 64) + for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -14), 0]): + alpha = alpha_delta // 2 + delta = alpha_delta - alpha + x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA) + for e in x_ext] + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept and i < len(x): + y_monotone.append(y[i]) + x_monotone.append(x[i]) + ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None, + color=colors['seq']) + ax.annotate( + f'${alpha_delta}$', + xy=(x[-1], y_monotone[-1]), + xytext=(1, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + ax.plot(x, y, label='hw', linestyle='--', color=colors['hw']) + + # Configure plot + sizes = df['size'] + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_xlim(0, sizes.max() * 1.08) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + +def plot4(ax=None, y_label=None, show=True): + """Plot actual and expected runtimes, for multiple number of rows.""" + + # Load results + df = experiments.results() + + # Get expected and (best) actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_sequential_configuration(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors for the plots + colors = {"sw": "C0", "hw": "C2"} + + # Plot measured runtimes + if ax is None: + _, ax = plt.subplots() + sizes = df['size'].unique() + + def plot_runtime_vs_speedup(ax, n_rows, show_label=False): + res = df[df['n_rows'] == n_rows] + best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() + ax.scatter( + sizes, best_sw_cycles, label='Actual' if show_label else None, + marker='x', color=colors['sw'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None, + marker='x', color=colors['hw'] + ) + + # Plot model line for best software implementation + x, y = sw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot( + x, y, label='Model (sw)' if show_label else None, + linestyle='--', color=colors['sw']) + + # Annotate sw model lines with number of rows, in correspondence with actual runtime + ax.annotate( + f'r={n_rows}', + xy=(x[-1], best_sw_cycles[sizes.max()]), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot( + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + if n_rows == 1: + ax.annotate( + 'r=1,2,4', + xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']), + xytext=(0, 5), textcoords='offset points', # nudge right + ha='center', va='center', clip_on=False) + + for i, n_rows in enumerate(df['n_rows'].unique()): + plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) + + # Configure plot + sizes = df['size'].unique() + ax.set_xlim(0, sizes.max() * 1.08) + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) + + plt.tight_layout() + if show: + plt.show() + + return df + + +def main(): + # Parse arguments + functions = [plot1, plot2, plot3, plot4] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + if requested('plot3'): + plot3() + if requested('plot4'): + plot4() + + +if __name__ == "__main__": + main() diff --git a/experiments/multicast/roi/hw.json.tpl b/experiments/multicast/roi/hw.json.tpl new file mode 100644 index 00000000..b1cd5567 --- /dev/null +++ b/experiments/multicast/roi/hw.json.tpl @@ -0,0 +1,14 @@ +[ + { + // DMA core of cluster 0 + "thread": "${f'hart_9'}", + "roi": [ + // First iteration + {"idx": 1, "label": "init"}, + {"idx": 2, "label": "transfer"}, + // Second iteration + {"idx": 4, "label": "init"}, + {"idx": 5, "label": "transfer"}, + ] + }, +] diff --git a/experiments/multicast/roi/seq.json.tpl b/experiments/multicast/roi/seq.json.tpl new file mode 100644 index 00000000..876752fd --- /dev/null +++ b/experiments/multicast/roi/seq.json.tpl @@ -0,0 +1,29 @@ +<% +num_batches = experiment['size'] // experiment['batch'] +%> + +[ +% for row in range(0, experiment['n_rows']): + % for col in range(0, 4): + { + // DMA cores + "thread": "${f'hart_{(col * 4 + row) * 9 + 8 + 1}'}", + "roi": [ + // First iteration + {"idx": 1, "label": "init"}, + % for batch in range(num_batches): + {"idx": ${1 + batch * 2 + 1}, "label": "${f'batch {batch}'}"}, + // {"idx": ${1 + batch * 2 + 2}, "label": "${f'sync {batch}'}"}, + % endfor + {"idx": ${1 + num_batches * 2 + 1}, "label": "barrier"}, + // Second iteration + {"idx": ${3 + num_batches * 2}, "label": "init"}, + % for batch in range(num_batches): + {"idx": ${3 + num_batches * 2 + batch * 2 + 1}, "label": "${f'batch {batch}'}"}, + // {"idx": ${3 + num_batches * 2 + batch * 2 + 2}, "label": "${f'sync {batch}'}"}, + % endfor + ] + }, + % endfor +% endfor +] diff --git a/experiments/multicast/roi/tree.json.tpl b/experiments/multicast/roi/tree.json.tpl new file mode 100644 index 00000000..7095181c --- /dev/null +++ b/experiments/multicast/roi/tree.json.tpl @@ -0,0 +1,162 @@ +% if experiment['n_rows'] == 1: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 10, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 12, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 14, "label": "level 2"}, + + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 6, "label": "level 2"} + ] + }, +] +% elif experiment['n_rows'] == 2: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 12, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 14, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 16, "label": "level 2"}, + + // Transfer from cluster 0 to cluster 1 + {"idx": 18, "label": "level 3"}, + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 8, "label": "level 2"}, + + // Transfer from cluster 8 to cluster 9 + {"idx": 10, "label": "level 3"} + ] + }, + + { + "thread": "${f'hart_{4 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 4 to cluster 5 + {"idx": 6, "label": "level 3"} + ] + }, + + { + "thread": "${f'hart_{12 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 12 to cluster 13 + {"idx": 6, "label": "level 3"} + ] + }, +] +% elif experiment['n_rows'] == 4: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 14, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 16, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 18, "label": "level 2"}, + + // Transfer from cluster 0 to cluster 2 + {"idx": 20, "label": "level 3"}, + + // Transfer from cluster 0 to cluster 1 + {"idx": 22, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 10, "label": "level 2"}, + + // Transfer from cluster 8 to cluster 10 + {"idx": 12, "label": "level 3"}, + + // Transfer from cluster 8 to cluster 9 + {"idx": 14, "label": "level 4"} + ] + }, + + { + "thread": "${f'hart_{4 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 4 to cluster 6 + {"idx": 8, "label": "level 3"}, + + // Transfer from cluster 4 to cluster 5 + {"idx": 10, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{12 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 12 to cluster 14 + {"idx": 8, "label": "level 3"}, + + // Transfer from cluster 12 to cluster 13 + {"idx": 10, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{2 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 2 to cluster 3 + {"idx": 6, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{6 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 6 to cluster 7 + {"idx": 6, "label": "level 4"} + ] + }, + + { + "thread": "${f'hart_{10 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 10 to cluster 11 + {"idx": 6, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{14 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 14 to cluster 15 + {"idx": 6, "label": "level 4"}, + ] + }, +] +% endif diff --git a/experiments/multicast/verify.py b/experiments/multicast/verify.py new file mode 100755 index 00000000..bc5a7b0a --- /dev/null +++ b/experiments/multicast/verify.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# +# Verification script for `multicast_benchmark.c` + +import sys +import snitch.util.sim.verif_utils as vu + + +class Verifier(vu.Verifier): + + OUTPUT_UIDS = ['output'] + + def __init__(self): + super().__init__() + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'uint32_t') + + def get_expected_results(self): + length = int(self.get_input_from_symbol('length', 'uint32_t')[0]) + n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0]) + return n_clusters * [i + 1 for i in range(length // n_clusters)] + + def check_results(self, *args): + return super().check_results(*args, atol=0) + + +if __name__ == "__main__": + sys.exit(Verifier().main()) diff --git a/experiments/picobello.py b/experiments/picobello.py new file mode 100644 index 00000000..7327b7f6 --- /dev/null +++ b/experiments/picobello.py @@ -0,0 +1,74 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from pathlib import Path +import shutil +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments import common + +root = Path(__file__).resolve().parents[1] +default_work_dir = root / 'target/sim/vsim' + + +def sim_cmd(work_dir=default_work_dir): + return [ + 'bash', '-c', + f'make vsim-run-batch -C {str(root)} SIM_DIR=${{run_dir}} ' + f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' + f'SN_BINARY=${{elf}} PRELMODE=3 VSIM_DIR={str(work_dir)}; ' + f'grep -q "] SUCCESS" ${{run_dir}}/transcript;' + ] + + +def sim_and_verify_cmd(verify_script, work_dir=default_work_dir): + return [ + 'bash', '-c', + f'make vsim-run-batch-verify -C {str(root)} SIM_DIR=${{run_dir}} ' + f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' + f'SN_BINARY=${{elf}} VERIFY_PY={verify_script} PRELMODE=3 VSIM_DIR={str(work_dir)};' + ] + + +def sw_callback(target=None, build_dir=None, defines=None, data_cfg=None, dry_run=False, + sync=False, **kwargs): + env = { + 'SN_TESTS_RISCV_CFLAGS': common.join_cdefines(defines), + f'{target}_RISCV_CFLAGS': common.join_cdefines(defines), + } + vars = { + f'{target}_BUILD_DIR': build_dir, + 'SN_TESTS_BUILDDIR': build_dir, + 'DEBUG': 'ON', + } + if data_cfg is not None: + vars[f'{target}_DATA_CFG'] = data_cfg + env = common.extend_environment(env) + return common.make( + target, flags=['-j'], dir=root, vars=vars, env=env, sync=sync, + dry_run=dry_run + ) + + +def hw_callback(work_dir=None, hw_cfg=None, dry_run=False, + sync=False, **kwargs): + shutil.copy2(hw_cfg, root / '.generated/floo_picobello_noc_pkg.sv') + work_dir.mkdir(exist_ok=True, parents=True) + vars = { + 'VSIM_DIR': work_dir, + } + return common.make('vsim-compile', dir=root, vars=vars, flags=['-j'], dry_run=dry_run, sync=sync) + + +callbacks = { + 'sw': sw_callback, + 'hw': hw_callback, +} + + +class ExperimentManager(eu.ExperimentManager): + + def __init__(self, *args, **kwargs): + super().__init__(*args, callbacks=callbacks, **kwargs) diff --git a/experiments/reduction/__init__.py b/experiments/reduction/__init__.py new file mode 100644 index 00000000..73945a38 --- /dev/null +++ b/experiments/reduction/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit, model + +__all__ = ["plot", "experiments", "fit", "model"] diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py new file mode 100755 index 00000000..4e3662b5 --- /dev/null +++ b/experiments/reduction/experiments.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from functools import cache +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +BEAT_BYTES = 64 +TCK = 5 +DIR = Path(__file__).parent + + +############### +# Experiments # +############### + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + keys = ['impl', 'n_rows', 'size'] + if experiment['impl'] in ['seq', 'tree']: + keys.append('batch') + return eu.derive_axes_from_keys(experiment, keys) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])), + 'SIZE': experiment['size'], + } + if experiment['impl'] in ['seq', 'tree']: + cdefs['BATCH'] = experiment['batch'] + return cdefs + + +def gen_experiments(ci=False): + experiments = [] + impls = ['seq', 'tree', 'hw'] + n_rows_list = [1, 2, 4] + sizes = [1024, 2048, 4096, 8192, 16384, 32768] + n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] + if ci: + impls = ['hw'] + n_rows_list = [4] + sizes = [32768] + n_batches_list = [1024, 2048] + # for impl in ['seq']: + # for impl in ['tree', 'hw']: + for impl in impls: + # for n_rows in [1]: + for n_rows in n_rows_list: + # for size in [4096]: + for size in sizes: + # for n_batches in [4]: + for n_batches in n_batches_list: + + # Only sequential and tree implementations supports batching, for all other + # implementations we only accept n_batches = 1 + batch_size = int(size // n_batches) + batch_beats = int(batch_size // BEAT_BYTES) + if impl in ['seq', 'tree']: + valid_n_batches = batch_beats > 8 and batch_beats < 256 + else: + valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list + + # If the current setting for n_batches is valid, add the experiment + if valid_n_batches: + experiment = { + 'impl': impl, + 'n_rows': n_rows, + 'size': size, + 'batch': batch_size, + 'app': 'reduction_benchmark', + 'roi': Path.cwd() / f'roi/{impl}.json.tpl', + } + experiment['cmd'] = pb.sim_and_verify_cmd(Path.cwd() / 'verify.py') + experiments.append(experiment) + return experiments + + +########### +# Results # +########### + +def dma_core(cluster_idx): + return f'hart_{1 + cluster_idx * 9 + 8}' + + +def compute_core(cluster_idx, core_idx): + return f'hart_{1 + cluster_idx * 9 + core_idx}' + + +# Assumes DMA transfers on all clusters, and for all batches to be equal. +# Therefore we only sample cluster 12's first transfer. +def seq_dma_cycles(sim_results): + roi = SimRegion(dma_core(12), 'd_0', 1) + return sim_results.get_timespan(roi) // TCK + + +# Assumes computation on all clusters, and for all batches to be equal. +# Therefore we only sample cluster 0's first computation. +def seq_compute_cycles(sim_results): + roi = SimRegion(compute_core(0, 0), 'c_0', 1) + return sim_results.get_timespan(roi) // TCK + + +def seq_cycles(sim_results, c, r, n_batches): + start = SimRegion(dma_core(12), 'd_0', 1) + end = SimRegion(compute_core(0, 0), f'c{"" if r == 1 else 2}_{n_batches-1}', 1) + return sim_results.get_timespan(start, end) // TCK + + +# Assumes DMA transfers on all clusters, and all levels of the tree to be equal. +# Therefore we only sample cluster 4's first transfer. +def tree_dma_cycles(sim_results): + roi = SimRegion(dma_core(4), '4 > 0', 0) + return sim_results.get_timespan(roi) // TCK + + +# Assumes computation on all clusters, and all levels of the tree to be equal. +# Therefore we only sample cluster 0's first computation. +def tree_compute_cycles(sim_results): + roi = SimRegion(compute_core(0, 0), 'comp l0', 0) + return sim_results.get_timespan(roi) // TCK + + +def tree_cycles(sim_results, c, r, n_batches): + n_levels = int(math.log2(c * r)) + start = SimRegion(dma_core(4), '4 > 0', 0) + end = SimRegion(compute_core(0, 0), f'comp l{n_levels-1}', n_batches - 1) + return sim_results.get_timespan(start, end) // TCK + + +def hw_dma_cycles(sim_results): + roi = SimRegion(dma_core(0), 'row reduction', 1) + return sim_results.get_timespan(roi) // TCK + + +def hw_cycles(sim_results): + start = SimRegion(dma_core(0), 'row reduction', 1) + end = SimRegion(dma_core(0), 'column reduction', 1) + return sim_results.get_timespan(start, end) // TCK + + +@cache +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +######## +# Main # +######## + +def main(): + + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/reduction/fit.py b/experiments/reduction/fit.py new file mode 100755 index 00000000..cb896b5d --- /dev/null +++ b/experiments/reduction/fit.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from functools import cache +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy import stats +from reduction import experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +@cache +def fit_seq_dma(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'seq'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['dma_cycles'] = df['results'].apply(experiments.seq_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['dma_cycles'].to_numpy() + if not quiet: + print("Fit DMA transfer time for seq reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_seq_compute(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'seq'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['comp_cycles'] = df['results'].apply(experiments.seq_compute_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['comp_cycles'].to_numpy() + if not quiet: + print("Fit compute time for seq reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_tree_dma(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'tree'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['dma_cycles'] = df['results'].apply(experiments.tree_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['dma_cycles'].to_numpy() + if not quiet: + print("Fit DMA transfer time for tree reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_tree_compute(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'tree'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['comp_cycles'] = df['results'].apply(experiments.tree_compute_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['comp_cycles'].to_numpy() + if not quiet: + print("Fit compute time for tree reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_hw(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'hw'] + + # Retrieve only single-row experiments (fitted model should generalize to multi-row) + df = df[df['n_rows'] == 1] + + # Fit data + df['cycles'] = df['results'].apply(experiments.hw_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['cycles'].to_numpy() + if not quiet: + print("Fit runtime for hw reduction:") + return fit(x, y, quiet=quiet) + + +def main(): + + fit_tree_dma(quiet=False) + fit_tree_compute(quiet=False) + + fit_seq_dma(quiet=False) + fit_seq_compute(quiet=False) + + fit_hw(quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py new file mode 100755 index 00000000..fda790d1 --- /dev/null +++ b/experiments/reduction/model.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from math import log2, sqrt, isqrt +from reduction import fit + +BEAT_BYTES = 64 +DELTA = 30 + + +def nearest_divisors(divisor, dividend): + from bisect import bisect_left + + # 1) Enumerate all positive divisors of dividend + divs = set() + r = int(isqrt(int(dividend))) + for d in range(1, r + 1): + if dividend % d == 0: # d divides dividend + divs.add(d) # add the small factor + divs.add(dividend // d) # add the paired large factor + divs = sorted(divs) # sort them ascending + + # 2) Binary search to locate where x would be inserted + i = bisect_left(divs, divisor) # first index with divs[i] >= divisor + + # 3) Pick neighbors around divisor + lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None + upper = divs[i] if i < len(divs) else None + return lower, upper + + +def hw_runtime(c, r, n): + hw_alpha, hw_beta = fit.fit_hw() + t_hw = hw_alpha + n * hw_beta + if r > 1: + return 2 * t_hw + else: + return t_hw + + +def seq_runtime(c, r, n, k, delta=DELTA): + batch = int(n // k) + dma_alpha, _ = fit.fit_seq_dma() + comp_alpha, _ = fit.fit_seq_compute() + t_comp = comp_alpha + batch * 1.16 + t_dma = dma_alpha + batch * 1 + t_max = max(t_comp, t_dma) + # print(t_dma * 5, t_comp * 5, t_max * 5) + n_iters = 1 + 2 * (c - 2) + k + if r > 1: + n_iters += 1 + 2 * (r - 2) + k + # First approximation model assumes compute and dma take roughly the same time + delta = 4 * r + 28 + return n_iters * (t_max + delta) - delta + + +def optimal_seq_k(c, r, n, delta=DELTA): + comp_alpha, _ = fit.fit_seq_compute() + if r > 1: + real_k = sqrt(n * (2 * (c + r) - 7) / (2 * (delta + comp_alpha))) + else: + real_k = sqrt(n * (2 * c - 3) / (delta + comp_alpha)) + lower_k, upper_k = nearest_divisors(real_k, n) + assert (lower_k is None) or (lower_k > 0) + assert (upper_k is None) or (upper_k <= n) + if lower_k is None: + return upper_k + elif upper_k is None: + return lower_k + else: + T_lower_k = seq_runtime(c, r, n, lower_k, delta) + T_upper_k = seq_runtime(c, r, n, upper_k, delta) + if T_lower_k < T_upper_k: + return lower_k + else: + return upper_k + + +def optimal_seq_runtime(c, r, n, delta=DELTA): + k_opt = optimal_seq_k(c, r, n, delta) + return seq_runtime(c, r, n, k_opt, delta) + + +def tree_runtime(c, r, n, k, delta=DELTA): + batch = int(n // k) + dma_alpha, _ = fit.fit_tree_dma() + comp_alpha, _ = fit.fit_tree_compute() + t_comp = comp_alpha + batch * 1.16 + t_dma = dma_alpha + batch * 1 + t_max = max(t_comp, t_dma) + # print(t_dma * 5, t_comp * 5, t_max * 5) + n_levels = log2(c * r) + return n_levels * (t_dma + (k - 1) * (delta + t_max) + delta + t_comp) + + +def optimal_tree_k(c, r, n, delta=DELTA): + dma_alpha, _ = fit.fit_tree_dma() + comp_alpha, _ = fit.fit_tree_compute() + real_k = sqrt(n / (delta + max(dma_alpha, comp_alpha))) + lower_k, upper_k = nearest_divisors(real_k, n) + assert (lower_k is None) or (lower_k > 0) + assert (upper_k is None) or (upper_k <= n) + if lower_k is None: + return upper_k + elif upper_k is None: + return lower_k + else: + T_lower_k = tree_runtime(c, r, n, lower_k, delta) + T_upper_k = tree_runtime(c, r, n, upper_k, delta) + if T_lower_k < T_upper_k: + return lower_k + else: + return upper_k + + +def optimal_tree_runtime(c, r, n, delta=DELTA): + k_opt = optimal_tree_k(c, r, n, delta) + return tree_runtime(c, r, n, k_opt, delta) + + +def optimal_sw_runtime(c, r, n, delta=DELTA): + T_seq = optimal_seq_runtime(c, r, n, delta) + T_tree = optimal_tree_runtime(c, r, n, delta) + return min(T_seq, T_tree) + + +# print(5 * tree_runtime(4, 1, 8192 // 64, 1)) +# print(5 * tree_runtime(4, 1, 32768 // 64, 4)) diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py new file mode 100755 index 00000000..b473e2cb --- /dev/null +++ b/experiments/reduction/plot.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from reduction import model, experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +# Find the monotone non-decreasing lower fit of an oscillating curve +def find_monotone_lower_fit(x, y): + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept: + y_monotone.append(y[i]) + x_monotone.append(x[i]) + return x_monotone, y_monotone + + +def get_actual_cycles(row): + if row['impl'] == 'seq': + n_batches = int(row['size'] // row['batch']) + return experiments.seq_cycles(row['results'], 4, row['n_rows'], n_batches) + elif row['impl'] == 'tree': + n_batches = int(row['size'] // row['batch']) + return experiments.tree_cycles(row['results'], 4, row['n_rows'], n_batches) + elif row['impl'] == 'hw': + return experiments.hw_cycles(row['results']) + + +def get_expected_cycles(row): + if row['impl'] == 'seq': + cycles = model.optimal_seq_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'tree': + cycles = model.optimal_tree_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'hw': + cycles = model.hw_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(cycles) + + +def get_expected_batch_size(row): + if row['impl'] == 'tree': + optimal_k = model.optimal_tree_k(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(row['size'] // optimal_k) + elif row['impl'] == 'seq': + optimal_k = model.optimal_seq_k(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(row['size'] // optimal_k) + elif row['impl'] == 'hw': + return None + + +# Select optimal configuration (optimal k) for seq and tree runs +def select_best_configs(df): + return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \ + .sort_values(['impl', 'n_rows', 'size']) \ + .reset_index(drop=True) + + +def seq_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax + 64, 64) + y = [model.optimal_seq_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit. +def monotone_seq_runtime_curve(xmin, xmax, c, r): + x, y = seq_runtime_curve(xmin, xmax, c, r) + return find_monotone_lower_fit(x, y) + + +def tree_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax + 64, 64) + y = [model.optimal_tree_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +# Filter optimal tree runtime, by finding the monotone non-decreasing lower fit. +def monotone_tree_runtime_curve(xmin, xmax, c, r): + x, y = tree_runtime_curve(xmin, xmax, c, r) + return find_monotone_lower_fit(x, y) + + +def hw_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax + 64, 64) + y = [model.hw_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +def sw_runtime_curve(xmin, xmax, c, n_rows): + x = np.arange(xmin, xmax, 64) + x, y_seq = seq_runtime_curve(xmin, xmax, c, n_rows) + _, y_tree = tree_runtime_curve(xmin, xmax, c, n_rows) + x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) + return x, y_min + + +def plot1(y_label=None, hide_x_axis=False, show=True): + """Plot actual and expected runtimes.""" + + # Load single-row results + c = 4 + r = 1 + df = experiments.results() + df = df[df['n_rows'] == r] + + # Get expected and actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_configs(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + # df['exp_batch'] = df.apply(get_expected_batch_size, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C0", "tree": "C1", "hw": "C2"} + markers = {"expected": "o", "actual": "x"} + + # Create plot + _, ax = plt.subplots() + + # Plot actual runtimes + sizes = df['size'].unique() + ax.scatter( + sizes, df[df['impl'] == 'seq']['cycles'], label='Actual', + marker=markers['actual'], color=colors['seq'] + ) + ax.scatter( + sizes, df[df['impl'] == 'tree']['cycles'], label='Actual', + marker=markers['actual'], color=colors['tree'] + ) + ax.scatter( + sizes, df[df['impl'] == 'hw']['cycles'], label='Actual', + marker=markers['actual'], color=colors['hw'] + ) + + # # Plot expected runtimes + # ax.scatter( + # sizes, df[df['impl'] == 'tree']['exp_cycles'], label='Expected (tree)', + # marker=markers['expected'], color=colors['tree'] + # ) + # ax.scatter( + # sizes, df[df['impl'] == 'hw']['exp_cycles'], label='Expected (hw)', + # marker=markers['expected'], color=colors['hw'] + # ) + + # Plot model line for seq runtime + x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + + # Plot model line for tree runtime + x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + + # Plot model line for hw runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) + + # Plot formatting + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_xlim(0, sizes.max() * 1.1) + ax.set_axisbelow(True) + ax.grid(True, color='gainsboro') + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot2(y_label=None, show=True): + """Plot actual and expected runtimes, for multiple number of rows.""" + + # Load results + c = 4 + df = experiments.results() + + # Get expected and actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_configs(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + df['exp_batch'] = df.apply(get_expected_batch_size, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors for the plots + colors = {"sw": "C0", "hw": "C1"} + + # Create plot + _, ax = plt.subplots() + sizes = df['size'].unique() + + # Create function to plot runtimes for a given number of rows + def plot_runtime_vs_speedup(ax, n_rows, show_label=False): + res = df[df['n_rows'] == n_rows] + print(res) + best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() + print(best_sw_cycles) + ax.scatter( + sizes, best_sw_cycles, label='Actual' if show_label else None, + marker='x', color=colors['sw'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None, + marker='x', color=colors['hw'] + ) + + # Plot model line for best software implementation + x, y = sw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + ax.plot( + x, y, label='Model (sw)' if show_label else None, + linestyle='--', color=colors['sw']) + + # Annotate sw model lines with number of rows, in correspondence with actual runtime + ax.annotate( + f'r={n_rows}', + xy=(x[-1], best_sw_cycles[sizes.max()]), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + ax.plot( + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + if n_rows == 1: + label = f'r={n_rows}' + else: + label = 'r=2,4' + if n_rows != 4: + ax.annotate( + label, + xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + for i, n_rows in enumerate(df['n_rows'].unique()): + plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) + + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlim(0, sizes.max() * 1.1) + ax.set_xlabel('Size [B]') + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, color='gainsboro') + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) + + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot3(show=True): + """Plot runtime dependency with delta.""" + + # Load results + df = experiments.results() + + # Get only hardware implementation results + df = df[df['impl'] == 'hw'] + + # Retrieve single-row experiments + n_rows = 1 + df = df[df['n_rows'] == n_rows] + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C1", "hw": "C2"} + + # Plot measured runtimes + _, ax = plt.subplots() + + # Plot model lines for sequential runtime (varying delta). + x = np.arange(df['size'].min(), df['size'].max(), 64) + for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -12), 0]): + alpha = alpha_delta // 2 + delta = alpha_delta - alpha + x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA) + for e in x_ext] + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept and i < len(x): + y_monotone.append(y[i]) + x_monotone.append(x[i]) + ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None, + color=colors['seq']) + ax.annotate( + f'${alpha_delta}$', + xy=(x[-1], y_monotone[-1]), + xytext=(1, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + ax.plot(x, y, label='hw', linestyle='--', color=colors['hw']) + + ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']]) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + +def main(): + # Parse arguments + functions = [plot1, plot2] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + + +if __name__ == "__main__": + main() diff --git a/experiments/reduction/roi/hw.json.tpl b/experiments/reduction/roi/hw.json.tpl new file mode 100644 index 00000000..c9301cc5 --- /dev/null +++ b/experiments/reduction/roi/hw.json.tpl @@ -0,0 +1,23 @@ +<% + def pb_cluster_idx(c, r): + return c * 4 + r + + n_rows = experiment['n_rows'] +%> +[ +% for r in range(n_rows): + % for c in range(4): + { + "thread": "${f'hart_{1 + pb_cluster_idx(c, r) * 9 + 8}'}", + "roi": [ + // First iteration + {"idx": 2, "label": "row reduction"}, + {"idx": 4, "label": "column reduction"}, + // Second iteration + {"idx": 8, "label": "row reduction"}, + {"idx": 10, "label": "column reduction"}, + ] + }, + % endfor +% endfor +] diff --git a/experiments/reduction/roi/seq.json.tpl b/experiments/reduction/roi/seq.json.tpl new file mode 100644 index 00000000..f14ede55 --- /dev/null +++ b/experiments/reduction/roi/seq.json.tpl @@ -0,0 +1,87 @@ +<% +n_rows = experiment['n_rows'] +n_cols = 4 +n_batches = int(experiment['size'] // experiment['batch']) +n_repetitions = 2 + +def pb_cluster_idx(r, c): + return c * n_cols + r +%> +[ + <% n_repetitions = 2 %> + + % for row in range(0, n_rows): + ## for easternmost column, only DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, n_cols-1)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"}, + % endfor + % endfor + ] + }, + + ## for middle columns + % for col in range(n_cols-2, 0, -1): + ## Compute cores + % for core in range(0, 8): + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + core}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + % endfor + % endfor + ] + }, + % endfor + ## DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"}, + % endfor + % endfor + ] + }, + % endfor + + ## for westernmost column + % if row != 0 and n_rows > 1: + ## DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd2_{i}'}"}, + % endfor + % endfor + ] + }, + % endif + ## Compute cores + % for core in range(0, 8): + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + core}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + % if row == (n_rows - 1): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + % else: + {"idx": ${(4 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + {"idx": ${(4 * n_batches + 2) * iter + 2 * n_batches + 2 * i + 2}, "label": "${f'c2_{i}'}"}, + % endif + % endfor + % endfor + ] + }, + % endfor + % endfor +] \ No newline at end of file diff --git a/experiments/reduction/roi/tree.json.tpl b/experiments/reduction/roi/tree.json.tpl new file mode 100644 index 00000000..a6e3e541 --- /dev/null +++ b/experiments/reduction/roi/tree.json.tpl @@ -0,0 +1,123 @@ +<% +import math +n_rows = experiment['n_rows'] +n_batches = int(experiment['size'] // experiment['batch']) +n_levels_in_row = int(math.log2(4)) +def n_levels_in_col(row_idx): + if row_idx == 0: + return int(math.log2(n_rows)) + elif row_idx == 2: + return 1 + else: + return 0 + +def pb_cluster_idx(row, col): + return row + col * 4 +%> + +[ +% for r in range(n_rows): + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 0}'}", + "roi": [ + ## Reductions in row + % for level_in_row in range(n_levels_in_row): + % for batch in range(n_batches): + { + "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + (level_in_row * n_batches + batch) * 2}, + "label": "${f'comp l{level_in_row}'}" + }, + % endfor + % endfor + ## Reductions in column 0 + % for level_in_col in range(n_levels_in_col(r)): + % for batch in range(n_batches): + { + "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + ((n_levels_in_row + level_in_col) * n_batches + batch) * 2}, + "label": "${f'comp l{n_levels_in_row + level_in_col}'}" + }, + % endfor + % endfor + ] + }, + + % if r > 0: + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 2 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "${f'{r} > {0 if r in [1, 2] else 2}'}" + }, + % endfor + ] + }, + %endif + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 4 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "4 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 4 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "4 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 0}'}", + "roi": [ + % for batch in range(n_batches): + ## Stage 0 reduction + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "comp l0" + }, + % endfor + ] + }, + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 8 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "8 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 3) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 12 to cluster 8 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "12 > 8" + }, + % endfor + ] + }, +% endfor +] diff --git a/experiments/reduction/verify.py b/experiments/reduction/verify.py new file mode 100755 index 00000000..e6a17e78 --- /dev/null +++ b/experiments/reduction/verify.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# +# Verification script for `reduction_benchmark.c` + +import sys +import snitch.util.sim.verif_utils as vu + + +class Verifier(vu.Verifier): + + OUTPUT_UIDS = ['output'] + + def __init__(self): + super().__init__() + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + length = int(self.get_input_from_symbol('length', 'uint32_t')[0]) + n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0]) + return [i * n_clusters + sum(range(n_clusters)) for i in range(length)] + + def check_results(self, *args): + return super().check_results(*args, atol=0) + + +if __name__ == "__main__": + sys.exit(Verifier().main()) diff --git a/experiments/summa_gemm/__init__.py b/experiments/summa_gemm/__init__.py new file mode 100644 index 00000000..7858b657 --- /dev/null +++ b/experiments/summa_gemm/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, model + +__all__ = ["plot", "experiments", "model"] diff --git a/experiments/summa_gemm/cfg.json.tpl b/experiments/summa_gemm/cfg.json.tpl new file mode 100644 index 00000000..e2ebd182 --- /dev/null +++ b/experiments/summa_gemm/cfg.json.tpl @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 4, // number of tiles in M dimension + n_tiles: ${4 * experiment['n_tiles']}, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 64, + n: ${64 * experiment['n_tiles']}, + k: 128, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/experiments/summa_gemm/experiments.py b/experiments/summa_gemm/experiments.py new file mode 100755 index 00000000..ca333764 --- /dev/null +++ b/experiments/summa_gemm/experiments.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu + +TCK = 5 +VERIFY_PY = Path(__file__).parent / '../../.deps/snitch_cluster/sw/kernels/blas/gemm/scripts/verify.py' + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + return eu.derive_axes_from_keys(experiment, ['mode', 'n_tiles']) + + def derive_cdefines(self, experiment): + cdefs = { + 'MODE': experiment['mode'].upper(), + } + return cdefs + + def derive_data_cfg(self, experiment): + return eu.derive_data_cfg_from_template(experiment) + + +def gen_experiments(): + experiments = [] + # for mode in ['hw']: + for mode in ['sw_tree']: + for n_tiles in [4]: + # for mode in ['sw_naive', 'sw_tree', 'hw']: + experiments.append({ + 'app': 'summa_gemm', + 'cmd': pb.sim_and_verify_cmd(VERIFY_PY), + 'mode': mode, + 'n_tiles': n_tiles, + }) + return experiments + + +def main(): + manager = ExperimentManager(gen_experiments()) + manager.run() + + +if __name__ == '__main__': + main() diff --git a/experiments/summa_gemm/model.py b/experiments/summa_gemm/model.py new file mode 100755 index 00000000..8d08bfc4 --- /dev/null +++ b/experiments/summa_gemm/model.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +import dma_multicast_v2 as multicast +import reduction + +PREC = 8 # in bytes +BEAT_BYTES = 64 # in bytes +UTIL = 0.981 # median utilization from https://arxiv.org/pdf/2506.10921 +PEAKPERF = 16 # DPflop/cycle on a single cluster +L1SIZE = 16 * 1024 # in bytes + + +def beats(bytes): + return math.ceil(bytes / BEAT_BYTES) + + +def max_square_problem_size(): + return math.floor(math.sqrt(L1SIZE / (6 * PREC))) + + +def t_mcast(dim, bytes, impl='sw'): + n = beats(bytes) + if impl == 'sw': + return multicast.model.optimal_sw_runtime(dim, 1, n) + elif impl == 'seq': + return multicast.model.optimal_seq_runtime(dim, 1, n) + elif impl == 'tree': + return multicast.model.tree_runtime(dim, 1, n) + elif impl == 'hw': + return multicast.model.hw_runtime(dim, 1, n) + else: + raise ValueError(f"Unknown multicast implementation: {impl}") + + +def t_mcast_a(c, Mt, Kt, impl='sw'): + return t_mcast(c, Mt * Kt * PREC, impl) + + +def t_mcast_b(r, Nt, Kt, impl='sw'): + return t_mcast(r, Nt * Kt * PREC, impl) + + +def t_summa_comm(r, c, Mt, Nt, Kt, impl='sw'): + return t_mcast_a(c, Mt, Kt, impl=impl) + t_mcast_b(r, Nt, Kt, impl=impl) + + +def t_comp(Mt, Nt, Kt): + return (2 * Mt * Nt * Kt) / (UTIL * PEAKPERF) + + +def t_summa_gemm(r, c, Mt, Nt, Kt, impl='sw'): + return max(t_summa_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt)) + + +def t_fcl_comm(r, c, Mt, Nt, Kt, impl='sw'): + # Time to load c*r submatrices of B (each of size Nt x Kt) + # from r memory tiles + return c * r * beats(Nt * Kt * PREC) / r + + +def t_reduction(r, c, bytes, impl='sw'): + n = beats(bytes) + if impl == 'sw': + return reduction.model.optimal_sw_runtime(c, r, n) + elif impl == 'hw': + return reduction.model.hw_runtime(c, r, n) + else: + raise ValueError(f"Unknown reduction implementation: {impl}") + + +def t_fcl_gemm(r, c, Mt, Nt, Kt, impl='sw'): + t_partial_result = max(t_fcl_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt)) + t_redu = t_reduction(r, c, Mt * Nt * PREC, impl=impl) + return t_partial_result + t_redu diff --git a/experiments/summa_gemm/plot.py b/experiments/summa_gemm/plot.py new file mode 100755 index 00000000..6d77bc6c --- /dev/null +++ b/experiments/summa_gemm/plot.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +from matplotlib.ticker import FuncFormatter +import numpy as np +import pandas as pd +from summa_gemm import model + + +def plot1(y_label=None, hide_x_axis=False, show=True): + # Get data + mesh_sizes = [4, 8, 16, 32, 64, 128, 256] + t_comm_sw = [] + t_comm_hw = [] + t_compute = [] + Mt = model.max_square_problem_size() + for c in mesh_sizes: + r = c + t_comm_sw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='sw')) + t_comm_hw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='hw')) + t_compute.append(model.t_comp(Mt, Mt, Mt)) + + df = pd.DataFrame({ + 'size': mesh_sizes, + 't_comm_sw': t_comm_sw, + 't_comm_hw': t_comm_hw, + 't_compute': t_compute, + }) + + # Prepare critical paths and speedups + df['t_sw'] = df[['t_comm_sw', 't_compute']].max(axis=1) + df['t_hw'] = df[['t_comm_hw', 't_compute']].max(axis=1) + df['speedup'] = df['t_sw'] / df['t_hw'] + + # Plot curves + fig, ax = plt.subplots() + ax.plot(mesh_sizes, t_comm_sw, marker='o', label='$T_{comm}$ (sw)') + ax.plot(mesh_sizes, t_comm_hw, marker='s', label='$T_{comm}$ (hw)') + ax.plot(mesh_sizes, t_compute, marker='^', label='$T_{comp}$') + ax.set_xscale('log', base=2) + ax.set_xticks(mesh_sizes) + ax.set_xlim(2.8, 370) + if hide_x_axis: + ax.set_xlabel('') + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xlabel('Mesh size') + ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, which="both", color='gainsboro') + ax.legend() + fig.tight_layout() + + # Annotate plot with speedup arrows (when speedup > 1) + keys = ['size', 't_comm_sw', 't_comm_hw', 't_compute', 'speedup'] + for x, sw, hw, t, sp in df[keys].itertuples(index=False, name=None): + if sp > 1.0: + y0 = sw + y1 = max(hw, t) + + # Double-headed arrow from t_comm_sw to max(t_comm_hw, t_compute) + ax.annotate( + "", + xy=(x, y1), + xytext=(x, y0), + arrowprops=dict(arrowstyle="<->", lw=1.2), + annotation_clip=False, + ) + + # Arithmetic midpoint for linear scale + y_mid = 0.5 * (y0 + y1) + + offset = 3 + if sp > 1.2: + ha = "right" + xytext = (-offset, 0) + else: + ha = "left" + xytext = (offset, 0) + ax.annotate( + f"{sp:.2f}$\\times$", + xy=(x, y_mid), + xytext=xytext, + textcoords="offset points", + ha=ha, + va="center", + rotation=90 if ax.get_yscale() == "log" else 0, # optional + annotation_clip=True, + ) + + if show: + plt.show() + + return df + + +def plot2(y_label=None, show=True): + # Get data + mesh_sizes = [4, 8, 16, 32, 64, 128, 256] + Mt = model.max_square_problem_size() + t_sw = [] + t_hw = [] + for size in mesh_sizes: + t_sw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='sw')) + t_hw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='hw')) + df = pd.DataFrame({ + 'size': mesh_sizes, + 't_sw': t_sw, + 't_hw': t_hw, + }) + + # Calculate speedup + df['speedup'] = df['t_sw'] / df['t_hw'] + + # Bar widths (to make them of equal width on a log scale) + logx = np.log2(df['size'].to_numpy(dtype=float)) + mid = 0.5 * (logx[:-1] + logx[1:]) # midpoints between neighbors + log_left_edges = np.r_[logx[0] - (mid[0] - logx[0]), mid] # extrapolate first + log_right_edges = np.r_[mid, logx[-1] + (logx[-1] - mid[-1])]# extrapolate last + fill = 0.6 + log_span = (log_right_edges - log_left_edges) * fill + logL = logx - 0.5 * log_span + logR = logx + 0.5 * log_span + left = 2.0**logL + right = 2.0**logR + width = right - left + + # Plot + fig, ax = plt.subplots() + ax.axhline(y=1, color='black', linewidth=1, zorder=5) + ax.bar(left, df['speedup'], width=width, align='edge', zorder=10) + ax.set_xscale('log', base=2) + ax.set_xlim(2.8, 370) + ax.set_ylim(0.5, None) + ax.set_xticks(mesh_sizes) + ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) + ax.set_xlabel('Mesh size') + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(axis='both', color='gainsboro') + fig.tight_layout() + + if show: + plt.show() + + return df + + +def main(): + # Parse arguments + functions = [plot1, plot2] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/summa_gemm/roi.json.tpl b/experiments/summa_gemm/roi.json.tpl new file mode 100644 index 00000000..4ed347bd --- /dev/null +++ b/experiments/summa_gemm/roi.json.tpl @@ -0,0 +1,29 @@ +<% n_tiles = experiment['n_tiles'] %> +[ + % for cluster in range(0,16): + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{cluster * 9 + j + 1}'}", + "roi": [ + % for i in range(0, n_tiles): + {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"}, + % endfor + ] + }, + % endfor + + // DMA core + { + "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", + "roi": [ + {"idx": 1, "label": "${f'tile_in_0'}"}, + % for i in range(0, n_tiles - 1): + {"idx": ${4*i + 3}, "label": "${f'tile_in_{i+1}'}"}, + {"idx": ${4*i + 5}, "label": "${f'tile_out_{i}'}"}, + % endfor + {"idx": ${n_tiles * 4 - 1}, "label": "${f'tile_out_{n_tiles-1}'}"}, + ] + }, + % endfor +] diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv new file mode 100644 index 00000000..cf82fdde --- /dev/null +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -0,0 +1,349 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" +`include "reduction/typedef.svh" + +package floo_picobello_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic [4:0] { + ClusterX0Y0 = 0, + ClusterX0Y1 = 1, + ClusterX0Y2 = 2, + ClusterX0Y3 = 3, + ClusterX1Y0 = 4, + ClusterX1Y1 = 5, + ClusterX1Y2 = 6, + ClusterX1Y3 = 7, + ClusterX2Y0 = 8, + ClusterX2Y1 = 9, + ClusterX2Y2 = 10, + ClusterX2Y3 = 11, + ClusterX3Y0 = 12, + ClusterX3Y1 = 13, + ClusterX3Y2 = 14, + ClusterX3Y3 = 15, + Cheshire = 16, + FhgSpu = 17, + TopSpmNarrow = 18, + TopSpmWide = 19, + L2Spm0 = 20, + L2Spm1 = 21, + L2Spm2 = 22, + L2Spm3 = 23, + L2Spm4 = 24, + L2Spm5 = 25, + L2Spm6 = 26, + L2Spm7 = 27, + NumEndpoints = 28 + } ep_id_e; + + + + typedef enum logic [4:0] { + ClusterX0Y0SamIdx = 0, + ClusterX0Y1SamIdx = 1, + ClusterX0Y2SamIdx = 2, + ClusterX0Y3SamIdx = 3, + ClusterX1Y0SamIdx = 4, + ClusterX1Y1SamIdx = 5, + ClusterX1Y2SamIdx = 6, + ClusterX1Y3SamIdx = 7, + ClusterX2Y0SamIdx = 8, + ClusterX2Y1SamIdx = 9, + ClusterX2Y2SamIdx = 10, + ClusterX2Y3SamIdx = 11, + ClusterX3Y0SamIdx = 12, + ClusterX3Y1SamIdx = 13, + ClusterX3Y2SamIdx = 14, + ClusterX3Y3SamIdx = 15, + CheshireExternalSamIdx = 16, + CheshireInternalSamIdx = 17, + FhgSpuSamIdx = 18, + TopSpmNarrowSamIdx = 19, + TopSpmWideSamIdx = 20, + L2Spm0SamIdx = 21, + L2Spm1SamIdx = 22, + L2Spm2SamIdx = 23, + L2Spm3SamIdx = 24, + L2Spm4SamIdx = 25, + L2Spm5SamIdx = 26, + L2Spm6SamIdx = 27, + L2Spm7SamIdx = 28 + } sam_idx_e; + + + + typedef logic [0:0] rob_idx_t; + typedef logic [0:0] port_id_t; + typedef logic [3:0] x_bits_t; + typedef logic [1:0] y_bits_t; + typedef struct packed { + x_bits_t x; + y_bits_t y; + port_id_t port_id; + } id_t; + + typedef logic route_t; + + + localparam int unsigned SamNumRules = 29; + + typedef struct packed { + id_t idx; + logic [47:0] start_addr; + logic [47:0] end_addr; + } sam_rule_t; + + localparam sam_rule_t [SamNumRules-1:0] Sam = '{ + '{ + idx: '{x: 8, y: 3, port_id: 0}, + start_addr: 48'h000070700000, + end_addr: 48'h000070800000 + }, // L2Spm7 + '{ + idx: '{x: 8, y: 2, port_id: 0}, + start_addr: 48'h000070600000, + end_addr: 48'h000070700000 + }, // L2Spm6 + '{ + idx: '{x: 8, y: 1, port_id: 0}, + start_addr: 48'h000070500000, + end_addr: 48'h000070600000 + }, // L2Spm5 + '{ + idx: '{x: 8, y: 0, port_id: 0}, + start_addr: 48'h000070400000, + end_addr: 48'h000070500000 + }, // L2Spm4 + '{ + idx: '{x: 0, y: 3, port_id: 0}, + start_addr: 48'h000070300000, + end_addr: 48'h000070400000 + }, // L2Spm3 + '{ + idx: '{x: 0, y: 2, port_id: 0}, + start_addr: 48'h000070200000, + end_addr: 48'h000070300000 + }, // L2Spm2 + '{ + idx: '{x: 0, y: 1, port_id: 0}, + start_addr: 48'h000070100000, + end_addr: 48'h000070200000 + }, // L2Spm1 + '{ + idx: '{x: 0, y: 0, port_id: 0}, + start_addr: 48'h000070000000, + end_addr: 48'h000070100000 + }, // L2Spm0 + '{ + idx: '{x: 9, y: 1, port_id: 0}, + start_addr: 48'h000060040000, + end_addr: 48'h000060080000 + }, // TopSpmWide + '{ + idx: '{x: 9, y: 2, port_id: 0}, + start_addr: 48'h000060000000, + end_addr: 48'h000060040000 + }, // TopSpmNarrow + '{ + idx: '{x: 9, y: 0, port_id: 0}, + start_addr: 48'h000040000000, + end_addr: 48'h000040040000 + }, // FhgSpu + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000000000000, + end_addr: 48'h000020000000 + }, // CheshireInternal + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000080000000, + end_addr: 48'h020000000000 + }, // CheshireExternal + '{ + idx: '{x: 7, y: 3, port_id: 0}, + start_addr: 48'h0000203c0000, + end_addr: 48'h000020400000 + }, // ClusterX3Y3 + '{ + idx: '{x: 7, y: 2, port_id: 0}, + start_addr: 48'h000020380000, + end_addr: 48'h0000203c0000 + }, // ClusterX3Y2 + '{ + idx: '{x: 7, y: 1, port_id: 0}, + start_addr: 48'h000020340000, + end_addr: 48'h000020380000 + }, // ClusterX3Y1 + '{ + idx: '{x: 7, y: 0, port_id: 0}, + start_addr: 48'h000020300000, + end_addr: 48'h000020340000 + }, // ClusterX3Y0 + '{ + idx: '{x: 6, y: 3, port_id: 0}, + start_addr: 48'h0000202c0000, + end_addr: 48'h000020300000 + }, // ClusterX2Y3 + '{ + idx: '{x: 6, y: 2, port_id: 0}, + start_addr: 48'h000020280000, + end_addr: 48'h0000202c0000 + }, // ClusterX2Y2 + '{ + idx: '{x: 6, y: 1, port_id: 0}, + start_addr: 48'h000020240000, + end_addr: 48'h000020280000 + }, // ClusterX2Y1 + '{ + idx: '{x: 6, y: 0, port_id: 0}, + start_addr: 48'h000020200000, + end_addr: 48'h000020240000 + }, // ClusterX2Y0 + '{ + idx: '{x: 5, y: 3, port_id: 0}, + start_addr: 48'h0000201c0000, + end_addr: 48'h000020200000 + }, // ClusterX1Y3 + '{ + idx: '{x: 5, y: 2, port_id: 0}, + start_addr: 48'h000020180000, + end_addr: 48'h0000201c0000 + }, // ClusterX1Y2 + '{ + idx: '{x: 5, y: 1, port_id: 0}, + start_addr: 48'h000020140000, + end_addr: 48'h000020180000 + }, // ClusterX1Y1 + '{ + idx: '{x: 5, y: 0, port_id: 0}, + start_addr: 48'h000020100000, + end_addr: 48'h000020140000 + }, // ClusterX1Y0 + '{ + idx: '{x: 4, y: 3, port_id: 0}, + start_addr: 48'h0000200c0000, + end_addr: 48'h000020100000 + }, // ClusterX0Y3 + '{ + idx: '{x: 4, y: 2, port_id: 0}, + start_addr: 48'h000020080000, + end_addr: 48'h0000200c0000 + }, // ClusterX0Y2 + '{ + idx: '{x: 4, y: 1, port_id: 0}, + start_addr: 48'h000020040000, + end_addr: 48'h000020080000 + }, // ClusterX0Y1 + '{ + idx: '{x: 4, y: 0, port_id: 0}, + start_addr: 48'h000020000000, + end_addr: 48'h000020040000 + } // ClusterX0Y0 + + }; + + + + localparam route_cfg_t RouteCfg = '{ + RouteAlgo: XYRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 41, + XYAddrOffsetY: 45, + IdAddrOffset: 0, + NumSamRules: 29, + NumRoutes: 0, + CollectiveCfg: '{ + OpCfg: '{ + EnNarrowMulticast: 1'b1, + EnWideMulticast: 1'b1, + EnLSBAnd: 1'b1, + EnF_Add: 1'b1, + EnF_Mul: 1'b1, + EnF_Min: 1'b1, + EnF_Max: 1'b1, + default: '0 + }, + RedCfg: ReductionDefaultCfg + } + }; + + + typedef logic [47:0] axi_narrow_in_addr_t; + typedef logic [63:0] axi_narrow_in_data_t; + typedef logic [7:0] axi_narrow_in_strb_t; + typedef logic [4:0] axi_narrow_in_id_t; + typedef logic [2:0] axi_narrow_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, + axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, + axi_narrow_in_user_t) + + + typedef logic [47:0] axi_narrow_out_addr_t; + typedef logic [63:0] axi_narrow_out_data_t; + typedef logic [7:0] axi_narrow_out_strb_t; + typedef logic [1:0] axi_narrow_out_id_t; + typedef logic [2:0] axi_narrow_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, + axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, + axi_narrow_out_strb_t, axi_narrow_out_user_t) + + + typedef logic [47:0] axi_wide_in_addr_t; + typedef logic [511:0] axi_wide_in_data_t; + typedef logic [63:0] axi_wide_in_strb_t; + typedef logic [2:0] axi_wide_in_id_t; + typedef logic [0:0] axi_wide_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, + axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic [47:0] axi_wide_out_addr_t; + typedef logic [511:0] axi_wide_out_data_t; + typedef logic [63:0] axi_wide_out_strb_t; + typedef logic [0:0] axi_wide_out_id_t; + typedef logic [0:0] axi_wide_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, + axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, + axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) + localparam axi_cfg_t AxiCfgN = '{ + AddrWidth: 48, + DataWidth: 64, + UserWidth: 3, + InIdWidth: 5, + OutIdWidth: 2 + }; + localparam axi_cfg_t AxiCfgW = '{ + AddrWidth: 48, + DataWidth: 512, + UserWidth: 1, + InIdWidth: 3, + OutIdWidth: 1 + }; + `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, + hdr_t) + + //TODO (lleone): all this must be generated + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 2) + typedef logic [AxiCfgW.DataWidth-1:0] data_wide_width; + typedef logic [AxiCfgN.DataWidth-1:0] data_narrow_width; + `RED_TYPEDEF_REQ_RSP_LINK(wide, data_wide_width, wide_req, wide_rsp) + `RED_TYPEDEF_REQ_RSP_LINK(narrow, data_narrow_width, narrow_req, narrow_rsp) + + +endpackage diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index ed406db1..e2af0086 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -8,6 +8,8 @@ `include "cheshire/typedef.svh" `include "floo_noc/typedef.svh" `include "axi/assign.svh" +`include "common_cells/assertions.svh" + module cheshire_tile import cheshire_pkg::*; @@ -104,7 +106,8 @@ module cheshire_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -117,7 +120,10 @@ module cheshire_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_out_t (floo_wide_double_t), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -129,7 +135,13 @@ module cheshire_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_west_o = router_floo_req_out[West]; @@ -144,6 +156,13 @@ module cheshire_tile assign router_floo_rsp_in[North] = '0; // No North port in this tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[South] = floo_rsp_south_i; + //TODO(colluca): Merge with floo_wide + // assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + // assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + // assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + // assign floo_wide_south_o.valid = router_floo_wide_out[South].valid; + // assign floo_wide_south_o.ready = router_floo_wide_out[South].ready; + // assign floo_wide_south_o.wide = router_floo_wide_out[South].wide[0]; assign floo_wide_west_o = router_floo_wide_out[West]; assign floo_wide_south_o = router_floo_wide_out[South]; assign router_floo_wide_in[West] = floo_wide_west_i; @@ -172,6 +191,8 @@ module cheshire_tile .ChimneyCfgW (ChimneyCfgW), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), @@ -188,7 +209,10 @@ module cheshire_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_in_t (floo_wide_double_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i, .rst_ni, @@ -472,4 +496,14 @@ module cheshire_tile assign fhg_spu_rst_no = control_reg.fhg_spu_rsts.rst.value; assign fhg_spu_clk_en_o = control_reg.fhg_spu_clk_enables.clk_en.value; + // Add Assertion that no multicast / reduction can enter this tile! + for (genvar r = 0; r < 4; r++) begin : gen_virt + `ASSERT(NoCollectivOperation_NReq_In, (!router_floo_req_in[r].valid | (router_floo_req_in[r].req[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_In, (!router_floo_rsp_in[r].valid | (router_floo_rsp_in[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide[0].generic.hdr.collective_op == Unicast))) + end + endmodule diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 1f8cf413..af101f30 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -27,6 +27,7 @@ module cluster_tile input logic [NrCores-1:0] msip_i, input logic [ 9:0] hart_base_id_i, input snitch_cluster_pkg::addr_t cluster_base_addr_i, + input snitch_cluster_pkg::addr_t cluster_base_offset_i, // Chimney ports input id_t id_i, // Router ports @@ -70,7 +71,7 @@ module cluster_tile `AXI_TYPEDEF_ALL(cluster_narrow_out_dw_conv, snitch_cluster_pkg::addr_t, snitch_cluster_pkg::narrow_out_id_t, data_hwpe_ctrl_t, strb_hwpe_ctrl_t, - snitch_cluster_pkg::user_t) + snitch_cluster_pkg::user_narrow_t) cluster_narrow_out_dw_conv_req_t cluster_narrow_out_dw_conv_req, cluster_narrow_out_cut_req; cluster_narrow_out_dw_conv_resp_t cluster_narrow_out_dw_conv_rsp, cluster_narrow_out_cut_rsp; @@ -82,6 +83,112 @@ module cluster_tile logic [NrCores-1:0] mxip; + + //////////////////////// + // Wide FPU Reduction // + //////////////////////// + + // Snitch cluster DCA interface + snitch_cluster_pkg::dca_req_t offload_dca_req, offload_dca_req_cut; + snitch_cluster_pkg::dca_rsp_t offload_dca_rsp, offload_dca_rsp_cut; + + // Signals to connect the NW router to the wide parser + red_wide_req_t offload_wide_req; + red_wide_rsp_t offload_wide_rsp; + + // Parse the Wide request from the reouter to the one from the snitch cluster! + // TODO(raroth): possible to remove this decode from the picobello repo and move it inside the + // FlooNoC repo. Currently the Decode used for the ALU is directly inside the floo_alu.sv + // file. Maybe do the same for the FPU + if(is_en_wide_reduction(RouteCfg.CollectiveCfg.OpCfg)) begin : gen_wide_offload_reduction + // Connect the DCA Request + assign offload_dca_req.q_valid = offload_wide_req.valid; + assign offload_wide_rsp.ready = offload_dca_rsp.q_ready; + + // Parse the FPU Request + always_comb begin + // Init default values + offload_dca_req.q.operands = '0; + + // Set default Values + offload_dca_req.q.src_fmt = fpnew_pkg::FP64; + offload_dca_req.q.dst_fmt = fpnew_pkg::FP64; + offload_dca_req.q.int_fmt = fpnew_pkg::INT64; + offload_dca_req.q.vectorial_op = 1'b0; + offload_dca_req.q.op_mod = 1'b0; + offload_dca_req.q.rnd_mode = fpnew_pkg::RNE; + offload_dca_req.q.op = fpnew_pkg::ADD; + + // Define the operation we want to execute on the FPU + unique casez (offload_wide_req.req.op) + (floo_pkg::F_Add) : begin + offload_dca_req.q.op = fpnew_pkg::ADD; + offload_dca_req.q.operands[0] = '0; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[2] = offload_wide_req.req.operand2; + end + (floo_pkg::F_Mul) : begin + offload_dca_req.q.op = fpnew_pkg::MUL; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; + offload_dca_req.q.operands[2] = '0; + end + (floo_pkg::F_Max) : begin + offload_dca_req.q.op = fpnew_pkg::MINMAX; + offload_dca_req.q.rnd_mode = fpnew_pkg::RNE; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; + offload_dca_req.q.operands[2] = '0; + end + (floo_pkg::F_Min) : begin + offload_dca_req.q.op = fpnew_pkg::MINMAX; + offload_dca_req.q.rnd_mode = fpnew_pkg::RTZ; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; + offload_dca_req.q.operands[2] = '0; + end + default : begin + offload_dca_req.q.op = fpnew_pkg::ADD; + offload_dca_req.q.operands[0] = '0; + offload_dca_req.q.operands[1] = '0; + offload_dca_req.q.operands[2] = '0; + end + endcase + end + + // TODO(raroth): move these spill register inside FlooNoC and make them configurable. + // Insert a reqrsp-cut to avoid timing violations + //If teh CutOffloadIntf is enabled, the cut is already in the offload controller, you can bypass teh one below + generic_reqrsp_cut #( + .req_chan_t (snitch_cluster_pkg::dca_req_chan_t), + .rsp_chan_t (snitch_cluster_pkg::dca_rsp_chan_t), + .BypassReq (WideReductionCfg.CutOffloadIntf), + .BypassRsp (WideReductionCfg.CutOffloadIntf) + ) i_dca_router_cut ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .slv_req_i (offload_dca_req), + .slv_rsp_o (offload_dca_rsp), + .mst_req_o (offload_dca_req_cut), + .mst_rsp_i (offload_dca_rsp_cut) + ); + // Connect the Response + assign offload_wide_rsp.valid = offload_dca_rsp.p_valid; + assign offload_dca_req.p_ready = offload_wide_req.ready; + assign offload_wide_rsp.rsp.result = offload_dca_rsp.p.result; + + // No Wide Reduction supported + end else begin : gen_no_wide_reduction + assign offload_dca_req_cut = '0; + assign offload_dca_rsp = '0; + assign offload_wide_rsp.ready = '0; + assign offload_wide_rsp.rsp.result = '0; + assign offload_wide_rsp.valid = '0; + end + + // TODO(lleone): Add teh narrow ALU reduction unit and connections here + + snitch_cluster_wrapper i_cluster ( .clk_i (tile_clk), .rst_ni (tile_rst_n), @@ -91,21 +198,36 @@ module cluster_tile .msip_i, .hart_base_id_i, .cluster_base_addr_i, - .mxip_i (mxip), - .clk_d2_bypass_i ('0), - .sram_cfgs_i ('0), - .narrow_in_req_i (cluster_narrow_in_req), - .narrow_in_resp_o (cluster_narrow_in_rsp), - .narrow_out_req_o (cluster_narrow_out_req), - .narrow_out_resp_i(cluster_narrow_out_rsp), - .wide_out_req_o (cluster_wide_out_req), - .wide_out_resp_i (cluster_wide_out_rsp), - .wide_in_req_i (cluster_wide_in_req), - .wide_in_resp_o (cluster_wide_in_rsp), - .narrow_ext_req_o (cluster_narrow_ext_req), - .narrow_ext_resp_i(cluster_narrow_ext_rsp), - .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), - .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned) + .cluster_base_offset_i, + .mxip_i (mxip), + .clk_d2_bypass_i ('0), + .sram_cfgs_i ('0), + .narrow_in_req_i (cluster_narrow_in_req), + .narrow_in_resp_o (cluster_narrow_in_rsp), + .narrow_out_req_o (cluster_narrow_out_req), + .narrow_out_resp_i (cluster_narrow_out_rsp), + .wide_out_req_o (cluster_wide_out_req), + .wide_out_resp_i (cluster_wide_out_rsp), + .wide_in_req_i (cluster_wide_in_req), + .wide_in_resp_o (cluster_wide_in_rsp), + .narrow_ext_req_o (cluster_narrow_ext_req), + .narrow_ext_resp_i (cluster_narrow_ext_rsp), + .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), + .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned), + .dca_req_i (offload_dca_req_cut), + .dca_rsp_o (offload_dca_rsp_cut), + .x_issue_req_o (), + .x_issue_resp_i ('0), + .x_issue_valid_o (), + .x_issue_ready_i ('0), + .x_register_o (), + .x_register_valid_o (), + .x_register_ready_i ('0), + .x_commit_o (), + .x_commit_valid_o (), + .x_result_i ('0), + .x_result_valid_i ('0), + .x_result_ready_o () ); if (UseHWPE) begin : gen_hwpe @@ -218,13 +340,17 @@ module cluster_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; + floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), - .EnMultiCast (RouteCfg.EnMultiCast), .RouteAlgo (RouteCfg.RouteAlgo), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), + .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), .OutFifoDepth(2), @@ -232,7 +358,14 @@ module cluster_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .red_wide_req_t (red_wide_req_t), + .red_wide_rsp_t (red_wide_rsp_t), + .red_narrow_req_t (red_narrow_req_t), + .red_narrow_rsp_t (red_narrow_rsp_t), + .CollectiveOpCfg (RouteCfg.CollectiveCfg.OpCfg), + .RdWideCfg (WideReductionCfg), + .RdNarrowCfg (NarrowReductionCfg) ) i_router ( .clk_i, .rst_ni, @@ -244,15 +377,21 @@ module cluster_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( offload_wide_req ), + .offload_wide_rsp_i ( offload_wide_rsp ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -265,7 +404,9 @@ module cluster_tile .ChimneyCfgW (floo_pkg::ChimneyDefaultCfg), .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), - .MaxAtomicTxns (1), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), + .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), .rob_idx_t (floo_picobello_noc_pkg::rob_idx_t), @@ -285,7 +426,8 @@ module cluster_tile .floo_rsp_t (floo_picobello_noc_pkg::floo_rsp_t), .floo_wide_t (floo_picobello_noc_pkg::floo_wide_t), .sram_cfg_t (snitch_cluster_pkg::sram_cfg_t), - .user_struct_t (picobello_pkg::mcast_user_t) + .user_narrow_struct_t (picobello_pkg::collective_narrow_user_t), + .user_wide_struct_t (picobello_pkg::collective_wide_user_t) ) i_chimney ( .clk_i (tile_clk), .rst_ni (tile_rst_n), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index c42f1d38..2dafd6a9 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -29,7 +29,8 @@ module dummy_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -42,7 +43,10 @@ module dummy_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_out_t (floo_wide_double_t), + .WideRwDecouple (WideRwDecouple), + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -54,15 +58,27 @@ module dummy_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; // Tie the router’s Eject input ports to 0 assign router_floo_req_in[Eject] = '0; diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index ad0636e7..5db4fa91 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -47,7 +47,8 @@ module fhg_spu_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -60,7 +61,10 @@ module fhg_spu_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_out_t (floo_wide_double_t), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -72,7 +76,13 @@ module fhg_spu_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_west_o = router_floo_req_out[West]; @@ -87,6 +97,12 @@ module fhg_spu_tile assign router_floo_rsp_in[South] = '0; // No South port in this tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[North] = floo_rsp_north_i; + // assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + // assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + // assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + // assign floo_wide_north_o.valid = router_floo_wide_out[North].valid; + // assign floo_wide_north_o.ready = router_floo_wide_out[North].ready; + // assign floo_wide_north_o.wide = router_floo_wide_out[North].wide[0]; assign floo_wide_west_o = router_floo_wide_out[West]; assign floo_wide_north_o = router_floo_wide_out[North]; assign router_floo_wide_in[West] = floo_wide_west_i; diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index faf4249a..f615891f 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -7,6 +7,7 @@ `include "common_cells/registers.svh" `include "axi/typedef.svh" `include "obi/typedef.svh" +`include "common_cells/assertions.svh" module mem_tile import floo_pkg::*; @@ -45,12 +46,12 @@ module mem_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), - .EnMultiCast (RouteCfgNoMcast.EnMultiCast), .RouteAlgo (RouteCfgNoMcast.RouteAlgo), .NumRoutes (5), .InFifoDepth (2), @@ -59,7 +60,10 @@ module mem_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_out_t (floo_wide_double_t), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -71,15 +75,27 @@ module mem_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -97,6 +113,8 @@ module mem_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, 1'b1, 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), @@ -113,7 +131,10 @@ module mem_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_in_t (floo_wide_double_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i (tile_clk), .rst_ni (tile_rst_n), @@ -473,5 +494,22 @@ module mem_tile .clk_o (tile_rst_n) ); `endif + // Add Assertion that no multicast / reduction can enter this tile! + for (genvar r = 0; r < 4; r++) begin : gen_route_assertions + `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req[0].generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_req_i[r].req[0].narrow_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide[0].generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req[0].generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_req_o[r].req[0].narrow_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide[0].generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr)) + end endmodule diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index dcbfa060..909f2ee2 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -270,7 +270,7 @@ package picobello_pkg; typedef struct packed { logic [5:0] offset; logic [2:0] len; - logic [2:0] grp_base_id; + logic [2:0] base_id; } mask_sel_t; typedef struct packed { @@ -325,16 +325,16 @@ package picobello_pkg; sam_multicast[rule].idx.mask_x = '{ offset: offset_id_x, len: len_id_x, - grp_base_id: empty_cols + base_id: empty_cols }; sam_multicast[rule].idx.mask_y = '{ offset: offset_id_y, len: len_id_y, - grp_base_id: empty_rows + base_id: empty_rows }; end else begin - sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, grp_base_id: 0}; - sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, grp_base_id: 0}; + sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, base_id: 0}; + sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, base_id: 0}; end end @@ -346,7 +346,7 @@ package picobello_pkg; function automatic floo_pkg::route_cfg_t gen_nomcast_route_cfg(); floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg; // Disable multicast for non-cluster tiles - ret.EnMultiCast = 1'b0; + ret.CollectiveCfg = CollectiveDefaultCfg; return ret; endfunction @@ -362,9 +362,9 @@ package picobello_pkg; $write(" { idx: { id: {x: %0d, y: %0d, port: %0d},", SamMcast[i].idx.id.x, SamMcast[i].idx.id.y, SamMcast[i].idx.id.port_id); $write(" mask_x: {offset: %0d, len: %0d, base_id: %0d},", SamMcast[i].idx.mask_x.offset, - SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.grp_base_id); + SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.base_id); $write(" mask_y: {offset: %0d, len: %0d, base_id: %0d} },", SamMcast[i].idx.mask_y.offset, - SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.grp_base_id); + SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.base_id); $write("start: 0x%0h, end: 0x%0h }\n", SamMcast[i].start_addr, SamMcast[i].end_addr); end $display("]"); @@ -391,6 +391,45 @@ package picobello_pkg; $display("----------------------------------------------------------"); endfunction + ///////////////////// + // REDUCTION // + ///////////////////// + + // TODO (lleone): Add generation of the follwing types ans structs into Floogen + typedef struct packed { + user_mask_t collective_mask; + floo_pkg::collect_op_e collective_op; + logic [snitch_cluster_pkg::AtomicIdWidth-1:0] atomic; + } collective_narrow_user_t; + + typedef struct packed { + user_mask_t collective_mask; + floo_pkg::collect_op_e collective_op; + } collective_wide_user_t; + + // // TODO (colluca): Merge with floo_wide + // typedef struct packed { + // logic [1:0] valid; + // logic [1:0] ready; + // floo_wide_chan_t [1:0] wide; + // } floo_wide_double_t; + + localparam int unsigned NumWideVirtChannels = 2; + localparam int unsigned NumWidePhysChannels = 1; + localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcPreemptValid; + localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Phys; + + // Configurations for the Reductions +localparam reduction_cfg_t WideReductionCfg = '{ + RdPipelineDepth: 5, + CutOffloadIntf: 1'b1 + }; + +localparam reduction_cfg_t NarrowReductionCfg = '{ + RdPipelineDepth: 1, + CutOffloadIntf: 1'b0 + }; + //////////////// // Cheshire // //////////////// diff --git a/hw/picobello_top.sv b/hw/picobello_top.sv index 201020c7..170372c8 100644 --- a/hw/picobello_top.sv +++ b/hw/picobello_top.sv @@ -97,27 +97,29 @@ module picobello_top localparam int Y = int'(ClusterPhysicalId.y); localparam int unsigned HartBaseId = c * NrCores + 1; // Cheshire is hart 0 localparam axi_wide_in_addr_t ClusterBaseAddr = Sam[ClusterSamIdx].start_addr; + localparam axi_wide_in_addr_t ClusterBaseOffset = Sam[ClusterSamIdx].end_addr - ClusterBaseAddr; cluster_tile i_cluster_tile ( .clk_i, .rst_ni, - .test_enable_i (test_mode_i), - .tile_clk_en_i (cluster_clk_en[c]), - .tile_rst_ni (cluster_rst_n[c]), - .clk_rst_bypass_i (clk_rst_bypass_i), - .debug_req_i (debug_req[c]), - .meip_i (meip[c]), - .mtip_i (mtip[c]), - .msip_i (msip[c]), - .hart_base_id_i (HartBaseId[9:0]), - .cluster_base_addr_i(ClusterBaseAddr), - .id_i (ClusterId), - .floo_req_o (floo_req_out[X][Y]), - .floo_rsp_i (floo_rsp_in[X][Y]), - .floo_wide_o (floo_wide_out[X][Y]), - .floo_req_i (floo_req_in[X][Y]), - .floo_rsp_o (floo_rsp_out[X][Y]), - .floo_wide_i (floo_wide_in[X][Y]) + .test_enable_i (test_mode_i), + .tile_clk_en_i (cluster_clk_en[c]), + .tile_rst_ni (cluster_rst_n[c]), + .clk_rst_bypass_i (clk_rst_bypass_i), + .debug_req_i (debug_req[c]), + .meip_i (meip[c]), + .mtip_i (mtip[c]), + .msip_i (msip[c]), + .hart_base_id_i (HartBaseId[9:0]), + .cluster_base_addr_i (ClusterBaseAddr), + .cluster_base_offset_i(ClusterBaseOffset), + .id_i (ClusterId), + .floo_req_o (floo_req_out[X][Y]), + .floo_rsp_i (floo_rsp_in[X][Y]), + .floo_wide_o (floo_wide_out[X][Y]), + .floo_req_i (floo_req_in[X][Y]), + .floo_rsp_o (floo_rsp_out[X][Y]), + .floo_wide_i (floo_wide_in[X][Y]) ); end diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index e4b3b5ca..a0980e8a 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -81,7 +81,9 @@ module spm_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_out; + floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -94,7 +96,10 @@ module spm_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_out_t (floo_wide_double_t), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -106,15 +111,27 @@ module spm_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), + // Narrow Reduction offload port + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -135,6 +152,8 @@ module spm_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, bit'(!IsNarrow), 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), @@ -151,7 +170,10 @@ module spm_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + // .floo_wide_in_t (floo_wide_double_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i, .rst_ni, diff --git a/iis-env.sh b/iis-env.sh index 87dffd81..2a65cc08 100644 --- a/iis-env.sh +++ b/iis-env.sh @@ -9,7 +9,7 @@ export VLIB="questa-2023.4 vlib" export BASE_PYTHON=/usr/local/anaconda3/bin/python3.11 export CHS_SW_GCC_BINROOT=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0/bin export VERIBLE_FMT="oseda -2025.03 verible-verilog-format" -export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.2.0/bin +export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.5.0/bin # Create the python venv if [ ! -d ".venv" ]; then @@ -20,3 +20,5 @@ fi if [ -z "$VIRTUAL_ENV" ] || [ "$VIRTUAL_ENV" != "$(realpath .venv)" ]; then source .venv/bin/activate fi + +export PYTHONPATH=$PWD/experiments/:$PYTHONPATH diff --git a/requirements.txt b/requirements.txt index 78565385..1b65da0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,5 +25,5 @@ hjson # for reggen # For peakrdl peakrdl -peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@7b8dbc9ad5854dc1cdaf36d4ea024c29ffb00a4c +peakrdl-rawheader==0.1.3 peakrdl-markdown diff --git a/sw/snitch/apps/summa_gemm/app.mk b/sw/snitch/apps/summa_gemm/app.mk new file mode 100644 index 00000000..163ae610 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/app.mk @@ -0,0 +1,17 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := summa_gemm +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +SRC_DIR := $(PB_SNITCH_SW_DIR)/apps/$(APP)/src +SRCS := $(SRC_DIR)/summa_gemm.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/kernels/blas $(SN_ROOT)/sw/kernels/blas/gemm/src + +# Refer to Snitch scripts +$(APP)_SCRIPT_DIR := $(SN_ROOT)/sw/kernels/blas/gemm/scripts + +include $(SN_ROOT)/sw/kernels/datagen.mk +include $(SN_ROOT)/sw/kernels/common.mk diff --git a/sw/snitch/apps/summa_gemm/data/params.json b/sw/snitch/apps/summa_gemm/data/params.json new file mode 100644 index 00000000..2b332462 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/data/params.json @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 4, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 32, + n: 32, + k: 8, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/sw/snitch/apps/summa_gemm/src/summa_gemm.c b/sw/snitch/apps/summa_gemm/src/summa_gemm.c new file mode 100644 index 00000000..bdecff61 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/src/summa_gemm.c @@ -0,0 +1,462 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Tim Fischer +// Luca Bertaccini +// Luca Colagrande +// Viviane Potocnik +// Lorenzo Leone + +#include "snrt.h" + +#include "blas.h" + +typedef enum { + SW_NAIVE, + SW_TREE, + HW +} mode_t; + +#ifndef MODE +#define MODE SW_NAIVE +#endif + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreorder-init-list" +#include "data.h" +#pragma clang diagnostic pop + +static inline void snrt_dma_load_2d_tile_mcast_sw( + void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, + size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, + uint32_t prec, snrt_comm_t comm) { + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iteration 0: cluster in row 0 fetches data from memory tile + if (pb_cluster_row_idx() == 0) { + snrt_dma_load_2d_tile( + dst, src, tile_x1_idx, tile_x0_idx, + tile_x1_size, tile_x0_size, full_x0_size, + prec); + snrt_dma_wait_all(); + } + + // Iterations to cover all transfers in each column + uint32_t n_levels_in_col = PB_LOG2_CLUSTER_PER_COL; + uint32_t size = tile_x1_size * tile_x0_size * prec; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char row_idx_inv = pb_cluster_num_in_col() - pb_cluster_row_idx(); + char num_active_rows = 1 << i; + char sender_stride = pb_cluster_num_in_col() / num_active_rows; + char is_sender = (row_idx_inv % sender_stride) == 0; + + // Every active cluster sends the data to a cluster above it + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx()); + void *remote_dst = snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, dst, size); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +/** + * @brief Performs a General Matrix Multiplication (GEMM) operation on a + * Snitch-based multiple-cluster architecture with a 2D mesh topology + * using the SUMMA dataflow. + * + * @param args Pointer to a `gemm_args_t` structure containing arguments + * for the GEMM operation. + * + * @details + * The function performs the following steps: + * 1. Copies the input arguments to local memory for faster access. + * 2. Calculates tile sizes based on the input dimensions and number of tiles. + * 3. Allocates space in TCDM for local copies of matrix tiles, unless + * matrix tiles are already stored in TCDM (see `load_* arguments`). + * 4. Distributes tiles to clusters for parallel processing. + * 5. Iterates over the tiles, performing the following: + * - Copies data for the current tile into local memory. + * - Performs the tile computation using the `sc_st_gemm` function. + * - Writes the result back to global memory. + */ +static inline int gemm_picobello(const gemm_args_t *args) { +#ifndef JOB_ARGS_PRELOADED + // Copy the arguments to local memory + gemm_args_t *largs = (gemm_args_t *)snrt_l1_alloc_cluster_local( + sizeof(gemm_args_t), alignof(gemm_args_t)); + if (snrt_is_dm_core()) { + snrt_dma_start_1d((void *)largs, (void *)args, sizeof(gemm_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); +#else + const gemm_args_t *largs = args; +#endif + + // Create a communicator for each column to share the B tiles + snrt_comm_t col_comm[pb_cluster_num_in_row()]; + for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++) + pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1, + 0, c); + + // Calculate tile sizes + uint32_t tile_m = largs->m / largs->m_tiles; + uint32_t tile_n = largs->n / largs->n_tiles; + uint32_t tile_k = largs->k / largs->k_tiles; + uint32_t tile_a_size = tile_m * tile_k * largs->prec; + uint32_t tile_b_size = tile_k * tile_n * largs->prec; + uint32_t tile_c_size = tile_m * tile_n * largs->prec; + + // Allocate space for local tile buffers in TCDM, unless preloaded + void *a0, *a1, *b0, *b1, *c0, *c1; + void *la[2], *lb[2], *lc[2], *lcr; + int banks_per_buffer = snrt_cluster_compute_core_num(); + allocate_buffers(tile_a_size, tile_b_size, tile_c_size, largs, + banks_per_buffer, la, lb, lc, &lcr); + if (snrt_cluster_core_idx() == 0) { + DUMP(la[0]); + DUMP(la[1]); + DUMP(lb[0]); + DUMP(lb[1]); + DUMP(lc[0]); + DUMP(lc[1]); + } + snrt_cluster_hw_barrier(); + + // NoC layout (6 columns x 4 rows) + /* + // + |------| |------| |------| |------| |------| |------| + | M3 |---| C3 |---| C7 |---| C11 |---| C15 |---| M7 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M2 |---| C2 |---| C6 |---| C10 |---| C14 |---| M6 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M1 |---| C1 |---| C5 |---| C9 |---| C13 |---| M5 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M0 |---| C0 |---| C4 |---| C8 |---| C12 |---| M4 | + |------| |------| |------| |------| |------| |------| + // + */ + + // SUMMA dataflow: + // - Clusters in the same row calculate different tiles of the same + // row block of C, reusing the same row block of A but different column + // blocks of B. + // - Clusters in the same column calculate different tiles of the same + // column block of C, reusing the same column block of B but different + // row blocks of A. + // + // Notes: + // - K tiling not supported. + + // Distribute m tiles to cluster rows and n tiles to cluster columns + uint32_t cluster_m_tiles = largs->m_tiles / pb_cluster_num_in_col(); + uint32_t cluster_n_tiles = largs->n_tiles / pb_cluster_num_in_row(); + uint32_t cluster_k_tiles = 1; + + // Calculate number of iterations + uint32_t num_tiles = cluster_m_tiles * cluster_n_tiles * cluster_k_tiles; + uint32_t num_iters = num_tiles; + if (largs->double_buffer) + num_iters += 2; + else + num_iters += 1; + + // Iterate over all tiles + for (uint32_t i = 0; i < num_iters; i++) { + // Calculate tile indices (we iterate in n->m order) + int dma_in_i = i; + int comp_i = largs->double_buffer ? i - 1 : i; + int dma_out_i = largs->double_buffer ? i - 2 : i - 1; + int dma_in_k = dma_in_i % cluster_k_tiles; + int dma_in_mn = dma_in_i / cluster_k_tiles; + int dma_in_n = dma_in_mn % cluster_n_tiles; + int dma_in_m = dma_in_mn / cluster_n_tiles; + int comp_k = comp_i % cluster_k_tiles; + int comp_mn = comp_i / cluster_k_tiles; + int comp_n = comp_mn % cluster_n_tiles; + int comp_m = comp_mn / cluster_n_tiles; + int dma_out_k = dma_out_i % cluster_k_tiles; + int dma_out_mn = dma_out_i / cluster_k_tiles; + int dma_out_n = dma_out_mn % cluster_n_tiles; + int dma_out_m = dma_out_mn / cluster_n_tiles; + + // Calculate the absolute m, n and k indices for each cluster + int dma_in_m_abs = dma_in_m + pb_cluster_row_idx() * cluster_m_tiles; + int comp_m_abs = comp_m + pb_cluster_row_idx() * cluster_m_tiles; + int dma_out_m_abs = dma_out_m + pb_cluster_row_idx() * cluster_m_tiles; + int dma_in_n_abs = dma_in_n + pb_cluster_col_idx() * cluster_n_tiles; + int comp_n_abs = comp_n + pb_cluster_col_idx() * cluster_n_tiles; + int dma_out_n_abs = dma_out_n + pb_cluster_col_idx() * cluster_n_tiles; + int dma_in_k_abs = dma_in_k; + int comp_k_abs = comp_k; + int dma_out_k_abs = dma_out_k; + + // DMA out phase + if (snrt_is_dm_core()) { + if (dma_out_i >= 0) { + snrt_mcycle(); + // Switch buffers + int buff_idx = largs->double_buffer ? dma_out_mn % 2 : 0; + + // Store C + // If parallelize_k, then only cluster 0 must writeback + if ((snrt_cluster_idx() == 0) || !(largs->parallelize_k)) { + if (largs->partition_banks) { + snrt_dma_2d_to_1d( + (void *)((uintptr_t)largs->c + + dma_out_m_abs * tile_c_size), + lc[buff_idx], tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_store_2d_tile(largs->c, lc[buff_idx], + dma_out_m_abs, dma_out_n_abs, tile_m, + tile_n, largs->ldc, largs->prec); + } + snrt_dma_wait_all(); + } + snrt_mcycle(); + } + } + + // DMA in phase + if (snrt_is_dm_core()) { + if (dma_in_i < num_tiles) { + snrt_mcycle(); + // Switch buffers + // When tiling on N, a row block of A is reused with multiple + // column blocks of B. There is no need to reload A on every + // iteration. The A buffer must be switched only when reloading + // A. On the other hand, the B buffer is switched every + // iteration, and the C buffer only needs to be switched after + // fully accumulating the result, i.e. after finishing the K loop. + int a_buff_idx = largs->double_buffer ? dma_in_m % 2 : 0; + int b_buff_idx = largs->double_buffer ? dma_in_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? dma_in_mn % 2 : 0; + + // Load A + if (largs->load_a && (dma_in_n == 0)) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + la[a_buff_idx], + (void *)((uintptr_t)largs->a + + dma_in_m_abs * tile_a_size), + tile_a_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_load_2d_tile( + la[a_buff_idx], largs->a, dma_in_m_abs, dma_in_k_abs, + tile_m, tile_k, largs->lda, largs->prec); + } + } + + // Load B + if (largs->load_b) { + if (largs->transb) { + snrt_dma_load_2d_tile(lb[b_buff_idx], largs->b, + dma_in_n_abs, dma_in_k_abs, + tile_n, tile_k, + largs->ldb, largs->prec); + } else { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lb[b_buff_idx], + (void *)((uintptr_t)largs->b + + dma_in_k_abs * tile_b_size), + tile_b_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + // In SW_NAIVE mode, every cluster fetches the B + // tile it requires, independently. In all other + // modes, the clusters in row 0 multicast the B + // tile to all clusters in the same column. + if (MODE == HW) { + if (pb_cluster_row_idx() == 0) { + snrt_dma_load_2d_tile_mcast( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, + largs->ldb, largs->prec, + col_comm[pb_cluster_col_idx()]); + } + } else if (MODE == SW_TREE) { + snrt_dma_load_2d_tile_mcast_sw( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, + largs->ldb, largs->prec, + col_comm[pb_cluster_col_idx()]); + } else { + snrt_dma_load_2d_tile( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, largs->ldb, + largs->prec); + } + } + } + } + + // Load C + // C tile is loaded only upon the first k iteration, then + // the C array will contain the partial results from the + // previous iteration + if (largs->load_c) { + if (dma_in_k_abs == 0) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], + (void *)((uintptr_t)largs->c + + dma_in_m_abs * tile_c_size), + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_load_2d_tile(lc[c_buff_idx], largs->c, + dma_in_m_abs, dma_in_n_abs, + tile_m, tile_n, largs->ldc, + largs->prec); + } + } else if (dma_in_k == 0) { + // Clusters other than the first need to initialize + // the C array to zero in their first iteration + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], snrt_cluster()->zeromem.mem, + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_start_1d(lc[c_buff_idx], + snrt_cluster()->zeromem.mem, + tile_c_size); + } + } + } + snrt_dma_wait_all(); + snrt_mcycle(); + } + } + + // Additional barrier required when not double buffering + if (!largs->double_buffer) snrt_global_barrier(); + + // Compute phase + if (comp_i >= 0 && comp_i < num_tiles) { + // Switch buffers + int a_buff_idx = largs->double_buffer ? comp_m % 2 : 0; + int b_buff_idx = largs->double_buffer ? comp_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? comp_mn % 2 : 0; + + // Only compute cores participate in the tile computation + if (!snrt_is_dm_core()) { + // uint32_t start_cycle = snrt_mcycle(); + + // In the first k iteration we accumulate with the C matrix + // scaled by beta, in successive iterations we accumulate + // the previous partial result. The tile-level beta is thus + // a function of k: beta(k). + uint32_t beta_k = comp_k_abs == 0 ? largs->beta : 1; + + // Tile computation + sc_st_gemm_args_t sc_st_args; + sc_st_args.prec = largs->prec; + sc_st_args.setup_ssr = largs->setup_ssr; + sc_st_args.partition_banks = largs->partition_banks; + sc_st_args.transa = largs->transa; + sc_st_args.transb = largs->transb; + sc_st_args.a = la[a_buff_idx]; + if (largs->transa) { + sc_st_args.lda = tile_m; + } else if (largs->partition_banks) { + sc_st_args.lda = calculate_partitioned_banks_stride( + banks_per_buffer, tile_k, largs->prec); + } else { + sc_st_args.lda = tile_k; + } + sc_st_args.b = lb[b_buff_idx]; + if (largs->transb) { + sc_st_args.ldb = tile_k; + } else if (largs->partition_banks) { + sc_st_args.ldb = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldb = tile_n; + } + sc_st_args.beta = beta_k; + sc_st_args.c = lc[c_buff_idx]; + if (largs->partition_banks) { + sc_st_args.ldc = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldc = tile_n; + } + sc_st_args.m = tile_m; + sc_st_args.n = tile_n; + sc_st_args.k = tile_k; + sc_st_gemm(largs->gemm_fp, &sc_st_args); + + // uint32_t end_cycle = snrt_mcycle(); + } + + // Add the partial result tiles from the various clusters together + // in a logarithmic reduction fashion. + // Note: both compute and DMA cores participate in this step. + if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) { + snrt_global_reduction_dma( + (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n); + } + } + + // Synchronize cores after every iteration + snrt_global_barrier(); + } + + return 0; +} + + +int main () { + gemm_picobello(&args); + return 0; +} diff --git a/sw/snitch/runtime/src/.gitignore b/sw/snitch/runtime/impl/.gitignore similarity index 100% rename from sw/snitch/runtime/src/.gitignore rename to sw/snitch/runtime/impl/.gitignore diff --git a/sw/snitch/runtime/src/memory.ld b/sw/snitch/runtime/impl/memory.ld similarity index 100% rename from sw/snitch/runtime/src/memory.ld rename to sw/snitch/runtime/impl/memory.ld diff --git a/sw/snitch/runtime/src/pb_memory.h b/sw/snitch/runtime/impl/pb_memory.h similarity index 100% rename from sw/snitch/runtime/src/pb_memory.h rename to sw/snitch/runtime/impl/pb_memory.h diff --git a/sw/snitch/runtime/src/pb_noc_cfg.h b/sw/snitch/runtime/impl/pb_noc_cfg.h similarity index 69% rename from sw/snitch/runtime/src/pb_noc_cfg.h rename to sw/snitch/runtime/impl/pb_noc_cfg.h index bc285ba6..266b2191 100644 --- a/sw/snitch/runtime/src/pb_noc_cfg.h +++ b/sw/snitch/runtime/impl/pb_noc_cfg.h @@ -6,3 +6,6 @@ #define PB_CLUSTER_PER_ROW 4 #define PB_CLUSTER_PER_COL 4 +// TODO(colluca): derive from previous or viceversa +#define PB_LOG2_CLUSTER_PER_ROW 2 +#define PB_LOG2_CLUSTER_PER_COL 2 diff --git a/sw/snitch/runtime/src/putchar.c b/sw/snitch/runtime/impl/putchar.c similarity index 100% rename from sw/snitch/runtime/src/putchar.c rename to sw/snitch/runtime/impl/putchar.c diff --git a/sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl b/sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl rename to sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl diff --git a/sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl b/sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl rename to sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl diff --git a/sw/snitch/runtime/src/snitch_cluster_memory.c b/sw/snitch/runtime/impl/snitch_cluster_memory.c similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_memory.c rename to sw/snitch/runtime/impl/snitch_cluster_memory.c diff --git a/sw/snitch/runtime/src/snitch_cluster_start.c b/sw/snitch/runtime/impl/snitch_cluster_start.c similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_start.c rename to sw/snitch/runtime/impl/snitch_cluster_start.c diff --git a/sw/snitch/runtime/src/snitch_cluster_start.h b/sw/snitch/runtime/impl/snitch_cluster_start.h similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_start.h rename to sw/snitch/runtime/impl/snitch_cluster_start.h diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/impl/snrt.S similarity index 100% rename from sw/snitch/runtime/src/snrt.S rename to sw/snitch/runtime/impl/snrt.S diff --git a/sw/snitch/runtime/impl/snrt.cc b/sw/snitch/runtime/impl/snrt.cc new file mode 100644 index 00000000..74f12312 --- /dev/null +++ b/sw/snitch/runtime/impl/snrt.cc @@ -0,0 +1,26 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +#include "alloc.c" +#include "alloc_v2.c" +#include "cls.c" +#include "cluster_interrupts.c" +#include "dm.c" +#include "dma.c" +#include "eu.c" +#include "kmp.c" +#include "omp.c" +#include "printf.c" +#include "putchar.c" +#include "riscv.c" +#include "snitch_cluster_memory.c" +#include "snitch_cluster_start.c" +#include "ssr.c" +#include "sync.c" +#include "team.c" + +#include "pb_team.c" +#include "pb_sync.c" diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/impl/snrt.h similarity index 91% rename from sw/snitch/runtime/src/snrt.h rename to sw/snitch/runtime/impl/snrt.h index 8959c80a..fd8be02e 100644 --- a/sw/snitch/runtime/src/snrt.h +++ b/sw/snitch/runtime/impl/snrt.h @@ -7,6 +7,9 @@ #include #include +// Additional optional configurations to change the runtime behaviour +// #define SNRT_ENABLE_NARROW_REDUCTION + // Configuration- and system-specific definitions (HAL) #include "pb_addrmap.h" #include "snitch_cluster_cfg.h" @@ -52,7 +55,6 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t; #include "sync.h" #include "team.h" #include "types.h" -#include "pb_team.h" // Accelerators #include "datamover/archi_datamover.h" @@ -61,3 +63,7 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t; #include "redmule/archi_redmule.h" #include "redmule/hal_redmule.h" #include "redmule/redmule_utils.h" + +// Picobello specific +#include "pb_team.h" +#include "pb_sync.h" diff --git a/sw/snitch/runtime/src/pb_sync.c b/sw/snitch/runtime/src/pb_sync.c new file mode 100644 index 00000000..bba8376d --- /dev/null +++ b/sw/snitch/runtime/src/pb_sync.c @@ -0,0 +1,6 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +extern inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, + uint32_t n_cols, snrt_comm_t parent_comm); diff --git a/sw/snitch/runtime/src/pb_sync.h b/sw/snitch/runtime/src/pb_sync.h new file mode 100644 index 00000000..782a9271 --- /dev/null +++ b/sw/snitch/runtime/src/pb_sync.h @@ -0,0 +1,44 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, + uint32_t n_cols, uint32_t start_row = 0, uint32_t start_col = 0, + snrt_comm_t parent_comm = NULL) { + + // If no communicator is given, world communicator is used as default. + if (parent_comm == NULL) parent_comm = snrt_comm_world; + + // Allocate communicator struct in L1 and point to it. + *comm = + (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); + + // Allocate barrier counter in L1. Only the first cluster's is actually + // used, but we want to keep all clusters' L1 allocators aligned. Thus, + // only the first cluster initializes its barrier counter. A global barrier + // is then used to ensure all cores "see" the initialized value. + uint32_t first_cluster = start_col * pb_cluster_num_in_col() + start_row; + void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t)); + barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), + first_cluster); + if ((snrt_cluster_idx() == first_cluster) && snrt_is_dm_core()) { + *(uint32_t *)barrier_ptr = 0; + snrt_fence(); + } + snrt_global_barrier(parent_comm); + + // Initialize communicator, pointing to the newly-allocated barrier + // counter in L3. + (*comm)->size = n_rows * n_cols; + (*comm)->mask = ((n_cols - 1) << PB_LOG2_CLUSTER_PER_COL) | + (n_rows - 1); + (*comm)->base = (start_col << PB_LOG2_CLUSTER_PER_COL) | start_row; + (*comm)->barrier_ptr = (uint32_t *)barrier_ptr; + uint32_t in_row_range = (pb_cluster_row_idx() >= start_row) && + (pb_cluster_row_idx() < (start_row + n_rows)); + uint32_t in_col_range = (pb_cluster_col_idx() >= start_col) && + (pb_cluster_col_idx() < (start_col + n_cols)); + (*comm)->is_participant = in_row_range && in_col_range; +} diff --git a/sw/snitch/runtime/src/pb_team.c b/sw/snitch/runtime/src/pb_team.c index cd280f61..0a67b710 100644 --- a/sw/snitch/runtime/src/pb_team.c +++ b/sw/snitch/runtime/src/pb_team.c @@ -1,4 +1,4 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2025 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 @@ -6,14 +6,40 @@ extern inline uintptr_t pb_l2_tile_address(uint32_t tile_idx); extern inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr); -extern inline uint32_t pb_cluster_row(uint32_t cidx); +extern inline constexpr uint32_t pb_log2_cluster_num_in_col(); -extern inline uint32_t pb_cluster_row(); +extern inline constexpr uint32_t pb_log2_cluster_num_in_row(); -extern inline uint32_t pb_cluster_col(uint32_t cidx); +extern inline constexpr uint32_t pb_cluster_num_in_row(); -extern inline uint32_t pb_cluster_col(); +extern inline constexpr uint32_t pb_cluster_num_in_col(); -extern inline uint32_t pb_closest_mem_tile(uint32_t cidx); +extern inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx); + +extern inline uint32_t pb_cluster_row_idx(); + +extern inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx); + +extern inline uint32_t pb_cluster_col_idx(); + +extern inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col); + +extern inline uint32_t pb_cluster_in_row(uint32_t row); + +extern inline uint32_t pb_cluster_in_col(uint32_t col); + +extern inline uint32_t pb_cluster_is_easternmost(); + +extern inline uint32_t pb_cluster_is_northernmost(); + +extern inline uint32_t pb_cluster_north_neighbour(); + +extern inline uint32_t pb_cluster_east_neighbour(); + +extern inline uint32_t pb_cluster_south_neighbour(); + +extern inline uint32_t pb_cluster_west_neighbour(); + +extern inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx); extern inline uint32_t pb_closest_mem_tile(); diff --git a/sw/snitch/runtime/src/pb_team.h b/sw/snitch/runtime/src/pb_team.h index 1fa8d8d0..fa41866c 100644 --- a/sw/snitch/runtime/src/pb_team.h +++ b/sw/snitch/runtime/src/pb_team.h @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // // Lorenzo Leone - +// Luca Colagrande /** * @file @@ -30,60 +30,153 @@ inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr) { PICOBELLO_ADDRMAP_L2_SPM_0_SIZE; } +/** + * @brief Get the log2 of the number of clusters in a row + */ + inline constexpr uint32_t pb_log2_cluster_num_in_row() { + // TODO(colluca): derive this from pb_cluster_num_in_row() or viceversa + return 2; +} /** - * @brief Get the NoC row index - * @param cidx The cluster index - * @return The Row index + * @brief Get the log2 of the number of clusters in a column */ -inline uint32_t pb_cluster_row(uint32_t cidx) -{ - return cidx % PB_CLUSTER_PER_ROW; +inline constexpr uint32_t pb_log2_cluster_num_in_col() { + // TODO(colluca): derive this from pb_cluster_num_in_col() or viceversa + return 2; +} + +/** + * @brief Get the number of clusters in a row + */ +inline constexpr uint32_t pb_cluster_num_in_row() { + return PB_CLUSTER_PER_ROW; +} + +/** + * @brief Get the number of clusters in a column + */ +inline constexpr uint32_t pb_cluster_num_in_col() { + return PB_CLUSTER_PER_COL; } /** - * @brief Get the NoC row index - * This is a convenience orload of pb_cluster_row() - * @return The Row index + * @brief Get the row index of a cluster + * @param cluster_idx The cluster index + * @return The row index relative to the first row of cluster tiles */ -inline uint32_t pb_cluster_row() +inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx) { - return pb_cluster_row(snrt_cluster_idx()); + return cluster_idx % pb_cluster_num_in_col(); } +/** + * @brief Get the row index of the invoking cluster + * @return The row index relative to the first row of cluster tiles + */ +inline uint32_t pb_cluster_row_idx() +{ + return pb_cluster_row_idx(snrt_cluster_idx()); +} /** - * @brief Get the NoC column index - * @param cidx The cluster index - * @return The Column index + * @brief Get the column index of a cluster + * @param cluster_idx The cluster index + * @return The column index relative to the first column of cluster tiles */ -inline uint32_t pb_cluster_col(uint32_t cidx) +inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx) { - return cidx / PB_CLUSTER_PER_COL; + return cluster_idx / pb_cluster_num_in_col(); } /** - * @brief Get the NoC column index - * This is a convenience orload of pb_cluster_row() - * @return The Column index + * @brief Get the column index of the invoking cluster + * @return The column index relative to the first column of cluster tiles */ -inline uint32_t pb_cluster_col() +inline uint32_t pb_cluster_col_idx() { - return pb_cluster_col(snrt_cluster_idx()); + return pb_cluster_col_idx(snrt_cluster_idx()); +} + +/** + * @brief Calculate the cluster index from its (row, col) coordinates + * @param row Row index relative to the first row of cluster tiles + * @param col Column index relative to the first column of cluster tiles + */ +inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col) { + return col * pb_cluster_num_in_col() + row; +} + +/** + * @brief Test if cluster is in a given row + * @param row Row index relative to the first row of cluster tiles + */ +inline uint32_t pb_cluster_in_row(uint32_t row) { + return pb_cluster_row_idx() == row; +} + +/** + * @brief Test if cluster is in a given column + * @param col Column index relative to the first column of cluster tiles + */ +inline uint32_t pb_cluster_in_col(uint32_t col) { + return pb_cluster_col_idx() == col; +} + +/** + * @brief Test if cluster is in the easternmost column + */ +inline uint32_t pb_cluster_is_easternmost() { + return pb_cluster_in_col(pb_cluster_num_in_row() - 1); +} + +/** + * @brief Test if cluster is in the northernmost row + */ +inline uint32_t pb_cluster_is_northernmost() { + return pb_cluster_in_row(pb_cluster_num_in_col() - 1); +} + +/** + * @brief Get cluster index of north neighbour + */ +inline uint32_t pb_cluster_north_neighbour() { + return snrt_cluster_idx() + 1; } +/** + * @brief Get cluster index of east neighbour + */ +inline uint32_t pb_cluster_east_neighbour() { + return snrt_cluster_idx() + pb_cluster_num_in_row(); +} + +/** + * @brief Get cluster index of south neighbour + */ +inline uint32_t pb_cluster_south_neighbour() { + return snrt_cluster_idx() - 1; +} + +/** + * @brief Get cluster index of west neighbour + */ +inline uint32_t pb_cluster_west_neighbour() { + return snrt_cluster_idx() - pb_cluster_num_in_row(); +} /** * @brief Get the index of the closest memory tile - * @param cidx The cluster index - * @return Index of the closest memory tile to cidx + * @param cluster_idx The cluster index + * @return Index of the closest memory tile to cluster_idx */ -inline uint32_t pb_closest_mem_tile(uint32_t cidx) { - uint32_t row = pb_cluster_row(cidx); +inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx) { + uint32_t row = pb_cluster_row_idx(cluster_idx); // e.g. with 4x4 matrix // first 8 clusters -> left column tiles 0..3 // clusters >= 8 -> right column tiles 4..7 - return (cidx < (snrt_cluster_num() / 2)) ? row : (row + PB_CLUSTER_PER_COL); + return (cluster_idx < (snrt_cluster_num() / 2)) ? + row : (row + PB_CLUSTER_PER_COL); } /** diff --git a/sw/snitch/runtime/src/snrt.cc b/sw/snitch/runtime/src/snrt.cc deleted file mode 120000 index 686f3cf5..00000000 --- a/sw/snitch/runtime/src/snrt.cc +++ /dev/null @@ -1 +0,0 @@ -../../../../.deps/snitch_cluster/sw/runtime/impl/snrt.cc \ No newline at end of file diff --git a/sw/snitch/tests/barrier_benchmark.c b/sw/snitch/tests/barrier_benchmark.c new file mode 100644 index 00000000..c14e57f3 --- /dev/null +++ b/sw/snitch/tests/barrier_benchmark.c @@ -0,0 +1,112 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "snrt.h" + +#ifndef N_ROWS +#define N_ROWS (pb_cluster_num_in_col()) +#endif + +#ifndef N_COLS +#define N_COLS (pb_cluster_num_in_row()) +#endif + +typedef enum { + SW, + HW +} impl_t; + +#ifndef IMPL +#define IMPL SW +#endif + +static inline void sw_barrier(snrt_comm_t comm) { + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_COLLECTIVE_MULTICAST; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + volatile uint32_t *mcip_set = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_set.w); + volatile uint32_t *mcip_clr = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_clear.w); + uint32_t size = comm->size; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Execute barrier in submesh (first iteration to preheat I$) + for (volatile uint32_t i = 0; i < 2; i++) { + // Align start times using hardware barrier + snrt_inter_cluster_barrier(comm); + + snrt_mcycle(); + + // Perform software inter-cluster barrier. + uint32_t cnt = __atomic_add_fetch(barrier_ptr, 1, __ATOMIC_RELAXED); + + // Cluster 0 polls the barrier counter, resets it for the next barrier + // and multicasts an interrupt to wake up the other clusters. Compared + // to the implementation in sync.h, this version is cache-friendly, + // i.e. successive invocations of the barrier will find a hot cache. + if (snrt_cluster_idx() == 0) { + while (*barrier_ptr != size); + *barrier_ptr = 0; + snrt_fence(); + snrt_set_awuser_low(user); + *mcip_set = 1 << snrt_cluster_compute_core_num(); + snrt_set_awuser_low(0); + } else { + snrt_wfi(); + } + // Clear interrupt for next barrier (also the sending cluster) + *mcip_clr = 1 << snrt_cluster_compute_core_num(); + snrt_mcycle(); + } +} + +static inline void hw_barrier(snrt_comm_t comm) { + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Execute barrier in submesh (first iteration to preheat I$, second to + // align start times for third iteration) + for (volatile uint32_t i = 0; i < 3; i++) { + snrt_mcycle(); + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + snrt_mcycle(); + } +} + +int main (void) { + + // Create communicator for submesh + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, N_COLS); + + if (snrt_is_dm_core() && comm->is_participant) { + // Execute barrier in submesh + if (IMPL == SW) { + sw_barrier(comm); + } else { + hw_barrier(comm); + } + } + + // Non-participating clusters wait on a global barrier + snrt_global_barrier(); + + return 0; +} diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c deleted file mode 100644 index 5583de48..00000000 --- a/sw/snitch/tests/dma_multicast.c +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2023 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Luca Colagrande -// -// This code tests the multicast feature of the wide interconnect. -// It uses the DMA to broadcast chunks of data to different clusters -// in the Picobello system. - -#include -#include "pb_addrmap.h" -#include "snrt.h" - -#define INITIALIZER 0xAAAAAAAA - -#ifndef LENGTH -#define LENGTH 32 -#endif - -#define LENGTH_TO_CHECK 32 - -#ifndef N_CLUSTERS_TO_USE -#define N_CLUSTERS_TO_USE snrt_cluster_num() -#endif - -// TODO(lleone): Check if is okay to shift by 18. Probably you need to adapt it to picobello -#define BCAST_MASK_ACTIVE ((N_CLUSTERS_TO_USE - 1) << 18) -// #define BCAST_MASK_ALL ((snrt_cluster_num() - 1) << 18) - - -static inline void dma_broadcast_to_clusters(void* dst, void* src, size_t size) { - // snrt_enable_multicast(BCAST_MASK_ACTIVE); - if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - snrt_dma_start_1d_mcast(snrt_remote_l1_ptr(dst, 0, 1), src, size, BCAST_MASK_ACTIVE); - snrt_dma_wait_all(); - } - // snrt_disable_multicast(); -} - -static inline int cluster_participates_in_bcast(int i) { - return (i < N_CLUSTERS_TO_USE); -} - -static inline void broadcast_wrapper(void* dst, void* src, size_t size) { - snrt_global_barrier(); - dma_broadcast_to_clusters(dst, src, size); - // Put clusters who don't participate in the broadcast to sleep, as if - // they proceed directly to the global barrier, they will interfere with - // the other clusters, by sending their atomics on the narrow interconnect. - if (!cluster_participates_in_bcast(snrt_cluster_idx())) { - snrt_wfi(); - snrt_int_clr_mcip(); - } - // Wake these up when cluster 0 is done - else if ((snrt_cluster_idx() == 0) && snrt_is_dm_core()) { - for (int i = 0; i < snrt_cluster_num(); i++) { - if (!cluster_participates_in_bcast(i)) - snrt_cluster()->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - } -} - -int main() { - snrt_interrupt_enable(IRQ_M_CLUSTER); - // snrt_int_clr_mcip(); - - // Allocate destination buffer - uint32_t *buffer_dst = (uint32_t *)snrt_l1_next_v2(); - uint32_t *buffer_src = buffer_dst + LENGTH; - - // First cluster initializes the source buffer and multicast- - // copies it to the destination buffer in every cluster's TCDM. - if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - for (uint32_t i = 0; i < LENGTH; i++) { - buffer_src[i] = INITIALIZER; - } - } - - // Initiate DMA transfer (twice to preheat the cache) - for (volatile int i = 0; i < 2; i++) { - broadcast_wrapper(buffer_dst, buffer_src, LENGTH * sizeof(uint32_t)); - } - - // All other clusters wait on a global barrier to signal the transfer - // completion. - snrt_global_barrier(); - - // Every cluster except cluster 0 checks that the data in the destination - // buffer is correct. To speed this up we only check the first 32 elements. - if (snrt_is_dm_core() && (snrt_cluster_idx() < N_CLUSTERS_TO_USE) && (snrt_cluster_idx() != 0)) { - uint32_t n_errs = LENGTH_TO_CHECK; - for (uint32_t i = 0; i < LENGTH_TO_CHECK; i++) { - if (buffer_dst[i] == INITIALIZER) n_errs--; - } - return n_errs; - } else - return 0; - } diff --git a/sw/snitch/tests/multicast_benchmark.c b/sw/snitch/tests/multicast_benchmark.c new file mode 100644 index 00000000..cc09c50f --- /dev/null +++ b/sw/snitch/tests/multicast_benchmark.c @@ -0,0 +1,329 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Lorenzo Leone +// +// This code implements a function to multicast data from one memory tile +// to all clusters in the same row, or multiple rows. + +#include +#include "snrt.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 1024 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 64 +#endif + +typedef enum { + SEQ, + TREE, + HW +} impl_t; + +#ifndef IMPL +#define IMPL SEQ +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(uint32_t)) + +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 2 +#endif + +#define N_ROWS (1 << LOG2_N_ROWS) + +static inline void dma_multicast_sequential(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { + // Only DMA cores of clusters in the first row participate + if (!snrt_is_dm_core() || !comm->is_participant) return; + + // Compute address of source buffer: + // - in memory tile for cluster 0 + // - in left neighbour for clusters in row 0 + // - in bottom neighbour for clusters in other rows + uintptr_t src; + if (snrt_cluster_idx() == 0) { + src = l3_buffer; + } else if (pb_cluster_in_row(0)) { + src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), pb_cluster_west_neighbour()); + } else { + src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), pb_cluster_south_neighbour()); + } + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all transfers in a row + uint32_t n_iters = N_BATCHES - 1 + pb_cluster_num_in_row(); + for (uint32_t i = 0; i < n_iters; i++) { + + // Every cluster is active for N_BATCHES iterations + // starting from the iteration with i == snrt_cluster_idx() + char cluster_active = i >= pb_cluster_col_idx(); + cluster_active &= i < (pb_cluster_col_idx() + N_BATCHES); + cluster_active &= pb_cluster_in_row(0); + + // Every active cluster copies data from the left tile + if (cluster_active) { + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(l1_buffer, src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + l1_buffer += BATCH; + src += BATCH; + asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + + // Iterations to cover all transfers in each column + n_iters = N_BATCHES - 1 + N_ROWS - 1; + for (uint32_t i = 0; i < n_iters; i++) { + + // Every cluster is active for N_BATCHES iterations + // starting from the iteration with (i + 1) == pb_cluster_row_idx() + char cluster_active = (i + 1) >= pb_cluster_row_idx(); + cluster_active &= (i + 1) < (pb_cluster_row_idx() + N_BATCHES); + cluster_active &= !pb_cluster_in_row(0); + + // Every active cluster copies data from the bottom tile + if (cluster_active) { + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(l1_buffer, src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + l1_buffer += BATCH; + src += BATCH; + asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +static inline void dma_multicast_tree(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { + + // Only DMA cores of clusters in the communicator continue from here + if (!snrt_is_dm_core() || !comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iteration 0: cluster 0 fetches data from memory tile + if (snrt_cluster_idx() == 0) { + snrt_mcycle(); + snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Iterations to cover all transfers in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders at every level of the tree + char cluster_idx_inv = pb_cluster_num_in_row() - pb_cluster_col_idx(); + char num_active_clusters = 1 << i; + char sender_stride = pb_cluster_num_in_row() / num_active_clusters; + char is_sender = ((cluster_idx_inv % sender_stride) == 0) + && pb_cluster_in_row(0); + + // Every active cluster sends the data to a cluster on the right + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx(0, + pb_cluster_col_idx() + receiver_offset); + uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(dst, l1_buffer, SIZE); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + + // Iterations to cover all transfers in each column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char row_idx_inv = N_ROWS - pb_cluster_row_idx(); + char num_active_rows = 1 << i; + char sender_stride = N_ROWS / num_active_rows; + char is_sender = (row_idx_inv % sender_stride) == 0; + + // Every active cluster sends the data to a cluster above it + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx()); + uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(dst, l1_buffer, SIZE); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +static inline void dma_multicast_hw(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { + // Only DMA core of cluster 0 continues past this point + if (!snrt_is_dm_core() || !(snrt_cluster_idx() == 0)) return; + + // Hardware multicast transfer + uint64_t mask = snrt_get_collective_mask(comm); + snrt_dma_enable_multicast(mask); + snrt_mcycle(); + snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); + snrt_dma_disable_multicast(); + snrt_dma_wait_all(); +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, + snrt_comm_t comm) { + if (IMPL == SEQ) + dma_multicast_sequential(l1_buffer, l3_buffer, comm); + else if (IMPL == TREE) + dma_multicast_tree(l1_buffer, l3_buffer, comm); + else if (IMPL == HW) + dma_multicast_hw(l1_buffer, l3_buffer, comm); +} + +// Global variables for verification script +uint32_t output[N_ELEMS * N_ROWS * pb_cluster_num_in_row()]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS * N_ROWS * pb_cluster_num_in_row(); + +int main() { + + // Allocate 4KiB-aligned buffer in every cluster + uint32_t *buffer = (uint32_t *)snrt_l1_alloc_cluster_local(SIZE, 4096); + + // Allocate 4KiB-aligned buffer in memory tile + uint32_t *l3_buffer = (uint32_t *)snrt_l3_alloc_v2(SIZE, 4096); + + // First cluster initializes the L3 buffer. + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + for (uint32_t i = 0; i < N_ELEMS; i++) { + l3_buffer[i] = i + 1; + } + } + + // TODO(colluca): is this needed? + // Every cluster in row 0 initializes its destination buffer + if (snrt_is_dm_core() && pb_cluster_in_row(0)) { + snrt_dma_start_1d( + (uintptr_t)buffer, (uintptr_t)(snrt_cluster()->zeromem.mem), SIZE); + snrt_dma_wait_all(); + } + + // Create communicator for first N rows (all other clusters are inactive) + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); + + // Only DMA cores of clusters in the first N rows continue from here + if (!snrt_is_dm_core() || !comm->is_participant) return 0; + + // Synchronize all clusters. + snrt_inter_cluster_barrier(comm); + + // Initiate multicast transfer (twice to preheat the cache) + for (volatile int i = 0; i < 2; i++) { + snrt_mcycle(); + dma_multicast((uintptr_t)buffer, (uintptr_t)l3_buffer, comm); + snrt_mcycle(); + snrt_inter_cluster_barrier(comm); + } + + // Writeback to L3 + if (snrt_is_dm_core() && (pb_cluster_row_idx() < N_ROWS)) { + uint32_t cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + uint32_t offset = cluster_idx * SIZE; + snrt_dma_start_1d( + (uintptr_t)output + offset, (uintptr_t)buffer, SIZE); + snrt_dma_wait_all(); + } + + return 0; +} diff --git a/sw/snitch/tests/overlapping_barriers.c b/sw/snitch/tests/overlapping_barriers.c new file mode 100644 index 00000000..d74c9bce --- /dev/null +++ b/sw/snitch/tests/overlapping_barriers.c @@ -0,0 +1,34 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// +// Test that multiple outstanding barriers with intersecting (or overlapping) +// participant sets do not deadlock the system. The "participant set" is +// defined as the set of clusters that participate in the barrier. + +#include "snrt.h" + +int main (void) { + + // Create communicator for row 0 + snrt_comm_t comm; + pb_create_mesh_comm(&comm, 1, pb_cluster_num_in_row()); + + // Make sure row-0 clusters arrive on the row-0 barrier only after the + // other clusters have arrived on the global barrier. This ensures that + // the global barrier, arriving first, takes ownership of the router, + // preventing the row-0 clusters from ever reaching the global barrier + // and consequently deadlocking the system, if multiple outstanding + // reductions are not properly supported. + if (pb_cluster_row_idx() == 0) { + for (int j = 0; j < 100; j++) snrt_nop(); + snrt_global_barrier(comm); + snrt_global_barrier(); + } else { + snrt_global_barrier(); + } + + return 0; +} diff --git a/sw/snitch/tests/parallel_row_col_barriers.c b/sw/snitch/tests/parallel_row_col_barriers.c new file mode 100644 index 00000000..acb1bd41 --- /dev/null +++ b/sw/snitch/tests/parallel_row_col_barriers.c @@ -0,0 +1,33 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// +// Stress test row, column and global barriers happening in rapid succession. + +#include "snrt.h" + +int main (void) { + + // Create communicators for each row and column. + snrt_comm_t row_comm[pb_cluster_num_in_col()]; + snrt_comm_t col_comm[pb_cluster_num_in_row()]; + for (uint32_t r = 0; r < pb_cluster_num_in_col(); r++) { + pb_create_mesh_comm(&row_comm[r], 1, pb_cluster_num_in_row(), + r, 0); + } + for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++) { + pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1, + 0, c); + } + + // Test multiple barriers in succession (row, col and global) + for (int i = 0; i < 10; i++) { + snrt_global_barrier(row_comm[pb_cluster_row_idx()]); + snrt_global_barrier(col_comm[pb_cluster_col_idx()]); + snrt_global_barrier(); + } + + return 0; +} diff --git a/sw/snitch/tests/reduction_benchmark.c b/sw/snitch/tests/reduction_benchmark.c new file mode 100644 index 00000000..45d75a1c --- /dev/null +++ b/sw/snitch/tests/reduction_benchmark.c @@ -0,0 +1,502 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Chen Wu + +#include "snrt.h" +#include "blas.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 1024 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 64 +#endif + +typedef enum { + SEQ, + TREE, + HW +} impl_t; + +#ifndef IMPL +#define IMPL SEQ +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(double)) + +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 0 +#endif + +#define N_ROWS (1 << LOG2_N_ROWS) + +// Replace `1` literals with this thread-local variable to prevent it from +// being allocated by the compiler in L2. +__thread double one = 1; + +static inline void global_hw_barrier(volatile uint32_t *barrier_ptr, + uint32_t user) { + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + if (snrt_is_dm_core()) { + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + snrt_cluster_hw_barrier(); +} + +static inline void swap_buffers(uintptr_t *a, uintptr_t *b) { + uintptr_t temp = *a; + *a = *b; + *b = temp; +} + +static inline void dma_reduction_hw(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + + // Create a communicator per row + snrt_comm_t row_comm[N_ROWS]; + for (int i = 0; i < N_ROWS; i++) + pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0, + comm); + + // Create a communicator for the first column + snrt_comm_t col_comm; + pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm); + + if (snrt_is_dm_core() && comm->is_participant) { + uint32_t remote_cluster; + uintptr_t remote_dst; + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + + // Reduction across rows (destination: first cluster in row) + snrt_mcycle(); + remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0); + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), remote_cluster); + snrt_dma_start_1d_reduction( + remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op); + snrt_dma_wait_all(); + snrt_mcycle(); + + // Barrier to ensure there is only one outstanding reduction in every + // router + snrt_inter_cluster_barrier(comm); + + // Reduction across first column (destination: cluster 0) + snrt_mcycle(); + if (N_ROWS > 1 && col_comm->is_participant) { + // Switch source and destination buffer + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)src, snrt_cluster_idx(), 0); + snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op); + snrt_dma_wait_all(); + } + snrt_mcycle(); + } +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_seq(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a; + } else { + dma_src = c; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a; + comp_src2 = dst[0]; + comp_result = c; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + comp_src1 = c; + comp_src2 = dst[0]; + comp_result = a; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + +static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx(), pb_cluster_col_idx() - dist); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() - dist, pb_cluster_col_idx()); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_compute_phase(char is_receiver, + uintptr_t src, uintptr_t dst, uintptr_t result) { + + // Every receiving cluster reduces the data from the sender + if (is_receiver && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst, + (double *)result); + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + + // Only clusters in the communicator continue from here + if (!comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Initially a is the source buffer and c is the result buffer + uintptr_t src = a; + uintptr_t dst[2] = {b, d}; + uintptr_t result = c; + + // Iterations to cover all reductions in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders and receivers at every level + // of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist; + char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0; + + // Transfer phase for first batch + dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } + + // Iterations to cover all reductions in the first column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0); + char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0); + + // Transfer phase for first batch + dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_reduction(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (IMPL == SEQ) + dma_reduction_seq(a, b, c, d, comm); + else if (IMPL == TREE) + dma_reduction_tree(a, b, c, d, comm); + else if (IMPL == HW) + dma_reduction_hw(a, c, comm); +} + +// Global variables for verification script +double output[N_ELEMS]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS; + +int main (void){ + + // Allocate 4KiB-aligned buffers in every cluster + uintptr_t a_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + uintptr_t c_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + uintptr_t b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + uintptr_t d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + + // Create communicator for first N rows (all other clusters are inactive) + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); + + // Only clusters in the first N rows continue from here + if (!comm->is_participant) return 0; + + // Two iterations to to preheat the cache + for (volatile int i = 0; i < 2; i++){ + + // Initialize source buffer + if (snrt_is_dm_core()) { + for (uint32_t i = 0; i < N_ELEMS; i++) { + uint32_t row_major_cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + ((double *)a_buffer)[i] = row_major_cluster_idx + i; + } + } + snrt_global_barrier(comm); + + // Perform reduction + snrt_mcycle(); + dma_reduction(a_buffer, b_buffer, c_buffer, d_buffer, comm); + snrt_mcycle(); + snrt_global_barrier(comm); + } + + // Writeback to L3 + uintptr_t result_buffer = c_buffer; + uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS; + if ((IMPL == HW && N_ROWS > 1) || + (IMPL == SEQ && N_ROWS > 1) || + (IMPL == TREE && (total_tree_levels % 2) == 0)) + result_buffer = a_buffer; + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE); + snrt_dma_wait_all(); + } +} diff --git a/sw/snitch/tests/reduction_benchmark_hyperbank.c b/sw/snitch/tests/reduction_benchmark_hyperbank.c new file mode 100644 index 00000000..747532b2 --- /dev/null +++ b/sw/snitch/tests/reduction_benchmark_hyperbank.c @@ -0,0 +1,776 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Chen Wu + +#include "snrt.h" +#include "blas.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 8192 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 1024 +#endif + +typedef enum { + SEQ, + TREE, + HW_GENERIC, + HW_SIMPLE, +} impl_t; + +#ifndef IMPL +#define IMPL SEQ +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(double)) + +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 2 +#endif + +#define N_ROWS (1 << LOG2_N_ROWS) + +// Replace `1` literals with this thread-local variable to prevent it from +// being allocated by the compiler in L2. +__thread double one = 1; + +static inline void global_hw_barrier(volatile uint32_t *barrier_ptr, + uint32_t user) { + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + if (snrt_is_dm_core()) { + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + snrt_cluster_hw_barrier(); +} + +static inline void swap_buffers(uintptr_t *a, uintptr_t *b) { + uintptr_t temp = *a; + *a = *b; + *b = temp; +} + +static inline void dma_reduction_hw_generic(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + if (snrt_is_dm_core() && comm->is_participant) { + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), 0); + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + snrt_dma_start_1d_reduction(remote_dst, src, SIZE, comm, op); + snrt_dma_wait_all(); + } +} + +static inline void dma_reduction_hw_simple(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + + // Create a communicator per row + snrt_comm_t row_comm[N_ROWS]; + for (int i = 0; i < N_ROWS; i++) + pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0, + comm); + + // Create a communicator for the first column + snrt_comm_t col_comm; + pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm); + + if (snrt_is_dm_core() && comm->is_participant) { + uint32_t remote_cluster; + uintptr_t remote_dst; + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + + // Reduction across rows (destination: first cluster in row) + snrt_mcycle(); + remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0); + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), remote_cluster); + snrt_dma_start_1d_reduction( + remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op); + snrt_dma_wait_all(); + snrt_mcycle(); + + // Barrier to ensure there is only one outstanding reduction in every + // router + snrt_inter_cluster_barrier(comm); + + // Reduction across first column (destination: cluster 0) + snrt_mcycle(); + if (N_ROWS > 1 && col_comm->is_participant) { + // Switch source and destination buffer + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)src, snrt_cluster_idx(), 0); + snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op); + snrt_dma_wait_all(); + } + snrt_mcycle(); + } +} + +static inline void dma_reduction_seq_normal(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a; + } else { + dma_src = c; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a; + comp_src2 = dst[0]; + comp_result = c; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + comp_src1 = c; + comp_src2 = dst[0]; + comp_result = a; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1, + uintptr_t b, + uintptr_t c0, uintptr_t c1, + uintptr_t d, + snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + uintptr_t a_ptr[2] = {a0, a1}; + uintptr_t c_ptr[2] = {c0, c1}; + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a_ptr[0]; + } else { + dma_src = c_ptr[0]; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a_ptr[0]; + comp_src2 = dst[0]; + comp_result = c_ptr[0]; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + + // comp_src1 += BATCH; + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + comp_src1 = a_ptr[0]; + + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + + // comp_result += BATCH; + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + comp_result = c_ptr[0]; + + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + // dma_src += BATCH; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + dma_src = a_ptr[0]; + } else { + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + dma_src = c_ptr[0]; + } + + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + a_ptr[0] = a0; + a_ptr[1] = a1; + c_ptr[0] = c0; + c_ptr[1] = c1; + + comp_src1 = c_ptr[0]; + comp_src2 = dst[0]; + comp_result = a_ptr[0]; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + // comp_src1 += BATCH; + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + comp_src1 = c_ptr[0]; + + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + + // comp_result += BATCH; + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + comp_result = a_ptr[0]; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + // dma_src += BATCH; + if (!is_northernmost) { + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + dma_src = a_ptr[0]; + } else { + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + dma_src = c_ptr[0]; + } + + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + +static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx(), pb_cluster_col_idx() - dist); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() - dist, pb_cluster_col_idx()); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_compute_phase(char is_receiver, + uintptr_t src, uintptr_t dst, uintptr_t result) { + + // Every receiving cluster reduces the data from the sender + if (is_receiver && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst, + (double *)result); + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + + // Only clusters in the communicator continue from here + if (!comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Initially a is the source buffer and c is the result buffer + uintptr_t src = a; + uintptr_t dst[2] = {b, d}; + uintptr_t result = c; + + // Iterations to cover all reductions in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders and receivers at every level + // of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist; + char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0; + + // Transfer phase for first batch + dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } + + // Iterations to cover all reductions in the first column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0); + char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0); + + // Transfer phase for first batch + dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_reduction(uintptr_t a0, uintptr_t a1, + uintptr_t b, + uintptr_t c0, uintptr_t c1, + uintptr_t d, + snrt_comm_t comm) { + if (IMPL == SEQ){ + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH) { + dma_reduction_seq_hyperbank(a0, a1, b, c0, c1, d, comm); + } else { + dma_reduction_seq_normal(a0, b, c0, d, comm); + } + } + // else if (IMPL == TREE) + // dma_reduction_tree(a, b, c, d, comm); + else if (IMPL == HW_GENERIC) + dma_reduction_hw_generic(a0, c0, comm); + else if (IMPL == HW_SIMPLE) + dma_reduction_hw_simple(a0, c0, comm); +} + +// Global variables for verification script +double output[N_ELEMS]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS; + +int main (void){ + // split a and c into two hyperbanks, even*BATCH in hyperbank 0 and odd*BATCH in hyperbank 1 + // b in hyperbank 0, d in hyperbank 1 + // Allocate 4KiB-aligned buffers in every cluster + uintptr_t a_buffer[2], c_buffer[2], b_buffer, d_buffer; + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)) { + // In SEQ implementation, when SIZE != BATCH we need to allocate + // twice the space for a and c to alternate between buffers + a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096); + a_buffer[1] = a_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE; + c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096); + c_buffer[1] = c_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE; + b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + d_buffer = b_buffer + SNRT_TCDM_HYPERBANK_SIZE; + } + else { + a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + } + + // Create communicator for first N rows (all other clusters are inactive) + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); + + // Only clusters in the first N rows continue from here + if (!comm->is_participant) return 0; + + // Two iterations to to preheat the cache + for (volatile int i = 0; i < 2; i++){ + + // Initialize source buffer + if (snrt_is_dm_core()) { + for (uint32_t i = 0; i < N_ELEMS; i++) { + uint32_t row_major_cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){ + uint32_t batch_id = i / (BATCH / sizeof(double)); + uint32_t buffer_idx = batch_id % 2; + uint32_t buffer_offset = (batch_id / 2) * (BATCH / sizeof(double)) + (i % (BATCH / sizeof(double))); + *((double *)(a_buffer[buffer_idx]) + buffer_offset) = row_major_cluster_idx + i; + + } + else { + ((double *)a_buffer[0])[i] = row_major_cluster_idx + i; + } + // ((double *)a_buffer)[i] = row_major_cluster_idx + i; + } + } + snrt_global_barrier(comm); + + // Perform reduction + snrt_mcycle(); + dma_reduction(a_buffer[0], a_buffer[1], b_buffer, c_buffer[0], c_buffer[1], d_buffer, comm); + snrt_mcycle(); + snrt_global_barrier(comm); + } + + // Writeback to L3 + uintptr_t result_buffer = c_buffer[0]; + uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS; + if ((IMPL == HW_SIMPLE && N_ROWS > 1) || + (IMPL == SEQ && N_ROWS > 1) || + (IMPL == TREE && (total_tree_levels % 2) == 0)) + result_buffer = a_buffer[0]; + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){ + snrt_dma_1d_to_2d((void *)output, + (void *)result_buffer, + SIZE / 2, + BATCH, + BATCH * SNRT_TCDM_HYPERBANK_NUM); + snrt_dma_1d_to_2d((void *)((uintptr_t)output + BATCH), + (void *)(result_buffer + SNRT_TCDM_HYPERBANK_SIZE), + SIZE / 2, + BATCH, + BATCH * SNRT_TCDM_HYPERBANK_NUM); + } + else { + snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE); + } + snrt_dma_wait_all(); + } +} diff --git a/sw/sw.mk b/sw/sw.mk index 012435e1..a743aebe 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -22,15 +22,20 @@ PB_GEN_DIR = $(PB_ROOT)/.generated ## Snitch Cluster ## #################### -SN_RUNTIME_SRCDIR = $(PB_SNITCH_SW_DIR)/runtime/src +SN_RUNTIME_SRCDIR = $(PB_SNITCH_SW_DIR)/runtime/impl SN_RUNTIME_BUILDDIR = $(PB_SNITCH_SW_DIR)/runtime/build -SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build -SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build SN_RUNTIME_INCDIRS = $(PB_INCDIR) SN_RUNTIME_INCDIRS += $(PB_GEN_DIR) +SN_RUNTIME_INCDIRS += $(PB_SNITCH_SW_DIR)/runtime/src SN_RUNTIME_HAL_HDRS = $(PB_GEN_DIR)/pb_addrmap.h SN_RUNTIME_HAL_HDRS += $(PB_GEN_DIR)/pb_raw_addrmap.h -SN_BUILD_APPS = OFF + +SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build + +SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build +SN_TESTS_INCDIRS = $(SN_ROOT)/sw/kernels/blas + +SN_BUILD_APPS = OFF SN_APPS = $(PB_SNITCH_SW_DIR)/apps/gemm_2d SN_APPS += $(PB_SNITCH_SW_DIR)/apps/gemm @@ -38,37 +43,15 @@ SN_APPS += $(PB_SNITCH_SW_DIR)/apps/axpy SN_APPS += $(SN_ROOT)/sw/kernels/dnn/flashattention_2 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear SN_APPS += $(PB_SNITCH_SW_DIR)/apps/mha +SN_APPS += $(PB_SNITCH_SW_DIR)/apps/summa_gemm + +SN_TESTS = $(wildcard $(PB_SNITCH_SW_DIR)/tests/*.c) include $(SN_ROOT)/make/sw.mk $(PB_GEN_DIR)/pb_raw_addrmap.h: $(PB_RDL_ALL) $(PEAKRDL) raw-header $< -o $@ $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) --base_name $(notdir $(basename $@)) --format c -# Collect Snitch tests which should be built -PB_SN_TESTS_DIR = $(PB_SNITCH_SW_DIR)/tests -PB_SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build -PB_SN_TEST_NAMES = $(basename $(notdir $(wildcard $(PB_SN_TESTS_DIR)/*.c))) -PB_SN_TEST_ELFS = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .elf,$(PB_SN_TEST_NAMES)))) -PB_SN_TEST_DUMP = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .dump,$(PB_SN_TEST_NAMES)))) - -.PHONY: pb-sn-tests clean-pb-sn-tests - -pb-sn-tests: $(PB_SN_TEST_ELFS) $(PB_SN_TEST_DUMP) - -clean-pb-sn-tests: - rm -rf $(PB_SN_TEST_ELFS) - -$(PB_SN_TEST_ELFS): $(PB_GEN_DIR)/pb_addrmap.h - -$(PB_SN_TESTS_BUILDDIR)/%.d: $(PB_SN_TESTS_DIR)/%.c | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) -MM -MT '$(@:.d=.elf)' -x c++ $< > $@ - -$(PB_SN_TESTS_BUILDDIR)/%.elf: $(PB_SN_TESTS_DIR)/%.c $(SN_RUNTIME_LIB) | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) $(SN_TESTS_RISCV_LDFLAGS) -x c++ $< -o $@ - -$(PB_SN_TESTS_BUILDDIR)/%.dump: $(PB_SN_TESTS_BUILDDIR)/%.elf | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_OBJDUMP) $(SN_RISCV_OBJDUMP_FLAGS) $< > $@ - ############## ## Cheshire ## ############## @@ -108,6 +91,6 @@ sn-runtime-clean: sn-clean-runtime sn-apps-clean: sn-clean-apps .PHONY: sw sw-tests sw-clean sw-tests-clean -sw sw-tests: chs-sw-tests sn-tests pb-sn-tests sn-apps +sw sw-tests: chs-sw-tests sn-tests sn-apps -sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests sn-apps-clean +sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean sn-apps-clean diff --git a/target/sim/traces.mk b/target/sim/traces.mk index f4773b33..4cc627c1 100644 --- a/target/sim/traces.mk +++ b/target/sim/traces.mk @@ -9,6 +9,10 @@ LOGS_DIR = $(SIM_DIR)/logs SN_SIM_DIR = $(SIM_DIR) include $(SN_ROOT)/make/traces.mk +ifdef ROI_SPEC + SN_ROI_SPEC = $(ROI_SPEC) +endif + CHS_ADDR2LINE ?= $(CHS_SW_GCC_BINROOT)/riscv64-unknown-elf-addr2line CHS_TXT_TRACE = $(LOGS_DIR)/trace_hart_00000.txt CHS_ANNOTATED_TRACE = $(LOGS_DIR)/trace_hart_00000.s @@ -24,6 +28,8 @@ traces: sn-traces chs-trace annotate: sn-annotate chs-annotate traces-clean: sn-clean-traces chs-trace-clean annotate-clean: sn-clean-annotate chs-annotate-clean +visual-trace: sn-visual-trace +clean-visual-trace: sn-clean-visual-trace chs-trace: $(CHS_TXT_TRACE) chs-annotate: $(CHS_ANNOTATED_TRACE) diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk index 40e0ad30..b57e28a5 100644 --- a/target/sim/vsim/vsim.mk +++ b/target/sim/vsim/vsim.mk @@ -45,16 +45,16 @@ vsim-compile: $(VSIM_DIR)/compile.tcl $(PB_HW_ALL) $(VSIM) -c $(VSIM_FLAGS) -do "source $<; quit" $(VSIM_DIR)/compile.tcl: $(BENDER_YML) $(BENDER_LOCK) - bender script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@ + $(BENDER) script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@ echo 'vlog -work $(VSIM_WORK) "$(realpath $(CHS_ROOT))/target/sim/src/elfloader.cpp" -ccflags "-std=c++11"' >> $@ vsim-run: - $(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*" + cd $(SIM_DIR) && $(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*" vsim-run-batch: - $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit" + cd $(SIM_DIR) && $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit" vsim-run-batch-verify: vsim-run-batch ifdef VERIFY_PY - $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000 + cd $(SIM_DIR) && $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000 endif \ No newline at end of file