diff --git a/.gitignore b/.gitignore
index 43cd241c..8128d3ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,9 +8,10 @@
 .bender
 working_dir/
 
-# Python virtual environment
+# Python virtual environment and caches
 .cache/pip
 .venv
+__pycache__
 
 # Generated source files
 .generated
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9a98fdb8..fe12d745 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -57,6 +57,9 @@ chs-sw:
 
 vsim-compile:
   stage: build
+  extends:
+    - .cache-deps
+    - .init-env
   needs:
     - generate-rtl
   variables:
@@ -101,6 +104,29 @@ sim-vsim:
     paths:
       - transcript
 
+#####################
+# Run Experiments   #
+#####################
+
+sim-exps:
+  stage: run
+  extends:
+    - .cache-deps
+    - .init-env
+  needs:
+    - chs-sw
+    - sn-sw
+    - vsim-compile
+  parallel:
+    matrix:
+      - EXPERIMENT: multicast
+      - EXPERIMENT: reduction
+      - EXPERIMENT: barrier
+  script:
+    - cd experiments/$EXPERIMENT && ./experiments.py --ci --actions sw run -j
+  artifacts:
+    expire_in: 7 days
+
 ###################
 # Physical Design #
 ###################
diff --git a/.gitlab/common.yml b/.gitlab/common.yml
index ac4634c7..708c5cc5 100644
--- a/.gitlab/common.yml
+++ b/.gitlab/common.yml
@@ -68,7 +68,6 @@ variables:
   script:
     - make sn-tests DEBUG=ON
     - make sn-apps DEBUG=ON
-    - make pb-sn-tests
   artifacts:
     paths:
       - sw/snitch/tests/build/*.elf
diff --git a/.gitlab/sw-tests.yml b/.gitlab/sw-tests.yml
index 42cc5c1d..d1cc08eb 100644
--- a/.gitlab/sw-tests.yml
+++ b/.gitlab/sw-tests.yml
@@ -23,7 +23,7 @@
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/non_null_exitcode.elf, NZ_EXIT_CODE: 896 }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicluster_atomics.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/mcast_barrier.elf }
-      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/dma_multicast.elf }
+      - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicast_benchmark.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multi_mcast.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/row_col_mcast.elf }
       - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/access_spm.elf }
diff --git a/Bender.lock b/Bender.lock
index 55c39087..10fa0b86 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -25,7 +25,7 @@ packages:
     - obi_peripherals
     - register_interface
   axi:
-    revision: bec548fa2a9b18cbd7531105bb1fdf481ea8ad49
+    revision: 06410c36819924e32db2afa428d244dbdbcd5d4e
     version: null
     source:
       Git: https://github.com/colluca/axi.git
@@ -129,8 +129,8 @@ packages:
     - common_cells
     - register_interface
   cluster_icache:
-    revision: 64e21ae455bbdde850c4df13bef86ea55ac42537
-    version: 0.2.0
+    revision: 1642961b9561513f283674efc2c90f24855e8d39
+    version: 0.3.1
     source:
       Git: https://github.com/pulp-platform/cluster_icache.git
     dependencies:
@@ -146,7 +146,7 @@ packages:
     dependencies:
     - common_cells
   common_cells:
-    revision: e1c09c75775c5f03eb45906d5145dbd2f5bcfb95
+    revision: ca9d577f2fbc45ec557ab4e0905bbfc154441540
     version: null
     source:
       Git: https://github.com/pulp-platform/common_cells.git
@@ -207,7 +207,7 @@ packages:
       Path: .deps/fhg_spu_cluster
     dependencies: []
   floo_noc:
-    revision: ea35a9909d60d552bbdafc7e898590f326b1898a
+    revision: 8d19d926fbc9174984a67d76f349d82ea117c443
     version: null
     source:
       Git: https://github.com/pulp-platform/FlooNoC.git
@@ -216,7 +216,15 @@ packages:
     - axi_riscv_atomics
     - common_cells
     - common_verification
+    - floo_noc_pd
+    - fpnew
     - idma
+  floo_noc_pd:
+    revision: null
+    version: null
+    source:
+      Path: working_dir/floo_noc/./pd
+    dependencies: []
   fpnew:
     revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315
     version: null
@@ -266,8 +274,8 @@ packages:
     dependencies:
     - tech_cells_generic
   idma:
-    revision: 7829f71691a62c1e2e5e3230f370f222c7a83087
-    version: null
+    revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e
+    version: 0.6.5
     source:
       Git: https://github.com/pulp-platform/iDMA.git
     dependencies:
@@ -383,7 +391,7 @@ packages:
     - common_cells
     - register_interface
   snitch_cluster:
-    revision: 5d6c957c70e3824e2330d8308eb904724cd41cbe
+    revision: ecf876f5d8726c6424970c99c5e91f83aaedeacd
     version: null
     source:
       Git: https://github.com/pulp-platform/snitch_cluster.git
diff --git a/Bender.yml b/Bender.yml
index a1628a36..88dc00a7 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -9,11 +9,11 @@ package:
 
 dependencies:
   register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: "0.4.5"                                }
-  axi:                { git: "https://github.com/pulp-platform/axi.git",                version: "0.39.6"                               }
+  axi:                { git: "https://github.com/colluca/axi.git",                      rev: "multicast"                                }
   common_cells:       { git: "https://github.com/pulp-platform/common_cells.git",       rev: "snitch"                                   }
   cheshire:           { git: "https://github.com/pulp-platform/cheshire.git",           rev: "picobello"                                }
-  snitch_cluster:     { git: "https://github.com/pulp-platform/snitch_cluster.git",     rev: "5d6c957c70e3824e2330d8308eb904724cd41cbe" }
-  floo_noc:           { git: "https://github.com/pulp-platform/FlooNoC.git",            rev: "develop"                                  }
+  snitch_cluster:     { git: "https://github.com/pulp-platform/snitch_cluster.git",     rev: "main"                         }
+  floo_noc:           { git: "https://github.com/pulp-platform/FlooNoC.git",            rev: "reduction-vc-rebase"                        }
   obi:                { git: "https://github.com/pulp-platform/obi.git",                rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" }
   redmule:            { git: "https://github.com/pulp-platform/redmule.git",            rev: "picobello"                                }
   hci:                { git: "https://github.com/pulp-platform/hci.git",                rev: "06fcba671e060f2e1b03b7ebe2d3e719f1557099" }
@@ -36,7 +36,7 @@ sources:
   - target: pb_gen_rtl
     files:
       # Level 0.0
-      - .generated/floo_picobello_noc_pkg.sv
+      - floo_picobello_noc_pkg_REDUCTION.sv
       - .generated/snitch_cluster_pkg.sv
       - .generated/pb_soc_regs_pkg.sv
       # Level 0.1
diff --git a/Makefile b/Makefile
index 0419dce0..cd4839c8 100644
--- a/Makefile
+++ b/Makefile
@@ -20,20 +20,23 @@ SN_ROOT   = $(shell $(BENDER) path snitch_cluster)
 FLOO_ROOT = $(shell $(BENDER) path floo_noc)
 
 # Executables
-BENDER           ?= bender -d $(PB_ROOT)
+BENDER           ?= bender --suppress W22 -d $(PB_ROOT)
 FLOO_GEN         ?= floogen
 VERIBLE_FMT      ?= verible-verilog-format
 VERIBLE_FMT_ARGS ?= --flagfile .verilog_format --inplace --verbose
 PEAKRDL          ?= peakrdl
 
 # Tiles configuration
-SN_CLUSTERS = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.cluster.num 2>/dev/null)
-L2_TILES = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.l2_spm.num 2>/dev/null)
+SN_CLUSTERS = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.cluster.num 2>/dev/null)
+L2_TILES = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.l2_spm.num 2>/dev/null)
 
 # Bender prerequisites
 BENDER_YML = $(PB_ROOT)/Bender.yml
 BENDER_LOCK = $(PB_ROOT)/Bender.lock
 
+$(PB_GEN_DIR):
+	mkdir -p $@
+
 ################
 # Bender flags #
 ################
@@ -45,7 +48,7 @@ SIM_TARGS += -t simulation -t test -t idma_test
 # systemRDL #
 #############
 
-PB_RDL_ALL += $(PB_GEN_DIR)/picobello.rdl
+PB_RDL_ALL += $(PB_GEN_DIR)/picobello_addrmap.rdl
 PB_RDL_ALL += $(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl
 PB_RDL_ALL += $(PB_GEN_DIR)/snitch_cluster.rdl
 PB_RDL_ALL += $(wildcard $(PB_ROOT)/cfg/rdl/*.rdl)
@@ -58,14 +61,14 @@ $(PB_GEN_DIR)/pb_soc_regs.sv: $(PB_GEN_DIR)/pb_soc_regs_pkg.sv
 $(PB_GEN_DIR)/pb_soc_regs_pkg.sv: $(PB_ROOT)/cfg/rdl/pb_soc_regs.rdl
 	$(PEAKRDL) regblock $< -o $(PB_GEN_DIR) --cpuif apb4-flat --default-reset arst_n -P Num_Clusters=$(SN_CLUSTERS) -P Num_Mem_Tiles=$(L2_TILES)
 
-$(PB_GEN_DIR)/picobello.rdl: $(FLOO_CFG)
-	$(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --rdl --rdl-as-mem --rdl-memwidth=32
+$(PB_GEN_DIR)/picobello_addrmap.rdl: $(FLOO_CFG)
+	$(FLOO_GEN) rdl -c $(FLOO_CFG) -o $(PB_GEN_DIR) --as-mem --memwidth=32
 
 # Those are dummy RDL files, for generation without access to the PD repository.
-$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl:
+$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: | $(PB_GEN_DIR)
 	@touch $@
 
-$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello.rdl $(PB_RDL_ALL)
+$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello_addrmap.rdl $(PB_RDL_ALL)
 	$(PEAKRDL) c-header $< $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) -o $@ -i -b ltoh
 
 $(PB_GEN_DIR)/pb_addrmap.svh: $(PB_RDL_ALL)
@@ -130,12 +133,12 @@ endif
 
 floo-hw-all: $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv
 $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv: $(FLOO_CFG) | $(PB_GEN_DIR)
-	$(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --only-pkg $(FLOO_GEN_FLAGS)
+	$(FLOO_GEN) pkg -c $(FLOO_CFG) -o $(PB_GEN_DIR) $(FLOO_GEN_FLAGS)
 
 
 floo-clean:
 	rm -f $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv
-	rm -f $(PB_GEN_DIR)/picobello.rdl
+	rm -f $(PB_GEN_DIR)/picobello_addrmap.rdl
 
 ###################
 # Physical Design #
@@ -172,16 +175,16 @@ clean-pd:
 
 PB_HW_ALL += $(CHS_HW_ALL)
 PB_HW_ALL += $(CHS_SIM_ALL)
-PB_HW_ALL += $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv
 PB_HW_ALL += $(PB_RDL_HW_ALL)
-PB_HW_ALL += update-sn-cfg
 
-.PHONY: picobello-hw-all picobello-clean clean
+.PHONY: picobello-hw-all picobello-hw-clean clean
+
+picobello-hw-all all: $(PB_HW_ALL) update-sn-cfg sn-hw-all floo-hw-all
 
-picobello-hw-all all: $(PB_HW_ALL) sn-hw-all
-	$(MAKE) $(PB_HW_ALL)
+picobello-hw-clean: sn-hw-clean floo-clean
+	rm -rf $(PB_HW_ALL)
 
-picobello-hw-clean clean: sn-hw-clean floo-clean
+clean: picobello-hw-clean
 	rm -rf $(BENDER_ROOT)
 
 ############
@@ -195,6 +198,7 @@ include $(PB_ROOT)/sw/sw.mk
 ##############
 
 TB_DUT = tb_picobello_top
+SIM_DIR = $(PB_ROOT)
 
 include $(PB_ROOT)/target/sim/vsim/vsim.mk
 include $(PB_ROOT)/target/sim/traces.mk
@@ -221,13 +225,14 @@ python-venv: .venv
 .venv:
 	$(BASE_PYTHON) -m venv $@
 	. $@/bin/activate && \
-	python -m pip install --upgrade pip setuptools && \
-	python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \
+	python -m pip install --upgrade pip "setuptools<81" && \
+	python -m pip install --no-build-isolation --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \
 	python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \
-	python -m pip install --cache-dir $(PIP_CACHE_DIR) "$(shell $(BENDER) path snitch_cluster)[kernels]"
+	python -m pip install --cache-dir $(PIP_CACHE_DIR) -e "$(shell $(BENDER) path snitch_cluster)[kernels]"
 
 python-venv-clean:
 	rm -rf .venv
+	rm -rf $(PIP_CACHE_DIR)
 
 verible-fmt:
 	$(VERIBLE_FMT) $(VERIBLE_FMT_ARGS) $(shell $(BENDER) script flist $(SIM_TARGS) --no-deps)
diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json
index 948bd2a8..7930bc7f 100644
--- a/cfg/snitch_cluster.json
+++ b/cfg/snitch_cluster.json
@@ -12,7 +12,6 @@
         addr_width: 48,
         data_width: 64,
         atomic_id_width: 5, // clog2(#clusters + 2)
-        user_width: 53, // addr_width + atomic_id_width
         tcdm: {
             size: 128,
             banks: 32,
@@ -26,13 +25,16 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
-        enable_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
+        enable_dca: true,
+        dca_data_width: 512,
         cluster_base_expose: true,
         alias_region_enable: true,
         alias_region_base: 0x30000000,
         num_exposed_wide_tcdm_ports: 1,
         narrow_axi_port_expose: true,
+        collective_width: 4,
         enable_external_interrupts: true,
         vm_support: false,
         // Timing parameters
diff --git a/experiments/.gitignore b/experiments/.gitignore
new file mode 100644
index 00000000..15806eca
--- /dev/null
+++ b/experiments/.gitignore
@@ -0,0 +1,6 @@
+build/
+runs/
+plots/
+results/
+data/
+hw/
\ No newline at end of file
diff --git a/experiments/barrier/__init__.py b/experiments/barrier/__init__.py
new file mode 100644
index 00000000..d01b5a8f
--- /dev/null
+++ b/experiments/barrier/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from . import plot, experiments, fit
+
+__all__ = ["plot", "experiments", "fit"]
diff --git a/experiments/barrier/experiments.py b/experiments/barrier/experiments.py
new file mode 100755
index 00000000..8775c6bf
--- /dev/null
+++ b/experiments/barrier/experiments.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import math
+from pathlib import Path
+import picobello as pb
+import snitch.util.experiments.experiment_utils as eu
+from snitch.util.experiments.SimResults import SimRegion
+
+TCK = 5
+DIR = Path(__file__).parent
+
+
+class ExperimentManager(pb.ExperimentManager):
+
+    def derive_axes(self, experiment):
+        return eu.derive_axes_from_keys(experiment, ['impl', 'n_clusters'])
+
+    def derive_cdefines(self, experiment):
+        cdefs = {
+            'IMPL': experiment['impl'].upper(),
+            'N_ROWS': experiment['n_rows'],
+            'N_COLS': experiment['n_cols'],
+        }
+        return cdefs
+
+
+def gen_experiments(ci=False):
+    experiments = []
+    # for impl in ['hw']:
+        # for n_clusters in [8]:
+    impls = ['sw', 'hw']
+    n_clusters_list = [2, 4, 8, 16]
+    if ci:
+        impls = ['hw']
+        n_clusters_list = [16]
+    for impl in impls:
+        for n_clusters in n_clusters_list:
+            experiments.append({
+                'app': 'barrier_benchmark',
+                'cmd': pb.sim_cmd(),
+                'impl': impl,
+                'n_clusters': n_clusters,
+                'n_rows': int(math.ceil(n_clusters / 4)),
+                'n_cols': n_clusters % 4 if n_clusters % 4 != 0 else 4,
+            })
+    return experiments
+
+
+def dma_core(row_idx, col_idx):
+    cluster_idx = row_idx + col_idx * 4
+    return 1 + cluster_idx * 9 + 8
+
+
+def get_total_cycles(sim_results):
+    roi = SimRegion(f'hart_{dma_core(0, 1)}', 'barrier', 1)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def results(manager=None):
+    if manager is None:
+        manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False)
+    return manager.get_results()
+
+
+def main():
+    parser = ExperimentManager.parser()
+    parser.add_argument('--ci', action='store_true',
+                        help='Reduce experiment space for CI runs')
+    args = parser.parse_args()
+    manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False)
+    manager.run()
+    df = results(manager)
+    print(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/barrier/fit.py b/experiments/barrier/fit.py
new file mode 100755
index 00000000..9a2f66c9
--- /dev/null
+++ b/experiments/barrier/fit.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy import stats
+from barrier import experiments
+
+BEAT_BYTES = 64
+
+
+def fit(x, y, quiet=True):
+    # Perform linear regression
+    res = stats.linregress(x, y)
+    alpha = res.intercept      # α
+    beta = res.slope           # β
+    r_value = res.rvalue
+    stderr_beta = res.stderr
+    stderr_alpha = res.intercept_stderr
+
+    if not quiet:
+        # Print results
+        print(f"\talpha (intercept): {alpha:.6g}")
+        print(f"\tbeta  (slope)    : {beta:.6g}")
+        print(f"\tR^2              : {r_value**2:.6g}")
+        print(f"\tSE(beta)         : {stderr_beta:.6g}")
+        print(f"\tSE(alpha)        : {stderr_alpha:.6g}")
+
+        # Plot results
+        plt.figure()
+        plt.scatter(x, y, s=12)
+        xline = np.linspace(x.min(), x.max(), 200)
+        yline = alpha + beta * xline
+        plt.plot(xline, yline, linewidth=2)
+        plt.xlabel('size [B]')
+        plt.ylabel('cycles')
+        plt.title("Linear fit: cycles = alpha + beta * X")
+        plt.show()
+
+    # Return results
+    return alpha, beta
+
+
+def get_results():
+    df = experiments.results()
+    df['cycles'] = df['results'].apply(experiments.get_total_cycles)
+    return df
+
+
+def fit_hw(quiet=True):
+    if not quiet:
+        print("Hardware barrier:")
+    df = get_results()
+    df = df[df['impl'] == 'hw']
+    x = df['n_clusters'].to_numpy()
+    y = df['cycles'].to_numpy()
+    return fit(x, y, quiet)
+
+
+def fit_sw(quiet=True):
+    if not quiet:
+        print("Software barrier:")
+    df = get_results()
+    df = df[df['impl'] == 'sw']
+    x = df['n_clusters'].to_numpy()
+    y = df['cycles'].to_numpy()
+    return fit(x, y, quiet)
+
+
+def main():
+    fit_hw(quiet=False)
+    fit_sw(quiet=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/barrier/plot.py b/experiments/barrier/plot.py
new file mode 100755
index 00000000..dfd8341c
--- /dev/null
+++ b/experiments/barrier/plot.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import matplotlib.pyplot as plt
+from barrier import experiments
+
+
+def plot1(show=True):
+    """Plot runtime vs number of clusters."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get actual runtimes
+    df['cycles'] = df['results'].apply(experiments.get_total_cycles)
+
+    # Plot
+    ax = df.pivot(index="n_clusters", columns="impl", values="cycles").plot(kind="bar", width=0.65)
+    ax.set_xlabel("Nr. clusters")
+    ax.set_ylabel("Runtime [cycles]")
+    ax.tick_params(axis='x', labelrotation=0)
+    ax.set_axisbelow(True)
+    ax.grid(axis="y", color="gainsboro")
+    ax.legend(handlelength=1, ncol=2, columnspacing=0.5, handletextpad=0.3)
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+
+def main():
+
+    # Parse arguments
+    functions = [plot1]
+    parser = argparse.ArgumentParser(description='Plot wide multicast results.')
+    parser.add_argument(
+        'plots',
+        nargs='*',
+        default='all',
+        choices=[f.__name__ for f in functions] + ['all'],
+        help='Plots to generate.'
+    )
+    args = parser.parse_args()
+
+    # Helper function to check if a plot was requested on the command line
+    def requested(plot):
+        return plot in args.plots or 'all' in args.plots
+
+    # Generate plots
+    if requested('plot1'):
+        plot1()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/barrier/roi.json.tpl b/experiments/barrier/roi.json.tpl
new file mode 100644
index 00000000..ea0dfcf1
--- /dev/null
+++ b/experiments/barrier/roi.json.tpl
@@ -0,0 +1,27 @@
+<%
+if experiment['n_clusters'] == 2:
+    clusters = [0, 4]
+elif experiment['n_clusters'] == 4:
+    clusters = [0, 4, 8, 12]
+elif experiment['n_clusters'] == 8:
+    clusters = [0, 4, 8, 12, 1, 5, 9, 13]
+elif experiment['n_clusters'] == 16:
+    clusters = list(range(16))
+%>
+
+[
+% for cluster in clusters:
+    {
+        "thread": "${f'hart_{cluster * 9 + 8 + 1}'}",
+        "roi": [
+    % if experiment['impl'] == 'sw':
+            {"idx": 1, "label": "barrier"},
+            {"idx": 3, "label": "barrier"},
+    % elif experiment['impl'] == 'hw':
+            {"idx": 3, "label": "barrier"},
+            {"idx": 5, "label": "barrier"},
+    % endif
+        ]
+    },
+% endfor
+]
diff --git a/experiments/multicast/README.md b/experiments/multicast/README.md
new file mode 100644
index 00000000..5e615395
--- /dev/null
+++ b/experiments/multicast/README.md
@@ -0,0 +1,32 @@
+All commands are assumed to be run from this folder.
+
+To run the experiments:
+```shell
+./experiments.py --actions sw run visual-trace -j
+```
+
+To manually verify a simulation:
+```shell
+./verify.py placeholder <build_dir>/multicast_benchmark.elf --no-ipc --memdump <sim_dir>/l2mem.bin --memaddr 0x70000000
+```
+
+To manually generate the traces for a simulation:
+```shell
+make -C ../../ annotate -j DEBUG=ON SIM_DIR=<sim_dir>
+```
+
+To manually build the visual trace for a simulation:
+```shell
+make -C ../../ sn-visual-trace -j DEBUG=ON SIM_DIR=<sim_dir> SN_ROI_SPEC=<sim_dir>/roi_spec.json
+```
+
+To fit the HW multicast model to the data:
+```shell
+./fit.py
+```
+Verify that `beta=1`, and plug `alpha`s in `model.py`. This file contains runtime models for the hardware and software multicast.
+
+To plot the results (uses `model.py` behind the scenes):
+```shell
+./plot.py
+```
diff --git a/experiments/multicast/__init__.py b/experiments/multicast/__init__.py
new file mode 100644
index 00000000..73945a38
--- /dev/null
+++ b/experiments/multicast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from . import plot, experiments, fit, model
+
+__all__ = ["plot", "experiments", "fit", "model"]
diff --git a/experiments/multicast/experiments.py b/experiments/multicast/experiments.py
new file mode 100755
index 00000000..9611cfd5
--- /dev/null
+++ b/experiments/multicast/experiments.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import math
+from pathlib import Path
+import picobello as pb
+import snitch.util.experiments.experiment_utils as eu
+from snitch.util.experiments.SimResults import SimRegion
+
+N_CLUSTERS = 4
+N_CLUSTERS_PER_ROW = 4
+WIDE_DATA_WIDTH = 64  # bytes
+TCK = 5
+DIR = Path(__file__).parent
+
+
+###############
+# Experiments #
+###############
+
+class ExperimentManager(pb.ExperimentManager):
+
+    def derive_axes(self, experiment):
+        keys = ['impl', 'n_rows', 'size']
+        if experiment['impl'] == 'seq':
+            keys.append('batch')
+        return eu.derive_axes_from_keys(experiment, keys)
+
+    def derive_cdefines(self, experiment):
+        cdefs = {
+            'IMPL': experiment['impl'].upper(),
+            'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])),
+            'SIZE': experiment['size'],
+            'BATCH': experiment['batch'],
+        }
+        return cdefs
+
+
+def gen_experiments(ci=False):
+    experiments = []
+    impls = ['seq', 'tree', 'hw']
+    n_rows_list = [1, 2, 4]
+    sizes = [1024, 2048, 4096, 8192, 16384, 32768]
+    n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+    if ci:
+        impls = ['hw']
+        n_rows_list = [4]
+        sizes = [32768]
+        n_batches_list = [1024, 2048]
+    for impl in impls:
+        # for n_rows in [1]:
+        for n_rows in n_rows_list:
+            # for size in [1024]:
+            for size in sizes:
+                # for n_batches in [1]:
+                for n_batches in n_batches_list:
+
+                    # Only sequential implementation supports batching, for all other
+                    # implementations we only accept n_batches = 1
+                    batch_size = int(size // n_batches)
+                    batch_beats = int(batch_size // WIDE_DATA_WIDTH)
+                    if impl == 'seq':
+                        valid_n_batches = batch_beats > 8 and batch_beats < 1024
+                    else:
+                        valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list
+
+                    # If the current setting for n_batches is valid, add the experiment
+                    if valid_n_batches:
+                        experiments.append({
+                            'impl': impl,
+                            'n_rows': n_rows,
+                            'size': size,
+                            'batch': batch_size,
+                            'app': 'multicast_benchmark',
+                            'cmd': pb.sim_and_verify_cmd(Path.cwd() / 'verify.py'),
+                            'roi': Path.cwd() / f'roi/{impl}.json.tpl',
+                        })
+    return experiments
+
+
+###########
+# Results #
+###########
+
+def dma_core(cluster_idx):
+    return f'hart_{1 + cluster_idx * 9 + 8}'
+
+
+def tree_cycles(sim_results, n_rows):
+    n_levels = int(math.log2(N_CLUSTERS_PER_ROW * n_rows) + 1)
+    start = SimRegion(dma_core(0 * N_CLUSTERS_PER_ROW), 'level 0', 0)
+    end = SimRegion(dma_core(2 * N_CLUSTERS_PER_ROW), f'level {n_levels-1}', 0)
+    return sim_results.get_timespan(start, end) // TCK
+
+
+# Cluster to cluster (C2C) transfer cycles
+def tree_c2c_cycles(sim_results):
+    roi = SimRegion(dma_core(0), 'level 1', 0)
+    return sim_results.get_timespan(roi) // TCK
+
+
+# Memory to cluster (M2C) transfer cycles
+def tree_m2c_cycles(sim_results):
+    roi = SimRegion(dma_core(0), 'level 0', 0)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def seq_cycles(sim_results, n_batches, n_rows):
+    start = SimRegion(dma_core(0), 'batch 0', 1)
+    end = SimRegion(dma_core(12 + n_rows - 1), f'batch {n_batches-1}', 1)
+    return sim_results.get_timespan(start, end) // TCK
+
+
+def seq_batch_cycles(sim_results, batch_idx, cluster_idx=0):
+    start = SimRegion(dma_core(cluster_idx), f'batch {batch_idx}', 1)
+    return sim_results.get_timespan(start) // TCK
+
+
+def hw_cycles(sim_results):
+    roi = SimRegion(dma_core(0), 'transfer', 1)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def results(manager=None):
+    if manager is None:
+        manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False)
+    return manager.get_results()
+
+
+########
+# Main #
+########
+
+def main():
+
+    parser = ExperimentManager.parser()
+    parser.add_argument('--ci', action='store_true',
+                        help='Reduce experiment space for CI runs')
+    args = parser.parse_args()
+    manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False)
+    manager.run()
+    df = results(manager)
+    print(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/multicast/fit.py b/experiments/multicast/fit.py
new file mode 100755
index 00000000..6a1a64dc
--- /dev/null
+++ b/experiments/multicast/fit.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy import stats
+from multicast import experiments
+
+pd.options.mode.copy_on_write = True
+BEAT_BYTES = 64
+
+
+def fit(x, y, quiet=True):
+    # Perform linear regression
+    res = stats.linregress(x, y)
+    alpha = res.intercept      # α
+    beta = res.slope           # β
+    r_value = res.rvalue
+    stderr_beta = res.stderr
+    stderr_alpha = res.intercept_stderr
+
+    if not quiet:
+        # Print results
+        print(f"\talpha (intercept): {alpha:.6g}")
+        print(f"\tbeta  (slope)    : {beta:.6g}")
+        print(f"\tR^2              : {r_value**2:.6g}")
+        print(f"\tSE(beta)         : {stderr_beta:.6g}")
+        print(f"\tSE(alpha)        : {stderr_alpha:.6g}")
+
+        # Plot results
+        plt.figure()
+        plt.scatter(x, y, s=12)
+        xline = np.linspace(x.min(), x.max(), 200)
+        yline = alpha + beta * xline
+        plt.plot(xline, yline, linewidth=2)
+        plt.xlabel('size [B]')
+        plt.ylabel('cycles')
+        plt.title("Linear fit: cycles = alpha + beta * X")
+        plt.show()
+
+    # Return results
+    return alpha, beta
+
+
+def fit_hw(df, quiet=True):
+    # Retrieve only single-row experiments (fitted model should generalize to multi-row)
+    df = df[df['n_rows'] == 1]
+    df['cycles'] = df['results'].apply(experiments.hw_cycles)
+
+    # Fit data
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['cycles'].to_numpy()
+    print("Fit transfer time for HW multicast:")
+    fit(x, y, quiet=quiet)
+
+
+def fit_seq(df, quiet=True):
+    # Retrieve 1-batch experiments
+    df['n_batches'] = df['size'] // df['batch']
+    df_no = df[df['n_batches'] == 1]
+
+    # Get cycles for non-overlapped batch (batch 0, cluster 0)
+    df_no['no_batch_cycles'] = df_no.apply(
+        lambda row: experiments.seq_batch_cycles(
+            row['results'], batch_idx=0, cluster_idx=0),
+        axis=1
+    )
+
+    # Fit non-overlapped batch data
+    x = df_no['batch'].to_numpy() / BEAT_BYTES
+    y = df_no['no_batch_cycles'].to_numpy()
+    print("Fit non-overlapped batch time for sequential multicast:")
+    fit(x, y, quiet=quiet)
+
+    # Retrieve 2-batch experiments
+    df_o = df[df['n_batches'] == 2]
+
+    # Get cycles for overlapped batch (batch 1, cluster 0)
+    df_o['o_batch_cycles'] = df_o.apply(
+        lambda row: experiments.seq_batch_cycles(
+            row['results'], batch_idx=1, cluster_idx=0),
+        axis=1
+    )
+
+    # Fit overlapped batch data
+    x = df_o['batch'].to_numpy() / BEAT_BYTES
+    y = df_o['o_batch_cycles'].to_numpy()
+    print("Fit overlapped batch time for sequential multicast:")
+    fit(x, y, quiet=quiet)
+
+
+def fit_tree(df, quiet=True):
+    # Get cycles of cluster to cluster and memory to cluster transfers
+    df['c2c_cycles'] = df['results'].apply(experiments.tree_c2c_cycles)
+    df['m2c_cycles'] = df['results'].apply(experiments.tree_m2c_cycles)
+
+    # Fit cluster to cluster data
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['c2c_cycles'].to_numpy()
+    print("Fit cluster-to-cluster transfer time for tree multicast:")
+    fit(x, y, quiet=quiet)
+
+    # Fit memory to cluster data
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['m2c_cycles'].to_numpy()
+    print("Fit memory-to-cluster transfer time for tree multicast:")
+    fit(x, y, quiet=quiet)
+
+
+def main():
+    df = experiments.results()
+
+    seq = df[df['impl'] == 'seq']
+    fit_seq(seq, quiet=False)
+
+    tree = df[df['impl'] == 'tree']
+    fit_tree(tree, quiet=False)
+
+    hw = df[df['impl'] == 'hw']
+    fit_hw(hw, quiet=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/multicast/model.py b/experiments/multicast/model.py
new file mode 100755
index 00000000..b67ddbaf
--- /dev/null
+++ b/experiments/multicast/model.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from math import isqrt, sqrt, log2, ceil
+
+# N: num clusters
+# L: num beats in transfer
+# B: num beats in batch
+# delta: num synchronization overhead cycles
+
+BEAT_BYTES = 64
+SEQ_ALPHA = 30
+DELTA = 36
+HW_ALPHA = 30
+TREE_M2C_ALPHA = 30
+TREE_C2C_ALPHA = 17
+
+
+def nearest_divisors(divisor, dividend):
+    from bisect import bisect_left
+
+    # 1) Enumerate all positive divisors of dividend
+    divs = set()
+    r = int(isqrt(int(dividend)))
+    for d in range(1, r + 1):
+        if dividend % d == 0:        # d divides dividend
+            divs.add(d)              # add the small factor
+            divs.add(dividend // d)  # add the paired large factor
+    divs = sorted(divs)              # sort them ascending
+
+    # 2) Binary search to locate where x would be inserted
+    i = bisect_left(divs, divisor)  # first index with divs[i] >= divisor
+
+    # 3) Pick neighbors around divisor
+    lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None
+    upper = divs[i] if i < len(divs) else None
+    return lower, upper
+
+
+# N1: num cols, N2: num rows
+def seq_runtime(N1, N2, L, B, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA):
+    n_batches = L // B
+    n_iters_row = n_batches - 1 + (N1 - 1)
+    n_iters_col = n_batches - 1 + (N2 - 1) if N2 > 1 else 0
+    n_iters = n_iters_row + n_iters_col
+    return (alpha1 + B) + n_iters * (alpha + B + delta)
+
+
+def optimal_batch_size(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA):
+    if N2 > 1:
+        real_B = sqrt(L * (alpha + delta) / (N1 + N2 - 1))
+    else:
+        real_B = sqrt(L * (alpha + delta) / (N1 - 1))
+    lower_B, upper_B = nearest_divisors(real_B, L)
+    assert (lower_B is None) or (lower_B > 0)
+    assert (upper_B is None) or (upper_B <= L)
+    if lower_B is None:
+        return upper_B
+    elif upper_B is None:
+        return lower_B
+    else:
+        T_lower_B = seq_runtime(N1, N2, L, lower_B, delta, alpha)
+        T_upper_B = seq_runtime(N1, N2, L, upper_B, delta, alpha)
+        if T_lower_B < T_upper_B:
+            return lower_B
+        else:
+            return upper_B
+
+
+def optimal_seq_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA):
+    B_opt = optimal_batch_size(N1, N2, L, delta, alpha)
+    return seq_runtime(N1, N2, L, B_opt, delta, alpha, alpha1)
+
+
+def hw_runtime(N1, N2, L):
+    if N2 > 1:
+        return HW_ALPHA + L + N1 - 1 + N2 - 1
+    else:
+        return HW_ALPHA + L + N1 - 1
+
+
+def tree_runtime(N1, N2, L, delta=DELTA):
+    m2c_transfer_cycles = TREE_M2C_ALPHA + L
+
+    # C2C alpha depends on the distance between the clusters.
+    # On our system we have two cycles latency per hop.
+    c2c_transfer_cycles = 0
+    for i in range(ceil(log2(N1))):
+        dist = N1 / (2 ** (i + 1))
+        c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L
+    for i in range(ceil(log2(N2))):
+        dist = N2 / (2 ** (i + 1))
+        c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L
+
+    delta_cycles = delta * (log2(N1 * N2) - 1)
+    return m2c_transfer_cycles + c2c_transfer_cycles + delta_cycles
+
+
+def optimal_sw_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA):
+    T_seq = optimal_seq_runtime(N1, N2, L, delta, alpha, alpha1)
+    T_tree = tree_runtime(N1, N2, L, delta)
+    return min(T_seq, T_tree)
diff --git a/experiments/multicast/plot.py b/experiments/multicast/plot.py
new file mode 100755
index 00000000..d591a6b2
--- /dev/null
+++ b/experiments/multicast/plot.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from multicast import model
+from multicast import experiments
+
+pd.options.mode.copy_on_write = True
+
+
+# Get actual runtimes
+def get_actual_cycles(row):
+    if row['impl'] == 'seq':
+        return experiments.seq_cycles(
+            row['results'], int(row['size'] // row['batch']), row['n_rows'])
+    elif row['impl'] == 'tree':
+        return experiments.tree_cycles(row['results'], row['n_rows'])
+    elif row['impl'] == 'hw':
+        return experiments.hw_cycles(row['results'])
+
+
+# Get expected runtimes
+def get_expected_cycles(row):
+    if row['impl'] == 'seq':
+        cycles = model.seq_runtime(4, row['n_rows'], row['size'] // 64, row['batch'] // 64)
+    elif row['impl'] == 'tree':
+        cycles = model.tree_runtime(4, row['n_rows'], row['size'] // 64)
+    elif row['impl'] == 'hw':
+        cycles = model.hw_runtime(4, row['n_rows'], row['size'] // 64)
+    return int(cycles)
+
+
+# Select optimal configuration (optimal n_batches) for sequential pipelined runs
+def select_best_sequential_configuration(df):
+    return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \
+        .sort_values(['impl', 'n_rows', 'size']) \
+        .reset_index(drop=True)
+
+
+def seq_runtime_curve(xmin, xmax, n_rows):
+    x = np.arange(xmin, xmax + 64, 64)
+    y = [model.optimal_seq_runtime(4, n_rows, e // 64) for e in x]
+    return x, y
+
+
+# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit.
+def monotone_seq_runtime_curve(xmin, xmax, n_rows):
+    x, y = seq_runtime_curve(xmin, xmax, n_rows)
+    return find_monotone_lower_fit(x, y)
+
+
+# Find the monotone non-decreasing lower fit of an oscillating curve
+def find_monotone_lower_fit(x, y):
+    y_monotone = []
+    x_monotone = []
+    for i, e in enumerate(y):
+        accept = True
+        for j in range(i, len(y)):
+            if y[j] < e:
+                accept = False
+                break
+        if accept:
+            y_monotone.append(y[i])
+            x_monotone.append(x[i])
+    return x_monotone, y_monotone
+
+
+def tree_runtime_curve(xmin, xmax, n_rows):
+    x = np.arange(xmin, xmax + 64, 64)
+    y = [model.tree_runtime(4, n_rows, e // 64) for e in x]
+    return x, y
+
+
+def hw_runtime_curve(xmin, xmax, n_rows):
+    x = np.arange(xmin, xmax, 64)
+    y = [model.hw_runtime(4, n_rows, e // 64) for e in x]
+    return x, y
+
+
+def sw_runtime_curve(xmin, xmax, n_rows):
+    x = np.arange(xmin, xmax + 64, 64)
+    x, y_seq = seq_runtime_curve(xmin, xmax, n_rows)
+    _, y_tree = tree_runtime_curve(xmin, xmax, n_rows)
+    x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)])
+    return x, y_min
+
+
+def plot1():
+    """Plot actual vs. expected runtime heatmaps."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get only sequential implementation results
+    df = df[(df['impl'] == 'seq') & (df['n_rows'] == 1)]
+
+    # Add actual and expected runtimes to the dataframe
+    df['act'] = df.apply(
+        lambda row: experiments.seq_cycles(
+            row['results'], int(row['size'] // row['batch']), row['n_rows']),
+        axis=1
+    )
+    df['exp'] = df.apply(
+        lambda row: model.seq_runtime(4, 1, row['size'] // 64, row['batch'] // 64),
+        axis=1
+    )
+
+    # Create dataframe for actual runtime heatmap
+    df_act = df.drop(columns=['exp'])
+    df_act = df_act.pivot(index='size', columns='batch', values='act')
+    sizes = df_act.index
+    batches = df_act.columns
+
+    # Create dataframe for expected runtime heatmap
+    df_exp = df.drop(columns=['act'])
+    df_exp = df_exp.pivot(index='size', columns='batch', values='exp')
+    sizes = df_exp.index
+    batches = df_exp.columns
+
+    # Plot actual and expected heatmaps
+    fig, ax = plt.subplots(1, 2)
+    cmap = mpl.colormaps['plasma']
+    cmap = mpl.colors.ListedColormap(cmap(np.linspace(0.15, 0.85, 128)))
+    sns.heatmap(
+        df_act,
+        cmap=cmap,
+        annot=True,
+        fmt=".0f",
+        cbar=True,
+        xticklabels=batches,
+        yticklabels=sizes,
+        ax=ax[0]
+    )
+    sns.heatmap(
+        df_exp,
+        cmap=cmap,
+        annot=True,
+        fmt=".0f",
+        cbar=True,
+        xticklabels=batches,
+        yticklabels=sizes,
+        ax=ax[1]
+    )
+    plt.tight_layout()
+    plt.show()
+
+
+def plot2(ax=None, y_label=None, hide_x_axis=False, show=True):
+    """Plot actual and expected runtimes."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get expected and (best) actual runtimes
+    df['cycles'] = df.apply(get_actual_cycles, axis=1)
+    df = select_best_sequential_configuration(df)
+    df['exp_cycles'] = df.apply(get_expected_cycles, axis=1)
+
+    # Drop results column for cleaner output
+    df = df.drop(columns=['results'])
+
+    # Choose consistent colors and markers for the plots
+    colors = {"seq": "C0", "tree": "C1", "hw": "C2"}
+    markers = {"expected": "o", "actual": "x"}
+
+    # Plot measured runtimes
+    if ax is None:
+        _, ax = plt.subplots()
+
+    def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label):
+        res = df[df['n_rows'] == n_rows]
+        sizes = res['size'].unique()
+        ax.scatter(
+            sizes, res[res['impl'] == 'seq']['cycles'], label='Actual',
+            marker=markers['actual'], color=colors['seq']
+        )
+        ax.scatter(
+            sizes, res[res['impl'] == 'tree']['cycles'], label='Actual',
+            marker=markers['actual'], color=colors['tree']
+        )
+        ax.scatter(
+            sizes, res[res['impl'] == 'hw']['cycles'], label='Actual',
+            marker=markers['actual'], color=colors['hw']
+        )
+
+        # # Plot expected runtimes
+        # ax.scatter(
+        #     sizes, res[res['impl'] == 'seq']['exp_cycles'], label='Expected (seq)',
+        #     marker=markers['expected'], color=colors['seq']
+        # )
+        # ax.scatter(
+        #     sizes, res[res['impl'] == 'tree']['exp_cycles'], label='Expected (tree)',
+        #     marker=markers['expected'], color=colors['tree']
+        # )
+        # ax.scatter(
+        #     sizes, res[res['impl'] == 'hw']['exp_cycles'], label='Expected (hw)',
+        #     marker=markers['expected'], color=colors['hw']
+        # )
+
+        # Plot model line for sequential runtime. Find monotone non-decreasing lower fit.
+        x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), n_rows)
+        ax.plot(x, y, label='Model  (seq)', linestyle='--', color=colors['seq'])
+
+        # Plot model line for tree runtime
+        x, y = tree_runtime_curve(sizes.min(), sizes.max(), n_rows)
+        ax.plot(x, y, label='Model  (tree)', linestyle='--', color=colors['tree'])
+
+        # Plot model line for hardware runtime
+        x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows)
+        ax.plot(x, y, label='Model  (hw)', linestyle='--', color=colors['hw'])
+
+        if hide_x_axis:
+            ax.set_xlabel('')
+            ax.set_xticks(sizes)
+            ax.tick_params(
+                axis='x',
+                which='both',
+                bottom=False, top=False,  # hide tick marks
+                labelbottom=False         # hide labels
+            )
+        else:
+            ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes])
+            ax.set_xlabel('Size [B]')
+        ax.set_xlim(0, sizes.max() * 1.08)
+        if y_label is None:
+            y_label = 'Runtime [cycles]'
+        ax.set_ylabel(y_label)
+        ax.set_axisbelow(True)
+        ax.grid(True, linestyle='-', color='gainsboro')
+        ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3)
+
+    plot_runtime_vs_speedup(ax, n_rows=1)
+    # plot_runtime_vs_speedup(ax[0][0], n_rows=1)
+    # plot_runtime_vs_speedup(ax[0][1], n_rows=2)
+    # plot_runtime_vs_speedup(ax[1][0], n_rows=4)
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+    return df
+
+
+def plot3(ax=None, y_label=None, hide_x_axis=False, show=True):
+    """Plot runtime dependency with delta."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get only hardware implementation results
+    df = df[df['impl'] == 'hw']
+
+    # Retrieve single-row experiments
+    n_rows = 1
+    df = df[df['n_rows'] == n_rows]
+
+    # Get actual runtimes
+    df['cycles'] = df['results'].apply(experiments.hw_cycles)
+
+    # Choose consistent colors and markers for the plots
+    colors = {"seq": "C1", "hw": "C2"}
+
+    # Plot measured runtimes
+    if ax is None:
+        _, ax = plt.subplots()
+
+    # Plot model lines for sequential runtime (varying delta).
+    x = np.arange(df['size'].min(), df['size'].max() + 64, 64)
+    for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -14), 0]):
+        alpha = alpha_delta // 2
+        delta = alpha_delta - alpha
+        x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64)
+        y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA)
+             for e in x_ext]
+        y_monotone = []
+        x_monotone = []
+        for i, e in enumerate(y):
+            accept = True
+            for j in range(i, len(y)):
+                if y[j] < e:
+                    accept = False
+                    break
+            if accept and i < len(x):
+                y_monotone.append(y[i])
+                x_monotone.append(x[i])
+        ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None,
+                color=colors['seq'])
+        ax.annotate(
+            f'${alpha_delta}$',
+            xy=(x[-1], y_monotone[-1]),
+            xytext=(1, 0), textcoords='offset points',  # nudge right
+            ha='left', va='center', clip_on=False)
+
+    # Plot model line for hardware runtime
+    y = [model.hw_runtime(4, n_rows, e // 64) for e in x]
+    ax.plot(x, y, label='hw', linestyle='--', color=colors['hw'])
+
+    # Configure plot
+    sizes = df['size']
+    if hide_x_axis:
+        ax.set_xlabel('')
+        ax.set_xticks(sizes)
+        ax.tick_params(
+            axis='x',
+            which='both',
+            bottom=False, top=False,  # hide tick marks
+            labelbottom=False         # hide labels
+        )
+    else:
+        ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes])
+        ax.set_xlabel('Size [B]')
+    ax.set_xlim(0, sizes.max() * 1.08)
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_axisbelow(True)
+    ax.grid(True, linestyle='-', color='gainsboro')
+    ax.legend()
+
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+
+def plot4(ax=None, y_label=None, show=True):
+    """Plot actual and expected runtimes, for multiple number of rows."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get expected and (best) actual runtimes
+    df['cycles'] = df.apply(get_actual_cycles, axis=1)
+    df = select_best_sequential_configuration(df)
+    df['exp_cycles'] = df.apply(get_expected_cycles, axis=1)
+
+    # Drop results column for cleaner output
+    df = df.drop(columns=['results'])
+
+    # Choose consistent colors for the plots
+    colors = {"sw": "C0", "hw": "C2"}
+
+    # Plot measured runtimes
+    if ax is None:
+        _, ax = plt.subplots()
+    sizes = df['size'].unique()
+
+    def plot_runtime_vs_speedup(ax, n_rows, show_label=False):
+        res = df[df['n_rows'] == n_rows]
+        best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min()
+        ax.scatter(
+            sizes, best_sw_cycles, label='Actual' if show_label else None,
+            marker='x', color=colors['sw']
+        )
+        ax.scatter(
+            sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None,
+            marker='x', color=colors['hw']
+        )
+
+        # Plot model line for best software implementation
+        x, y = sw_runtime_curve(sizes.min(), sizes.max(), n_rows)
+        ax.plot(
+            x, y, label='Model  (sw)' if show_label else None,
+            linestyle='--', color=colors['sw'])
+
+        # Annotate sw model lines with number of rows, in correspondence with actual runtime
+        ax.annotate(
+            f'r={n_rows}',
+            xy=(x[-1], best_sw_cycles[sizes.max()]),
+            xytext=(sizes.max() * 0.0001, 0), textcoords='offset points',  # nudge right
+            ha='left', va='center', clip_on=False)
+
+        # Plot model line for hardware runtime
+        x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows)
+        ax.plot(
+            x, y, label='Model  (hw)' if show_label else None, linestyle='--', color=colors['hw'])
+        if n_rows == 1:
+            ax.annotate(
+                'r=1,2,4',
+                xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']),
+                xytext=(0, 5), textcoords='offset points',  # nudge right
+                ha='center', va='center', clip_on=False)
+
+    for i, n_rows in enumerate(df['n_rows'].unique()):
+        plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0))
+
+    # Configure plot
+    sizes = df['size'].unique()
+    ax.set_xlim(0, sizes.max() * 1.08)
+    ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes])
+    ax.set_xlabel('Size [B]')
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_axisbelow(True)
+    ax.grid(True, linestyle='-', color='gainsboro')
+    ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3)
+
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+    return df
+
+
+def main():
+    # Parse arguments
+    functions = [plot1, plot2, plot3, plot4]
+    parser = argparse.ArgumentParser(description='Plot wide multicast results.')
+    parser.add_argument(
+        'plots',
+        nargs='*',
+        default='all',
+        choices=[f.__name__ for f in functions] + ['all'],
+        help='Plots to generate.'
+    )
+    args = parser.parse_args()
+
+    # Helper function to check if a plot was requested on the command line
+    def requested(plot):
+        return plot in args.plots or 'all' in args.plots
+
+    # Generate plots
+    if requested('plot1'):
+        plot1()
+    if requested('plot2'):
+        plot2()
+    if requested('plot3'):
+        plot3()
+    if requested('plot4'):
+        plot4()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/multicast/roi/hw.json.tpl b/experiments/multicast/roi/hw.json.tpl
new file mode 100644
index 00000000..b1cd5567
--- /dev/null
+++ b/experiments/multicast/roi/hw.json.tpl
@@ -0,0 +1,14 @@
+[
+    {
+        // DMA core of cluster 0
+        "thread": "${f'hart_9'}",
+        "roi": [
+            // First iteration
+            {"idx": 1, "label": "init"},
+            {"idx": 2, "label": "transfer"},
+            // Second iteration
+            {"idx": 4, "label": "init"},
+            {"idx": 5, "label": "transfer"},
+        ]
+    },
+]
diff --git a/experiments/multicast/roi/seq.json.tpl b/experiments/multicast/roi/seq.json.tpl
new file mode 100644
index 00000000..876752fd
--- /dev/null
+++ b/experiments/multicast/roi/seq.json.tpl
@@ -0,0 +1,29 @@
+<%
+num_batches = experiment['size'] // experiment['batch']
+%>
+
+[
+% for row in range(0, experiment['n_rows']):
+    % for col in range(0, 4):
+    {
+        // DMA cores
+        "thread": "${f'hart_{(col * 4 + row) * 9 + 8 + 1}'}",
+        "roi": [
+            // First iteration
+            {"idx": 1, "label": "init"},
+        % for batch in range(num_batches):
+            {"idx": ${1 + batch * 2 + 1}, "label": "${f'batch {batch}'}"},
+            // {"idx": ${1 + batch * 2 + 2}, "label": "${f'sync {batch}'}"},
+        % endfor
+            {"idx": ${1 + num_batches * 2 + 1}, "label": "barrier"},
+            // Second iteration
+            {"idx": ${3 + num_batches * 2}, "label": "init"},
+        % for batch in range(num_batches):
+            {"idx": ${3 + num_batches * 2 + batch * 2 + 1}, "label": "${f'batch {batch}'}"},
+            // {"idx": ${3 + num_batches * 2 + batch * 2 + 2}, "label": "${f'sync {batch}'}"},
+        % endfor
+        ]
+    },
+    % endfor
+% endfor
+]
diff --git a/experiments/multicast/roi/tree.json.tpl b/experiments/multicast/roi/tree.json.tpl
new file mode 100644
index 00000000..7095181c
--- /dev/null
+++ b/experiments/multicast/roi/tree.json.tpl
@@ -0,0 +1,162 @@
+% if experiment['n_rows'] == 1:
+[
+    {
+        "thread": "${f'hart_{0 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from memory tile to cluster 0
+            {"idx": 10, "label": "level 0"},
+
+            // Transfer from cluster 0 to cluster 8
+            {"idx": 12, "label": "level 1"},
+
+            // Transfer from cluster 0 to cluster 4
+            {"idx": 14, "label": "level 2"},
+
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{8 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 8 to cluster 12
+            {"idx": 6, "label": "level 2"}
+        ]
+    },
+]
+% elif experiment['n_rows'] == 2:
+[
+    {
+        "thread": "${f'hart_{0 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from memory tile to cluster 0
+            {"idx": 12, "label": "level 0"},
+
+            // Transfer from cluster 0 to cluster 8
+            {"idx": 14, "label": "level 1"},
+
+            // Transfer from cluster 0 to cluster 4
+            {"idx": 16, "label": "level 2"},
+
+            // Transfer from cluster 0 to cluster 1
+            {"idx": 18, "label": "level 3"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{8 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 8 to cluster 12
+            {"idx": 8, "label": "level 2"},
+
+            // Transfer from cluster 8 to cluster 9
+            {"idx": 10, "label": "level 3"}
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{4 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 4 to cluster 5
+            {"idx": 6, "label": "level 3"}
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{12 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 12 to cluster 13
+            {"idx": 6, "label": "level 3"}
+        ]
+    },
+]
+% elif experiment['n_rows'] == 4:
+[
+    {
+        "thread": "${f'hart_{0 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from memory tile to cluster 0
+            {"idx": 14, "label": "level 0"},
+
+            // Transfer from cluster 0 to cluster 8
+            {"idx": 16, "label": "level 1"},
+
+            // Transfer from cluster 0 to cluster 4
+            {"idx": 18, "label": "level 2"},
+
+            // Transfer from cluster 0 to cluster 2
+            {"idx": 20, "label": "level 3"},
+
+            // Transfer from cluster 0 to cluster 1
+            {"idx": 22, "label": "level 4"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{8 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 8 to cluster 12
+            {"idx": 10, "label": "level 2"},
+
+            // Transfer from cluster 8 to cluster 10
+            {"idx": 12, "label": "level 3"},
+
+            // Transfer from cluster 8 to cluster 9
+            {"idx": 14, "label": "level 4"}
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{4 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 4 to cluster 6
+            {"idx": 8, "label": "level 3"},
+
+            // Transfer from cluster 4 to cluster 5
+            {"idx": 10, "label": "level 4"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{12 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 12 to cluster 14
+            {"idx": 8, "label": "level 3"},
+
+            // Transfer from cluster 12 to cluster 13
+            {"idx": 10, "label": "level 4"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{2 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 2 to cluster 3
+            {"idx": 6, "label": "level 4"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{6 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 6 to cluster 7
+            {"idx": 6, "label": "level 4"}
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{10 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 10 to cluster 11
+            {"idx": 6, "label": "level 4"},
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{14 * 9 + 8 + 1}'}",
+        "roi": [
+            // Transfer from cluster 14 to cluster 15
+            {"idx": 6, "label": "level 4"},
+        ]
+    },
+]
+% endif
diff --git a/experiments/multicast/verify.py b/experiments/multicast/verify.py
new file mode 100755
index 00000000..bc5a7b0a
--- /dev/null
+++ b/experiments/multicast/verify.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+# 
+# Verification script for `multicast_benchmark.c`
+
+import sys
+import snitch.util.sim.verif_utils as vu
+
+
+class Verifier(vu.Verifier):
+
+    OUTPUT_UIDS = ['output']
+
+    def __init__(self):
+        super().__init__()
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'uint32_t')
+
+    def get_expected_results(self):
+        length = int(self.get_input_from_symbol('length', 'uint32_t')[0])
+        n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0])
+        return n_clusters * [i + 1 for i in range(length // n_clusters)]
+
+    def check_results(self, *args):
+        return super().check_results(*args, atol=0)
+
+
+if __name__ == "__main__":
+    sys.exit(Verifier().main())
diff --git a/experiments/picobello.py b/experiments/picobello.py
new file mode 100644
index 00000000..7327b7f6
--- /dev/null
+++ b/experiments/picobello.py
@@ -0,0 +1,74 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from pathlib import Path
+import shutil
+import snitch.util.experiments.experiment_utils as eu
+from snitch.util.experiments import common
+
+root = Path(__file__).resolve().parents[1]
+default_work_dir = root / 'target/sim/vsim'
+
+
+def sim_cmd(work_dir=default_work_dir):
+    return [
+        'bash', '-c',
+        f'make vsim-run-batch -C {str(root)} SIM_DIR=${{run_dir}} '
+        f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf '
+        f'SN_BINARY=${{elf}} PRELMODE=3 VSIM_DIR={str(work_dir)}; '
+        f'grep -q "] SUCCESS" ${{run_dir}}/transcript;'
+    ]
+
+
+def sim_and_verify_cmd(verify_script, work_dir=default_work_dir):
+    return [
+        'bash', '-c',
+        f'make vsim-run-batch-verify -C {str(root)} SIM_DIR=${{run_dir}} '
+        f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf '
+        f'SN_BINARY=${{elf}} VERIFY_PY={verify_script} PRELMODE=3 VSIM_DIR={str(work_dir)};'
+    ]
+
+
+def sw_callback(target=None, build_dir=None, defines=None, data_cfg=None, dry_run=False,
+                sync=False, **kwargs):
+    env = {
+        'SN_TESTS_RISCV_CFLAGS': common.join_cdefines(defines),
+        f'{target}_RISCV_CFLAGS': common.join_cdefines(defines),
+    }
+    vars = {
+        f'{target}_BUILD_DIR': build_dir,
+        'SN_TESTS_BUILDDIR': build_dir,
+        'DEBUG': 'ON',
+    }
+    if data_cfg is not None:
+        vars[f'{target}_DATA_CFG'] = data_cfg
+    env = common.extend_environment(env)
+    return common.make(
+        target, flags=['-j'], dir=root, vars=vars, env=env, sync=sync,
+        dry_run=dry_run
+    )
+
+
+def hw_callback(work_dir=None, hw_cfg=None, dry_run=False,
+                sync=False, **kwargs):
+    shutil.copy2(hw_cfg, root / '.generated/floo_picobello_noc_pkg.sv')
+    work_dir.mkdir(exist_ok=True, parents=True)
+    vars = {
+        'VSIM_DIR': work_dir,
+    }
+    return common.make('vsim-compile', dir=root, vars=vars, flags=['-j'], dry_run=dry_run, sync=sync)
+
+
+callbacks = {
+    'sw': sw_callback,
+    'hw': hw_callback,
+}
+
+
+class ExperimentManager(eu.ExperimentManager):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, callbacks=callbacks, **kwargs)
diff --git a/experiments/reduction/__init__.py b/experiments/reduction/__init__.py
new file mode 100644
index 00000000..73945a38
--- /dev/null
+++ b/experiments/reduction/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from . import plot, experiments, fit, model
+
+__all__ = ["plot", "experiments", "fit", "model"]
diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py
new file mode 100755
index 00000000..4e3662b5
--- /dev/null
+++ b/experiments/reduction/experiments.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from functools import cache
+import math
+from pathlib import Path
+import picobello as pb
+import snitch.util.experiments.experiment_utils as eu
+from snitch.util.experiments.SimResults import SimRegion
+
+BEAT_BYTES = 64
+TCK = 5
+DIR = Path(__file__).parent
+
+
+###############
+# Experiments #
+###############
+
+
+class ExperimentManager(pb.ExperimentManager):
+
+    def derive_axes(self, experiment):
+        keys = ['impl', 'n_rows', 'size']
+        if experiment['impl'] in ['seq', 'tree']:
+            keys.append('batch')
+        return eu.derive_axes_from_keys(experiment, keys)
+
+    def derive_cdefines(self, experiment):
+        cdefs = {
+            'IMPL': experiment['impl'].upper(),
+            'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])),
+            'SIZE': experiment['size'],
+        }
+        if experiment['impl'] in ['seq', 'tree']:
+            cdefs['BATCH'] = experiment['batch']
+        return cdefs
+
+
+def gen_experiments(ci=False):
+    experiments = []
+    impls = ['seq', 'tree', 'hw']
+    n_rows_list = [1, 2, 4]
+    sizes = [1024, 2048, 4096, 8192, 16384, 32768]
+    n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
+    if ci:
+        impls = ['hw']
+        n_rows_list = [4]
+        sizes = [32768]
+        n_batches_list = [1024, 2048]
+    # for impl in ['seq']:
+    # for impl in ['tree', 'hw']:
+    for impl in impls:
+        # for n_rows in [1]:
+        for n_rows in n_rows_list:
+            # for size in [4096]:
+            for size in sizes:
+                # for n_batches in [4]:
+                for n_batches in n_batches_list:
+
+                    # Only sequential and tree implementations supports batching, for all other
+                    # implementations we only accept n_batches = 1
+                    batch_size = int(size // n_batches)
+                    batch_beats = int(batch_size // BEAT_BYTES)
+                    if impl in ['seq', 'tree']:
+                        valid_n_batches = batch_beats > 8 and batch_beats < 256
+                    else:
+                        valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list
+
+                    # If the current setting for n_batches is valid, add the experiment
+                    if valid_n_batches:
+                        experiment = {
+                            'impl': impl,
+                            'n_rows': n_rows,
+                            'size': size,
+                            'batch': batch_size,
+                            'app': 'reduction_benchmark',
+                            'roi': Path.cwd() / f'roi/{impl}.json.tpl',
+                        }
+                        experiment['cmd'] = pb.sim_and_verify_cmd(Path.cwd() / 'verify.py')
+                        experiments.append(experiment)
+    return experiments
+
+
+###########
+# Results #
+###########
+
+def dma_core(cluster_idx):
+    return f'hart_{1 + cluster_idx * 9 + 8}'
+
+
+def compute_core(cluster_idx, core_idx):
+    return f'hart_{1 + cluster_idx * 9 + core_idx}'
+
+
+# Assumes DMA transfers on all clusters, and for all batches to be equal.
+# Therefore we only sample cluster 12's first transfer.
+def seq_dma_cycles(sim_results):
+    roi = SimRegion(dma_core(12), 'd_0', 1)
+    return sim_results.get_timespan(roi) // TCK
+
+
+# Assumes computation on all clusters, and for all batches to be equal.
+# Therefore we only sample cluster 0's first computation.
+def seq_compute_cycles(sim_results):
+    roi = SimRegion(compute_core(0, 0), 'c_0', 1)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def seq_cycles(sim_results, c, r, n_batches):
+    start = SimRegion(dma_core(12), 'd_0', 1)
+    end = SimRegion(compute_core(0, 0), f'c{"" if r == 1 else 2}_{n_batches-1}', 1)
+    return sim_results.get_timespan(start, end) // TCK
+
+
+# Assumes DMA transfers on all clusters, and all levels of the tree to be equal.
+# Therefore we only sample cluster 4's first transfer.
+def tree_dma_cycles(sim_results):
+    roi = SimRegion(dma_core(4), '4 > 0', 0)
+    return sim_results.get_timespan(roi) // TCK
+
+
+# Assumes computation on all clusters, and all levels of the tree to be equal.
+# Therefore we only sample cluster 0's first computation.
+def tree_compute_cycles(sim_results):
+    roi = SimRegion(compute_core(0, 0), 'comp l0', 0)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def tree_cycles(sim_results, c, r, n_batches):
+    n_levels = int(math.log2(c * r))
+    start = SimRegion(dma_core(4), '4 > 0', 0)
+    end = SimRegion(compute_core(0, 0), f'comp l{n_levels-1}', n_batches - 1)
+    return sim_results.get_timespan(start, end) // TCK
+
+
+def hw_dma_cycles(sim_results):
+    roi = SimRegion(dma_core(0), 'row reduction', 1)
+    return sim_results.get_timespan(roi) // TCK
+
+
+def hw_cycles(sim_results):
+    start = SimRegion(dma_core(0), 'row reduction', 1)
+    end = SimRegion(dma_core(0), 'column reduction', 1)
+    return sim_results.get_timespan(start, end) // TCK
+
+
+@cache
+def results(manager=None):
+    if manager is None:
+        manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False)
+    return manager.get_results()
+
+
+########
+# Main #
+########
+
+def main():
+
+    parser = ExperimentManager.parser()
+    parser.add_argument('--ci', action='store_true',
+                        help='Reduce experiment space for CI runs')
+    args = parser.parse_args()
+    manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False)
+    manager.run()
+    df = results(manager)
+    print(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/reduction/fit.py b/experiments/reduction/fit.py
new file mode 100755
index 00000000..cb896b5d
--- /dev/null
+++ b/experiments/reduction/fit.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from functools import cache
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy import stats
+from reduction import experiments
+
+pd.options.mode.copy_on_write = True
+BEAT_BYTES = 64
+
+
+def fit(x, y, quiet=True):
+    # Perform linear regression
+    res = stats.linregress(x, y)
+    alpha = res.intercept      # α
+    beta = res.slope           # β
+    r_value = res.rvalue
+    stderr_beta = res.stderr
+    stderr_alpha = res.intercept_stderr
+
+    if not quiet:
+        # Print results
+        print(f"\talpha (intercept): {alpha:.6g}")
+        print(f"\tbeta  (slope)    : {beta:.6g}")
+        print(f"\tR^2              : {r_value**2:.6g}")
+        print(f"\tSE(beta)         : {stderr_beta:.6g}")
+        print(f"\tSE(alpha)        : {stderr_alpha:.6g}")
+
+        # Plot results
+        plt.figure()
+        plt.scatter(x, y, s=12)
+        xline = np.linspace(x.min(), x.max(), 200)
+        yline = alpha + beta * xline
+        plt.plot(xline, yline, linewidth=2)
+        plt.xlabel('size [B]')
+        plt.ylabel('cycles')
+        plt.title("Linear fit: cycles = alpha + beta * X")
+        plt.show()
+
+    # Return results
+    return alpha, beta
+
+
+@cache
+def fit_seq_dma(df=None, quiet=True):
+    # Get experiment data
+    if df is None:
+        df = experiments.results()
+        df = df[df['impl'] == 'seq']
+
+    # Retrieve only single-row and single-batch experiments (fitted model should generalize)
+    df = df[df['n_rows'] == 1]
+    df['n_batches'] = df['size'] // df['batch']
+    df = df[df['n_batches'] == 1]
+
+    # Fit data
+    df['dma_cycles'] = df['results'].apply(experiments.seq_dma_cycles)
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['dma_cycles'].to_numpy()
+    if not quiet:
+        print("Fit DMA transfer time for seq reduction:")
+    return fit(x, y, quiet=quiet)
+
+
+@cache
+def fit_seq_compute(df=None, quiet=True):
+    # Get experiment data
+    if df is None:
+        df = experiments.results()
+        df = df[df['impl'] == 'seq']
+
+    # Retrieve only single-row and single-batch experiments (fitted model should generalize)
+    df = df[df['n_rows'] == 1]
+    df['n_batches'] = df['size'] // df['batch']
+    df = df[df['n_batches'] == 1]
+
+    # Fit data
+    df['comp_cycles'] = df['results'].apply(experiments.seq_compute_cycles)
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['comp_cycles'].to_numpy()
+    if not quiet:
+        print("Fit compute time for seq reduction:")
+    return fit(x, y, quiet=quiet)
+
+
+@cache
+def fit_tree_dma(df=None, quiet=True):
+    # Get experiment data
+    if df is None:
+        df = experiments.results()
+        df = df[df['impl'] == 'tree']
+
+    # Retrieve only single-row and single-batch experiments (fitted model should generalize)
+    df = df[df['n_rows'] == 1]
+    df['n_batches'] = df['size'] // df['batch']
+    df = df[df['n_batches'] == 1]
+
+    # Fit data
+    df['dma_cycles'] = df['results'].apply(experiments.tree_dma_cycles)
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['dma_cycles'].to_numpy()
+    if not quiet:
+        print("Fit DMA transfer time for tree reduction:")
+    return fit(x, y, quiet=quiet)
+
+
+@cache
+def fit_tree_compute(df=None, quiet=True):
+    # Get experiment data
+    if df is None:
+        df = experiments.results()
+        df = df[df['impl'] == 'tree']
+
+    # Retrieve only single-row and single-batch experiments (fitted model should generalize)
+    df = df[df['n_rows'] == 1]
+    df['n_batches'] = df['size'] // df['batch']
+    df = df[df['n_batches'] == 1]
+
+    # Fit data
+    df['comp_cycles'] = df['results'].apply(experiments.tree_compute_cycles)
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['comp_cycles'].to_numpy()
+    if not quiet:
+        print("Fit compute time for tree reduction:")
+    return fit(x, y, quiet=quiet)
+
+
+@cache
+def fit_hw(df=None, quiet=True):
+    # Get experiment data
+    if df is None:
+        df = experiments.results()
+        df = df[df['impl'] == 'hw']
+
+    # Retrieve only single-row experiments (fitted model should generalize to multi-row)
+    df = df[df['n_rows'] == 1]
+
+    # Fit data
+    df['cycles'] = df['results'].apply(experiments.hw_dma_cycles)
+    x = df['size'].to_numpy() / BEAT_BYTES
+    y = df['cycles'].to_numpy()
+    if not quiet:
+        print("Fit runtime for hw reduction:")
+    return fit(x, y, quiet=quiet)
+
+
+def main():
+
+    fit_tree_dma(quiet=False)
+    fit_tree_compute(quiet=False)
+
+    fit_seq_dma(quiet=False)
+    fit_seq_compute(quiet=False)
+
+    fit_hw(quiet=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py
new file mode 100755
index 00000000..fda790d1
--- /dev/null
+++ b/experiments/reduction/model.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from math import log2, sqrt, isqrt
+from reduction import fit
+
+BEAT_BYTES = 64
+DELTA = 30
+
+
+def nearest_divisors(divisor, dividend):
+    from bisect import bisect_left
+
+    # 1) Enumerate all positive divisors of dividend
+    divs = set()
+    r = int(isqrt(int(dividend)))
+    for d in range(1, r + 1):
+        if dividend % d == 0:        # d divides dividend
+            divs.add(d)              # add the small factor
+            divs.add(dividend // d)  # add the paired large factor
+    divs = sorted(divs)              # sort them ascending
+
+    # 2) Binary search to locate where x would be inserted
+    i = bisect_left(divs, divisor)  # first index with divs[i] >= divisor
+
+    # 3) Pick neighbors around divisor
+    lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None
+    upper = divs[i] if i < len(divs) else None
+    return lower, upper
+
+
+def hw_runtime(c, r, n):
+    hw_alpha, hw_beta = fit.fit_hw()
+    t_hw = hw_alpha + n * hw_beta
+    if r > 1:
+        return 2 * t_hw
+    else:
+        return t_hw
+
+
+def seq_runtime(c, r, n, k, delta=DELTA):
+    batch = int(n // k)
+    dma_alpha, _ = fit.fit_seq_dma()
+    comp_alpha, _ = fit.fit_seq_compute()
+    t_comp = comp_alpha + batch * 1.16
+    t_dma = dma_alpha + batch * 1
+    t_max = max(t_comp, t_dma)
+    # print(t_dma * 5, t_comp * 5, t_max * 5)
+    n_iters = 1 + 2 * (c - 2) + k
+    if r > 1:
+        n_iters += 1 + 2 * (r - 2) + k
+    # First approximation model assumes compute and dma take roughly the same time
+    delta = 4 * r + 28
+    return n_iters * (t_max + delta) - delta
+
+
+def optimal_seq_k(c, r, n, delta=DELTA):
+    comp_alpha, _ = fit.fit_seq_compute()
+    if r > 1:
+        real_k = sqrt(n * (2 * (c + r) - 7) / (2 * (delta + comp_alpha)))
+    else:
+        real_k = sqrt(n * (2 * c - 3) / (delta + comp_alpha))
+    lower_k, upper_k = nearest_divisors(real_k, n)
+    assert (lower_k is None) or (lower_k > 0)
+    assert (upper_k is None) or (upper_k <= n)
+    if lower_k is None:
+        return upper_k
+    elif upper_k is None:
+        return lower_k
+    else:
+        T_lower_k = seq_runtime(c, r, n, lower_k, delta)
+        T_upper_k = seq_runtime(c, r, n, upper_k, delta)
+        if T_lower_k < T_upper_k:
+            return lower_k
+        else:
+            return upper_k
+
+
+def optimal_seq_runtime(c, r, n, delta=DELTA):
+    k_opt = optimal_seq_k(c, r, n, delta)
+    return seq_runtime(c, r, n, k_opt, delta)
+
+
+def tree_runtime(c, r, n, k, delta=DELTA):
+    batch = int(n // k)
+    dma_alpha, _ = fit.fit_tree_dma()
+    comp_alpha, _ = fit.fit_tree_compute()
+    t_comp = comp_alpha + batch * 1.16
+    t_dma = dma_alpha + batch * 1
+    t_max = max(t_comp, t_dma)
+    # print(t_dma * 5, t_comp * 5, t_max * 5)
+    n_levels = log2(c * r)
+    return n_levels * (t_dma + (k - 1) * (delta + t_max) + delta + t_comp)
+
+
+def optimal_tree_k(c, r, n, delta=DELTA):
+    dma_alpha, _ = fit.fit_tree_dma()
+    comp_alpha, _ = fit.fit_tree_compute()
+    real_k = sqrt(n / (delta + max(dma_alpha, comp_alpha)))
+    lower_k, upper_k = nearest_divisors(real_k, n)
+    assert (lower_k is None) or (lower_k > 0)
+    assert (upper_k is None) or (upper_k <= n)
+    if lower_k is None:
+        return upper_k
+    elif upper_k is None:
+        return lower_k
+    else:
+        T_lower_k = tree_runtime(c, r, n, lower_k, delta)
+        T_upper_k = tree_runtime(c, r, n, upper_k, delta)
+        if T_lower_k < T_upper_k:
+            return lower_k
+        else:
+            return upper_k
+
+
+def optimal_tree_runtime(c, r, n, delta=DELTA):
+    k_opt = optimal_tree_k(c, r, n, delta)
+    return tree_runtime(c, r, n, k_opt, delta)
+
+
+def optimal_sw_runtime(c, r, n, delta=DELTA):
+    T_seq = optimal_seq_runtime(c, r, n, delta)
+    T_tree = optimal_tree_runtime(c, r, n, delta)
+    return min(T_seq, T_tree)
+
+
+# print(5 * tree_runtime(4, 1, 8192 // 64, 1))
+# print(5 * tree_runtime(4, 1, 32768 // 64, 4))
diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py
new file mode 100755
index 00000000..b473e2cb
--- /dev/null
+++ b/experiments/reduction/plot.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from reduction import model, experiments
+
+pd.options.mode.copy_on_write = True
+BEAT_BYTES = 64
+
+
+# Find the monotone non-decreasing lower fit of an oscillating curve
+def find_monotone_lower_fit(x, y):
+    y_monotone = []
+    x_monotone = []
+    for i, e in enumerate(y):
+        accept = True
+        for j in range(i, len(y)):
+            if y[j] < e:
+                accept = False
+                break
+        if accept:
+            y_monotone.append(y[i])
+            x_monotone.append(x[i])
+    return x_monotone, y_monotone
+
+
+def get_actual_cycles(row):
+    if row['impl'] == 'seq':
+        n_batches = int(row['size'] // row['batch'])
+        return experiments.seq_cycles(row['results'], 4, row['n_rows'], n_batches)
+    elif row['impl'] == 'tree':
+        n_batches = int(row['size'] // row['batch'])
+        return experiments.tree_cycles(row['results'], 4, row['n_rows'], n_batches)
+    elif row['impl'] == 'hw':
+        return experiments.hw_cycles(row['results'])
+
+
+def get_expected_cycles(row):
+    if row['impl'] == 'seq':
+        cycles = model.optimal_seq_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES)
+    elif row['impl'] == 'tree':
+        cycles = model.optimal_tree_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES)
+    elif row['impl'] == 'hw':
+        cycles = model.hw_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES)
+    return int(cycles)
+
+
+def get_expected_batch_size(row):
+    if row['impl'] == 'tree':
+        optimal_k = model.optimal_tree_k(4, row['n_rows'], row['size'] // BEAT_BYTES)
+        return int(row['size'] // optimal_k)
+    elif row['impl'] == 'seq':
+        optimal_k = model.optimal_seq_k(4, row['n_rows'], row['size'] // BEAT_BYTES)
+        return int(row['size'] // optimal_k)
+    elif row['impl'] == 'hw':
+        return None
+
+
+# Select optimal configuration (optimal k) for seq and tree runs
+def select_best_configs(df):
+    return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \
+        .sort_values(['impl', 'n_rows', 'size']) \
+        .reset_index(drop=True)
+
+
+def seq_runtime_curve(xmin, xmax, c, r):
+    x = np.arange(xmin, xmax + 64, 64)
+    y = [model.optimal_seq_runtime(c, r, e // BEAT_BYTES) for e in x]
+    return x, y
+
+
+# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit.
+def monotone_seq_runtime_curve(xmin, xmax, c, r):
+    x, y = seq_runtime_curve(xmin, xmax, c, r)
+    return find_monotone_lower_fit(x, y)
+
+
+def tree_runtime_curve(xmin, xmax, c, r):
+    x = np.arange(xmin, xmax + 64, 64)
+    y = [model.optimal_tree_runtime(c, r, e // BEAT_BYTES) for e in x]
+    return x, y
+
+
+# Filter optimal tree runtime, by finding the monotone non-decreasing lower fit.
+def monotone_tree_runtime_curve(xmin, xmax, c, r):
+    x, y = tree_runtime_curve(xmin, xmax, c, r)
+    return find_monotone_lower_fit(x, y)
+
+
+def hw_runtime_curve(xmin, xmax, c, r):
+    x = np.arange(xmin, xmax + 64, 64)
+    y = [model.hw_runtime(c, r, e // BEAT_BYTES) for e in x]
+    return x, y
+
+
+def sw_runtime_curve(xmin, xmax, c, n_rows):
+    x = np.arange(xmin, xmax, 64)
+    x, y_seq = seq_runtime_curve(xmin, xmax, c, n_rows)
+    _, y_tree = tree_runtime_curve(xmin, xmax, c, n_rows)
+    x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)])
+    return x, y_min
+
+
+def plot1(y_label=None, hide_x_axis=False, show=True):
+    """Plot actual and expected runtimes."""
+
+    # Load single-row results
+    c = 4
+    r = 1
+    df = experiments.results()
+    df = df[df['n_rows'] == r]
+
+    # Get expected and actual runtimes
+    df['cycles'] = df.apply(get_actual_cycles, axis=1)
+    df = select_best_configs(df)
+    df['exp_cycles'] = df.apply(get_expected_cycles, axis=1)
+    # df['exp_batch'] = df.apply(get_expected_batch_size, axis=1)
+
+    # Drop results column for cleaner output
+    df = df.drop(columns=['results'])
+
+    # Choose consistent colors and markers for the plots
+    colors = {"seq": "C0", "tree": "C1", "hw": "C2"}
+    markers = {"expected": "o", "actual": "x"}
+
+    # Create plot
+    _, ax = plt.subplots()
+
+    # Plot actual runtimes
+    sizes = df['size'].unique()
+    ax.scatter(
+        sizes, df[df['impl'] == 'seq']['cycles'], label='Actual',
+        marker=markers['actual'], color=colors['seq']
+    )
+    ax.scatter(
+        sizes, df[df['impl'] == 'tree']['cycles'], label='Actual',
+        marker=markers['actual'], color=colors['tree']
+    )
+    ax.scatter(
+        sizes, df[df['impl'] == 'hw']['cycles'], label='Actual',
+        marker=markers['actual'], color=colors['hw']
+    )
+
+    # # Plot expected runtimes
+    # ax.scatter(
+    #     sizes, df[df['impl'] == 'tree']['exp_cycles'], label='Expected (tree)',
+    #     marker=markers['expected'], color=colors['tree']
+    # )
+    # ax.scatter(
+    #     sizes, df[df['impl'] == 'hw']['exp_cycles'], label='Expected (hw)',
+    #     marker=markers['expected'], color=colors['hw']
+    # )
+
+    # Plot model line for seq runtime
+    x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), c, r)
+    ax.plot(x, y, label='Model  (seq)', linestyle='--', color=colors['seq'])
+
+    # Plot model line for tree runtime
+    x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r)
+    ax.plot(x, y, label='Model  (tree)', linestyle='--', color=colors['tree'])
+
+    # Plot model line for hw runtime
+    x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, r)
+    ax.plot(x, y, label='Model  (hw)', linestyle='--', color=colors['hw'])
+
+    # Plot formatting
+    if hide_x_axis:
+        ax.set_xlabel('')
+        ax.set_xticks(sizes)
+        ax.tick_params(
+            axis='x',
+            which='both',
+            bottom=False, top=False,  # hide tick marks
+            labelbottom=False         # hide labels
+        )
+    else:
+        ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes])
+        ax.set_xlabel('Size [B]')
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_xlim(0, sizes.max() * 1.1)
+    ax.set_axisbelow(True)
+    ax.grid(True, color='gainsboro')
+    ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3)
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+    return df
+
+
+def plot2(y_label=None, show=True):
+    """Plot actual and expected runtimes, for multiple number of rows."""
+
+    # Load results
+    c = 4
+    df = experiments.results()
+
+    # Get expected and actual runtimes
+    df['cycles'] = df.apply(get_actual_cycles, axis=1)
+    df = select_best_configs(df)
+    df['exp_cycles'] = df.apply(get_expected_cycles, axis=1)
+    df['exp_batch'] = df.apply(get_expected_batch_size, axis=1)
+
+    # Drop results column for cleaner output
+    df = df.drop(columns=['results'])
+
+    # Choose consistent colors for the plots
+    colors = {"sw": "C0", "hw": "C1"}
+
+    # Create plot
+    _, ax = plt.subplots()
+    sizes = df['size'].unique()
+
+    # Create function to plot runtimes for a given number of rows
+    def plot_runtime_vs_speedup(ax, n_rows, show_label=False):
+        res = df[df['n_rows'] == n_rows]
+        print(res)
+        best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min()
+        print(best_sw_cycles)
+        ax.scatter(
+            sizes, best_sw_cycles, label='Actual' if show_label else None,
+            marker='x', color=colors['sw']
+        )
+        ax.scatter(
+            sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None,
+            marker='x', color=colors['hw']
+        )
+
+        # Plot model line for best software implementation
+        x, y = sw_runtime_curve(sizes.min(), sizes.max(), c, n_rows)
+        ax.plot(
+            x, y, label='Model  (sw)' if show_label else None,
+            linestyle='--', color=colors['sw'])
+
+        # Annotate sw model lines with number of rows, in correspondence with actual runtime
+        ax.annotate(
+            f'r={n_rows}',
+            xy=(x[-1], best_sw_cycles[sizes.max()]),
+            xytext=(sizes.max() * 0.0001, 0), textcoords='offset points',  # nudge right
+            ha='left', va='center', clip_on=False)
+
+        # Plot model line for hardware runtime
+        x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, n_rows)
+        ax.plot(
+            x, y, label='Model  (hw)' if show_label else None, linestyle='--', color=colors['hw'])
+        if n_rows == 1:
+            label = f'r={n_rows}'
+        else:
+            label = 'r=2,4'
+        if n_rows != 4:
+            ax.annotate(
+                label,
+                xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']),
+                xytext=(sizes.max() * 0.0001, 0), textcoords='offset points',  # nudge right
+                ha='left', va='center', clip_on=False)
+
+    for i, n_rows in enumerate(df['n_rows'].unique()):
+        plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0))
+
+    ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes])
+    ax.set_xlim(0, sizes.max() * 1.1)
+    ax.set_xlabel('Size [B]')
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_axisbelow(True)
+    ax.grid(True, color='gainsboro')
+    ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3)
+
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+    return df
+
+
+def plot3(show=True):
+    """Plot runtime dependency with delta."""
+
+    # Load results
+    df = experiments.results()
+
+    # Get only hardware implementation results
+    df = df[df['impl'] == 'hw']
+
+    # Retrieve single-row experiments
+    n_rows = 1
+    df = df[df['n_rows'] == n_rows]
+
+    # Get actual runtimes
+    df['cycles'] = df['results'].apply(experiments.hw_cycles)
+
+    # Choose consistent colors and markers for the plots
+    colors = {"seq": "C1", "hw": "C2"}
+
+    # Plot measured runtimes
+    _, ax = plt.subplots()
+
+    # Plot model lines for sequential runtime (varying delta).
+    x = np.arange(df['size'].min(), df['size'].max(), 64)
+    for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -12), 0]):
+        alpha = alpha_delta // 2
+        delta = alpha_delta - alpha
+        x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64)
+        y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA)
+             for e in x_ext]
+        y_monotone = []
+        x_monotone = []
+        for i, e in enumerate(y):
+            accept = True
+            for j in range(i, len(y)):
+                if y[j] < e:
+                    accept = False
+                    break
+            if accept and i < len(x):
+                y_monotone.append(y[i])
+                x_monotone.append(x[i])
+        ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None,
+                color=colors['seq'])
+        ax.annotate(
+            f'${alpha_delta}$',
+            xy=(x[-1], y_monotone[-1]),
+            xytext=(1, 0), textcoords='offset points',  # nudge right
+            ha='left', va='center', clip_on=False)
+
+    # Plot model line for hardware runtime
+    y = [model.hw_runtime(4, n_rows, e // 64) for e in x]
+    ax.plot(x, y, label='hw', linestyle='--', color=colors['hw'])
+
+    ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']])
+    ax.set_xlabel('Size [B]')
+    ax.set_ylabel('Runtime [cycles]')
+    ax.set_axisbelow(True)
+    ax.grid(True, linestyle='-', color='gainsboro')
+    ax.legend()
+
+    plt.tight_layout()
+    if show:
+        plt.show()
+
+
+def main():
+    # Parse arguments
+    functions = [plot1, plot2]
+    parser = argparse.ArgumentParser(description='Plot wide multicast results.')
+    parser.add_argument(
+        'plots',
+        nargs='*',
+        default='all',
+        choices=[f.__name__ for f in functions] + ['all'],
+        help='Plots to generate.'
+    )
+    args = parser.parse_args()
+
+    # Helper function to check if a plot was requested on the command line
+    def requested(plot):
+        return plot in args.plots or 'all' in args.plots
+
+    # Generate plots
+    if requested('plot1'):
+        plot1()
+    if requested('plot2'):
+        plot2()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/reduction/roi/hw.json.tpl b/experiments/reduction/roi/hw.json.tpl
new file mode 100644
index 00000000..c9301cc5
--- /dev/null
+++ b/experiments/reduction/roi/hw.json.tpl
@@ -0,0 +1,23 @@
+<%
+    def pb_cluster_idx(c, r):
+        return c * 4 + r
+
+    n_rows = experiment['n_rows']
+%>
+[
+% for r in range(n_rows):
+    % for c in range(4):
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(c, r) * 9 + 8}'}",
+        "roi": [
+            // First iteration
+            {"idx": 2, "label": "row reduction"},
+            {"idx": 4, "label": "column reduction"},
+            // Second iteration
+            {"idx": 8, "label": "row reduction"},
+            {"idx": 10, "label": "column reduction"},
+        ]
+    },
+    % endfor
+% endfor
+]
diff --git a/experiments/reduction/roi/seq.json.tpl b/experiments/reduction/roi/seq.json.tpl
new file mode 100644
index 00000000..f14ede55
--- /dev/null
+++ b/experiments/reduction/roi/seq.json.tpl
@@ -0,0 +1,87 @@
+<%
+n_rows = experiment['n_rows']
+n_cols = 4
+n_batches = int(experiment['size'] // experiment['batch'])
+n_repetitions = 2
+
+def pb_cluster_idx(r, c):
+    return c * n_cols + r
+%>
+[
+    <% n_repetitions = 2 %>
+
+    % for row in range(0, n_rows):
+        ## for easternmost column, only DMA core
+        {
+            "thread": "${f'hart_{1 + pb_cluster_idx(row, n_cols-1)*9 + 8}'}",
+            "roi": [
+                % for iter in range(0, n_repetitions):
+                    % for i in range(0, n_batches):
+                        {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"},
+                    % endfor
+                % endfor
+            ]
+        },
+
+        ## for middle columns
+        % for col in range(n_cols-2, 0, -1):
+            ## Compute cores
+            % for core in range(0, 8):
+            {
+                "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + core}'}",
+                "roi": [
+                    % for iter in range(0, n_repetitions):
+                        % for i in range(0, n_batches):
+                            {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"},
+                        % endfor
+                    % endfor
+                ]
+            },
+            % endfor
+            ## DMA core
+            {
+                "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + 8}'}",
+                "roi": [
+                    % for iter in range(0, n_repetitions):
+                        % for i in range(0, n_batches):
+                            {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"},
+                        % endfor
+                    % endfor
+                ]
+            },
+        % endfor
+    
+        ## for westernmost column
+        % if row != 0 and n_rows > 1:
+            ## DMA core
+            {
+                "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + 8}'}",
+                "roi": [
+                    % for iter in range(0, n_repetitions):
+                        % for i in range(0, n_batches):
+                            {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd2_{i}'}"},
+                        % endfor
+                    % endfor
+                ]
+            },
+        % endif
+        ## Compute cores
+        % for core in range(0, 8):
+        {
+            "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + core}'}",
+            "roi": [
+                % for iter in range(0, n_repetitions):
+                    % for i in range(0, n_batches):
+                        % if row == (n_rows - 1):
+                            {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"},
+                        % else:
+                            {"idx": ${(4 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"},
+                            {"idx": ${(4 * n_batches + 2) * iter + 2 * n_batches + 2 * i + 2}, "label": "${f'c2_{i}'}"},
+                        % endif
+                    % endfor
+                % endfor
+            ]
+        },
+        % endfor
+    % endfor
+]
\ No newline at end of file
diff --git a/experiments/reduction/roi/tree.json.tpl b/experiments/reduction/roi/tree.json.tpl
new file mode 100644
index 00000000..a6e3e541
--- /dev/null
+++ b/experiments/reduction/roi/tree.json.tpl
@@ -0,0 +1,123 @@
+<%
+import math
+n_rows = experiment['n_rows']
+n_batches = int(experiment['size'] // experiment['batch'])
+n_levels_in_row = int(math.log2(4))
+def n_levels_in_col(row_idx):
+    if row_idx == 0:
+        return int(math.log2(n_rows))
+    elif row_idx == 2:
+        return 1
+    else:
+        return 0
+
+def pb_cluster_idx(row, col):
+    return row + col * 4
+%>
+
+[
+% for r in range(n_rows):
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 0}'}",
+        "roi": [
+    ## Reductions in row
+    % for level_in_row in range(n_levels_in_row):
+        % for batch in range(n_batches):
+            {
+                "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + (level_in_row * n_batches + batch) * 2},
+                "label": "${f'comp l{level_in_row}'}"
+            },
+        % endfor
+    % endfor
+    ## Reductions in column 0
+    % for level_in_col in range(n_levels_in_col(r)):
+        % for batch in range(n_batches):
+            {
+                "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + ((n_levels_in_row + level_in_col) * n_batches + batch) * 2},
+                "label": "${f'comp l{n_levels_in_row + level_in_col}'}"
+            },
+        % endfor
+    % endfor
+        ]
+    },
+
+    % if r > 0:
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 8}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Transfer from cluster 2 to cluster 0
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "${f'{r} > {0 if r in [1, 2] else 2}'}"
+            },
+        % endfor
+        ]
+    },
+    %endif
+
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Transfer from cluster 4 to cluster 0
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "4 > 0"
+            },
+        % endfor
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Transfer from cluster 4 to cluster 0
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "4 > 0"
+            },
+        % endfor
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 0}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Stage 0 reduction
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "comp l0"
+            },
+        % endfor
+        ]
+    },
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 8}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Transfer from cluster 8 to cluster 0
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "8 > 0"
+            },
+        % endfor
+        ]
+    },
+
+    {
+        "thread": "${f'hart_{1 + pb_cluster_idx(r, 3) * 9 + 8}'}",
+        "roi": [
+        % for batch in range(n_batches):
+            ## Transfer from cluster 12 to cluster 8
+            {
+                "idx": ${4 + n_batches * 2 + batch * 2},
+                "label": "12 > 8"
+            },
+        % endfor
+        ]
+    },
+% endfor
+]
diff --git a/experiments/reduction/verify.py b/experiments/reduction/verify.py
new file mode 100755
index 00000000..e6a17e78
--- /dev/null
+++ b/experiments/reduction/verify.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+# 
+# Verification script for `reduction_benchmark.c`
+
+import sys
+import snitch.util.sim.verif_utils as vu
+
+
+class Verifier(vu.Verifier):
+
+    OUTPUT_UIDS = ['output']
+
+    def __init__(self):
+        super().__init__()
+
+    def get_actual_results(self):
+        return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double')
+
+    def get_expected_results(self):
+        length = int(self.get_input_from_symbol('length', 'uint32_t')[0])
+        n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0])
+        return [i * n_clusters + sum(range(n_clusters)) for i in range(length)]
+
+    def check_results(self, *args):
+        return super().check_results(*args, atol=0)
+
+
+if __name__ == "__main__":
+    sys.exit(Verifier().main())
diff --git a/experiments/summa_gemm/__init__.py b/experiments/summa_gemm/__init__.py
new file mode 100644
index 00000000..7858b657
--- /dev/null
+++ b/experiments/summa_gemm/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from . import plot, experiments, model
+
+__all__ = ["plot", "experiments", "model"]
diff --git a/experiments/summa_gemm/cfg.json.tpl b/experiments/summa_gemm/cfg.json.tpl
new file mode 100644
index 00000000..e2ebd182
--- /dev/null
+++ b/experiments/summa_gemm/cfg.json.tpl
@@ -0,0 +1,25 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    setup_ssr: 1,
+    parallelize_m: 1,
+    parallelize_k: 0,
+    m_tiles: 4, // number of tiles in M dimension
+    n_tiles: ${4 * experiment['n_tiles']}, // number of tiles in N dimension
+    k_tiles: 1, // number of tiles in K dimension
+    load_a: 1,
+    load_b: 1,
+    load_c: 1,
+    double_buffer: 1,
+    partition_banks: 0,
+    transa: false,
+    transb: false, // must be true for SIMD
+    m: 64,
+    n: ${64 * experiment['n_tiles']},
+    k: 128,
+    alpha: 1,
+    beta: 0,
+    gemm_fp: "gemm_fp64_opt"
+}
diff --git a/experiments/summa_gemm/experiments.py b/experiments/summa_gemm/experiments.py
new file mode 100755
index 00000000..ca333764
--- /dev/null
+++ b/experiments/summa_gemm/experiments.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from pathlib import Path
+import picobello as pb
+import snitch.util.experiments.experiment_utils as eu
+
+TCK = 5
+VERIFY_PY = Path(__file__).parent / '../../.deps/snitch_cluster/sw/kernels/blas/gemm/scripts/verify.py'
+
+
+class ExperimentManager(pb.ExperimentManager):
+
+    def derive_axes(self, experiment):
+        return eu.derive_axes_from_keys(experiment, ['mode', 'n_tiles'])
+
+    def derive_cdefines(self, experiment):
+        cdefs = {
+            'MODE': experiment['mode'].upper(),
+        }
+        return cdefs
+
+    def derive_data_cfg(self, experiment):
+        return eu.derive_data_cfg_from_template(experiment)
+
+
+def gen_experiments():
+    experiments = []
+    # for mode in ['hw']:
+    for mode in ['sw_tree']:
+        for n_tiles in [4]:
+        # for mode in ['sw_naive', 'sw_tree', 'hw']:
+            experiments.append({
+                'app': 'summa_gemm',
+                'cmd': pb.sim_and_verify_cmd(VERIFY_PY),
+                'mode': mode,
+                'n_tiles': n_tiles,
+            })
+    return experiments
+
+
+def main():
+    manager = ExperimentManager(gen_experiments())
+    manager.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/experiments/summa_gemm/model.py b/experiments/summa_gemm/model.py
new file mode 100755
index 00000000..8d08bfc4
--- /dev/null
+++ b/experiments/summa_gemm/model.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import math
+import dma_multicast_v2 as multicast
+import reduction
+
+PREC = 8  # in bytes
+BEAT_BYTES = 64  # in bytes
+UTIL = 0.981  # median utilization from https://arxiv.org/pdf/2506.10921
+PEAKPERF = 16  # DPflop/cycle on a single cluster
+L1SIZE = 16 * 1024  # in bytes
+
+
+def beats(bytes):
+    return math.ceil(bytes / BEAT_BYTES)
+
+
+def max_square_problem_size():
+    return math.floor(math.sqrt(L1SIZE / (6 * PREC)))
+
+
+def t_mcast(dim, bytes, impl='sw'):
+    n = beats(bytes)
+    if impl == 'sw':
+        return multicast.model.optimal_sw_runtime(dim, 1, n)
+    elif impl == 'seq':
+        return multicast.model.optimal_seq_runtime(dim, 1, n)
+    elif impl == 'tree':
+        return multicast.model.tree_runtime(dim, 1, n)
+    elif impl == 'hw':
+        return multicast.model.hw_runtime(dim, 1, n)
+    else:
+        raise ValueError(f"Unknown multicast implementation: {impl}")
+
+
+def t_mcast_a(c, Mt, Kt, impl='sw'):
+    return t_mcast(c, Mt * Kt * PREC, impl)
+
+
+def t_mcast_b(r, Nt, Kt, impl='sw'):
+    return t_mcast(r, Nt * Kt * PREC, impl)
+
+
+def t_summa_comm(r, c, Mt, Nt, Kt, impl='sw'):
+    return t_mcast_a(c, Mt, Kt, impl=impl) + t_mcast_b(r, Nt, Kt, impl=impl)
+
+
+def t_comp(Mt, Nt, Kt):
+    return (2 * Mt * Nt * Kt) / (UTIL * PEAKPERF)
+
+
+def t_summa_gemm(r, c, Mt, Nt, Kt, impl='sw'):
+    return max(t_summa_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt))
+
+
+def t_fcl_comm(r, c, Mt, Nt, Kt, impl='sw'):
+    # Time to load c*r submatrices of B (each of size Nt x Kt)
+    # from r memory tiles
+    return c * r * beats(Nt * Kt * PREC) / r
+
+
+def t_reduction(r, c, bytes, impl='sw'):
+    n = beats(bytes)
+    if impl == 'sw':
+        return reduction.model.optimal_sw_runtime(c, r, n)
+    elif impl == 'hw':
+        return reduction.model.hw_runtime(c, r, n)
+    else:
+        raise ValueError(f"Unknown reduction implementation: {impl}")
+
+
+def t_fcl_gemm(r, c, Mt, Nt, Kt, impl='sw'):
+    t_partial_result = max(t_fcl_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt))
+    t_redu = t_reduction(r, c, Mt * Nt * PREC, impl=impl)
+    return t_partial_result + t_redu
diff --git a/experiments/summa_gemm/plot.py b/experiments/summa_gemm/plot.py
new file mode 100755
index 00000000..6d77bc6c
--- /dev/null
+++ b/experiments/summa_gemm/plot.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import matplotlib.pyplot as plt
+from matplotlib.ticker import FuncFormatter
+import numpy as np
+import pandas as pd
+from summa_gemm import model
+
+
+def plot1(y_label=None, hide_x_axis=False, show=True):
+    # Get data
+    mesh_sizes = [4, 8, 16, 32, 64, 128, 256]
+    t_comm_sw = []
+    t_comm_hw = []
+    t_compute = []
+    Mt = model.max_square_problem_size()
+    for c in mesh_sizes:
+        r = c
+        t_comm_sw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='sw'))
+        t_comm_hw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='hw'))
+        t_compute.append(model.t_comp(Mt, Mt, Mt))
+
+    df = pd.DataFrame({
+        'size': mesh_sizes,
+        't_comm_sw': t_comm_sw,
+        't_comm_hw': t_comm_hw,
+        't_compute': t_compute,
+    })
+
+    # Prepare critical paths and speedups
+    df['t_sw'] = df[['t_comm_sw', 't_compute']].max(axis=1)
+    df['t_hw'] = df[['t_comm_hw', 't_compute']].max(axis=1)
+    df['speedup'] = df['t_sw'] / df['t_hw']
+
+    # Plot curves
+    fig, ax = plt.subplots()
+    ax.plot(mesh_sizes, t_comm_sw, marker='o', label='$T_{comm}$ (sw)')
+    ax.plot(mesh_sizes, t_comm_hw, marker='s', label='$T_{comm}$ (hw)')
+    ax.plot(mesh_sizes, t_compute, marker='^', label='$T_{comp}$')
+    ax.set_xscale('log', base=2)
+    ax.set_xticks(mesh_sizes)
+    ax.set_xlim(2.8, 370)
+    if hide_x_axis:
+        ax.set_xlabel('')
+        ax.tick_params(
+            axis='x',
+            which='both',
+            bottom=False, top=False,  # hide tick marks
+            labelbottom=False         # hide labels
+        )
+    else:
+        ax.set_xlabel('Mesh size')
+        ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}"))
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_axisbelow(True)
+    ax.grid(True, which="both", color='gainsboro')
+    ax.legend()
+    fig.tight_layout()
+
+    # Annotate plot with speedup arrows (when speedup > 1)
+    keys = ['size', 't_comm_sw', 't_comm_hw', 't_compute', 'speedup']
+    for x, sw, hw, t, sp in df[keys].itertuples(index=False, name=None):
+        if sp > 1.0:
+            y0 = sw
+            y1 = max(hw, t)
+
+            # Double-headed arrow from t_comm_sw to max(t_comm_hw, t_compute)
+            ax.annotate(
+                "",
+                xy=(x, y1),
+                xytext=(x, y0),
+                arrowprops=dict(arrowstyle="<->", lw=1.2),
+                annotation_clip=False,
+            )
+
+            # Arithmetic midpoint for linear scale
+            y_mid = 0.5 * (y0 + y1)
+
+            offset = 3
+            if sp > 1.2:
+                ha = "right"
+                xytext = (-offset, 0)
+            else:
+                ha = "left"
+                xytext = (offset, 0)
+            ax.annotate(
+                f"{sp:.2f}$\\times$",
+                xy=(x, y_mid),
+                xytext=xytext,
+                textcoords="offset points",
+                ha=ha,
+                va="center",
+                rotation=90 if ax.get_yscale() == "log" else 0,  # optional
+                annotation_clip=True,
+            )
+
+    if show:
+        plt.show()
+
+    return df
+
+
+def plot2(y_label=None, show=True):
+    # Get data
+    mesh_sizes = [4, 8, 16, 32, 64, 128, 256]
+    Mt = model.max_square_problem_size()
+    t_sw = []
+    t_hw = []
+    for size in mesh_sizes:
+        t_sw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='sw'))
+        t_hw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='hw'))
+    df = pd.DataFrame({
+        'size': mesh_sizes,
+        't_sw': t_sw,
+        't_hw': t_hw,
+    })
+
+    # Calculate speedup
+    df['speedup'] = df['t_sw'] / df['t_hw']
+
+    # Bar widths (to make them of equal width on a log scale)
+    logx = np.log2(df['size'].to_numpy(dtype=float))
+    mid = 0.5 * (logx[:-1] + logx[1:])                 # midpoints between neighbors
+    log_left_edges  = np.r_[logx[0] - (mid[0] - logx[0]), mid]   # extrapolate first
+    log_right_edges = np.r_[mid, logx[-1] + (logx[-1] - mid[-1])]# extrapolate last
+    fill = 0.6
+    log_span = (log_right_edges - log_left_edges) * fill
+    logL = logx - 0.5 * log_span
+    logR = logx + 0.5 * log_span
+    left  = 2.0**logL
+    right = 2.0**logR
+    width = right - left
+
+    # Plot
+    fig, ax = plt.subplots()
+    ax.axhline(y=1, color='black', linewidth=1, zorder=5)
+    ax.bar(left, df['speedup'], width=width, align='edge', zorder=10)
+    ax.set_xscale('log', base=2)
+    ax.set_xlim(2.8, 370)
+    ax.set_ylim(0.5, None)
+    ax.set_xticks(mesh_sizes)
+    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}"))
+    ax.set_xlabel('Mesh size')
+    if y_label is None:
+        y_label = 'Runtime [cycles]'
+    ax.set_ylabel(y_label)
+    ax.set_axisbelow(True)
+    ax.grid(axis='both', color='gainsboro')
+    fig.tight_layout()
+
+    if show:
+        plt.show()
+
+    return df
+
+
+def main():
+    # Parse arguments
+    functions = [plot1, plot2]
+    parser = argparse.ArgumentParser(description='Plot wide multicast results.')
+    parser.add_argument(
+        'plots',
+        nargs='*',
+        default='all',
+        choices=[f.__name__ for f in functions] + ['all'],
+        help='Plots to generate.'
+    )
+    args = parser.parse_args()
+
+    # Helper function to check if a plot was requested on the command line
+    def requested(plot):
+        return plot in args.plots or 'all' in args.plots
+
+    # Generate plots
+    if requested('plot1'):
+        plot1()
+    if requested('plot2'):
+        plot2()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/experiments/summa_gemm/roi.json.tpl b/experiments/summa_gemm/roi.json.tpl
new file mode 100644
index 00000000..4ed347bd
--- /dev/null
+++ b/experiments/summa_gemm/roi.json.tpl
@@ -0,0 +1,29 @@
+<% n_tiles = experiment['n_tiles'] %>
+[
+    % for cluster in range(0,16):
+        // Compute cores
+        % for j in range(0, 8):
+        {
+            "thread": "${f'hart_{cluster * 9 + j + 1}'}",
+            "roi": [
+            % for i in range(0, n_tiles):
+                {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"},
+            % endfor
+            ]
+        },
+        % endfor
+
+        // DMA core
+        {
+            "thread": "${f'hart_{cluster * 9 + 8 + 1}'}",
+            "roi": [
+                {"idx": 1, "label": "${f'tile_in_0'}"},
+                % for i in range(0, n_tiles - 1):
+                    {"idx": ${4*i + 3}, "label": "${f'tile_in_{i+1}'}"},
+                    {"idx": ${4*i + 5}, "label": "${f'tile_out_{i}'}"},
+                % endfor
+                {"idx": ${n_tiles * 4 - 1}, "label": "${f'tile_out_{n_tiles-1}'}"},
+            ]
+        },
+    % endfor
+]
diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv
new file mode 100644
index 00000000..cf82fdde
--- /dev/null
+++ b/floo_picobello_noc_pkg_REDUCTION.sv
@@ -0,0 +1,349 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// AUTOMATICALLY GENERATED! DO NOT EDIT!
+
+`include "axi/typedef.svh"
+`include "floo_noc/typedef.svh"
+`include "reduction/typedef.svh"
+
+package floo_picobello_noc_pkg;
+
+  import floo_pkg::*;
+
+  /////////////////////
+  //   Address Map   //
+  /////////////////////
+
+  typedef enum logic [4:0] {
+    ClusterX0Y0  = 0,
+    ClusterX0Y1  = 1,
+    ClusterX0Y2  = 2,
+    ClusterX0Y3  = 3,
+    ClusterX1Y0  = 4,
+    ClusterX1Y1  = 5,
+    ClusterX1Y2  = 6,
+    ClusterX1Y3  = 7,
+    ClusterX2Y0  = 8,
+    ClusterX2Y1  = 9,
+    ClusterX2Y2  = 10,
+    ClusterX2Y3  = 11,
+    ClusterX3Y0  = 12,
+    ClusterX3Y1  = 13,
+    ClusterX3Y2  = 14,
+    ClusterX3Y3  = 15,
+    Cheshire     = 16,
+    FhgSpu       = 17,
+    TopSpmNarrow = 18,
+    TopSpmWide   = 19,
+    L2Spm0       = 20,
+    L2Spm1       = 21,
+    L2Spm2       = 22,
+    L2Spm3       = 23,
+    L2Spm4       = 24,
+    L2Spm5       = 25,
+    L2Spm6       = 26,
+    L2Spm7       = 27,
+    NumEndpoints = 28
+  } ep_id_e;
+
+
+
+  typedef enum logic [4:0] {
+    ClusterX0Y0SamIdx      = 0,
+    ClusterX0Y1SamIdx      = 1,
+    ClusterX0Y2SamIdx      = 2,
+    ClusterX0Y3SamIdx      = 3,
+    ClusterX1Y0SamIdx      = 4,
+    ClusterX1Y1SamIdx      = 5,
+    ClusterX1Y2SamIdx      = 6,
+    ClusterX1Y3SamIdx      = 7,
+    ClusterX2Y0SamIdx      = 8,
+    ClusterX2Y1SamIdx      = 9,
+    ClusterX2Y2SamIdx      = 10,
+    ClusterX2Y3SamIdx      = 11,
+    ClusterX3Y0SamIdx      = 12,
+    ClusterX3Y1SamIdx      = 13,
+    ClusterX3Y2SamIdx      = 14,
+    ClusterX3Y3SamIdx      = 15,
+    CheshireExternalSamIdx = 16,
+    CheshireInternalSamIdx = 17,
+    FhgSpuSamIdx           = 18,
+    TopSpmNarrowSamIdx     = 19,
+    TopSpmWideSamIdx       = 20,
+    L2Spm0SamIdx           = 21,
+    L2Spm1SamIdx           = 22,
+    L2Spm2SamIdx           = 23,
+    L2Spm3SamIdx           = 24,
+    L2Spm4SamIdx           = 25,
+    L2Spm5SamIdx           = 26,
+    L2Spm6SamIdx           = 27,
+    L2Spm7SamIdx           = 28
+  } sam_idx_e;
+
+
+
+  typedef logic [0:0] rob_idx_t;
+  typedef logic [0:0] port_id_t;
+  typedef logic [3:0] x_bits_t;
+  typedef logic [1:0] y_bits_t;
+  typedef struct packed {
+    x_bits_t  x;
+    y_bits_t  y;
+    port_id_t port_id;
+  } id_t;
+
+  typedef logic route_t;
+
+
+  localparam int unsigned SamNumRules = 29;
+
+  typedef struct packed {
+    id_t         idx;
+    logic [47:0] start_addr;
+    logic [47:0] end_addr;
+  } sam_rule_t;
+
+  localparam sam_rule_t [SamNumRules-1:0] Sam = '{
+      '{
+          idx: '{x: 8, y: 3, port_id: 0},
+          start_addr: 48'h000070700000,
+          end_addr: 48'h000070800000
+      },  // L2Spm7
+      '{
+          idx: '{x: 8, y: 2, port_id: 0},
+          start_addr: 48'h000070600000,
+          end_addr: 48'h000070700000
+      },  // L2Spm6
+      '{
+          idx: '{x: 8, y: 1, port_id: 0},
+          start_addr: 48'h000070500000,
+          end_addr: 48'h000070600000
+      },  // L2Spm5
+      '{
+          idx: '{x: 8, y: 0, port_id: 0},
+          start_addr: 48'h000070400000,
+          end_addr: 48'h000070500000
+      },  // L2Spm4
+      '{
+          idx: '{x: 0, y: 3, port_id: 0},
+          start_addr: 48'h000070300000,
+          end_addr: 48'h000070400000
+      },  // L2Spm3
+      '{
+          idx: '{x: 0, y: 2, port_id: 0},
+          start_addr: 48'h000070200000,
+          end_addr: 48'h000070300000
+      },  // L2Spm2
+      '{
+          idx: '{x: 0, y: 1, port_id: 0},
+          start_addr: 48'h000070100000,
+          end_addr: 48'h000070200000
+      },  // L2Spm1
+      '{
+          idx: '{x: 0, y: 0, port_id: 0},
+          start_addr: 48'h000070000000,
+          end_addr: 48'h000070100000
+      },  // L2Spm0
+      '{
+          idx: '{x: 9, y: 1, port_id: 0},
+          start_addr: 48'h000060040000,
+          end_addr: 48'h000060080000
+      },  // TopSpmWide
+      '{
+          idx: '{x: 9, y: 2, port_id: 0},
+          start_addr: 48'h000060000000,
+          end_addr: 48'h000060040000
+      },  // TopSpmNarrow
+      '{
+          idx: '{x: 9, y: 0, port_id: 0},
+          start_addr: 48'h000040000000,
+          end_addr: 48'h000040040000
+      },  // FhgSpu
+      '{
+          idx: '{x: 9, y: 3, port_id: 0},
+          start_addr: 48'h000000000000,
+          end_addr: 48'h000020000000
+      },  // CheshireInternal
+      '{
+          idx: '{x: 9, y: 3, port_id: 0},
+          start_addr: 48'h000080000000,
+          end_addr: 48'h020000000000
+      },  // CheshireExternal
+      '{
+          idx: '{x: 7, y: 3, port_id: 0},
+          start_addr: 48'h0000203c0000,
+          end_addr: 48'h000020400000
+      },  // ClusterX3Y3
+      '{
+          idx: '{x: 7, y: 2, port_id: 0},
+          start_addr: 48'h000020380000,
+          end_addr: 48'h0000203c0000
+      },  // ClusterX3Y2
+      '{
+          idx: '{x: 7, y: 1, port_id: 0},
+          start_addr: 48'h000020340000,
+          end_addr: 48'h000020380000
+      },  // ClusterX3Y1
+      '{
+          idx: '{x: 7, y: 0, port_id: 0},
+          start_addr: 48'h000020300000,
+          end_addr: 48'h000020340000
+      },  // ClusterX3Y0
+      '{
+          idx: '{x: 6, y: 3, port_id: 0},
+          start_addr: 48'h0000202c0000,
+          end_addr: 48'h000020300000
+      },  // ClusterX2Y3
+      '{
+          idx: '{x: 6, y: 2, port_id: 0},
+          start_addr: 48'h000020280000,
+          end_addr: 48'h0000202c0000
+      },  // ClusterX2Y2
+      '{
+          idx: '{x: 6, y: 1, port_id: 0},
+          start_addr: 48'h000020240000,
+          end_addr: 48'h000020280000
+      },  // ClusterX2Y1
+      '{
+          idx: '{x: 6, y: 0, port_id: 0},
+          start_addr: 48'h000020200000,
+          end_addr: 48'h000020240000
+      },  // ClusterX2Y0
+      '{
+          idx: '{x: 5, y: 3, port_id: 0},
+          start_addr: 48'h0000201c0000,
+          end_addr: 48'h000020200000
+      },  // ClusterX1Y3
+      '{
+          idx: '{x: 5, y: 2, port_id: 0},
+          start_addr: 48'h000020180000,
+          end_addr: 48'h0000201c0000
+      },  // ClusterX1Y2
+      '{
+          idx: '{x: 5, y: 1, port_id: 0},
+          start_addr: 48'h000020140000,
+          end_addr: 48'h000020180000
+      },  // ClusterX1Y1
+      '{
+          idx: '{x: 5, y: 0, port_id: 0},
+          start_addr: 48'h000020100000,
+          end_addr: 48'h000020140000
+      },  // ClusterX1Y0
+      '{
+          idx: '{x: 4, y: 3, port_id: 0},
+          start_addr: 48'h0000200c0000,
+          end_addr: 48'h000020100000
+      },  // ClusterX0Y3
+      '{
+          idx: '{x: 4, y: 2, port_id: 0},
+          start_addr: 48'h000020080000,
+          end_addr: 48'h0000200c0000
+      },  // ClusterX0Y2
+      '{
+          idx: '{x: 4, y: 1, port_id: 0},
+          start_addr: 48'h000020040000,
+          end_addr: 48'h000020080000
+      },  // ClusterX0Y1
+      '{
+          idx: '{x: 4, y: 0, port_id: 0},
+          start_addr: 48'h000020000000,
+          end_addr: 48'h000020040000
+      }  // ClusterX0Y0
+
+  };
+
+
+
+  localparam route_cfg_t RouteCfg = '{
+      RouteAlgo: XYRouting,
+      UseIdTable: 1'b1,
+      XYAddrOffsetX: 41,
+      XYAddrOffsetY: 45,
+      IdAddrOffset: 0,
+      NumSamRules: 29,
+      NumRoutes: 0,
+      CollectiveCfg: '{
+        OpCfg: '{
+            EnNarrowMulticast:  1'b1,
+            EnWideMulticast:    1'b1,
+            EnLSBAnd:           1'b1,
+            EnF_Add:            1'b1,
+            EnF_Mul:            1'b1,
+            EnF_Min:            1'b1,
+            EnF_Max:            1'b1,
+            default:            '0
+        },
+        RedCfg: ReductionDefaultCfg
+      }
+  };
+
+
+  typedef logic [47:0] axi_narrow_in_addr_t;
+  typedef logic [63:0] axi_narrow_in_data_t;
+  typedef logic [7:0] axi_narrow_in_strb_t;
+  typedef logic [4:0] axi_narrow_in_id_t;
+  typedef logic [2:0] axi_narrow_in_user_t;
+  `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t,
+                      axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t,
+                      axi_narrow_in_user_t)
+
+
+  typedef logic [47:0] axi_narrow_out_addr_t;
+  typedef logic [63:0] axi_narrow_out_data_t;
+  typedef logic [7:0] axi_narrow_out_strb_t;
+  typedef logic [1:0] axi_narrow_out_id_t;
+  typedef logic [2:0] axi_narrow_out_user_t;
+  `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t,
+                      axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t,
+                      axi_narrow_out_strb_t, axi_narrow_out_user_t)
+
+
+  typedef logic [47:0] axi_wide_in_addr_t;
+  typedef logic [511:0] axi_wide_in_data_t;
+  typedef logic [63:0] axi_wide_in_strb_t;
+  typedef logic [2:0] axi_wide_in_id_t;
+  typedef logic [0:0] axi_wide_in_user_t;
+  `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t,
+                      axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t)
+
+
+  typedef logic [47:0] axi_wide_out_addr_t;
+  typedef logic [511:0] axi_wide_out_data_t;
+  typedef logic [63:0] axi_wide_out_strb_t;
+  typedef logic [0:0] axi_wide_out_id_t;
+  typedef logic [0:0] axi_wide_out_user_t;
+  `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t,
+                      axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t,
+                      axi_wide_out_user_t)
+
+
+
+  `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e)
+  localparam axi_cfg_t AxiCfgN = '{
+      AddrWidth: 48,
+      DataWidth: 64,
+      UserWidth: 3,
+      InIdWidth: 5,
+      OutIdWidth: 2
+  };
+  localparam axi_cfg_t AxiCfgW = '{
+      AddrWidth: 48,
+      DataWidth: 512,
+      UserWidth: 1,
+      InIdWidth: 3,
+      OutIdWidth: 1
+  };
+  `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW,
+                            hdr_t)
+
+  //TODO (lleone): all this must be generated
+  `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 2)
+   typedef logic [AxiCfgW.DataWidth-1:0] data_wide_width;
+   typedef logic [AxiCfgN.DataWidth-1:0] data_narrow_width;
+  `RED_TYPEDEF_REQ_RSP_LINK(wide, data_wide_width, wide_req, wide_rsp)
+  `RED_TYPEDEF_REQ_RSP_LINK(narrow, data_narrow_width, narrow_req, narrow_rsp)
+
+
+endpackage
diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv
index ed406db1..e2af0086 100644
--- a/hw/cheshire_tile.sv
+++ b/hw/cheshire_tile.sv
@@ -8,6 +8,8 @@
 `include "cheshire/typedef.svh"
 `include "floo_noc/typedef.svh"
 `include "axi/assign.svh"
+`include "common_cells/assertions.svh"
+
 
 module cheshire_tile
   import cheshire_pkg::*;
@@ -104,7 +106,8 @@ module cheshire_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
@@ -117,7 +120,10 @@ module cheshire_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    // .floo_wide_out_t (floo_wide_double_t),
+    .WideRwDecouple (WideRwDecouple),
+    .VcImpl    (VcImplementation)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -129,7 +135,13 @@ module cheshire_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+    // Wide Reduction offload port
+    .offload_wide_req_o   (    ),
+    .offload_wide_rsp_i   ( '0 ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_west_o            = router_floo_req_out[West];
@@ -144,6 +156,13 @@ module cheshire_tile
   assign router_floo_rsp_in[North]  = '0;  // No North port in this tile
   assign router_floo_rsp_in[East]   = '0;  // No East port in this tile
   assign router_floo_rsp_in[South]  = floo_rsp_south_i;
+  //TODO(colluca): Merge with floo_wide
+  // assign floo_wide_west_o.valid           = router_floo_wide_out[West].valid;
+  // assign floo_wide_west_o.ready           = router_floo_wide_out[West].ready;
+  // assign floo_wide_west_o.wide           = router_floo_wide_out[West].wide[0];
+  // assign floo_wide_south_o.valid          = router_floo_wide_out[South].valid;
+  // assign floo_wide_south_o.ready          = router_floo_wide_out[South].ready;
+  // assign floo_wide_south_o.wide           = router_floo_wide_out[South].wide[0];
   assign floo_wide_west_o           = router_floo_wide_out[West];
   assign floo_wide_south_o          = router_floo_wide_out[South];
   assign router_floo_wide_in[West]  = floo_wide_west_i;
@@ -172,6 +191,8 @@ module cheshire_tile
     .ChimneyCfgW         (ChimneyCfgW),
     .RouteCfg            (RouteCfgNoMcast),
     .AtopSupport         (1'b1),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImpl              (VcImplementation),
     .MaxAtomicTxns       (AxiCfgN.OutIdWidth - 1),
     .Sam                 (Sam),
     .id_t                (id_t),
@@ -188,7 +209,10 @@ module cheshire_tile
     .axi_wide_out_rsp_t  (axi_wide_out_rsp_t),
     .floo_req_t          (floo_req_t),
     .floo_rsp_t          (floo_rsp_t),
-    .floo_wide_t         (floo_wide_t)
+    .floo_wide_t         (floo_wide_t),
+    // .floo_wide_in_t         (floo_wide_double_t),
+    .user_narrow_struct_t             (collective_narrow_user_t),
+    .user_wide_struct_t               (collective_wide_user_t)
   ) i_chimney (
     .clk_i,
     .rst_ni,
@@ -472,4 +496,14 @@ module cheshire_tile
   assign fhg_spu_rst_no   = control_reg.fhg_spu_rsts.rst.value;
   assign fhg_spu_clk_en_o = control_reg.fhg_spu_clk_enables.clk_en.value;
 
+  // Add Assertion that no multicast / reduction can enter this tile!
+  for (genvar r = 0; r < 4; r++) begin : gen_virt
+    `ASSERT(NoCollectivOperation_NReq_In, (!router_floo_req_in[r].valid | (router_floo_req_in[r].req[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NRsp_In, (!router_floo_rsp_in[r].valid | (router_floo_rsp_in[r].rsp[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide[0].generic.hdr.collective_op == Unicast)))
+  end
+
 endmodule
diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv
index 1f8cf413..af101f30 100644
--- a/hw/cluster_tile.sv
+++ b/hw/cluster_tile.sv
@@ -27,6 +27,7 @@ module cluster_tile
   input  logic                      [NrCores-1:0] msip_i,
   input  logic                      [        9:0] hart_base_id_i,
   input  snitch_cluster_pkg::addr_t               cluster_base_addr_i,
+  input  snitch_cluster_pkg::addr_t               cluster_base_offset_i,
   // Chimney ports
   input  id_t                                     id_i,
   // Router ports
@@ -70,7 +71,7 @@ module cluster_tile
 
   `AXI_TYPEDEF_ALL(cluster_narrow_out_dw_conv, snitch_cluster_pkg::addr_t,
                    snitch_cluster_pkg::narrow_out_id_t, data_hwpe_ctrl_t, strb_hwpe_ctrl_t,
-                   snitch_cluster_pkg::user_t)
+                   snitch_cluster_pkg::user_narrow_t)
 
   cluster_narrow_out_dw_conv_req_t cluster_narrow_out_dw_conv_req, cluster_narrow_out_cut_req;
   cluster_narrow_out_dw_conv_resp_t cluster_narrow_out_dw_conv_rsp, cluster_narrow_out_cut_rsp;
@@ -82,6 +83,112 @@ module cluster_tile
 
   logic          [NrCores-1:0] mxip;
 
+
+  ////////////////////////
+  // Wide FPU Reduction //
+  ////////////////////////
+
+  // Snitch cluster DCA interface
+  snitch_cluster_pkg::dca_req_t offload_dca_req, offload_dca_req_cut;
+  snitch_cluster_pkg::dca_rsp_t offload_dca_rsp, offload_dca_rsp_cut;
+
+  // Signals to connect the NW router to the wide parser
+  red_wide_req_t offload_wide_req;
+  red_wide_rsp_t offload_wide_rsp;
+
+  // Parse the Wide request from the reouter to the one from the snitch cluster!
+  // TODO(raroth): possible to remove this decode from the picobello repo and move it inside the
+  //       FlooNoC repo. Currently the Decode used for the ALU is directly inside the floo_alu.sv
+  //       file. Maybe do the same for the FPU
+  if(is_en_wide_reduction(RouteCfg.CollectiveCfg.OpCfg)) begin : gen_wide_offload_reduction
+    // Connect the DCA Request
+    assign offload_dca_req.q_valid = offload_wide_req.valid;
+    assign offload_wide_rsp.ready  = offload_dca_rsp.q_ready;
+
+    // Parse the FPU Request
+    always_comb begin
+      // Init default values
+      offload_dca_req.q.operands = '0;
+
+      // Set default Values
+      offload_dca_req.q.src_fmt = fpnew_pkg::FP64;
+      offload_dca_req.q.dst_fmt = fpnew_pkg::FP64;
+      offload_dca_req.q.int_fmt = fpnew_pkg::INT64;
+      offload_dca_req.q.vectorial_op = 1'b0;
+      offload_dca_req.q.op_mod = 1'b0;
+      offload_dca_req.q.rnd_mode = fpnew_pkg::RNE;
+      offload_dca_req.q.op = fpnew_pkg::ADD;
+
+      // Define the operation we want to execute on the FPU
+      unique casez (offload_wide_req.req.op)
+        (floo_pkg::F_Add) : begin
+          offload_dca_req.q.op = fpnew_pkg::ADD;
+          offload_dca_req.q.operands[0] = '0;
+          offload_dca_req.q.operands[1] = offload_wide_req.req.operand1;
+          offload_dca_req.q.operands[2] = offload_wide_req.req.operand2;
+        end
+        (floo_pkg::F_Mul) : begin
+          offload_dca_req.q.op = fpnew_pkg::MUL;
+          offload_dca_req.q.operands[0] = offload_wide_req.req.operand1;
+          offload_dca_req.q.operands[1] = offload_wide_req.req.operand2;
+          offload_dca_req.q.operands[2] = '0;
+        end
+        (floo_pkg::F_Max) : begin
+          offload_dca_req.q.op = fpnew_pkg::MINMAX;
+          offload_dca_req.q.rnd_mode = fpnew_pkg::RNE;
+          offload_dca_req.q.operands[0] = offload_wide_req.req.operand1;
+          offload_dca_req.q.operands[1] = offload_wide_req.req.operand2;
+          offload_dca_req.q.operands[2] = '0;
+        end
+        (floo_pkg::F_Min) : begin
+          offload_dca_req.q.op = fpnew_pkg::MINMAX;
+          offload_dca_req.q.rnd_mode = fpnew_pkg::RTZ;
+          offload_dca_req.q.operands[0] = offload_wide_req.req.operand1;
+          offload_dca_req.q.operands[1] = offload_wide_req.req.operand2;
+          offload_dca_req.q.operands[2] = '0;
+        end
+        default : begin
+          offload_dca_req.q.op = fpnew_pkg::ADD;
+          offload_dca_req.q.operands[0] = '0;
+          offload_dca_req.q.operands[1] = '0;
+          offload_dca_req.q.operands[2] = '0;
+        end
+      endcase
+    end
+
+    // TODO(raroth): move these spill register inside FlooNoC and make them configurable.
+    // Insert a reqrsp-cut to avoid timing violations
+     //If teh CutOffloadIntf is enabled, the cut is already in the offload controller, you can bypass teh one below
+    generic_reqrsp_cut #(
+      .req_chan_t (snitch_cluster_pkg::dca_req_chan_t),
+      .rsp_chan_t (snitch_cluster_pkg::dca_rsp_chan_t),
+      .BypassReq  (WideReductionCfg.CutOffloadIntf),
+      .BypassRsp  (WideReductionCfg.CutOffloadIntf)
+    ) i_dca_router_cut (
+      .clk_i          (clk_i),
+      .rst_ni         (rst_ni),
+      .slv_req_i      (offload_dca_req),
+      .slv_rsp_o      (offload_dca_rsp),
+      .mst_req_o      (offload_dca_req_cut),
+      .mst_rsp_i      (offload_dca_rsp_cut)
+    );
+    // Connect the Response
+    assign offload_wide_rsp.valid = offload_dca_rsp.p_valid;
+    assign offload_dca_req.p_ready = offload_wide_req.ready;
+    assign offload_wide_rsp.rsp.result = offload_dca_rsp.p.result;
+
+  // No Wide Reduction supported
+  end else begin : gen_no_wide_reduction
+    assign offload_dca_req_cut = '0;
+    assign offload_dca_rsp = '0;
+    assign offload_wide_rsp.ready = '0;
+    assign offload_wide_rsp.rsp.result = '0;
+    assign offload_wide_rsp.valid = '0;
+  end
+
+  // TODO(lleone): Add teh narrow ALU reduction unit and connections here
+
+
   snitch_cluster_wrapper i_cluster (
     .clk_i            (tile_clk),
     .rst_ni           (tile_rst_n),
@@ -91,21 +198,36 @@ module cluster_tile
     .msip_i,
     .hart_base_id_i,
     .cluster_base_addr_i,
-    .mxip_i           (mxip),
-    .clk_d2_bypass_i  ('0),
-    .sram_cfgs_i      ('0),
-    .narrow_in_req_i  (cluster_narrow_in_req),
-    .narrow_in_resp_o (cluster_narrow_in_rsp),
-    .narrow_out_req_o (cluster_narrow_out_req),
-    .narrow_out_resp_i(cluster_narrow_out_rsp),
-    .wide_out_req_o   (cluster_wide_out_req),
-    .wide_out_resp_i  (cluster_wide_out_rsp),
-    .wide_in_req_i    (cluster_wide_in_req),
-    .wide_in_resp_o   (cluster_wide_in_rsp),
-    .narrow_ext_req_o (cluster_narrow_ext_req),
-    .narrow_ext_resp_i(cluster_narrow_ext_rsp),
-    .tcdm_ext_req_i   (cluster_tcdm_ext_req_aligned),
-    .tcdm_ext_resp_o  (cluster_tcdm_ext_rsp_aligned)
+    .cluster_base_offset_i,
+    .mxip_i             (mxip),
+    .clk_d2_bypass_i    ('0),
+    .sram_cfgs_i        ('0),
+    .narrow_in_req_i    (cluster_narrow_in_req),
+    .narrow_in_resp_o   (cluster_narrow_in_rsp),
+    .narrow_out_req_o   (cluster_narrow_out_req),
+    .narrow_out_resp_i  (cluster_narrow_out_rsp),
+    .wide_out_req_o     (cluster_wide_out_req),
+    .wide_out_resp_i    (cluster_wide_out_rsp),
+    .wide_in_req_i      (cluster_wide_in_req),
+    .wide_in_resp_o     (cluster_wide_in_rsp),
+    .narrow_ext_req_o   (cluster_narrow_ext_req),
+    .narrow_ext_resp_i  (cluster_narrow_ext_rsp),
+    .tcdm_ext_req_i     (cluster_tcdm_ext_req_aligned),
+    .tcdm_ext_resp_o    (cluster_tcdm_ext_rsp_aligned),
+    .dca_req_i          (offload_dca_req_cut),
+    .dca_rsp_o          (offload_dca_rsp_cut),
+    .x_issue_req_o       (),
+    .x_issue_resp_i      ('0),
+    .x_issue_valid_o     (),
+    .x_issue_ready_i     ('0),
+    .x_register_o        (),
+    .x_register_valid_o  (),
+    .x_register_ready_i  ('0),
+    .x_commit_o          (),
+    .x_commit_valid_o    (),
+    .x_result_i          ('0),
+    .x_result_valid_i    ('0),
+    .x_result_ready_o    ()
   );
 
   if (UseHWPE) begin : gen_hwpe
@@ -218,13 +340,17 @@ module cluster_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
+
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
     .AxiCfgW     (AxiCfgW),
-    .EnMultiCast (RouteCfg.EnMultiCast),
     .RouteAlgo   (RouteCfg.RouteAlgo),
+    .WideRwDecouple (WideRwDecouple),
+    .VcImpl      (VcImplementation),
+    .NoLoopback  (1'b0),
     .NumRoutes   (5),
     .InFifoDepth (2),
     .OutFifoDepth(2),
@@ -232,7 +358,14 @@ module cluster_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    .red_wide_req_t   (red_wide_req_t),
+    .red_wide_rsp_t   (red_wide_rsp_t),
+    .red_narrow_req_t (red_narrow_req_t),
+    .red_narrow_rsp_t (red_narrow_rsp_t),
+    .CollectiveOpCfg          (RouteCfg.CollectiveCfg.OpCfg),
+    .RdWideCfg                (WideReductionCfg),
+    .RdNarrowCfg              (NarrowReductionCfg)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -244,15 +377,21 @@ module cluster_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+    // Wide Reduction offload port
+    .offload_wide_req_o   ( offload_wide_req ),
+    .offload_wide_rsp_i   ( offload_wide_rsp ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_o                      = router_floo_req_out[West:North];
   assign router_floo_req_in[West:North]  = floo_req_i;
   assign floo_rsp_o                      = router_floo_rsp_out[West:North];
   assign router_floo_rsp_in[West:North]  = floo_rsp_i;
-  assign floo_wide_o                     = router_floo_wide_out[West:North];
   assign router_floo_wide_in[West:North] = floo_wide_i;
+  assign floo_wide_o[West:North] =router_floo_wide_out[West:North];
 
   /////////////
   // Chimney //
@@ -265,7 +404,9 @@ module cluster_tile
     .ChimneyCfgW         (floo_pkg::ChimneyDefaultCfg),
     .RouteCfg            (floo_picobello_noc_pkg::RouteCfg),
     .AtopSupport         (1'b1),
-    .MaxAtomicTxns       (1),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImpl              (VcImplementation),
+    .MaxAtomicTxns       (3),
     .Sam                 (picobello_pkg::SamMcast),
     .id_t                (floo_picobello_noc_pkg::id_t),
     .rob_idx_t           (floo_picobello_noc_pkg::rob_idx_t),
@@ -285,7 +426,8 @@ module cluster_tile
     .floo_rsp_t          (floo_picobello_noc_pkg::floo_rsp_t),
     .floo_wide_t         (floo_picobello_noc_pkg::floo_wide_t),
     .sram_cfg_t          (snitch_cluster_pkg::sram_cfg_t),
-    .user_struct_t       (picobello_pkg::mcast_user_t)
+    .user_narrow_struct_t         (picobello_pkg::collective_narrow_user_t),
+    .user_wide_struct_t           (picobello_pkg::collective_wide_user_t)
   ) i_chimney (
     .clk_i               (tile_clk),
     .rst_ni              (tile_rst_n),
diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv
index c42f1d38..2dafd6a9 100644
--- a/hw/dummy_tile.sv
+++ b/hw/dummy_tile.sv
@@ -29,7 +29,8 @@ module dummy_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
@@ -42,7 +43,10 @@ module dummy_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    // .floo_wide_out_t (floo_wide_double_t),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImplementation    (VcImplementation)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -54,15 +58,27 @@ module dummy_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+    // Wide Reduction offload port
+    .offload_wide_req_o   (    ),
+    .offload_wide_rsp_i   ( '0 ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_o                      = router_floo_req_out[West:North];
   assign router_floo_req_in[West:North]  = floo_req_i;
   assign floo_rsp_o                      = router_floo_rsp_out[West:North];
   assign router_floo_rsp_in[West:North]  = floo_rsp_i;
-  assign floo_wide_o                     = router_floo_wide_out[West:North];
+  // Only the local port uses both physical channels. Other outputs use only the lower.
+  // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o
+  //   assign floo_wide_o[i].valid = router_floo_wide_out[i].valid;
+  //   assign floo_wide_o[i].ready = router_floo_wide_out[i].ready;
+  //   assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0];
+  // end
   assign router_floo_wide_in[West:North] = floo_wide_i;
+  assign floo_wide_o[West:North] =router_floo_wide_out[West:North];
 
   // Tie the router’s Eject input ports to 0
   assign router_floo_req_in[Eject]       = '0;
diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv
index ad0636e7..5db4fa91 100644
--- a/hw/fhg_spu_tile.sv
+++ b/hw/fhg_spu_tile.sv
@@ -47,7 +47,8 @@ module fhg_spu_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
@@ -60,7 +61,10 @@ module fhg_spu_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    // .floo_wide_out_t (floo_wide_double_t),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImpl    (VcImplementation)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -72,7 +76,13 @@ module fhg_spu_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+    // Wide Reduction offload port
+    .offload_wide_req_o   (    ),
+    .offload_wide_rsp_i   ( '0 ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_west_o            = router_floo_req_out[West];
@@ -87,6 +97,12 @@ module fhg_spu_tile
   assign router_floo_rsp_in[South]  = '0;  // No South port in this tile
   assign router_floo_rsp_in[East]   = '0;  // No East port in this tile
   assign router_floo_rsp_in[North]  = floo_rsp_north_i;
+  // assign floo_wide_west_o.valid     = router_floo_wide_out[West].valid;
+  // assign floo_wide_west_o.ready     = router_floo_wide_out[West].ready;
+  // assign floo_wide_west_o.wide      = router_floo_wide_out[West].wide[0];
+  // assign floo_wide_north_o.valid    = router_floo_wide_out[North].valid;
+  // assign floo_wide_north_o.ready    = router_floo_wide_out[North].ready;
+  // assign floo_wide_north_o.wide     = router_floo_wide_out[North].wide[0];
   assign floo_wide_west_o           = router_floo_wide_out[West];
   assign floo_wide_north_o          = router_floo_wide_out[North];
   assign router_floo_wide_in[West]  = floo_wide_west_i;
diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv
index faf4249a..f615891f 100644
--- a/hw/mem_tile.sv
+++ b/hw/mem_tile.sv
@@ -7,6 +7,7 @@
 `include "common_cells/registers.svh"
 `include "axi/typedef.svh"
 `include "obi/typedef.svh"
+`include "common_cells/assertions.svh"
 
 module mem_tile
   import floo_pkg::*;
@@ -45,12 +46,12 @@ module mem_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
     .AxiCfgW     (AxiCfgW),
-    .EnMultiCast (RouteCfgNoMcast.EnMultiCast),
     .RouteAlgo   (RouteCfgNoMcast.RouteAlgo),
     .NumRoutes   (5),
     .InFifoDepth (2),
@@ -59,7 +60,10 @@ module mem_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    // .floo_wide_out_t (floo_wide_double_t),
+    .WideRwDecouple  (WideRwDecouple),
+    .VcImpl    (VcImplementation)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -71,15 +75,27 @@ module mem_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+    // Wide Reduction offload port
+    .offload_wide_req_o   (    ),
+    .offload_wide_rsp_i   ( '0 ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_o                      = router_floo_req_out[West:North];
   assign router_floo_req_in[West:North]  = floo_req_i;
   assign floo_rsp_o                      = router_floo_rsp_out[West:North];
   assign router_floo_rsp_in[West:North]  = floo_rsp_i;
-  assign floo_wide_o                     = router_floo_wide_out[West:North];
+  // Only the local port uses both physical channels. Other outputs use only the lower.
+  // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o
+  //   assign floo_wide_o[i].valid = router_floo_wide_out[i].valid;
+  //   assign floo_wide_o[i].ready = router_floo_wide_out[i].ready;
+  //   assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0];
+  // end
   assign router_floo_wide_in[West:North] = floo_wide_i;
+  assign floo_wide_o[West:North] =router_floo_wide_out[West:North];
 
   /////////////
   // Chimney //
@@ -97,6 +113,8 @@ module mem_tile
     .ChimneyCfgW         (set_ports(ChimneyDefaultCfg, 1'b1, 1'b0)),
     .RouteCfg            (RouteCfgNoMcast),
     .AtopSupport         (1'b1),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImpl              (VcImplementation),
     .MaxAtomicTxns       (1),
     .Sam                 (Sam),
     .id_t                (id_t),
@@ -113,7 +131,10 @@ module mem_tile
     .axi_wide_out_rsp_t  (axi_wide_out_rsp_t),
     .floo_req_t          (floo_req_t),
     .floo_rsp_t          (floo_rsp_t),
-    .floo_wide_t         (floo_wide_t)
+    .floo_wide_t         (floo_wide_t),
+    // .floo_wide_in_t     (floo_wide_double_t),
+    .user_narrow_struct_t             (collective_narrow_user_t),
+    .user_wide_struct_t               (collective_wide_user_t)
   ) i_chimney (
     .clk_i               (tile_clk),
     .rst_ni              (tile_rst_n),
@@ -473,5 +494,22 @@ module mem_tile
     .clk_o    (tile_rst_n)
   );
 `endif
+  // Add Assertion that no multicast / reduction can enter this tile!
+  for (genvar r = 0; r < 4; r++) begin : gen_route_assertions
+    `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req[0].generic.hdr.collective_op == Unicast)),
+            clk_i, !rst_ni,
+            $sformatf("Unsupported collective attempted with destination: %h", floo_req_i[r].req[0].narrow_aw.payload.addr))
+    `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide[0].generic.hdr.collective_op == Unicast)),
+            clk_i, !rst_ni,
+            $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr))
+    `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req[0].generic.hdr.collective_op == Unicast)),
+            clk_i, !rst_ni,
+            $sformatf("Unsupported collective attempted with destination: %h", floo_req_o[r].req[0].narrow_aw.payload.addr))
+    `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp[0].generic.hdr.collective_op == Unicast)))
+    `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide[0].generic.hdr.collective_op == Unicast)),
+            clk_i, !rst_ni,
+            $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr))
+  end
 
 endmodule
diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv
index dcbfa060..909f2ee2 100644
--- a/hw/picobello_pkg.sv
+++ b/hw/picobello_pkg.sv
@@ -270,7 +270,7 @@ package picobello_pkg;
   typedef struct packed {
     logic [5:0] offset;
     logic [2:0] len;
-    logic [2:0] grp_base_id;
+    logic [2:0] base_id;
   } mask_sel_t;
 
   typedef struct packed {
@@ -325,16 +325,16 @@ package picobello_pkg;
         sam_multicast[rule].idx.mask_x = '{
             offset: offset_id_x,
             len: len_id_x,
-            grp_base_id: empty_cols
+            base_id: empty_cols
         };
         sam_multicast[rule].idx.mask_y = '{
             offset: offset_id_y,
             len: len_id_y,
-            grp_base_id: empty_rows
+            base_id: empty_rows
         };
       end else begin
-        sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, grp_base_id: 0};
-        sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, grp_base_id: 0};
+        sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, base_id: 0};
+        sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, base_id: 0};
       end
 
     end
@@ -346,7 +346,7 @@ package picobello_pkg;
   function automatic floo_pkg::route_cfg_t gen_nomcast_route_cfg();
     floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg;
     // Disable multicast for non-cluster tiles
-    ret.EnMultiCast = 1'b0;
+    ret.CollectiveCfg = CollectiveDefaultCfg;
     return ret;
   endfunction
 
@@ -362,9 +362,9 @@ package picobello_pkg;
       $write("  { idx: { id: {x: %0d, y: %0d, port: %0d},", SamMcast[i].idx.id.x,
              SamMcast[i].idx.id.y, SamMcast[i].idx.id.port_id);
       $write("  mask_x: {offset: %0d, len: %0d, base_id: %0d},", SamMcast[i].idx.mask_x.offset,
-             SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.grp_base_id);
+             SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.base_id);
       $write("  mask_y: {offset: %0d, len: %0d, base_id: %0d} },", SamMcast[i].idx.mask_y.offset,
-             SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.grp_base_id);
+             SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.base_id);
       $write("start: 0x%0h, end: 0x%0h }\n", SamMcast[i].start_addr, SamMcast[i].end_addr);
     end
     $display("]");
@@ -391,6 +391,45 @@ package picobello_pkg;
     $display("----------------------------------------------------------");
   endfunction
 
+  /////////////////////
+  //   REDUCTION     //
+  /////////////////////
+
+  // TODO (lleone): Add generation of the follwing types ans structs into Floogen
+  typedef struct packed {
+    user_mask_t                                   collective_mask;
+    floo_pkg::collect_op_e                        collective_op;
+    logic [snitch_cluster_pkg::AtomicIdWidth-1:0] atomic;
+  } collective_narrow_user_t;
+
+  typedef struct packed {
+    user_mask_t             collective_mask;
+    floo_pkg::collect_op_e  collective_op;
+  } collective_wide_user_t;
+
+  //  // TODO (colluca): Merge with floo_wide
+  // typedef struct packed {
+  //   logic [1:0] valid;
+  //   logic [1:0] ready;
+  //   floo_wide_chan_t [1:0] wide;
+  // } floo_wide_double_t;
+
+  localparam int unsigned NumWideVirtChannels = 2;
+  localparam int unsigned NumWidePhysChannels = 1;
+  localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcPreemptValid;
+  localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Phys;
+
+  // Configurations for the Reductions
+localparam reduction_cfg_t WideReductionCfg = '{
+    RdPipelineDepth: 5,
+    CutOffloadIntf: 1'b1
+  };
+
+localparam reduction_cfg_t NarrowReductionCfg = '{
+    RdPipelineDepth: 1,
+    CutOffloadIntf: 1'b0
+  };
+
   ////////////////
   //  Cheshire  //
   ////////////////
diff --git a/hw/picobello_top.sv b/hw/picobello_top.sv
index 201020c7..170372c8 100644
--- a/hw/picobello_top.sv
+++ b/hw/picobello_top.sv
@@ -97,27 +97,29 @@ module picobello_top
     localparam int Y = int'(ClusterPhysicalId.y);
     localparam int unsigned HartBaseId = c * NrCores + 1;  // Cheshire is hart 0
     localparam axi_wide_in_addr_t ClusterBaseAddr = Sam[ClusterSamIdx].start_addr;
+    localparam axi_wide_in_addr_t ClusterBaseOffset = Sam[ClusterSamIdx].end_addr - ClusterBaseAddr;
 
     cluster_tile i_cluster_tile (
       .clk_i,
       .rst_ni,
-      .test_enable_i      (test_mode_i),
-      .tile_clk_en_i      (cluster_clk_en[c]),
-      .tile_rst_ni        (cluster_rst_n[c]),
-      .clk_rst_bypass_i   (clk_rst_bypass_i),
-      .debug_req_i        (debug_req[c]),
-      .meip_i             (meip[c]),
-      .mtip_i             (mtip[c]),
-      .msip_i             (msip[c]),
-      .hart_base_id_i     (HartBaseId[9:0]),
-      .cluster_base_addr_i(ClusterBaseAddr),
-      .id_i               (ClusterId),
-      .floo_req_o         (floo_req_out[X][Y]),
-      .floo_rsp_i         (floo_rsp_in[X][Y]),
-      .floo_wide_o        (floo_wide_out[X][Y]),
-      .floo_req_i         (floo_req_in[X][Y]),
-      .floo_rsp_o         (floo_rsp_out[X][Y]),
-      .floo_wide_i        (floo_wide_in[X][Y])
+      .test_enable_i        (test_mode_i),
+      .tile_clk_en_i        (cluster_clk_en[c]),
+      .tile_rst_ni          (cluster_rst_n[c]),
+      .clk_rst_bypass_i     (clk_rst_bypass_i),
+      .debug_req_i          (debug_req[c]),
+      .meip_i               (meip[c]),
+      .mtip_i               (mtip[c]),
+      .msip_i               (msip[c]),
+      .hart_base_id_i       (HartBaseId[9:0]),
+      .cluster_base_addr_i  (ClusterBaseAddr),
+      .cluster_base_offset_i(ClusterBaseOffset),
+      .id_i                 (ClusterId),
+      .floo_req_o           (floo_req_out[X][Y]),
+      .floo_rsp_i           (floo_rsp_in[X][Y]),
+      .floo_wide_o          (floo_wide_out[X][Y]),
+      .floo_req_i           (floo_req_in[X][Y]),
+      .floo_rsp_o           (floo_rsp_out[X][Y]),
+      .floo_wide_i          (floo_wide_in[X][Y])
     );
   end
 
diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv
index e4b3b5ca..a0980e8a 100644
--- a/hw/spm_tile.sv
+++ b/hw/spm_tile.sv
@@ -81,7 +81,9 @@ module spm_tile
 
   floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in;
   floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in;
-  floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_in;
+  floo_wide_t [Eject:North] router_floo_wide_out;
+
 
   floo_nw_router #(
     .AxiCfgN     (AxiCfgN),
@@ -94,7 +96,10 @@ module spm_tile
     .hdr_t       (hdr_t),
     .floo_req_t  (floo_req_t),
     .floo_rsp_t  (floo_rsp_t),
-    .floo_wide_t (floo_wide_t)
+    .floo_wide_t (floo_wide_t),
+    // .floo_wide_out_t (floo_wide_double_t),
+    .WideRwDecouple (WideRwDecouple),
+    .VcImpl    (VcImplementation)
   ) i_router (
     .clk_i,
     .rst_ni,
@@ -106,15 +111,27 @@ module spm_tile
     .floo_req_o    (router_floo_req_out),
     .floo_rsp_i    (router_floo_rsp_in),
     .floo_wide_i   (router_floo_wide_in),
-    .floo_wide_o   (router_floo_wide_out)
+    .floo_wide_o   (router_floo_wide_out),
+     // Wide Reduction offload port
+    .offload_wide_req_o   (    ),
+    .offload_wide_rsp_i   ( '0 ),
+    // Narrow Reduction offload port
+    .offload_narrow_req_o (   ),
+    .offload_narrow_rsp_i ('0 )
   );
 
   assign floo_req_o                      = router_floo_req_out[West:North];
   assign router_floo_req_in[West:North]  = floo_req_i;
   assign floo_rsp_o                      = router_floo_rsp_out[West:North];
   assign router_floo_rsp_in[West:North]  = floo_rsp_i;
-  assign floo_wide_o                     = router_floo_wide_out[West:North];
+  // Only the local port uses both physical channels. Other outputs use only the lower.
+  // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o
+  //   assign floo_wide_o[i].valid = router_floo_wide_out[i].valid;
+  //   assign floo_wide_o[i].ready = router_floo_wide_out[i].ready;
+  //   assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0];
+  // end
   assign router_floo_wide_in[West:North] = floo_wide_i;
+  assign floo_wide_o[West:North] =router_floo_wide_out[West:North];
 
   /////////////
   // Chimney //
@@ -135,6 +152,8 @@ module spm_tile
     .ChimneyCfgW         (set_ports(ChimneyDefaultCfg, bit'(!IsNarrow), 1'b0)),
     .RouteCfg            (RouteCfgNoMcast),
     .AtopSupport         (1'b1),
+    .WideRwDecouple      (WideRwDecouple),
+    .VcImpl              (VcImplementation),
     .MaxAtomicTxns       (1),
     .Sam                 (Sam),
     .id_t                (id_t),
@@ -151,7 +170,10 @@ module spm_tile
     .axi_wide_out_rsp_t  (axi_wide_out_rsp_t),
     .floo_req_t          (floo_req_t),
     .floo_rsp_t          (floo_rsp_t),
-    .floo_wide_t         (floo_wide_t)
+    .floo_wide_t         (floo_wide_t),
+    // .floo_wide_in_t (floo_wide_double_t),
+    .user_narrow_struct_t             (collective_narrow_user_t),
+    .user_wide_struct_t               (collective_wide_user_t)
   ) i_chimney (
     .clk_i,
     .rst_ni,
diff --git a/iis-env.sh b/iis-env.sh
index 87dffd81..2a65cc08 100644
--- a/iis-env.sh
+++ b/iis-env.sh
@@ -9,7 +9,7 @@ export VLIB="questa-2023.4 vlib"
 export BASE_PYTHON=/usr/local/anaconda3/bin/python3.11
 export CHS_SW_GCC_BINROOT=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0/bin
 export VERIBLE_FMT="oseda -2025.03 verible-verilog-format"
-export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.2.0/bin
+export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.5.0/bin
 
 # Create the python venv
 if [ ! -d ".venv" ]; then
@@ -20,3 +20,5 @@ fi
 if [ -z "$VIRTUAL_ENV" ] || [ "$VIRTUAL_ENV" != "$(realpath .venv)" ]; then
   source .venv/bin/activate
 fi
+
+export PYTHONPATH=$PWD/experiments/:$PYTHONPATH
diff --git a/requirements.txt b/requirements.txt
index 78565385..1b65da0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,5 +25,5 @@ hjson  # for reggen
 
 # For peakrdl
 peakrdl
-peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@7b8dbc9ad5854dc1cdaf36d4ea024c29ffb00a4c
+peakrdl-rawheader==0.1.3
 peakrdl-markdown
diff --git a/sw/snitch/apps/summa_gemm/app.mk b/sw/snitch/apps/summa_gemm/app.mk
new file mode 100644
index 00000000..163ae610
--- /dev/null
+++ b/sw/snitch/apps/summa_gemm/app.mk
@@ -0,0 +1,17 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := summa_gemm
+$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build
+SRC_DIR          := $(PB_SNITCH_SW_DIR)/apps/$(APP)/src
+SRCS             := $(SRC_DIR)/summa_gemm.c
+$(APP)_INCDIRS   := $(SN_ROOT)/sw/kernels/blas $(SN_ROOT)/sw/kernels/blas/gemm/src
+
+# Refer to Snitch scripts
+$(APP)_SCRIPT_DIR :=  $(SN_ROOT)/sw/kernels/blas/gemm/scripts
+
+include $(SN_ROOT)/sw/kernels/datagen.mk
+include $(SN_ROOT)/sw/kernels/common.mk
diff --git a/sw/snitch/apps/summa_gemm/data/params.json b/sw/snitch/apps/summa_gemm/data/params.json
new file mode 100644
index 00000000..2b332462
--- /dev/null
+++ b/sw/snitch/apps/summa_gemm/data/params.json
@@ -0,0 +1,25 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+{
+    setup_ssr: 1,
+    parallelize_m: 1,
+    parallelize_k: 0,
+    m_tiles: 4, // number of tiles in M dimension
+    n_tiles: 4, // number of tiles in N dimension
+    k_tiles: 1, // number of tiles in K dimension
+    load_a: 1,
+    load_b: 1,
+    load_c: 1,
+    double_buffer: 1,
+    partition_banks: 0,
+    transa: false,
+    transb: false, // must be true for SIMD
+    m: 32,
+    n: 32,
+    k: 8,
+    alpha: 1,
+    beta: 0,
+    gemm_fp: "gemm_fp64_opt"
+}
diff --git a/sw/snitch/apps/summa_gemm/src/summa_gemm.c b/sw/snitch/apps/summa_gemm/src/summa_gemm.c
new file mode 100644
index 00000000..bdecff61
--- /dev/null
+++ b/sw/snitch/apps/summa_gemm/src/summa_gemm.c
@@ -0,0 +1,462 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Tim Fischer <fischeti@iis.ee.ethz.ch>
+//         Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+//         Luca Colagrande <colluca@iis.ee.ethz.ch>
+//         Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+//         Lorenzo Leone <lleone@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#include "blas.h"
+
+typedef enum {
+    SW_NAIVE,
+    SW_TREE,
+    HW
+} mode_t;
+
+#ifndef MODE
+#define MODE SW_NAIVE
+#endif
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreorder-init-list"
+#include "data.h"
+#pragma clang diagnostic pop
+
+static inline void snrt_dma_load_2d_tile_mcast_sw(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, snrt_comm_t comm) {
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iteration 0: cluster in row 0 fetches data from memory tile
+    if (pb_cluster_row_idx() == 0) {
+        snrt_dma_load_2d_tile(
+            dst, src, tile_x1_idx, tile_x0_idx,
+            tile_x1_size, tile_x0_size, full_x0_size,
+            prec);
+        snrt_dma_wait_all();
+    }
+
+    // Iterations to cover all transfers in each column
+    uint32_t n_levels_in_col = PB_LOG2_CLUSTER_PER_COL;
+    uint32_t size = tile_x1_size * tile_x0_size * prec;
+    for (uint32_t i = 0; i < n_levels_in_col; i++) {
+
+        // Determine which clusters are senders at every level of the tree
+        char row_idx_inv = pb_cluster_num_in_col() - pb_cluster_row_idx();
+        char num_active_rows = 1 << i;
+        char sender_stride = pb_cluster_num_in_col() / num_active_rows;
+        char is_sender = (row_idx_inv % sender_stride) == 0;
+
+        // Every active cluster sends the data to a cluster above it
+        if (is_sender) {
+
+            // Calculate destination 
+            char receiver_offset = sender_stride / 2;
+            uintptr_t dst_cluster = pb_calculate_cluster_idx(
+                pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx());
+            void *remote_dst = snrt_remote_l1_ptr((void *)dst,
+                snrt_cluster_idx(), dst_cluster);
+
+            snrt_mcycle();
+
+            // Start DMA
+            snrt_dma_start_1d(remote_dst, dst, size);    
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+        
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+}
+
+/**
+ * @brief Performs a General Matrix Multiplication (GEMM) operation on a
+ *        Snitch-based multiple-cluster architecture with a 2D mesh topology
+ *        using the SUMMA dataflow.
+ *
+ * @param args Pointer to a `gemm_args_t` structure containing arguments
+ *             for the GEMM operation.
+ *
+ * @details
+ * The function performs the following steps:
+ * 1. Copies the input arguments to local memory for faster access.
+ * 2. Calculates tile sizes based on the input dimensions and number of tiles.
+ * 3. Allocates space in TCDM for local copies of matrix tiles, unless
+ *    matrix tiles are already stored in TCDM (see `load_* arguments`).
+ * 4. Distributes tiles to clusters for parallel processing.
+ * 5. Iterates over the tiles, performing the following:
+ *    - Copies data for the current tile into local memory.
+ *    - Performs the tile computation using the `sc_st_gemm` function.
+ *    - Writes the result back to global memory.
+ */
+static inline int gemm_picobello(const gemm_args_t *args) {
+#ifndef JOB_ARGS_PRELOADED
+    // Copy the arguments to local memory
+    gemm_args_t *largs = (gemm_args_t *)snrt_l1_alloc_cluster_local(
+        sizeof(gemm_args_t), alignof(gemm_args_t));
+    if (snrt_is_dm_core()) {
+        snrt_dma_start_1d((void *)largs, (void *)args, sizeof(gemm_args_t));
+        snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+#else
+    const gemm_args_t *largs = args;
+#endif
+
+    // Create a communicator for each column to share the B tiles
+    snrt_comm_t col_comm[pb_cluster_num_in_row()];
+	for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++)
+		pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1,
+			0, c);
+
+    // Calculate tile sizes
+    uint32_t tile_m = largs->m / largs->m_tiles;
+    uint32_t tile_n = largs->n / largs->n_tiles;
+    uint32_t tile_k = largs->k / largs->k_tiles;
+    uint32_t tile_a_size = tile_m * tile_k * largs->prec;
+    uint32_t tile_b_size = tile_k * tile_n * largs->prec;
+    uint32_t tile_c_size = tile_m * tile_n * largs->prec;
+
+    // Allocate space for local tile buffers in TCDM, unless preloaded
+    void *a0, *a1, *b0, *b1, *c0, *c1;
+    void *la[2], *lb[2], *lc[2], *lcr;
+    int banks_per_buffer = snrt_cluster_compute_core_num();
+    allocate_buffers(tile_a_size, tile_b_size, tile_c_size, largs,
+                     banks_per_buffer, la, lb, lc, &lcr);
+    if (snrt_cluster_core_idx() == 0) {
+        DUMP(la[0]);
+        DUMP(la[1]);
+        DUMP(lb[0]);
+        DUMP(lb[1]);
+        DUMP(lc[0]);
+        DUMP(lc[1]);
+    }
+    snrt_cluster_hw_barrier();
+
+    // NoC layout (6 columns x 4 rows)
+    /*
+    //
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M3  |---|  C3  |---|  C7  |---| C11  |---| C15  |---|  M7  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M2  |---|  C2  |---|  C6  |---| C10  |---| C14  |---|  M6  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M1  |---|  C1  |---|  C5  |---|  C9  |---| C13  |---|  M5  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+        |           |          |          |          |          |
+        |           |          |          |          |          |
+     |------|   |------|   |------|   |------|   |------|   |------|
+     |  M0  |---|  C0  |---|  C4  |---|  C8  |---| C12  |---|  M4  |
+     |------|   |------|   |------|   |------|   |------|   |------|
+    //
+    */
+
+    // SUMMA dataflow:
+    // - Clusters in the same row calculate different tiles of the same
+    //   row block of C, reusing the same row block of A but different column
+    //   blocks of B.
+    // - Clusters in the same column calculate different tiles of the same
+    //   column block of C, reusing the same column block of B but different
+    //   row blocks of A.
+    //
+    // Notes:
+    // - K tiling not supported.
+
+    // Distribute m tiles to cluster rows and n tiles to cluster columns
+    uint32_t cluster_m_tiles = largs->m_tiles / pb_cluster_num_in_col();
+    uint32_t cluster_n_tiles = largs->n_tiles / pb_cluster_num_in_row();
+    uint32_t cluster_k_tiles = 1;
+
+    // Calculate number of iterations
+    uint32_t num_tiles = cluster_m_tiles * cluster_n_tiles * cluster_k_tiles;
+    uint32_t num_iters = num_tiles;
+    if (largs->double_buffer)
+        num_iters += 2;
+    else
+        num_iters += 1;
+
+    // Iterate over all tiles
+    for (uint32_t i = 0; i < num_iters; i++) {
+        // Calculate tile indices (we iterate in n->m order)
+        int dma_in_i = i;
+        int comp_i = largs->double_buffer ? i - 1 : i;
+        int dma_out_i = largs->double_buffer ? i - 2 : i - 1;
+        int dma_in_k = dma_in_i % cluster_k_tiles;
+        int dma_in_mn = dma_in_i / cluster_k_tiles;
+        int dma_in_n = dma_in_mn % cluster_n_tiles;
+        int dma_in_m = dma_in_mn / cluster_n_tiles;
+        int comp_k = comp_i % cluster_k_tiles;
+        int comp_mn = comp_i / cluster_k_tiles;
+        int comp_n = comp_mn % cluster_n_tiles;
+        int comp_m = comp_mn / cluster_n_tiles;
+        int dma_out_k = dma_out_i % cluster_k_tiles;
+        int dma_out_mn = dma_out_i / cluster_k_tiles;
+        int dma_out_n = dma_out_mn % cluster_n_tiles;
+        int dma_out_m = dma_out_mn / cluster_n_tiles;
+
+        // Calculate the absolute m, n and k indices for each cluster
+        int dma_in_m_abs = dma_in_m + pb_cluster_row_idx() * cluster_m_tiles;
+        int comp_m_abs = comp_m + pb_cluster_row_idx() * cluster_m_tiles;
+        int dma_out_m_abs = dma_out_m + pb_cluster_row_idx() * cluster_m_tiles;
+        int dma_in_n_abs = dma_in_n + pb_cluster_col_idx() * cluster_n_tiles;
+        int comp_n_abs = comp_n + pb_cluster_col_idx() * cluster_n_tiles;
+        int dma_out_n_abs = dma_out_n + pb_cluster_col_idx() * cluster_n_tiles;
+        int dma_in_k_abs = dma_in_k;
+        int comp_k_abs = comp_k;
+        int dma_out_k_abs = dma_out_k;
+
+        // DMA out phase
+        if (snrt_is_dm_core()) {
+            if (dma_out_i >= 0) {
+                snrt_mcycle();
+                // Switch buffers
+                int buff_idx = largs->double_buffer ? dma_out_mn % 2 : 0;
+
+                // Store C
+                // If parallelize_k, then only cluster 0 must writeback
+                if ((snrt_cluster_idx() == 0) || !(largs->parallelize_k)) {
+                    if (largs->partition_banks) {
+                        snrt_dma_2d_to_1d(
+                            (void *)((uintptr_t)largs->c +
+                                     dma_out_m_abs * tile_c_size),
+                            lc[buff_idx], tile_c_size,
+                            banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                            SNRT_TCDM_HYPERBANK_WIDTH);
+                    } else {
+                        snrt_dma_store_2d_tile(largs->c, lc[buff_idx],
+                                               dma_out_m_abs, dma_out_n_abs, tile_m,
+                                               tile_n, largs->ldc, largs->prec);
+                    }
+                    snrt_dma_wait_all();
+                }
+                snrt_mcycle();
+            }
+        }
+
+        // DMA in phase
+        if (snrt_is_dm_core()) {
+            if (dma_in_i < num_tiles) {
+                snrt_mcycle();
+                // Switch buffers
+                // When tiling on N, a row block of A is reused with multiple
+                // column blocks of B. There is no need to reload A on every
+                // iteration. The A buffer must be switched only when reloading
+                // A. On the other hand, the B buffer is switched every
+                // iteration, and the C buffer only needs to be switched after
+                // fully accumulating the result, i.e. after finishing the K loop.
+                int a_buff_idx = largs->double_buffer ? dma_in_m % 2 : 0;
+                int b_buff_idx = largs->double_buffer ? dma_in_i % 2 : 0;
+                int c_buff_idx = largs->double_buffer ? dma_in_mn % 2 : 0;
+
+                // Load A
+                if (largs->load_a && (dma_in_n == 0)) {
+                    if (largs->partition_banks) {
+                        snrt_dma_1d_to_2d(
+                            la[a_buff_idx],
+                            (void *)((uintptr_t)largs->a +
+                                     dma_in_m_abs * tile_a_size),
+                            tile_a_size,
+                            banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                            SNRT_TCDM_HYPERBANK_WIDTH);
+                    } else {
+                        snrt_dma_load_2d_tile(
+                            la[a_buff_idx], largs->a, dma_in_m_abs, dma_in_k_abs,
+                            tile_m, tile_k, largs->lda, largs->prec);
+                    }
+                }
+
+                // Load B
+                if (largs->load_b) {
+                    if (largs->transb) {
+                        snrt_dma_load_2d_tile(lb[b_buff_idx], largs->b,
+                                              dma_in_n_abs, dma_in_k_abs,
+                                              tile_n, tile_k,
+                                              largs->ldb, largs->prec);
+                    } else {
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lb[b_buff_idx],
+                                (void *)((uintptr_t)largs->b +
+                                         dma_in_k_abs * tile_b_size),
+                                tile_b_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            // In SW_NAIVE mode, every cluster fetches the B
+                            // tile it requires, independently. In all other
+                            // modes, the clusters in row 0 multicast the B
+                            // tile to all clusters in the same column.
+                            if (MODE == HW) {
+                                if (pb_cluster_row_idx() == 0) {
+                                    snrt_dma_load_2d_tile_mcast(
+                                        lb[b_buff_idx], largs->b, dma_in_k_abs,
+                                        dma_in_n_abs, tile_k, tile_n,
+                                        largs->ldb, largs->prec,
+                                        col_comm[pb_cluster_col_idx()]);
+                                }
+                            } else if (MODE == SW_TREE) {
+                                snrt_dma_load_2d_tile_mcast_sw(
+                                    lb[b_buff_idx], largs->b, dma_in_k_abs,
+                                    dma_in_n_abs, tile_k, tile_n,
+                                    largs->ldb, largs->prec,
+                                    col_comm[pb_cluster_col_idx()]);                            
+                            } else {
+                                snrt_dma_load_2d_tile(
+                                    lb[b_buff_idx], largs->b, dma_in_k_abs,
+                                    dma_in_n_abs, tile_k, tile_n, largs->ldb,
+                                    largs->prec);
+                            }
+                        }
+                    }
+                }
+
+                // Load C
+                // C tile is loaded only upon the first k iteration, then
+                // the C array will contain the partial results from the
+                // previous iteration
+                if (largs->load_c) {
+                    if (dma_in_k_abs == 0) {
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lc[c_buff_idx],
+                                (void *)((uintptr_t)largs->c +
+                                         dma_in_m_abs * tile_c_size),
+                                tile_c_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            snrt_dma_load_2d_tile(lc[c_buff_idx], largs->c,
+                                                  dma_in_m_abs, dma_in_n_abs,
+                                                  tile_m, tile_n, largs->ldc,
+                                                  largs->prec);
+                        }
+                    } else if (dma_in_k == 0) {
+                        // Clusters other than the first need to initialize
+                        // the C array to zero in their first iteration
+                        if (largs->partition_banks) {
+                            snrt_dma_1d_to_2d(
+                                lc[c_buff_idx], snrt_cluster()->zeromem.mem,
+                                tile_c_size,
+                                banks_per_buffer * SNRT_TCDM_BANK_WIDTH,
+                                SNRT_TCDM_HYPERBANK_WIDTH);
+                        } else {
+                            snrt_dma_start_1d(lc[c_buff_idx],
+                                              snrt_cluster()->zeromem.mem,
+                                              tile_c_size);
+                        }
+                    }
+                }
+                snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+        }
+
+        // Additional barrier required when not double buffering
+        if (!largs->double_buffer) snrt_global_barrier();
+
+        // Compute phase
+        if (comp_i >= 0 && comp_i < num_tiles) {
+            // Switch buffers
+            int a_buff_idx = largs->double_buffer ? comp_m % 2 : 0;
+            int b_buff_idx = largs->double_buffer ? comp_i % 2 : 0;
+            int c_buff_idx = largs->double_buffer ? comp_mn % 2 : 0;
+
+            // Only compute cores participate in the tile computation
+            if (!snrt_is_dm_core()) {
+                // uint32_t start_cycle = snrt_mcycle();
+
+                // In the first k iteration we accumulate with the C matrix
+                // scaled by beta, in successive iterations we accumulate
+                // the previous partial result. The tile-level beta is thus
+                // a function of k: beta(k).
+                uint32_t beta_k = comp_k_abs == 0 ? largs->beta : 1;
+
+                // Tile computation
+                sc_st_gemm_args_t sc_st_args;
+                sc_st_args.prec = largs->prec;
+                sc_st_args.setup_ssr = largs->setup_ssr;
+                sc_st_args.partition_banks = largs->partition_banks;
+                sc_st_args.transa = largs->transa;
+                sc_st_args.transb = largs->transb;
+                sc_st_args.a = la[a_buff_idx];
+                if (largs->transa) {
+                    sc_st_args.lda = tile_m;
+                } else if (largs->partition_banks) {
+                    sc_st_args.lda = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_k, largs->prec);
+                } else {
+                    sc_st_args.lda = tile_k;
+                }
+                sc_st_args.b = lb[b_buff_idx];
+                if (largs->transb) {
+                    sc_st_args.ldb = tile_k;
+                } else if (largs->partition_banks) {
+                    sc_st_args.ldb = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_n, largs->prec);
+                } else {
+                    sc_st_args.ldb = tile_n;
+                }
+                sc_st_args.beta = beta_k;
+                sc_st_args.c = lc[c_buff_idx];
+                if (largs->partition_banks) {
+                    sc_st_args.ldc = calculate_partitioned_banks_stride(
+                        banks_per_buffer, tile_n, largs->prec);
+                } else {
+                    sc_st_args.ldc = tile_n;
+                }
+                sc_st_args.m = tile_m;
+                sc_st_args.n = tile_n;
+                sc_st_args.k = tile_k;
+                sc_st_gemm(largs->gemm_fp, &sc_st_args);
+
+                // uint32_t end_cycle = snrt_mcycle();
+            }
+
+            // Add the partial result tiles from the various clusters together
+            // in a logarithmic reduction fashion.
+            // Note: both compute and DMA cores participate in this step.
+            if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) {
+                snrt_global_reduction_dma(
+                    (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n);
+            }
+        }
+
+        // Synchronize cores after every iteration
+        snrt_global_barrier();
+    }
+
+    return 0;
+}
+
+
+int main () {
+    gemm_picobello(&args);
+    return 0;
+}
diff --git a/sw/snitch/runtime/src/.gitignore b/sw/snitch/runtime/impl/.gitignore
similarity index 100%
rename from sw/snitch/runtime/src/.gitignore
rename to sw/snitch/runtime/impl/.gitignore
diff --git a/sw/snitch/runtime/src/memory.ld b/sw/snitch/runtime/impl/memory.ld
similarity index 100%
rename from sw/snitch/runtime/src/memory.ld
rename to sw/snitch/runtime/impl/memory.ld
diff --git a/sw/snitch/runtime/src/pb_memory.h b/sw/snitch/runtime/impl/pb_memory.h
similarity index 100%
rename from sw/snitch/runtime/src/pb_memory.h
rename to sw/snitch/runtime/impl/pb_memory.h
diff --git a/sw/snitch/runtime/src/pb_noc_cfg.h b/sw/snitch/runtime/impl/pb_noc_cfg.h
similarity index 69%
rename from sw/snitch/runtime/src/pb_noc_cfg.h
rename to sw/snitch/runtime/impl/pb_noc_cfg.h
index bc285ba6..266b2191 100644
--- a/sw/snitch/runtime/src/pb_noc_cfg.h
+++ b/sw/snitch/runtime/impl/pb_noc_cfg.h
@@ -6,3 +6,6 @@
 
 #define PB_CLUSTER_PER_ROW 4
 #define PB_CLUSTER_PER_COL 4
+// TODO(colluca): derive from previous or viceversa
+#define PB_LOG2_CLUSTER_PER_ROW 2
+#define PB_LOG2_CLUSTER_PER_COL 2
diff --git a/sw/snitch/runtime/src/putchar.c b/sw/snitch/runtime/impl/putchar.c
similarity index 100%
rename from sw/snitch/runtime/src/putchar.c
rename to sw/snitch/runtime/impl/putchar.c
diff --git a/sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl b/sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl
similarity index 100%
rename from sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl
rename to sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl
diff --git a/sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl b/sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl
similarity index 100%
rename from sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl
rename to sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl
diff --git a/sw/snitch/runtime/src/snitch_cluster_memory.c b/sw/snitch/runtime/impl/snitch_cluster_memory.c
similarity index 100%
rename from sw/snitch/runtime/src/snitch_cluster_memory.c
rename to sw/snitch/runtime/impl/snitch_cluster_memory.c
diff --git a/sw/snitch/runtime/src/snitch_cluster_start.c b/sw/snitch/runtime/impl/snitch_cluster_start.c
similarity index 100%
rename from sw/snitch/runtime/src/snitch_cluster_start.c
rename to sw/snitch/runtime/impl/snitch_cluster_start.c
diff --git a/sw/snitch/runtime/src/snitch_cluster_start.h b/sw/snitch/runtime/impl/snitch_cluster_start.h
similarity index 100%
rename from sw/snitch/runtime/src/snitch_cluster_start.h
rename to sw/snitch/runtime/impl/snitch_cluster_start.h
diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/impl/snrt.S
similarity index 100%
rename from sw/snitch/runtime/src/snrt.S
rename to sw/snitch/runtime/impl/snrt.S
diff --git a/sw/snitch/runtime/impl/snrt.cc b/sw/snitch/runtime/impl/snrt.cc
new file mode 100644
index 00000000..74f12312
--- /dev/null
+++ b/sw/snitch/runtime/impl/snrt.cc
@@ -0,0 +1,26 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+#include "alloc.c"
+#include "alloc_v2.c"
+#include "cls.c"
+#include "cluster_interrupts.c"
+#include "dm.c"
+#include "dma.c"
+#include "eu.c"
+#include "kmp.c"
+#include "omp.c"
+#include "printf.c"
+#include "putchar.c"
+#include "riscv.c"
+#include "snitch_cluster_memory.c"
+#include "snitch_cluster_start.c"
+#include "ssr.c"
+#include "sync.c"
+#include "team.c"
+
+#include "pb_team.c"
+#include "pb_sync.c"
diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/impl/snrt.h
similarity index 91%
rename from sw/snitch/runtime/src/snrt.h
rename to sw/snitch/runtime/impl/snrt.h
index 8959c80a..fd8be02e 100644
--- a/sw/snitch/runtime/src/snrt.h
+++ b/sw/snitch/runtime/impl/snrt.h
@@ -7,6 +7,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
+// Additional optional configurations to change the runtime behaviour
+// #define SNRT_ENABLE_NARROW_REDUCTION
+
 // Configuration- and system-specific definitions (HAL)
 #include "pb_addrmap.h"
 #include "snitch_cluster_cfg.h"
@@ -52,7 +55,6 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t;
 #include "sync.h"
 #include "team.h"
 #include "types.h"
-#include "pb_team.h"
 
 // Accelerators
 #include "datamover/archi_datamover.h"
@@ -61,3 +63,7 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t;
 #include "redmule/archi_redmule.h"
 #include "redmule/hal_redmule.h"
 #include "redmule/redmule_utils.h"
+
+// Picobello specific
+#include "pb_team.h"
+#include "pb_sync.h"
diff --git a/sw/snitch/runtime/src/pb_sync.c b/sw/snitch/runtime/src/pb_sync.c
new file mode 100644
index 00000000..bba8376d
--- /dev/null
+++ b/sw/snitch/runtime/src/pb_sync.c
@@ -0,0 +1,6 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+extern inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows,
+    uint32_t n_cols, snrt_comm_t parent_comm);
diff --git a/sw/snitch/runtime/src/pb_sync.h b/sw/snitch/runtime/src/pb_sync.h
new file mode 100644
index 00000000..782a9271
--- /dev/null
+++ b/sw/snitch/runtime/src/pb_sync.h
@@ -0,0 +1,44 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows,
+    uint32_t n_cols, uint32_t start_row = 0, uint32_t start_col = 0,
+    snrt_comm_t parent_comm = NULL) {
+
+    // If no communicator is given, world communicator is used as default.
+    if (parent_comm == NULL) parent_comm = snrt_comm_world;
+
+    // Allocate communicator struct in L1 and point to it.
+    *comm =
+        (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t));
+
+    // Allocate barrier counter in L1. Only the first cluster's is actually
+    // used, but we want to keep all clusters' L1 allocators aligned. Thus, 
+    // only the first cluster initializes its barrier counter. A global barrier
+    // is then used to ensure all cores "see" the initialized value.
+    uint32_t first_cluster = start_col * pb_cluster_num_in_col() + start_row;
+    void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t));
+    barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(),
+        first_cluster);
+    if ((snrt_cluster_idx() == first_cluster) && snrt_is_dm_core()) {
+        *(uint32_t *)barrier_ptr = 0;
+        snrt_fence();
+    }
+    snrt_global_barrier(parent_comm);
+
+    // Initialize communicator, pointing to the newly-allocated barrier
+    // counter in L3.
+    (*comm)->size = n_rows * n_cols;
+    (*comm)->mask = ((n_cols - 1) << PB_LOG2_CLUSTER_PER_COL) |
+        (n_rows - 1);
+    (*comm)->base = (start_col << PB_LOG2_CLUSTER_PER_COL) | start_row;
+    (*comm)->barrier_ptr = (uint32_t *)barrier_ptr;
+    uint32_t in_row_range = (pb_cluster_row_idx() >= start_row) &&
+        (pb_cluster_row_idx() < (start_row + n_rows));
+    uint32_t in_col_range = (pb_cluster_col_idx() >= start_col) &&
+        (pb_cluster_col_idx() < (start_col + n_cols));
+    (*comm)->is_participant = in_row_range && in_col_range;
+}
diff --git a/sw/snitch/runtime/src/pb_team.c b/sw/snitch/runtime/src/pb_team.c
index cd280f61..0a67b710 100644
--- a/sw/snitch/runtime/src/pb_team.c
+++ b/sw/snitch/runtime/src/pb_team.c
@@ -1,4 +1,4 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
+// Copyright 2025 ETH Zurich and University of Bologna.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
@@ -6,14 +6,40 @@ extern inline uintptr_t pb_l2_tile_address(uint32_t tile_idx);
 
 extern inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr);
 
-extern inline uint32_t pb_cluster_row(uint32_t cidx);
+extern inline constexpr uint32_t pb_log2_cluster_num_in_col();
 
-extern inline uint32_t pb_cluster_row();
+extern inline constexpr uint32_t pb_log2_cluster_num_in_row();
 
-extern inline uint32_t pb_cluster_col(uint32_t cidx);
+extern inline constexpr uint32_t pb_cluster_num_in_row();
 
-extern inline uint32_t pb_cluster_col();
+extern inline constexpr uint32_t pb_cluster_num_in_col();
 
-extern inline uint32_t pb_closest_mem_tile(uint32_t cidx);
+extern inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx);
+
+extern inline uint32_t pb_cluster_row_idx();
+
+extern inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx);
+
+extern inline uint32_t pb_cluster_col_idx();
+
+extern inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col);
+
+extern inline uint32_t pb_cluster_in_row(uint32_t row);
+
+extern inline uint32_t pb_cluster_in_col(uint32_t col);
+
+extern inline uint32_t pb_cluster_is_easternmost();
+
+extern inline uint32_t pb_cluster_is_northernmost();
+
+extern inline uint32_t pb_cluster_north_neighbour();
+
+extern inline uint32_t pb_cluster_east_neighbour();
+
+extern inline uint32_t pb_cluster_south_neighbour();
+
+extern inline uint32_t pb_cluster_west_neighbour();
+    
+extern inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx);
 
 extern inline uint32_t pb_closest_mem_tile();
diff --git a/sw/snitch/runtime/src/pb_team.h b/sw/snitch/runtime/src/pb_team.h
index 1fa8d8d0..fa41866c 100644
--- a/sw/snitch/runtime/src/pb_team.h
+++ b/sw/snitch/runtime/src/pb_team.h
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 // Lorenzo Leone <lleone@iis.ee.ethz.ch>
-
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 /**
  * @file
@@ -30,60 +30,153 @@ inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr) {
         PICOBELLO_ADDRMAP_L2_SPM_0_SIZE;
 }
 
+/**
+ * @brief Get the log2 of the number of clusters in a row
+ */
+ inline constexpr uint32_t pb_log2_cluster_num_in_row() {
+    // TODO(colluca): derive this from pb_cluster_num_in_row() or viceversa
+    return 2;
+}
 
 /**
- * @brief Get the NoC row index
- * @param cidx The cluster index
- * @return The Row index
+ * @brief Get the log2 of the number of clusters in a column
  */
-inline uint32_t pb_cluster_row(uint32_t cidx)
-{
-    return cidx % PB_CLUSTER_PER_ROW;
+inline constexpr uint32_t pb_log2_cluster_num_in_col() {
+    // TODO(colluca): derive this from pb_cluster_num_in_col() or viceversa
+    return 2;
+}
+
+/**
+ * @brief Get the number of clusters in a row
+ */
+inline constexpr uint32_t pb_cluster_num_in_row() {
+    return PB_CLUSTER_PER_ROW;
+}
+
+/**
+ * @brief Get the number of clusters in a column
+ */
+inline constexpr uint32_t pb_cluster_num_in_col() {
+    return PB_CLUSTER_PER_COL;
 }
 
 /**
- * @brief Get the NoC row index
- * This is a convenience orload of pb_cluster_row()
- * @return The Row index
+ * @brief Get the row index of a cluster
+ * @param cluster_idx The cluster index
+ * @return The row index relative to the first row of cluster tiles
  */
-inline uint32_t pb_cluster_row()
+inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx)
 {
-    return pb_cluster_row(snrt_cluster_idx());
+    return cluster_idx % pb_cluster_num_in_col();
 }
 
+/**
+ * @brief Get the row index of the invoking cluster
+ * @return The row index relative to the first row of cluster tiles
+ */
+inline uint32_t pb_cluster_row_idx()
+{
+    return pb_cluster_row_idx(snrt_cluster_idx());
+}
 
 /**
- * @brief Get the NoC column index
- * @param cidx The cluster index
- * @return The Column index
+ * @brief Get the column index of a cluster
+ * @param cluster_idx The cluster index
+ * @return The column index relative to the first column of cluster tiles
  */
-inline uint32_t pb_cluster_col(uint32_t cidx)
+inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx)
 {
-    return cidx / PB_CLUSTER_PER_COL;
+    return cluster_idx / pb_cluster_num_in_col();
 }
 
 /**
- * @brief Get the NoC column index
- * This is a convenience orload of pb_cluster_row()
- * @return The Column index
+ * @brief Get the column index of the invoking cluster
+ * @return The column index relative to the first column of cluster tiles
  */
-inline uint32_t pb_cluster_col()
+inline uint32_t pb_cluster_col_idx()
 {
-    return pb_cluster_col(snrt_cluster_idx());
+    return pb_cluster_col_idx(snrt_cluster_idx());
+}
+
+/**
+ * @brief Calculate the cluster index from its (row, col) coordinates
+ * @param row Row index relative to the first row of cluster tiles
+ * @param col Column index relative to the first column of cluster tiles
+ */
+inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col) {
+    return col * pb_cluster_num_in_col() + row;
+}
+
+/**
+ * @brief Test if cluster is in a given row
+ * @param row Row index relative to the first row of cluster tiles
+ */
+inline uint32_t pb_cluster_in_row(uint32_t row) {
+    return pb_cluster_row_idx() == row;
+}
+
+/**
+ * @brief Test if cluster is in a given column
+ * @param col Column index relative to the first column of cluster tiles
+ */
+inline uint32_t pb_cluster_in_col(uint32_t col) {
+    return pb_cluster_col_idx() == col;
+}
+
+/**
+ * @brief Test if cluster is in the easternmost column
+ */
+inline uint32_t pb_cluster_is_easternmost() {
+    return pb_cluster_in_col(pb_cluster_num_in_row() - 1);
+}
+
+/**
+ * @brief Test if cluster is in the northernmost row
+ */
+inline uint32_t pb_cluster_is_northernmost() {
+    return pb_cluster_in_row(pb_cluster_num_in_col() - 1);
+}
+
+/**
+ * @brief Get cluster index of north neighbour
+ */
+inline uint32_t pb_cluster_north_neighbour() {
+    return snrt_cluster_idx() + 1;
 }
 
+/**
+ * @brief Get cluster index of east neighbour
+ */
+inline uint32_t pb_cluster_east_neighbour() {
+    return snrt_cluster_idx() + pb_cluster_num_in_row();
+}
+
+/**
+ * @brief Get cluster index of south neighbour
+ */
+inline uint32_t pb_cluster_south_neighbour() {
+    return snrt_cluster_idx() - 1;
+}
+
+/**
+ * @brief Get cluster index of west neighbour
+ */
+inline uint32_t pb_cluster_west_neighbour() {
+    return snrt_cluster_idx() - pb_cluster_num_in_row();
+}
 
 /**
  * @brief Get the index of the closest memory tile
- * @param cidx The cluster index
- * @return Index of the closest memory tile to cidx
+ * @param cluster_idx The cluster index
+ * @return Index of the closest memory tile to cluster_idx
  */
-inline uint32_t pb_closest_mem_tile(uint32_t cidx) {
-    uint32_t row = pb_cluster_row(cidx);
+inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx) {
+    uint32_t row = pb_cluster_row_idx(cluster_idx);
     // e.g. with 4x4 matrix
     // first 8 clusters -> left column tiles 0..3
     // clusters >= 8 -> right column tiles 4..7
-    return (cidx < (snrt_cluster_num() / 2)) ? row : (row + PB_CLUSTER_PER_COL);
+    return (cluster_idx < (snrt_cluster_num() / 2)) ?
+        row : (row + PB_CLUSTER_PER_COL);
 }
 
 /**
diff --git a/sw/snitch/runtime/src/snrt.cc b/sw/snitch/runtime/src/snrt.cc
deleted file mode 120000
index 686f3cf5..00000000
--- a/sw/snitch/runtime/src/snrt.cc
+++ /dev/null
@@ -1 +0,0 @@
-../../../../.deps/snitch_cluster/sw/runtime/impl/snrt.cc
\ No newline at end of file
diff --git a/sw/snitch/tests/barrier_benchmark.c b/sw/snitch/tests/barrier_benchmark.c
new file mode 100644
index 00000000..c14e57f3
--- /dev/null
+++ b/sw/snitch/tests/barrier_benchmark.c
@@ -0,0 +1,112 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#ifndef N_ROWS
+#define N_ROWS (pb_cluster_num_in_col())
+#endif
+
+#ifndef N_COLS
+#define N_COLS (pb_cluster_num_in_row())
+#endif
+
+typedef enum {
+    SW,
+    HW
+} impl_t;
+
+#ifndef IMPL
+#define IMPL SW
+#endif
+
+static inline void sw_barrier(snrt_comm_t comm) {
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_COLLECTIVE_MULTICAST;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    volatile uint32_t *mcip_set = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_set.w);
+    volatile uint32_t *mcip_clr = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_clear.w);
+    uint32_t size = comm->size;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Execute barrier in submesh (first iteration to preheat I$)
+    for (volatile uint32_t i = 0; i < 2; i++) {
+        // Align start times using hardware barrier
+        snrt_inter_cluster_barrier(comm);
+        
+        snrt_mcycle();
+
+        // Perform software inter-cluster barrier.
+        uint32_t cnt = __atomic_add_fetch(barrier_ptr, 1, __ATOMIC_RELAXED);
+
+        // Cluster 0 polls the barrier counter, resets it for the next barrier
+        // and multicasts an interrupt to wake up the other clusters. Compared
+        // to the implementation in sync.h, this version is cache-friendly,
+        // i.e. successive invocations of the barrier will find a hot cache.
+        if (snrt_cluster_idx() == 0) {
+            while (*barrier_ptr != size);
+            *barrier_ptr = 0;
+            snrt_fence();
+            snrt_set_awuser_low(user);
+            *mcip_set = 1 << snrt_cluster_compute_core_num();
+            snrt_set_awuser_low(0);
+        } else {
+            snrt_wfi();
+        }
+        // Clear interrupt for next barrier (also the sending cluster)
+        *mcip_clr = 1 << snrt_cluster_compute_core_num();
+        snrt_mcycle();
+    }
+}
+
+static inline void hw_barrier(snrt_comm_t comm) {
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Execute barrier in submesh (first iteration to preheat I$, second to
+    // align start times for third iteration)
+    for (volatile uint32_t i = 0; i < 3; i++) {
+        snrt_mcycle();
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+        snrt_mcycle();
+    }
+}
+
+int main (void) {
+
+    // Create communicator for submesh
+    snrt_comm_t comm;
+    pb_create_mesh_comm(&comm, N_ROWS, N_COLS);
+
+    if (snrt_is_dm_core() && comm->is_participant) {
+        // Execute barrier in submesh
+        if (IMPL == SW) {
+            sw_barrier(comm);
+        } else {
+            hw_barrier(comm);
+        }
+    }
+
+    // Non-participating clusters wait on a global barrier
+    snrt_global_barrier();
+
+	return 0;
+}
diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c
deleted file mode 100644
index 5583de48..00000000
--- a/sw/snitch/tests/dma_multicast.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Luca Colagrande <colluca@iis.ee.ethz.ch>
-//
-// This code tests the multicast feature of the wide interconnect.
-// It uses the DMA to broadcast chunks of data to different clusters
-// in the Picobello system.
-
-#include <stdint.h>
-#include "pb_addrmap.h"
-#include "snrt.h"
-
-#define INITIALIZER 0xAAAAAAAA
-
-#ifndef LENGTH
-#define LENGTH 32
-#endif
-
-#define LENGTH_TO_CHECK 32
-
-#ifndef N_CLUSTERS_TO_USE
-#define N_CLUSTERS_TO_USE snrt_cluster_num()
-#endif
-
-// TODO(lleone): Check if is okay to shift by 18. Probably you need to adapt it to picobello
-#define BCAST_MASK_ACTIVE ((N_CLUSTERS_TO_USE - 1) << 18)
-// #define BCAST_MASK_ALL ((snrt_cluster_num() - 1) << 18)
-
-
-static inline void dma_broadcast_to_clusters(void* dst, void* src, size_t size) {
-    // snrt_enable_multicast(BCAST_MASK_ACTIVE);
-    if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) {
-        snrt_dma_start_1d_mcast(snrt_remote_l1_ptr(dst, 0, 1), src, size, BCAST_MASK_ACTIVE);
-        snrt_dma_wait_all();
-    }
-    // snrt_disable_multicast();
-}
-
-static inline int cluster_participates_in_bcast(int i) {
-    return (i < N_CLUSTERS_TO_USE);
-}
-
-static inline void broadcast_wrapper(void* dst, void* src, size_t size) {
-    snrt_global_barrier();
-    dma_broadcast_to_clusters(dst, src, size);
-    // Put clusters who don't participate in the broadcast to sleep, as if
-    // they proceed directly to the global barrier, they will interfere with
-    // the other clusters, by sending their atomics on the narrow interconnect.
-    if (!cluster_participates_in_bcast(snrt_cluster_idx())) {
-        snrt_wfi();
-        snrt_int_clr_mcip();
-    }
-    // Wake these up when cluster 0 is done
-    else if ((snrt_cluster_idx() == 0) && snrt_is_dm_core()) {
-        for (int i = 0; i < snrt_cluster_num(); i++) {
-            if (!cluster_participates_in_bcast(i))
-                snrt_cluster()->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff;
-        }
-    }
-}
-
-int main() {
-    snrt_interrupt_enable(IRQ_M_CLUSTER);
-    // snrt_int_clr_mcip();
-
-    // Allocate destination buffer
-    uint32_t *buffer_dst = (uint32_t *)snrt_l1_next_v2();
-    uint32_t *buffer_src = buffer_dst + LENGTH;
-
-    // First cluster initializes the source buffer and multicast-
-    // copies it to the destination buffer in every cluster's TCDM.
-    if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) {
-        for (uint32_t i = 0; i < LENGTH; i++) {
-            buffer_src[i] = INITIALIZER;
-        }
-    }
-
-    // Initiate DMA transfer (twice to preheat the cache)
-    for (volatile int i = 0; i < 2; i++) {
-        broadcast_wrapper(buffer_dst, buffer_src, LENGTH * sizeof(uint32_t));
-    }
-
-    // All other clusters wait on a global barrier to signal the transfer
-    // completion.
-    snrt_global_barrier();
-
-    // Every cluster except cluster 0 checks that the data in the destination
-    // buffer is correct. To speed this up we only check the first 32 elements.
-    if (snrt_is_dm_core() && (snrt_cluster_idx() < N_CLUSTERS_TO_USE) && (snrt_cluster_idx() != 0)) {
-        uint32_t n_errs = LENGTH_TO_CHECK;
-        for (uint32_t i = 0; i < LENGTH_TO_CHECK; i++) {
-            if (buffer_dst[i] == INITIALIZER) n_errs--;
-        }
-        return n_errs;
-    } else
-        return 0;
-  }
diff --git a/sw/snitch/tests/multicast_benchmark.c b/sw/snitch/tests/multicast_benchmark.c
new file mode 100644
index 00000000..cc09c50f
--- /dev/null
+++ b/sw/snitch/tests/multicast_benchmark.c
@@ -0,0 +1,329 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Lorenzo Leone   <lleone@iis.ee.ethz.ch>
+//
+// This code implements a function to multicast data from one memory tile
+// to all clusters in the same row, or multiple rows.
+
+#include <stdint.h>
+#include "snrt.h"
+
+// Transfer size in bytes
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+// Batch size in bytes
+#ifndef BATCH
+#define BATCH 64
+#endif
+
+typedef enum {
+    SEQ,
+    TREE,
+    HW
+} impl_t;
+
+#ifndef IMPL
+#define IMPL SEQ
+#endif
+
+#define N_BATCHES (SIZE / BATCH)
+#define N_ELEMS (SIZE / sizeof(uint32_t))
+
+#ifndef LOG2_N_ROWS
+#define LOG2_N_ROWS 2
+#endif
+
+#define N_ROWS (1 << LOG2_N_ROWS)
+
+static inline void dma_multicast_sequential(uintptr_t l1_buffer,
+    uintptr_t l3_buffer, snrt_comm_t comm) {
+    // Only DMA cores of clusters in the first row participate
+    if (!snrt_is_dm_core() || !comm->is_participant) return;
+        
+    // Compute address of source buffer:
+    // - in memory tile for cluster 0
+    // - in left neighbour for clusters in row 0
+    // - in bottom neighbour for clusters in other rows
+    uintptr_t src;
+    if (snrt_cluster_idx() == 0) {
+        src = l3_buffer;
+    } else if (pb_cluster_in_row(0)) {
+        src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer,
+            snrt_cluster_idx(), pb_cluster_west_neighbour());
+    } else {
+        src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer,
+            snrt_cluster_idx(), pb_cluster_south_neighbour());
+    }
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iterations to cover all transfers in a row
+    uint32_t n_iters = N_BATCHES - 1 + pb_cluster_num_in_row();
+    for (uint32_t i = 0; i < n_iters; i++) {
+
+        // Every cluster is active for N_BATCHES iterations
+        // starting from the iteration with i == snrt_cluster_idx()
+        char cluster_active = i >= pb_cluster_col_idx();
+        cluster_active &= i < (pb_cluster_col_idx() + N_BATCHES);
+        cluster_active &= pb_cluster_in_row(0);
+
+        // Every active cluster copies data from the left tile
+        if (cluster_active) {
+            snrt_mcycle();
+
+            // Start DMA
+            snrt_dma_start_1d(l1_buffer, src, BATCH);
+
+            // Update pointers for next iteration while transfer completes,
+            // preventing instructions from being reordered after the DMA wait
+            l1_buffer += BATCH;
+            src += BATCH;
+            asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+        
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+
+    // Iterations to cover all transfers in each column
+    n_iters = N_BATCHES - 1 + N_ROWS - 1;
+    for (uint32_t i = 0; i < n_iters; i++) {
+
+        // Every cluster is active for N_BATCHES iterations
+        // starting from the iteration with (i + 1) == pb_cluster_row_idx()
+        char cluster_active = (i + 1) >= pb_cluster_row_idx();
+        cluster_active &= (i + 1) < (pb_cluster_row_idx() + N_BATCHES);
+        cluster_active &= !pb_cluster_in_row(0);
+
+        // Every active cluster copies data from the bottom tile
+        if (cluster_active) {
+            snrt_mcycle();
+
+            // Start DMA
+            snrt_dma_start_1d(l1_buffer, src, BATCH);
+
+            // Update pointers for next iteration while transfer completes,
+            // preventing instructions from being reordered after the DMA wait
+            l1_buffer += BATCH;
+            src += BATCH;
+            asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+        
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+}
+
+static inline void dma_multicast_tree(uintptr_t l1_buffer,
+    uintptr_t l3_buffer, snrt_comm_t comm) {
+
+    // Only DMA cores of clusters in the communicator continue from here
+    if (!snrt_is_dm_core() || !comm->is_participant) return;
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iteration 0: cluster 0 fetches data from memory tile
+    if (snrt_cluster_idx() == 0) {
+        snrt_mcycle();
+        snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE);
+        snrt_dma_wait_all();
+        snrt_mcycle();
+    }
+
+    // Iterations to cover all transfers in a row
+    uint32_t n_levels_in_row = pb_log2_cluster_num_in_row();
+    for (uint32_t i = 0; i < n_levels_in_row; i++) {
+
+        // Determine which clusters are senders at every level of the tree
+        char cluster_idx_inv = pb_cluster_num_in_row() - pb_cluster_col_idx();
+        char num_active_clusters = 1 << i;
+        char sender_stride = pb_cluster_num_in_row() / num_active_clusters;
+        char is_sender = ((cluster_idx_inv % sender_stride) == 0)
+            && pb_cluster_in_row(0);
+
+        // Every active cluster sends the data to a cluster on the right
+        if (is_sender) {
+
+            // Calculate destination
+            char receiver_offset = sender_stride / 2;
+            uintptr_t dst_cluster = pb_calculate_cluster_idx(0,
+                pb_cluster_col_idx() + receiver_offset);
+            uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer,
+                snrt_cluster_idx(), dst_cluster);
+
+            snrt_mcycle();
+
+            // Start DMA
+            snrt_dma_start_1d(dst, l1_buffer, SIZE);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+        
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+
+    // Iterations to cover all transfers in each column
+    uint32_t n_levels_in_col = LOG2_N_ROWS;
+    for (uint32_t i = 0; i < n_levels_in_col; i++) {
+
+        // Determine which clusters are senders at every level of the tree
+        char row_idx_inv = N_ROWS - pb_cluster_row_idx();
+        char num_active_rows = 1 << i;
+        char sender_stride = N_ROWS / num_active_rows;
+        char is_sender = (row_idx_inv % sender_stride) == 0;
+
+        // Every active cluster sends the data to a cluster above it
+        if (is_sender) {
+
+            // Calculate destination 
+            char receiver_offset = sender_stride / 2;
+            uintptr_t dst_cluster = pb_calculate_cluster_idx(
+                pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx());
+            uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer,
+                snrt_cluster_idx(), dst_cluster);
+
+            snrt_mcycle();
+
+            // Start DMA
+            snrt_dma_start_1d(dst, l1_buffer, SIZE);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+        
+        // Perform inter-cluster barrier. Disable reduction before the fence
+        // so it overlaps with the latency of the ongoing reduction operation.
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+}
+
+static inline void dma_multicast_hw(uintptr_t l1_buffer,
+    uintptr_t l3_buffer, snrt_comm_t comm) {
+        // Only DMA core of cluster 0 continues past this point
+    if (!snrt_is_dm_core() || !(snrt_cluster_idx() == 0)) return;
+
+    // Hardware multicast transfer
+    uint64_t mask = snrt_get_collective_mask(comm);
+    snrt_dma_enable_multicast(mask);
+    snrt_mcycle();
+    snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE);
+    snrt_dma_disable_multicast();
+    snrt_dma_wait_all();
+}
+
+// L1 buffer of every cluster invoking this function should be
+// at the same offset in the TCDM
+static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer,
+    snrt_comm_t comm) {
+    if (IMPL == SEQ)
+        dma_multicast_sequential(l1_buffer, l3_buffer, comm);
+    else if (IMPL == TREE)
+        dma_multicast_tree(l1_buffer, l3_buffer, comm);
+    else if (IMPL == HW)
+        dma_multicast_hw(l1_buffer, l3_buffer, comm);
+}
+
+// Global variables for verification script
+uint32_t output[N_ELEMS * N_ROWS * pb_cluster_num_in_row()];
+extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row();
+extern const uint32_t length = N_ELEMS * N_ROWS * pb_cluster_num_in_row();
+
+int main() {
+
+    // Allocate 4KiB-aligned buffer in every cluster
+    uint32_t *buffer = (uint32_t *)snrt_l1_alloc_cluster_local(SIZE, 4096);
+
+    // Allocate 4KiB-aligned buffer in memory tile
+    uint32_t *l3_buffer = (uint32_t *)snrt_l3_alloc_v2(SIZE, 4096);
+
+    // First cluster initializes the L3 buffer.
+    if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) {
+        for (uint32_t i = 0; i < N_ELEMS; i++) {
+            l3_buffer[i] = i + 1;
+        }
+    }
+
+    // TODO(colluca): is this needed?
+    // Every cluster in row 0 initializes its destination buffer
+    if (snrt_is_dm_core() && pb_cluster_in_row(0)) {
+        snrt_dma_start_1d(
+            (uintptr_t)buffer, (uintptr_t)(snrt_cluster()->zeromem.mem), SIZE);
+        snrt_dma_wait_all();
+    }
+
+    // Create communicator for first N rows (all other clusters are inactive)
+    snrt_comm_t comm;
+    pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row());
+
+    // Only DMA cores of clusters in the first N rows continue from here
+    if (!snrt_is_dm_core() || !comm->is_participant) return 0;
+
+    // Synchronize all clusters.
+    snrt_inter_cluster_barrier(comm);
+
+    // Initiate multicast transfer (twice to preheat the cache)
+    for (volatile int i = 0; i < 2; i++) {
+        snrt_mcycle();
+        dma_multicast((uintptr_t)buffer, (uintptr_t)l3_buffer, comm);
+        snrt_mcycle();
+        snrt_inter_cluster_barrier(comm);
+    }
+
+    // Writeback to L3
+    if (snrt_is_dm_core() && (pb_cluster_row_idx() < N_ROWS)) {
+        uint32_t cluster_idx = pb_cluster_col_idx() +
+            pb_cluster_row_idx() * pb_cluster_num_in_row();
+        uint32_t offset = cluster_idx * SIZE;
+        snrt_dma_start_1d(
+            (uintptr_t)output + offset, (uintptr_t)buffer, SIZE);
+        snrt_dma_wait_all();
+    }
+
+    return 0;
+}
diff --git a/sw/snitch/tests/overlapping_barriers.c b/sw/snitch/tests/overlapping_barriers.c
new file mode 100644
index 00000000..d74c9bce
--- /dev/null
+++ b/sw/snitch/tests/overlapping_barriers.c
@@ -0,0 +1,34 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+//
+// Test that multiple outstanding barriers with intersecting (or overlapping)
+// participant sets do not deadlock the system. The "participant set" is
+// defined as the set of clusters that participate in the barrier.
+
+#include "snrt.h"
+
+int main (void) {
+
+    // Create communicator for row 0
+    snrt_comm_t comm;
+    pb_create_mesh_comm(&comm, 1, pb_cluster_num_in_row());
+
+	// Make sure row-0 clusters arrive on the row-0 barrier only after the
+	// other clusters have arrived on the global barrier. This ensures that
+	// the global barrier, arriving first, takes ownership of the router,
+	// preventing the row-0 clusters from ever reaching the global barrier
+	// and consequently deadlocking the system, if multiple outstanding
+	// reductions are not properly supported.
+	if (pb_cluster_row_idx() == 0) {
+		for (int j = 0; j < 100; j++) snrt_nop();
+		snrt_global_barrier(comm);
+		snrt_global_barrier();
+	} else {
+		snrt_global_barrier();
+	}
+
+	return 0;
+}
diff --git a/sw/snitch/tests/parallel_row_col_barriers.c b/sw/snitch/tests/parallel_row_col_barriers.c
new file mode 100644
index 00000000..acb1bd41
--- /dev/null
+++ b/sw/snitch/tests/parallel_row_col_barriers.c
@@ -0,0 +1,33 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+//
+// Stress test row, column and global barriers happening in rapid succession.
+
+#include "snrt.h"
+
+int main (void) {
+
+    // Create communicators for each row and column.
+    snrt_comm_t row_comm[pb_cluster_num_in_col()];
+	snrt_comm_t col_comm[pb_cluster_num_in_row()];
+	for (uint32_t r = 0; r < pb_cluster_num_in_col(); r++) {
+		pb_create_mesh_comm(&row_comm[r], 1, pb_cluster_num_in_row(),
+			r, 0);
+	}
+	for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++) {
+		pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1,
+			0, c);
+	}
+
+	// Test multiple barriers in succession (row, col and global)
+	for (int i = 0; i < 10; i++) {
+		snrt_global_barrier(row_comm[pb_cluster_row_idx()]);
+		snrt_global_barrier(col_comm[pb_cluster_col_idx()]);
+		snrt_global_barrier();
+	}
+
+	return 0;
+}
diff --git a/sw/snitch/tests/reduction_benchmark.c b/sw/snitch/tests/reduction_benchmark.c
new file mode 100644
index 00000000..45d75a1c
--- /dev/null
+++ b/sw/snitch/tests/reduction_benchmark.c
@@ -0,0 +1,502 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Chen Wu <chenwu@iis.ee.ethz.ch>
+
+#include "snrt.h"
+#include "blas.h"
+
+// Transfer size in bytes
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+// Batch size in bytes
+#ifndef BATCH
+#define BATCH 64
+#endif
+
+typedef enum {
+    SEQ,
+    TREE,
+    HW
+} impl_t;
+
+#ifndef IMPL
+#define IMPL SEQ
+#endif
+
+#define N_BATCHES (SIZE / BATCH)
+#define N_ELEMS (SIZE / sizeof(double))
+
+#ifndef LOG2_N_ROWS
+#define LOG2_N_ROWS 0
+#endif
+
+#define N_ROWS (1 << LOG2_N_ROWS)
+
+// Replace `1` literals with this thread-local variable to prevent it from
+// being allocated by the compiler in L2.
+__thread double one = 1;
+
+static inline void global_hw_barrier(volatile uint32_t *barrier_ptr,
+    uint32_t user) {
+
+    // Perform inter-cluster barrier. Disable reduction before the fence
+    // so it overlaps with the latency of the ongoing reduction operation.
+    if (snrt_is_dm_core()) {
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+    snrt_cluster_hw_barrier();
+}
+
+static inline void swap_buffers(uintptr_t *a, uintptr_t *b) {
+    uintptr_t temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+static inline void dma_reduction_hw(uintptr_t src, uintptr_t dst,
+    snrt_comm_t comm) {
+
+    // Create a communicator per row
+    snrt_comm_t row_comm[N_ROWS];
+    for (int i = 0; i < N_ROWS; i++)
+        pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0,
+            comm);
+
+    // Create a communicator for the first column
+    snrt_comm_t col_comm;
+    pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm);
+
+    if (snrt_is_dm_core() && comm->is_participant) {
+        uint32_t remote_cluster;
+        uintptr_t remote_dst;
+        snrt_collective_opcode_t op = SNRT_REDUCTION_FADD;
+
+        // Reduction across rows (destination: first cluster in row)
+        snrt_mcycle();
+        remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0);
+        remote_dst = (uintptr_t)snrt_remote_l1_ptr(
+            (void *)dst, snrt_cluster_idx(), remote_cluster);
+        snrt_dma_start_1d_reduction(
+            remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op);
+        snrt_dma_wait_all();
+        snrt_mcycle();
+
+        // Barrier to ensure there is only one outstanding reduction in every
+        // router
+        snrt_inter_cluster_barrier(comm);
+
+        // Reduction across first column (destination: cluster 0)
+        snrt_mcycle();
+        if (N_ROWS > 1 && col_comm->is_participant) {
+            // Switch source and destination buffer
+            remote_dst = (uintptr_t)snrt_remote_l1_ptr(
+                (void *)src, snrt_cluster_idx(), 0);
+            snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op);
+            snrt_dma_wait_all();
+        }
+        snrt_mcycle();
+    }
+}
+
+// We need four buffers. Upon function invocation, the data must be in a.
+static inline void dma_reduction_seq(uintptr_t a, uintptr_t b, uintptr_t c,
+    uintptr_t d, snrt_comm_t comm) {
+    if (!comm->is_participant) return;
+
+    uint32_t col_idx = pb_cluster_col_idx();
+    uint32_t row_idx = pb_cluster_row_idx();
+    uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1);
+
+    // Group to simplify rotating buffers
+    uintptr_t dst[2] = {b, d};
+
+    // Compute addresses of source and destination data for DMA cores.
+    // All clusters in col > 0 participate in the row reduction. For this
+    // reduction the source is in `a` and the result in `c`.
+    // Only the easternmost clusters send their input data (in `a`) to their
+    // west neighbours, all other clusters send the data they reduced in the
+    // previous step (in `c`).
+    // Clusters in col == 0 participate in the column reduction. For this
+    // reduction the source is in `c` and the result in `a`.
+    // Only the northernmost cluster sends the source data (in `c`) to its
+    // south neighbour, all other clusters send the data they reduced in the
+    // previous step (in `a`).
+    uintptr_t dma_src, dma_dst;
+    if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) {
+        dma_src = a;
+    } else {
+        dma_src = c;
+    }
+    // Clusters in col > 0, participating in row reduction, send to west
+    // neighbours. Clusters in col == 0, participating in column reduction,
+    // send to south neighbours.
+    uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() :
+        pb_cluster_south_neighbour();
+    dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+        snrt_cluster_idx(), dst_cluster);
+
+    // Compute addresses of source and result data for compute cores
+    uintptr_t comp_src1, comp_src2, comp_result;
+    comp_src1 = a;
+    comp_src2 = dst[0];
+    comp_result = c;
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iterations to cover all reductions in a row
+    uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES;
+    for (uint32_t i = 0; i < n_iters; i++) {
+
+        uint32_t col_idx_inv, base_iter;
+        char dma_active, compute_active;
+
+        // Determine which clusters need to send data at every iteration
+        col_idx_inv = pb_cluster_num_in_row() - col_idx - 1;
+        base_iter = 2 * col_idx_inv;
+        dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        dma_active &= !pb_cluster_in_col(0);
+
+        // Determine which clusters need to reduce data at every iteration
+        base_iter = 1 + 2 * (col_idx_inv - 1);
+        compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        compute_active &= !pb_cluster_is_easternmost();
+
+        if (compute_active && snrt_is_compute_core()) {
+            snrt_mcycle();
+            axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                (double *)comp_src2, (double *)comp_result);
+            comp_src1 += BATCH;
+            swap_buffers(&dst[0], &dst[1]);
+            comp_src2 = dst[0];
+            comp_result += BATCH;
+            snrt_mcycle();
+        }
+
+        if (dma_active && snrt_is_dm_core()) {
+
+            // Start DMA
+            snrt_mcycle();
+            snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+            // Update pointers for next iteration while transfer completes,
+            // preventing instructions from being reordered after the DMA wait
+            dma_src += BATCH;
+            swap_buffers(&dst[0], &dst[1]);
+            dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                snrt_cluster_idx(), pb_cluster_west_neighbour());
+            asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+
+        global_hw_barrier(barrier_ptr, user);
+    }
+
+    // Final column reduction is performed only if there is more than one row
+    if (N_ROWS > 1) {
+
+        dst[0] = b;
+        dst[1] = d;
+        comp_src1 = c;
+        comp_src2 = dst[0];
+        comp_result = a;
+        
+        // Iterations to cover all reductions in the first column
+        n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES;
+        for (uint32_t i = 0; i < n_iters; i++) {
+
+            uint32_t row_idx_inv, base_iter;
+            char dma_active, compute_active;
+    
+            // Determine which clusters need to send data at every iteration
+            row_idx_inv = N_ROWS - row_idx - 1;
+            base_iter = 2 * row_idx_inv;
+            dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            dma_active &= pb_cluster_in_col(0);
+            dma_active &= snrt_cluster_idx() != 0;
+
+            // Determine which clusters need to reduce data at every iteration
+            base_iter = 1 + 2 * (row_idx_inv - 1);
+            compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            compute_active &= pb_cluster_in_col(0);
+            compute_active &= !is_northernmost;
+
+            if (compute_active && snrt_is_compute_core()) {
+                snrt_mcycle();
+                axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                    (double *)comp_src2, (double *)comp_result);
+                comp_src1 += BATCH;
+                swap_buffers(&dst[0], &dst[1]);
+                comp_src2 = dst[0];
+                comp_result += BATCH;
+                snrt_mcycle();
+            }
+
+            if (dma_active && snrt_is_dm_core()) {
+                // Start DMA
+                snrt_mcycle();
+                snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+                // Update pointers for next iteration while transfer completes,
+                // preventing instructions from being reordered after the DMA wait
+                dma_src += BATCH;
+                swap_buffers(&dst[0], &dst[1]);
+                dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                    snrt_cluster_idx(), pb_cluster_south_neighbour());
+                asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+                // Wait for DMA to complete
+                snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+
+            global_hw_barrier(barrier_ptr, user);
+        }
+    }
+}
+
+static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist,
+    uintptr_t src, uintptr_t dst) {
+
+    // Every sending cluster sends the data to a cluster on the left
+    if (is_sender && snrt_is_dm_core()) {
+
+        // Calculate destination
+        uintptr_t dst_cluster = pb_calculate_cluster_idx(
+            pb_cluster_row_idx(), pb_cluster_col_idx() - dist);
+        uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst,
+            snrt_cluster_idx(), dst_cluster);
+
+        snrt_mcycle();
+
+        // Start DMA
+        snrt_dma_start_1d(remote_dst, src, BATCH);
+
+        // Wait for DMA to complete
+        snrt_dma_wait_all();
+        snrt_mcycle();
+    }
+}
+
+static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist,
+    uintptr_t src, uintptr_t dst) {
+
+    // Every sending cluster sends the data to a cluster on the left
+    if (is_sender && snrt_is_dm_core()) {
+
+        // Calculate destination
+        uintptr_t dst_cluster = pb_calculate_cluster_idx(
+            pb_cluster_row_idx() - dist, pb_cluster_col_idx());
+        uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst,
+            snrt_cluster_idx(), dst_cluster);
+
+        snrt_mcycle();
+
+        // Start DMA
+        snrt_dma_start_1d(remote_dst, src, BATCH);
+
+        // Wait for DMA to complete
+        snrt_dma_wait_all();
+        snrt_mcycle();
+    }
+}
+
+static inline void dma_reduction_tree_compute_phase(char is_receiver,
+    uintptr_t src, uintptr_t dst, uintptr_t result) {
+
+    // Every receiving cluster reduces the data from the sender
+    if (is_receiver && snrt_is_compute_core()) {
+        snrt_mcycle();
+        axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst,
+            (double *)result);
+        snrt_mcycle();
+    }
+    snrt_cluster_hw_barrier();
+}
+
+// We need four buffers. Upon function invocation, the data must be in a.
+static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c,
+    uintptr_t d, snrt_comm_t comm) {
+
+    // Only clusters in the communicator continue from here
+    if (!comm->is_participant) return;
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Initially a is the source buffer and c is the result buffer
+    uintptr_t src = a;
+    uintptr_t dst[2] = {b, d};
+    uintptr_t result = c;
+
+    // Iterations to cover all reductions in a row
+    uint32_t n_levels_in_row = pb_log2_cluster_num_in_row();
+    for (uint32_t i = 0; i < n_levels_in_row; i++) {
+
+        // Determine which clusters are senders and receivers at every level
+        // of the tree
+        char dist = 1 << i;  // Distance between clusters in a pair
+        char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist;
+        char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0;
+
+        // Transfer phase for first batch
+        dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]);
+        global_hw_barrier(barrier_ptr, user);
+
+        // If more than one batch is present, there will be N_BATCHES - 1
+        // iterations in which we perform both transfer and compute phases
+        uint32_t j = 0;
+        for (; j < N_BATCHES - 1; j++) {
+            // Alternate destination buffers for every batch
+            uintptr_t dma_dst = dst[(j+1)%2];
+            uintptr_t comp_dst = dst[j%2];
+
+            // Calculate pointers for current batch
+            uintptr_t batch_dma_src = src + (j+1) * BATCH;
+            uintptr_t batch_comp_src = src + j * BATCH;
+            uintptr_t batch_result = result + j * BATCH;
+
+            dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src,
+                dma_dst);
+            dma_reduction_tree_compute_phase(is_receiver, batch_comp_src,
+                comp_dst, batch_result);
+            global_hw_barrier(barrier_ptr, user);
+        }
+
+        // Compute phase for last batch
+        dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH,
+            dst[j%2], result + j * BATCH);
+
+        // Swap source and result buffers for next iteration
+        swap_buffers(&src, &result);
+    }
+
+    // Iterations to cover all reductions in the first column
+    uint32_t n_levels_in_col = LOG2_N_ROWS;
+    for (uint32_t i = 0; i < n_levels_in_col; i++) {
+
+        // Determine which clusters are senders at every level of the tree
+        char dist = 1 << i;  // Distance between clusters in a pair
+        char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0);
+        char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0);
+
+        // Transfer phase for first batch
+        dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]);
+        global_hw_barrier(barrier_ptr, user);
+
+        // If more than one batch is present, there will be N_BATCHES - 1
+        // iterations in which we perform both transfer and compute phases
+        uint32_t j = 0;
+        for (; j < N_BATCHES - 1; j++) {
+            // Alternate destination buffers for every batch
+            uintptr_t dma_dst = dst[(j+1)%2];
+            uintptr_t comp_dst = dst[j%2];
+
+            // Calculate pointers for current batch
+            uintptr_t batch_dma_src = src + (j+1) * BATCH;
+            uintptr_t batch_comp_src = src + j * BATCH;
+            uintptr_t batch_result = result + j * BATCH;
+
+            dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src,
+                dma_dst);
+            dma_reduction_tree_compute_phase(is_receiver, batch_comp_src,
+                comp_dst, batch_result);
+            global_hw_barrier(barrier_ptr, user);
+        }
+
+        // Compute phase for last batch
+        dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH,
+            dst[j%2], result + j * BATCH);
+
+        // Swap source and result buffers for next iteration
+        swap_buffers(&src, &result);
+    }
+}
+
+// L1 buffer of every cluster invoking this function should be
+// at the same offset in the TCDM
+static inline void dma_reduction(uintptr_t a, uintptr_t b, uintptr_t c,
+    uintptr_t d, snrt_comm_t comm) {
+    if (IMPL == SEQ)
+        dma_reduction_seq(a, b, c, d, comm);
+    else if (IMPL == TREE)
+        dma_reduction_tree(a, b, c, d, comm);
+    else if (IMPL == HW)
+        dma_reduction_hw(a, c, comm);
+}
+
+// Global variables for verification script
+double output[N_ELEMS];
+extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row();
+extern const uint32_t length = N_ELEMS;
+
+int main (void){
+
+    // Allocate 4KiB-aligned buffers in every cluster
+    uintptr_t a_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096);
+    uintptr_t c_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096);
+    uintptr_t b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096);
+    uintptr_t d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096);
+
+    // Create communicator for first N rows (all other clusters are inactive)
+    snrt_comm_t comm;
+    pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row());
+
+    // Only clusters in the first N rows continue from here
+    if (!comm->is_participant) return 0;
+
+    // Two iterations to to preheat the cache
+    for (volatile int i = 0; i < 2; i++){
+
+        // Initialize source buffer
+        if (snrt_is_dm_core()) {
+            for (uint32_t i = 0; i < N_ELEMS; i++) {
+                uint32_t row_major_cluster_idx = pb_cluster_col_idx() +
+                    pb_cluster_row_idx() * pb_cluster_num_in_row();
+                ((double *)a_buffer)[i] = row_major_cluster_idx + i;
+            }
+        }
+        snrt_global_barrier(comm);
+
+        // Perform reduction
+        snrt_mcycle();
+        dma_reduction(a_buffer, b_buffer, c_buffer, d_buffer, comm);
+        snrt_mcycle();
+        snrt_global_barrier(comm);
+    }
+
+    // Writeback to L3
+    uintptr_t result_buffer = c_buffer;
+    uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS;
+    if ((IMPL == HW && N_ROWS > 1) ||
+        (IMPL == SEQ && N_ROWS > 1) ||
+        (IMPL == TREE && (total_tree_levels % 2) == 0))
+        result_buffer = a_buffer;
+    if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) {
+        snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE);
+        snrt_dma_wait_all();
+    }
+}
diff --git a/sw/snitch/tests/reduction_benchmark_hyperbank.c b/sw/snitch/tests/reduction_benchmark_hyperbank.c
new file mode 100644
index 00000000..747532b2
--- /dev/null
+++ b/sw/snitch/tests/reduction_benchmark_hyperbank.c
@@ -0,0 +1,776 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Chen Wu <chenwu@iis.ee.ethz.ch>
+
+#include "snrt.h"
+#include "blas.h"
+
+// Transfer size in bytes
+#ifndef SIZE
+#define SIZE 8192
+#endif
+
+// Batch size in bytes
+#ifndef BATCH
+#define BATCH 1024
+#endif
+
+typedef enum {
+    SEQ,
+    TREE,
+    HW_GENERIC,
+    HW_SIMPLE,
+} impl_t;
+
+#ifndef IMPL
+#define IMPL SEQ
+#endif
+
+#define N_BATCHES (SIZE / BATCH)
+#define N_ELEMS (SIZE / sizeof(double))
+
+#ifndef LOG2_N_ROWS
+#define LOG2_N_ROWS 2
+#endif
+
+#define N_ROWS (1 << LOG2_N_ROWS)
+
+// Replace `1` literals with this thread-local variable to prevent it from
+// being allocated by the compiler in L2.
+__thread double one = 1;
+
+static inline void global_hw_barrier(volatile uint32_t *barrier_ptr,
+    uint32_t user) {
+
+    // Perform inter-cluster barrier. Disable reduction before the fence
+    // so it overlaps with the latency of the ongoing reduction operation.
+    if (snrt_is_dm_core()) {
+        snrt_set_awuser_low(user);
+        *barrier_ptr = 1;
+        snrt_set_awuser_low(0);
+        snrt_fence();
+    }
+    snrt_cluster_hw_barrier();
+}
+
+static inline void swap_buffers(uintptr_t *a, uintptr_t *b) {
+    uintptr_t temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+static inline void dma_reduction_hw_generic(uintptr_t src, uintptr_t dst,
+    snrt_comm_t comm) {
+    if (snrt_is_dm_core() && comm->is_participant) {
+        uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr(
+            (void *)dst, snrt_cluster_idx(), 0);
+        snrt_collective_opcode_t op = SNRT_REDUCTION_FADD;
+        snrt_dma_start_1d_reduction(remote_dst, src, SIZE, comm, op);
+        snrt_dma_wait_all();
+    }
+}
+
+static inline void dma_reduction_hw_simple(uintptr_t src, uintptr_t dst,
+    snrt_comm_t comm) {
+
+    // Create a communicator per row
+    snrt_comm_t row_comm[N_ROWS];
+    for (int i = 0; i < N_ROWS; i++)
+        pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0,
+            comm);
+
+    // Create a communicator for the first column
+    snrt_comm_t col_comm;
+    pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm);
+
+    if (snrt_is_dm_core() && comm->is_participant) {
+        uint32_t remote_cluster;
+        uintptr_t remote_dst;
+        snrt_collective_opcode_t op = SNRT_REDUCTION_FADD;
+
+        // Reduction across rows (destination: first cluster in row)
+        snrt_mcycle();
+        remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0);
+        remote_dst = (uintptr_t)snrt_remote_l1_ptr(
+            (void *)dst, snrt_cluster_idx(), remote_cluster);
+        snrt_dma_start_1d_reduction(
+            remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op);
+        snrt_dma_wait_all();
+        snrt_mcycle();
+
+        // Barrier to ensure there is only one outstanding reduction in every
+        // router
+        snrt_inter_cluster_barrier(comm);
+
+        // Reduction across first column (destination: cluster 0)
+        snrt_mcycle();
+        if (N_ROWS > 1 && col_comm->is_participant) {
+            // Switch source and destination buffer
+            remote_dst = (uintptr_t)snrt_remote_l1_ptr(
+                (void *)src, snrt_cluster_idx(), 0);
+            snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op);
+            snrt_dma_wait_all();
+        }
+        snrt_mcycle();
+    }
+}
+
+static inline void dma_reduction_seq_normal(uintptr_t a, uintptr_t b, uintptr_t c,
+    uintptr_t d, snrt_comm_t comm) {
+    if (!comm->is_participant) return;
+
+    uint32_t col_idx = pb_cluster_col_idx();
+    uint32_t row_idx = pb_cluster_row_idx();
+    uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1);
+
+    // Group to simplify rotating buffers
+    uintptr_t dst[2] = {b, d};
+
+    // Compute addresses of source and destination data for DMA cores.
+    // All clusters in col > 0 participate in the row reduction. For this
+    // reduction the source is in `a` and the result in `c`.
+    // Only the easternmost clusters send their input data (in `a`) to their
+    // west neighbours, all other clusters send the data they reduced in the
+    // previous step (in `c`).
+    // Clusters in col == 0 participate in the column reduction. For this
+    // reduction the source is in `c` and the result in `a`.
+    // Only the northernmost cluster sends the source data (in `c`) to its
+    // south neighbour, all other clusters send the data they reduced in the
+    // previous step (in `a`).
+    uintptr_t dma_src, dma_dst;
+    if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) {
+        dma_src = a;
+    } else {
+        dma_src = c;
+    }
+    // Clusters in col > 0, participating in row reduction, send to west
+    // neighbours. Clusters in col == 0, participating in column reduction,
+    // send to south neighbours.
+    uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() :
+        pb_cluster_south_neighbour();
+    dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+        snrt_cluster_idx(), dst_cluster);
+
+    // Compute addresses of source and result data for compute cores
+    uintptr_t comp_src1, comp_src2, comp_result;
+    comp_src1 = a;
+    comp_src2 = dst[0];
+    comp_result = c;
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iterations to cover all reductions in a row
+    uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES;
+    for (uint32_t i = 0; i < n_iters; i++) {
+
+        uint32_t col_idx_inv, base_iter;
+        char dma_active, compute_active;
+
+        // Determine which clusters need to send data at every iteration
+        col_idx_inv = pb_cluster_num_in_row() - col_idx - 1;
+        base_iter = 2 * col_idx_inv;
+        dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        dma_active &= !pb_cluster_in_col(0);
+
+        // Determine which clusters need to reduce data at every iteration
+        base_iter = 1 + 2 * (col_idx_inv - 1);
+        compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        compute_active &= !pb_cluster_is_easternmost();
+
+        if (compute_active && snrt_is_compute_core()) {
+            snrt_mcycle();
+            axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                (double *)comp_src2, (double *)comp_result);
+            comp_src1 += BATCH;
+            swap_buffers(&dst[0], &dst[1]);
+            comp_src2 = dst[0];
+            comp_result += BATCH;
+            snrt_mcycle();
+        }
+
+        if (dma_active && snrt_is_dm_core()) {
+
+            // Start DMA
+            snrt_mcycle();
+            snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+            // Update pointers for next iteration while transfer completes,
+            // preventing instructions from being reordered after the DMA wait
+            dma_src += BATCH;
+            swap_buffers(&dst[0], &dst[1]);
+            dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                snrt_cluster_idx(), pb_cluster_west_neighbour());
+            asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+
+        global_hw_barrier(barrier_ptr, user);
+    }
+
+    // Final column reduction is performed only if there is more than one row
+    if (N_ROWS > 1) {
+
+        dst[0] = b;
+        dst[1] = d;
+        comp_src1 = c;
+        comp_src2 = dst[0];
+        comp_result = a;
+        
+        // Iterations to cover all reductions in the first column
+        n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES;
+        for (uint32_t i = 0; i < n_iters; i++) {
+
+            uint32_t row_idx_inv, base_iter;
+            char dma_active, compute_active;
+    
+            // Determine which clusters need to send data at every iteration
+            row_idx_inv = N_ROWS - row_idx - 1;
+            base_iter = 2 * row_idx_inv;
+            dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            dma_active &= pb_cluster_in_col(0);
+            dma_active &= snrt_cluster_idx() != 0;
+
+            // Determine which clusters need to reduce data at every iteration
+            base_iter = 1 + 2 * (row_idx_inv - 1);
+            compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            compute_active &= pb_cluster_in_col(0);
+            compute_active &= !is_northernmost;
+
+            if (compute_active && snrt_is_compute_core()) {
+                snrt_mcycle();
+                axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                    (double *)comp_src2, (double *)comp_result);
+                comp_src1 += BATCH;
+                swap_buffers(&dst[0], &dst[1]);
+                comp_src2 = dst[0];
+                comp_result += BATCH;
+                snrt_mcycle();
+            }
+
+            if (dma_active && snrt_is_dm_core()) {
+                // Start DMA
+                snrt_mcycle();
+                snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+                // Update pointers for next iteration while transfer completes,
+                // preventing instructions from being reordered after the DMA wait
+                dma_src += BATCH;
+                swap_buffers(&dst[0], &dst[1]);
+                dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                    snrt_cluster_idx(), pb_cluster_south_neighbour());
+                asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+                // Wait for DMA to complete
+                snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+
+            global_hw_barrier(barrier_ptr, user);
+        }
+    }
+}
+
+
+// We need four buffers. Upon function invocation, the data must be in a.
+static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1,
+                                 uintptr_t b,
+                                 uintptr_t c0, uintptr_t c1,
+                                 uintptr_t d,
+                                 snrt_comm_t comm) {
+    if (!comm->is_participant) return;
+
+    uint32_t col_idx = pb_cluster_col_idx();
+    uint32_t row_idx = pb_cluster_row_idx();
+    uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1);
+
+    // Group to simplify rotating buffers
+    uintptr_t dst[2] = {b, d};
+    uintptr_t a_ptr[2] = {a0, a1};
+    uintptr_t c_ptr[2] = {c0, c1};
+
+    // Compute addresses of source and destination data for DMA cores.
+    // All clusters in col > 0 participate in the row reduction. For this
+    // reduction the source is in `a` and the result in `c`.
+    // Only the easternmost clusters send their input data (in `a`) to their
+    // west neighbours, all other clusters send the data they reduced in the
+    // previous step (in `c`).
+    // Clusters in col == 0 participate in the column reduction. For this
+    // reduction the source is in `c` and the result in `a`.
+    // Only the northernmost cluster sends the source data (in `c`) to its
+    // south neighbour, all other clusters send the data they reduced in the
+    // previous step (in `a`).
+    uintptr_t dma_src, dma_dst;
+    if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) {
+        dma_src = a_ptr[0];
+    } else {
+        dma_src = c_ptr[0];
+    }
+    // Clusters in col > 0, participating in row reduction, send to west
+    // neighbours. Clusters in col == 0, participating in column reduction,
+    // send to south neighbours.
+    uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() :
+        pb_cluster_south_neighbour();
+    dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+        snrt_cluster_idx(), dst_cluster);
+
+    // Compute addresses of source and result data for compute cores
+    uintptr_t comp_src1, comp_src2, comp_result;
+    comp_src1 = a_ptr[0];
+    comp_src2 = dst[0];
+    comp_result = c_ptr[0];
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Iterations to cover all reductions in a row
+    uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES;
+    for (uint32_t i = 0; i < n_iters; i++) {
+
+        uint32_t col_idx_inv, base_iter;
+        char dma_active, compute_active;
+
+        // Determine which clusters need to send data at every iteration
+        col_idx_inv = pb_cluster_num_in_row() - col_idx - 1;
+        base_iter = 2 * col_idx_inv;
+        dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        dma_active &= !pb_cluster_in_col(0);
+
+        // Determine which clusters need to reduce data at every iteration
+        base_iter = 1 + 2 * (col_idx_inv - 1);
+        compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+        compute_active &= !pb_cluster_is_easternmost();
+
+        if (compute_active && snrt_is_compute_core()) {
+            snrt_mcycle();
+            axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                (double *)comp_src2, (double *)comp_result);
+            
+            // comp_src1 += BATCH;
+            a_ptr[0] += BATCH;
+            swap_buffers(&a_ptr[0], &a_ptr[1]);
+            comp_src1 = a_ptr[0];
+
+            swap_buffers(&dst[0], &dst[1]);
+            comp_src2 = dst[0];
+
+            // comp_result += BATCH;
+            c_ptr[0] += BATCH;
+            swap_buffers(&c_ptr[0], &c_ptr[1]);
+            comp_result = c_ptr[0];
+
+            snrt_mcycle();
+        }
+
+        if (dma_active && snrt_is_dm_core()) {
+
+            // Start DMA
+            snrt_mcycle();
+            snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+            // Update pointers for next iteration while transfer completes,
+            // preventing instructions from being reordered after the DMA wait
+            // dma_src += BATCH;
+            if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) {
+                a_ptr[0] += BATCH;
+                swap_buffers(&a_ptr[0], &a_ptr[1]);
+                dma_src = a_ptr[0];
+            } else {
+                c_ptr[0] += BATCH;
+                swap_buffers(&c_ptr[0], &c_ptr[1]);
+                dma_src = c_ptr[0];
+            }
+            
+            swap_buffers(&dst[0], &dst[1]);
+            dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                snrt_cluster_idx(), pb_cluster_west_neighbour());
+            asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+            // Wait for DMA to complete
+            snrt_dma_wait_all();
+            snrt_mcycle();
+        }
+
+        global_hw_barrier(barrier_ptr, user);
+    }
+
+    // Final column reduction is performed only if there is more than one row
+    if (N_ROWS > 1) {
+
+        dst[0] = b;
+        dst[1] = d;
+        a_ptr[0] = a0;
+        a_ptr[1] = a1;
+        c_ptr[0] = c0;
+        c_ptr[1] = c1;
+
+        comp_src1 = c_ptr[0];
+        comp_src2 = dst[0];
+        comp_result = a_ptr[0];
+        
+        // Iterations to cover all reductions in the first column
+        n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES;
+        for (uint32_t i = 0; i < n_iters; i++) {
+
+            uint32_t row_idx_inv, base_iter;
+            char dma_active, compute_active;
+    
+            // Determine which clusters need to send data at every iteration
+            row_idx_inv = N_ROWS - row_idx - 1;
+            base_iter = 2 * row_idx_inv;
+            dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            dma_active &= pb_cluster_in_col(0);
+            dma_active &= snrt_cluster_idx() != 0;
+
+            // Determine which clusters need to reduce data at every iteration
+            base_iter = 1 + 2 * (row_idx_inv - 1);
+            compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES));
+            compute_active &= pb_cluster_in_col(0);
+            compute_active &= !is_northernmost;
+
+            if (compute_active && snrt_is_compute_core()) {
+                snrt_mcycle();
+                axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1,
+                    (double *)comp_src2, (double *)comp_result);
+                // comp_src1 += BATCH;
+                c_ptr[0] += BATCH;
+                swap_buffers(&c_ptr[0], &c_ptr[1]);
+                comp_src1 = c_ptr[0];
+
+                swap_buffers(&dst[0], &dst[1]);
+                comp_src2 = dst[0];
+
+                // comp_result += BATCH;
+                a_ptr[0] += BATCH;
+                swap_buffers(&a_ptr[0], &a_ptr[1]);
+                comp_result = a_ptr[0];
+                snrt_mcycle();
+            }
+
+            if (dma_active && snrt_is_dm_core()) {
+                // Start DMA
+                snrt_mcycle();
+                snrt_dma_start_1d(dma_dst, dma_src, BATCH);
+
+                // Update pointers for next iteration while transfer completes,
+                // preventing instructions from being reordered after the DMA wait
+                // dma_src += BATCH;
+                if (!is_northernmost) {
+                    a_ptr[0] += BATCH;
+                    swap_buffers(&a_ptr[0], &a_ptr[1]);
+                    dma_src = a_ptr[0];
+                } else {
+                    c_ptr[0] += BATCH;
+                    swap_buffers(&c_ptr[0], &c_ptr[1]);
+                    dma_src = c_ptr[0];
+                }
+
+                swap_buffers(&dst[0], &dst[1]);
+                dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0],
+                    snrt_cluster_idx(), pb_cluster_south_neighbour());
+                asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::);
+
+                // Wait for DMA to complete
+                snrt_dma_wait_all();
+                snrt_mcycle();
+            }
+
+            global_hw_barrier(barrier_ptr, user);
+        }
+    }
+}
+
+static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist,
+    uintptr_t src, uintptr_t dst) {
+
+    // Every sending cluster sends the data to a cluster on the left
+    if (is_sender && snrt_is_dm_core()) {
+
+        // Calculate destination
+        uintptr_t dst_cluster = pb_calculate_cluster_idx(
+            pb_cluster_row_idx(), pb_cluster_col_idx() - dist);
+        uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst,
+            snrt_cluster_idx(), dst_cluster);
+
+        snrt_mcycle();
+
+        // Start DMA
+        snrt_dma_start_1d(remote_dst, src, BATCH);
+
+        // Wait for DMA to complete
+        snrt_dma_wait_all();
+        snrt_mcycle();
+    }
+}
+
+static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist,
+    uintptr_t src, uintptr_t dst) {
+
+    // Every sending cluster sends the data to a cluster on the left
+    if (is_sender && snrt_is_dm_core()) {
+
+        // Calculate destination
+        uintptr_t dst_cluster = pb_calculate_cluster_idx(
+            pb_cluster_row_idx() - dist, pb_cluster_col_idx());
+        uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst,
+            snrt_cluster_idx(), dst_cluster);
+
+        snrt_mcycle();
+
+        // Start DMA
+        snrt_dma_start_1d(remote_dst, src, BATCH);
+
+        // Wait for DMA to complete
+        snrt_dma_wait_all();
+        snrt_mcycle();
+    }
+}
+
+static inline void dma_reduction_tree_compute_phase(char is_receiver,
+    uintptr_t src, uintptr_t dst, uintptr_t result) {
+
+    // Every receiving cluster reduces the data from the sender
+    if (is_receiver && snrt_is_compute_core()) {
+        snrt_mcycle();
+        axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst,
+            (double *)result);
+        snrt_mcycle();
+    }
+    snrt_cluster_hw_barrier();
+}
+
+// We need four buffers. Upon function invocation, the data must be in a.
+static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c,
+    uintptr_t d, snrt_comm_t comm) {
+
+    // Only clusters in the communicator continue from here
+    if (!comm->is_participant) return;
+
+    // Prepare for inter-cluster barrier in advance, preventing instruction
+    // reordering using the volatile block.
+    snrt_collective_t op;
+    op.f.opcode = SNRT_REDUCTION_BARRIER;
+    op.f.mask = snrt_get_collective_mask(comm);
+    volatile uint32_t *barrier_ptr = comm->barrier_ptr;
+    uint32_t user = (uint32_t)op.w;
+    asm volatile ("" : "+r"(user) ::);
+
+    // Initially a is the source buffer and c is the result buffer
+    uintptr_t src = a;
+    uintptr_t dst[2] = {b, d};
+    uintptr_t result = c;
+
+    // Iterations to cover all reductions in a row
+    uint32_t n_levels_in_row = pb_log2_cluster_num_in_row();
+    for (uint32_t i = 0; i < n_levels_in_row; i++) {
+
+        // Determine which clusters are senders and receivers at every level
+        // of the tree
+        char dist = 1 << i;  // Distance between clusters in a pair
+        char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist;
+        char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0;
+
+        // Transfer phase for first batch
+        dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]);
+        global_hw_barrier(barrier_ptr, user);
+
+        // If more than one batch is present, there will be N_BATCHES - 1
+        // iterations in which we perform both transfer and compute phases
+        uint32_t j = 0;
+        for (; j < N_BATCHES - 1; j++) {
+            // Alternate destination buffers for every batch
+            uintptr_t dma_dst = dst[(j+1)%2];
+            uintptr_t comp_dst = dst[j%2];
+
+            // Calculate pointers for current batch
+            uintptr_t batch_dma_src = src + (j+1) * BATCH;
+            uintptr_t batch_comp_src = src + j * BATCH;
+            uintptr_t batch_result = result + j * BATCH;
+
+            dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src,
+                dma_dst);
+            dma_reduction_tree_compute_phase(is_receiver, batch_comp_src,
+                comp_dst, batch_result);
+            global_hw_barrier(barrier_ptr, user);
+        }
+
+        // Compute phase for last batch
+        dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH,
+            dst[j%2], result + j * BATCH);
+
+        // Swap source and result buffers for next iteration
+        swap_buffers(&src, &result);
+    }
+
+    // Iterations to cover all reductions in the first column
+    uint32_t n_levels_in_col = LOG2_N_ROWS;
+    for (uint32_t i = 0; i < n_levels_in_col; i++) {
+
+        // Determine which clusters are senders at every level of the tree
+        char dist = 1 << i;  // Distance between clusters in a pair
+        char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0);
+        char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0);
+
+        // Transfer phase for first batch
+        dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]);
+        global_hw_barrier(barrier_ptr, user);
+
+        // If more than one batch is present, there will be N_BATCHES - 1
+        // iterations in which we perform both transfer and compute phases
+        uint32_t j = 0;
+        for (; j < N_BATCHES - 1; j++) {
+            // Alternate destination buffers for every batch
+            uintptr_t dma_dst = dst[(j+1)%2];
+            uintptr_t comp_dst = dst[j%2];
+
+            // Calculate pointers for current batch
+            uintptr_t batch_dma_src = src + (j+1) * BATCH;
+            uintptr_t batch_comp_src = src + j * BATCH;
+            uintptr_t batch_result = result + j * BATCH;
+
+            dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src,
+                dma_dst);
+            dma_reduction_tree_compute_phase(is_receiver, batch_comp_src,
+                comp_dst, batch_result);
+            global_hw_barrier(barrier_ptr, user);
+        }
+
+        // Compute phase for last batch
+        dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH,
+            dst[j%2], result + j * BATCH);
+
+        // Swap source and result buffers for next iteration
+        swap_buffers(&src, &result);
+    }
+}
+
+// L1 buffer of every cluster invoking this function should be
+// at the same offset in the TCDM
+static inline void dma_reduction(uintptr_t a0, uintptr_t a1,
+                                 uintptr_t b,
+                                 uintptr_t c0, uintptr_t c1,
+                                 uintptr_t d,
+                                 snrt_comm_t comm) {
+    if (IMPL == SEQ){
+        if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH) {
+            dma_reduction_seq_hyperbank(a0, a1, b, c0, c1, d, comm);
+        } else {
+            dma_reduction_seq_normal(a0, b, c0, d, comm);
+        }
+    }
+    // else if (IMPL == TREE)
+    //     dma_reduction_tree(a, b, c, d, comm);
+    else if (IMPL == HW_GENERIC)
+        dma_reduction_hw_generic(a0, c0, comm);
+    else if (IMPL == HW_SIMPLE)
+        dma_reduction_hw_simple(a0, c0, comm);
+}
+
+// Global variables for verification script
+double output[N_ELEMS];
+extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row();
+extern const uint32_t length = N_ELEMS;
+
+int main (void){
+    // split a and c into two hyperbanks, even*BATCH in hyperbank 0 and odd*BATCH in hyperbank 1
+    // b in hyperbank 0, d in hyperbank 1
+    // Allocate 4KiB-aligned buffers in every cluster
+    uintptr_t a_buffer[2], c_buffer[2], b_buffer, d_buffer;
+    if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)) {
+        // In SEQ implementation, when SIZE != BATCH we need to allocate
+        // twice the space for a and c to alternate between buffers
+        a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096);
+        a_buffer[1] = a_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE;
+        c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096);
+        c_buffer[1] = c_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE;
+        b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096);
+        d_buffer = b_buffer + SNRT_TCDM_HYPERBANK_SIZE;
+    } 
+    else {
+        a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096);
+        c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096);
+        b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096);
+        d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096);
+    }
+
+    // Create communicator for first N rows (all other clusters are inactive)
+    snrt_comm_t comm;
+    pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row());
+
+    // Only clusters in the first N rows continue from here
+    if (!comm->is_participant) return 0;
+
+    // Two iterations to to preheat the cache
+    for (volatile int i = 0; i < 2; i++){
+
+        // Initialize source buffer
+        if (snrt_is_dm_core()) {
+            for (uint32_t i = 0; i < N_ELEMS; i++) {
+                uint32_t row_major_cluster_idx = pb_cluster_col_idx() +
+                    pb_cluster_row_idx() * pb_cluster_num_in_row();
+                if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){
+                    uint32_t batch_id = i / (BATCH / sizeof(double));
+                    uint32_t buffer_idx = batch_id % 2;
+                    uint32_t buffer_offset = (batch_id / 2) * (BATCH / sizeof(double)) + (i % (BATCH / sizeof(double)));
+                    *((double *)(a_buffer[buffer_idx]) + buffer_offset) = row_major_cluster_idx + i;
+                    
+                }
+                else {
+                    ((double *)a_buffer[0])[i] = row_major_cluster_idx + i;
+                }
+                // ((double *)a_buffer)[i] = row_major_cluster_idx + i;
+            }
+        }
+        snrt_global_barrier(comm);
+
+        // Perform reduction
+        snrt_mcycle();
+        dma_reduction(a_buffer[0], a_buffer[1], b_buffer, c_buffer[0], c_buffer[1], d_buffer, comm);
+        snrt_mcycle();
+        snrt_global_barrier(comm);
+    }
+
+    // Writeback to L3
+    uintptr_t result_buffer = c_buffer[0];
+    uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS;
+    if ((IMPL == HW_SIMPLE && N_ROWS > 1) ||
+        (IMPL == SEQ && N_ROWS > 1) ||
+        (IMPL == TREE && (total_tree_levels % 2) == 0))
+        result_buffer = a_buffer[0];
+    if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) {
+        if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){
+            snrt_dma_1d_to_2d((void *)output, 
+                            (void *)result_buffer, 
+                            SIZE / 2, 
+                            BATCH, 
+                            BATCH * SNRT_TCDM_HYPERBANK_NUM);
+            snrt_dma_1d_to_2d((void *)((uintptr_t)output + BATCH), 
+                            (void *)(result_buffer + SNRT_TCDM_HYPERBANK_SIZE), 
+                            SIZE / 2, 
+                            BATCH, 
+                            BATCH * SNRT_TCDM_HYPERBANK_NUM);
+        }
+        else {
+            snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE);
+        }
+        snrt_dma_wait_all();
+    }
+}
diff --git a/sw/sw.mk b/sw/sw.mk
index 012435e1..a743aebe 100644
--- a/sw/sw.mk
+++ b/sw/sw.mk
@@ -22,15 +22,20 @@ PB_GEN_DIR = $(PB_ROOT)/.generated
 ## Snitch Cluster ##
 ####################
 
-SN_RUNTIME_SRCDIR    = $(PB_SNITCH_SW_DIR)/runtime/src
+SN_RUNTIME_SRCDIR    = $(PB_SNITCH_SW_DIR)/runtime/impl
 SN_RUNTIME_BUILDDIR  = $(PB_SNITCH_SW_DIR)/runtime/build
-SN_TESTS_BUILDDIR    = $(PB_SNITCH_SW_DIR)/tests/build
-SN_RVTESTS_BUILDDIR  = $(PB_SNITCH_SW_DIR)/riscv-tests/build
 SN_RUNTIME_INCDIRS   = $(PB_INCDIR)
 SN_RUNTIME_INCDIRS  += $(PB_GEN_DIR)
+SN_RUNTIME_INCDIRS  += $(PB_SNITCH_SW_DIR)/runtime/src
 SN_RUNTIME_HAL_HDRS  = $(PB_GEN_DIR)/pb_addrmap.h
 SN_RUNTIME_HAL_HDRS += $(PB_GEN_DIR)/pb_raw_addrmap.h
-SN_BUILD_APPS        = OFF
+
+SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build
+
+SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build
+SN_TESTS_INCDIRS  = $(SN_ROOT)/sw/kernels/blas
+
+SN_BUILD_APPS = OFF
 
 SN_APPS  = $(PB_SNITCH_SW_DIR)/apps/gemm_2d
 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/gemm
@@ -38,37 +43,15 @@ SN_APPS += $(PB_SNITCH_SW_DIR)/apps/axpy
 SN_APPS += $(SN_ROOT)/sw/kernels/dnn/flashattention_2
 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear
 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/mha
+SN_APPS += $(PB_SNITCH_SW_DIR)/apps/summa_gemm
+
+SN_TESTS = $(wildcard $(PB_SNITCH_SW_DIR)/tests/*.c)
 
 include $(SN_ROOT)/make/sw.mk
 
 $(PB_GEN_DIR)/pb_raw_addrmap.h: $(PB_RDL_ALL)
 	$(PEAKRDL) raw-header $< -o $@ $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) --base_name $(notdir $(basename $@)) --format c
 
-# Collect Snitch tests which should be built
-PB_SN_TESTS_DIR      = $(PB_SNITCH_SW_DIR)/tests
-PB_SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build
-PB_SN_TEST_NAMES = $(basename $(notdir $(wildcard $(PB_SN_TESTS_DIR)/*.c)))
-PB_SN_TEST_ELFS = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .elf,$(PB_SN_TEST_NAMES))))
-PB_SN_TEST_DUMP = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .dump,$(PB_SN_TEST_NAMES))))
-
-.PHONY: pb-sn-tests clean-pb-sn-tests
-
-pb-sn-tests: $(PB_SN_TEST_ELFS) $(PB_SN_TEST_DUMP)
-
-clean-pb-sn-tests:
-	rm -rf $(PB_SN_TEST_ELFS)
-
-$(PB_SN_TEST_ELFS): $(PB_GEN_DIR)/pb_addrmap.h
-
-$(PB_SN_TESTS_BUILDDIR)/%.d: $(PB_SN_TESTS_DIR)/%.c | $(PB_SN_TESTS_BUILDDIR)
-	$(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) -MM -MT '$(@:.d=.elf)' -x c++ $< > $@
-
-$(PB_SN_TESTS_BUILDDIR)/%.elf: $(PB_SN_TESTS_DIR)/%.c $(SN_RUNTIME_LIB) | $(PB_SN_TESTS_BUILDDIR)
-	$(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) $(SN_TESTS_RISCV_LDFLAGS) -x c++ $< -o $@
-
-$(PB_SN_TESTS_BUILDDIR)/%.dump: $(PB_SN_TESTS_BUILDDIR)/%.elf | $(PB_SN_TESTS_BUILDDIR)
-	$(SN_RISCV_OBJDUMP) $(SN_RISCV_OBJDUMP_FLAGS) $< > $@
-
 ##############
 ## Cheshire ##
 ##############
@@ -108,6 +91,6 @@ sn-runtime-clean: sn-clean-runtime
 sn-apps-clean: sn-clean-apps
 
 .PHONY: sw sw-tests sw-clean sw-tests-clean
-sw sw-tests: chs-sw-tests sn-tests pb-sn-tests sn-apps
+sw sw-tests: chs-sw-tests sn-tests sn-apps
 
-sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests sn-apps-clean
+sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean sn-apps-clean
diff --git a/target/sim/traces.mk b/target/sim/traces.mk
index f4773b33..4cc627c1 100644
--- a/target/sim/traces.mk
+++ b/target/sim/traces.mk
@@ -9,6 +9,10 @@ LOGS_DIR = $(SIM_DIR)/logs
 SN_SIM_DIR = $(SIM_DIR)
 include $(SN_ROOT)/make/traces.mk
 
+ifdef ROI_SPEC
+	SN_ROI_SPEC = $(ROI_SPEC)
+endif
+
 CHS_ADDR2LINE      ?= $(CHS_SW_GCC_BINROOT)/riscv64-unknown-elf-addr2line
 CHS_TXT_TRACE       = $(LOGS_DIR)/trace_hart_00000.txt
 CHS_ANNOTATED_TRACE = $(LOGS_DIR)/trace_hart_00000.s
@@ -24,6 +28,8 @@ traces: sn-traces chs-trace
 annotate: sn-annotate chs-annotate
 traces-clean: sn-clean-traces chs-trace-clean
 annotate-clean: sn-clean-annotate chs-annotate-clean
+visual-trace: sn-visual-trace
+clean-visual-trace: sn-clean-visual-trace
 
 chs-trace: $(CHS_TXT_TRACE)
 chs-annotate: $(CHS_ANNOTATED_TRACE)
diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk
index 40e0ad30..b57e28a5 100644
--- a/target/sim/vsim/vsim.mk
+++ b/target/sim/vsim/vsim.mk
@@ -45,16 +45,16 @@ vsim-compile: $(VSIM_DIR)/compile.tcl $(PB_HW_ALL)
 	$(VSIM) -c $(VSIM_FLAGS) -do "source $<; quit"
 
 $(VSIM_DIR)/compile.tcl: $(BENDER_YML) $(BENDER_LOCK)
-	bender script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@
+	$(BENDER) script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@
 	echo 'vlog -work $(VSIM_WORK) "$(realpath $(CHS_ROOT))/target/sim/src/elfloader.cpp" -ccflags "-std=c++11"' >> $@
 
 vsim-run:
-	$(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*"
+	cd $(SIM_DIR) && $(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*"
 
 vsim-run-batch:
-	$(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit"
+	cd $(SIM_DIR) && $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit"
 
 vsim-run-batch-verify: vsim-run-batch
 ifdef VERIFY_PY
-	$(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000
+	cd $(SIM_DIR) && $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000
 endif
\ No newline at end of file