From 5e5019a8c850d1ba4a7c00580bd5fba46e107fb0 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 18:57:49 +0200 Subject: [PATCH 01/77] cfg: Rename to `enable_narrow_collectives` in `snitch_cluster.json` --- cfg/snitch_cluster.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json index 948bd2a8..3a60ec80 100644 --- a/cfg/snitch_cluster.json +++ b/cfg/snitch_cluster.json @@ -27,7 +27,7 @@ narrow_trans: 4, wide_trans: 32, dma_user_width: 48, - enable_multicast: true, + enable_narrow_collectives: true, cluster_base_expose: true, alias_region_enable: true, alias_region_base: 0x30000000, From 5830d2d74a1785da8e4f475a8f788888a4b6b5e1 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 18:58:42 +0200 Subject: [PATCH 02/77] cluster_tile.sv: Update name of `user_t` to `user_narrow_t` --- hw/cluster_tile.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 1f8cf413..420297cb 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -70,7 +70,7 @@ module cluster_tile `AXI_TYPEDEF_ALL(cluster_narrow_out_dw_conv, snitch_cluster_pkg::addr_t, snitch_cluster_pkg::narrow_out_id_t, data_hwpe_ctrl_t, strb_hwpe_ctrl_t, - snitch_cluster_pkg::user_t) + snitch_cluster_pkg::user_narrow_t) cluster_narrow_out_dw_conv_req_t cluster_narrow_out_dw_conv_req, cluster_narrow_out_cut_req; cluster_narrow_out_dw_conv_resp_t cluster_narrow_out_dw_conv_rsp, cluster_narrow_out_cut_rsp; From a4bfb062e82c5960d9b8d1124b1230cda9698a56 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 18:59:05 +0200 Subject: [PATCH 03/77] cluster_tile.sv: Add and tie off DCA interface --- hw/cluster_tile.sv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 420297cb..564b59b4 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -105,7 +105,9 @@ module cluster_tile .narrow_ext_req_o (cluster_narrow_ext_req), .narrow_ext_resp_i(cluster_narrow_ext_rsp), .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), - .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned) + .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned), + .dca_req_i ('0), + .dca_rsp_o () ); if (UseHWPE) begin : gen_hwpe From 917dfba3331c7d5b11c7a1c8b6272434b1fed834 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 18:59:21 +0200 Subject: [PATCH 04/77] iis-env.sh: Bump LLVM toolchain --- iis-env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iis-env.sh b/iis-env.sh index 87dffd81..2e9f41b3 100644 --- a/iis-env.sh +++ b/iis-env.sh @@ -9,7 +9,7 @@ export VLIB="questa-2023.4 vlib" export BASE_PYTHON=/usr/local/anaconda3/bin/python3.11 export CHS_SW_GCC_BINROOT=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0/bin export VERIBLE_FMT="oseda -2025.03 verible-verilog-format" -export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.2.0/bin +export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.3.0/bin # Create the python venv if [ ! -d ".venv" ]; then From e5decf52ff1318a0bd33acf228d078ad16834ea4 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 18:59:41 +0200 Subject: [PATCH 05/77] requirements.txt: Bump peakrdl-rawheader --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 78565385..4effd42e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,5 +25,5 @@ hjson # for reggen # For peakrdl peakrdl -peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@7b8dbc9ad5854dc1cdaf36d4ea024c29ffb00a4c +peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@9a7bd727986ee2ffa5105c61aaeb5c991309d29b peakrdl-markdown From 297ee5885bc861ee7ec9496a72439dd4e819885f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 19:00:07 +0200 Subject: [PATCH 06/77] UNDO: Add separate SNRT_ENABLE_NARROW_REDUCTION --- sw/snitch/runtime/src/snrt.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/src/snrt.h index 8959c80a..a7bdb4aa 100644 --- a/sw/snitch/runtime/src/snrt.h +++ b/sw/snitch/runtime/src/snrt.h @@ -7,6 +7,9 @@ #include #include +// Additional optional configurations to change the runtime behaviour +// #define SNRT_ENABLE_NARROW_REDUCTION + // Configuration- and system-specific definitions (HAL) #include "pb_addrmap.h" #include "snitch_cluster_cfg.h" From 4955f98300f9ce59bd1822626e01918dd2ae069e Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 19:10:24 +0200 Subject: [PATCH 07/77] bender: Bump `snitch_cluster` and `axi` --- Bender.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Bender.lock b/Bender.lock index 55c39087..b76d7eed 100644 --- a/Bender.lock +++ b/Bender.lock @@ -25,7 +25,7 @@ packages: - obi_peripherals - register_interface axi: - revision: bec548fa2a9b18cbd7531105bb1fdf481ea8ad49 + revision: 8e04779f341eb2c89412aae92223a292beef487e version: null source: Git: https://github.com/colluca/axi.git @@ -383,7 +383,7 @@ packages: - common_cells - register_interface snitch_cluster: - revision: 5d6c957c70e3824e2330d8308eb904724cd41cbe + revision: 4432d89076346e9bd25980e35973a9d521e94d41 version: null source: Git: https://github.com/pulp-platform/snitch_cluster.git From cc4231e2d06a0dc9b5ff7f59cbd40ead6acff16c Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 26 Sep 2025 19:49:29 +0200 Subject: [PATCH 08/77] sw: Initialize destination arrays in `dma_multicast.c` --- sw/snitch/tests/dma_multicast.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c index 5583de48..e844cb5f 100644 --- a/sw/snitch/tests/dma_multicast.c +++ b/sw/snitch/tests/dma_multicast.c @@ -69,6 +69,15 @@ int main() { uint32_t *buffer_dst = (uint32_t *)snrt_l1_next_v2(); uint32_t *buffer_src = buffer_dst + LENGTH; + // Every cluster initializes its destination buffer. + if (snrt_is_dm_core()) { + snrt_dma_start_1d(buffer_dst, snrt_cluster()->zeromem.mem, LENGTH * sizeof(uint32_t)); + snrt_dma_wait_all(); + } + + // Synchronize all clusters. + snrt_global_barrier(); + // First cluster initializes the source buffer and multicast- // copies it to the destination buffer in every cluster's TCDM. if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { From 3a9d835596f6413ef3ab621f269f3c17a20d2937 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sat, 27 Sep 2025 14:36:57 +0200 Subject: [PATCH 09/77] bender: Bump iDMA to include DMUSER instruction --- Bender.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bender.lock b/Bender.lock index b76d7eed..257a78fa 100644 --- a/Bender.lock +++ b/Bender.lock @@ -266,7 +266,7 @@ packages: dependencies: - tech_cells_generic idma: - revision: 7829f71691a62c1e2e5e3230f370f222c7a83087 + revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e version: null source: Git: https://github.com/pulp-platform/iDMA.git From 479cf6977df68b0a50327796d513c9359a487148 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sat, 27 Sep 2025 14:01:31 +0200 Subject: [PATCH 10/77] cfg: Set `enable_wide_collectives` in snitch_cluster.json --- cfg/snitch_cluster.json | 1 + 1 file changed, 1 insertion(+) diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json index 3a60ec80..b3b4d020 100644 --- a/cfg/snitch_cluster.json +++ b/cfg/snitch_cluster.json @@ -28,6 +28,7 @@ wide_trans: 32, dma_user_width: 48, enable_narrow_collectives: true, + enable_wide_collectives: true, cluster_base_expose: true, alias_region_enable: true, alias_region_base: 0x30000000, From 15b9b28a43239bdd750975598dd300d285fc99ce Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sat, 27 Sep 2025 14:24:36 +0200 Subject: [PATCH 11/77] hw: Pass cluster base offset to cluster tile --- Bender.lock | 2 +- hw/cluster_tile.sv | 2 ++ hw/picobello_top.sv | 36 +++++++++++++++++++----------------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/Bender.lock b/Bender.lock index 257a78fa..47c06106 100644 --- a/Bender.lock +++ b/Bender.lock @@ -383,7 +383,7 @@ packages: - common_cells - register_interface snitch_cluster: - revision: 4432d89076346e9bd25980e35973a9d521e94d41 + revision: 1e7aadb7f8b31afcb1f7a9d7369b6ccee7e20aa8 version: null source: Git: https://github.com/pulp-platform/snitch_cluster.git diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 564b59b4..5ed7b7a1 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -27,6 +27,7 @@ module cluster_tile input logic [NrCores-1:0] msip_i, input logic [ 9:0] hart_base_id_i, input snitch_cluster_pkg::addr_t cluster_base_addr_i, + input snitch_cluster_pkg::addr_t cluster_base_offset_i, // Chimney ports input id_t id_i, // Router ports @@ -91,6 +92,7 @@ module cluster_tile .msip_i, .hart_base_id_i, .cluster_base_addr_i, + .cluster_base_offset_i, .mxip_i (mxip), .clk_d2_bypass_i ('0), .sram_cfgs_i ('0), diff --git a/hw/picobello_top.sv b/hw/picobello_top.sv index 201020c7..170372c8 100644 --- a/hw/picobello_top.sv +++ b/hw/picobello_top.sv @@ -97,27 +97,29 @@ module picobello_top localparam int Y = int'(ClusterPhysicalId.y); localparam int unsigned HartBaseId = c * NrCores + 1; // Cheshire is hart 0 localparam axi_wide_in_addr_t ClusterBaseAddr = Sam[ClusterSamIdx].start_addr; + localparam axi_wide_in_addr_t ClusterBaseOffset = Sam[ClusterSamIdx].end_addr - ClusterBaseAddr; cluster_tile i_cluster_tile ( .clk_i, .rst_ni, - .test_enable_i (test_mode_i), - .tile_clk_en_i (cluster_clk_en[c]), - .tile_rst_ni (cluster_rst_n[c]), - .clk_rst_bypass_i (clk_rst_bypass_i), - .debug_req_i (debug_req[c]), - .meip_i (meip[c]), - .mtip_i (mtip[c]), - .msip_i (msip[c]), - .hart_base_id_i (HartBaseId[9:0]), - .cluster_base_addr_i(ClusterBaseAddr), - .id_i (ClusterId), - .floo_req_o (floo_req_out[X][Y]), - .floo_rsp_i (floo_rsp_in[X][Y]), - .floo_wide_o (floo_wide_out[X][Y]), - .floo_req_i (floo_req_in[X][Y]), - .floo_rsp_o (floo_rsp_out[X][Y]), - .floo_wide_i (floo_wide_in[X][Y]) + .test_enable_i (test_mode_i), + .tile_clk_en_i (cluster_clk_en[c]), + .tile_rst_ni (cluster_rst_n[c]), + .clk_rst_bypass_i (clk_rst_bypass_i), + .debug_req_i (debug_req[c]), + .meip_i (meip[c]), + .mtip_i (mtip[c]), + .msip_i (msip[c]), + .hart_base_id_i (HartBaseId[9:0]), + .cluster_base_addr_i (ClusterBaseAddr), + .cluster_base_offset_i(ClusterBaseOffset), + .id_i (ClusterId), + .floo_req_o (floo_req_out[X][Y]), + .floo_rsp_i (floo_rsp_in[X][Y]), + .floo_wide_o (floo_wide_out[X][Y]), + .floo_req_i (floo_req_in[X][Y]), + .floo_rsp_o (floo_rsp_out[X][Y]), + .floo_wide_i (floo_wide_in[X][Y]) ); end From 38344f4b29691e6cddba178e48ca43e56611b28a Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 7 Aug 2025 15:13:07 +0200 Subject: [PATCH 12/77] sw: Integrate Lorenzo's microbenchmark --- experiments.sh | 37 +++ sw/snitch/experiments/util/quick_plot.py | 293 +++++++++++++++++++++++ sw/snitch/tests/dma_multicast.c | 250 ++++++++++++++++--- 3 files changed, 553 insertions(+), 27 deletions(-) create mode 100755 experiments.sh create mode 100755 sw/snitch/experiments/util/quick_plot.py diff --git a/experiments.sh b/experiments.sh new file mode 100755 index 00000000..b227358a --- /dev/null +++ b/experiments.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +TRAN_LEN_VALUES=(256 512 1024 2048 4096 8192) +MODE_VALUES=(SW_UNOPT SW_OPT SW_OPT2) +N_CLUSTERS_VALUES=(4 8 16) + +for N_CLUSTERS in "${N_CLUSTERS_VALUES[@]}"; do + for MODE in "${MODE_VALUES[@]}"; do + for TRAN_LEN in "${TRAN_LEN_VALUES[@]}"; do + + echo "=== Running with TRAN_LEN=${TRAN_LEN}, MODE=${MODE}, N_CLUSTERS=${N_CLUSTERS} ===" + + # Remove old ELF + rm -f sw/snitch/tests/build/dma_multicast.elf + + # Build + make sw DEBUG=ON TRAN_LEN=${TRAN_LEN} MODE=${MODE} N_CLUSTERS=${N_CLUSTERS} -j + + # Run simulation + make vsim-run-batch CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf \ + SN_BINARY=sw/snitch/tests/build/dma_multicast.elf \ + PRELMODE=3 + + # Annotate + make annotate DEBUG=ON -j + + # Move logs + DEST_DIR="logs_${TRAN_LEN}_${MODE}_${N_CLUSTERS}" + mkdir -p results/"${DEST_DIR}" + mv logs/*.s results/"${DEST_DIR}" + + echo "Logs saved to results/${DEST_DIR}" + echo "------------------------------------------------------------" + + done + done +done diff --git a/sw/snitch/experiments/util/quick_plot.py b/sw/snitch/experiments/util/quick_plot.py new file mode 100755 index 00000000..547e3045 --- /dev/null +++ b/sw/snitch/experiments/util/quick_plot.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Lorenzo Leone + +import os +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import cm +from matplotlib.patches import Patch +from matplotlib.colors import ListedColormap + +PULP_COLORS = { + "PULP Blue": "#3067AB", + "PULP Petrol": "#84B5E9", + "PULP Green": "#168638", + "PULP Bronze": "#E59955", + "PULP Red": "#9B3B32", + "PULP Purple": "#851B66", + "PULP Grey": "#6F6F6F" +} + +def plot_multicast_speedup(clusters, transfer_sizes, speedups, + colors=None, + title="Multicast Speedup", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=1, + legend_frame=True, + save_name=None): + """ + Plots a bar chart of multicast speedup. + + Parameters: + - clusters: list of cluster counts (x-axis categories) + - transfer_sizes: list of transfer sizes (legend categories) + - speedups: 2D list of speedups where each row corresponds to clusters[i] + and each column corresponds to transfer_sizes[j] + - colors: list of colors for each transfer size + - title, xlabel, ylabel: strings for plot customization + - legend_loc: legend location (e.g., 'upper left', 'upper right') + - legend_ncol: number of columns in legend + - legend_frame: whether to show frame around legend + """ + + # Number of clusters and transfer sizes + n_clusters = len(clusters) + n_sizes = len(transfer_sizes) + + # Default colors if none are provided + if colors is None: + colors = plt.cm.viridis(np.linspace(0, 1, n_sizes)) + + # X positions for cluster groups + x = np.arange(n_clusters) + + # Width of each bar + bar_width = 0.8 / n_sizes + + fig, ax = plt.subplots(figsize=(10, 6)) + + # Plot each transfer size group + for i, size in enumerate(transfer_sizes): + offsets = x + i * bar_width - (bar_width * (n_sizes - 1) / 2) + ax.bar(offsets, + [speedups[j][i] for j in range(n_clusters)], + width=bar_width, + color=colors[i], + label=f"{size}") + + # Labels and title + ax.set_xlabel(xlabel, fontsize=14) + ax.set_ylabel(ylabel, fontsize=14) + ax.set_title(title, fontsize=16) + ax.set_xticks(x) + ax.set_xticklabels(clusters) + ax.tick_params(axis='both', which='major', labelsize=14) + + + # Legend customization + ax.legend(title="Transfer Size", loc=legend_loc, ncol=legend_ncol, frameon=legend_frame, fontsize=14, title_fontsize=16) + + # Grid for readability + ax.grid(axis="y", linestyle="--", alpha=0.7) + + fig.tight_layout() + + # Save plot if a name is provided + if save_name: + plots_dir = os.path.join(os.getcwd(), "plots") + os.makedirs(plots_dir, exist_ok=True) + file_path = os.path.join(plots_dir, f"{save_name}.svg") + plt.savefig(file_path, format="svg", bbox_inches="tight") + print(f"Plot saved to: {file_path}") + + return fig, ax + + +def _add_gradient_legend_inset( + ax_main, colors1, colors2, transfer_sizes, sw1_label, sw2_label, + inset_width=0.3, bar_height=0.04, gap=0.01, x0=0.02, y_top=0.92 +): + """Place two gradient bars in the top-left; bottom=SW1 (with labels), top=SW2 (no labels).""" + fig = ax_main.figure + + # Top bar = SW2 (no labels) + ax_sw2 = ax_main.inset_axes([x0, y_top - bar_height, inset_width, bar_height], + transform=ax_main.transAxes) + # Bottom bar = SW1 (with labels) + ax_sw1 = ax_main.inset_axes([x0, y_top - 2*bar_height - gap, inset_width, bar_height], + transform=ax_main.transAxes) + + def draw_bar(ax_bar, colors, title, show_labels=False, lgn_title=False): + data = np.arange(len(colors)).reshape(1, len(colors)) + cmap = ListedColormap(colors) + ax_bar.imshow(data, aspect='auto', cmap=cmap, extent=[0, len(colors), 0, 1]) + + if show_labels: + centers = np.arange(len(colors)) + 0.5 + ax_bar.set_xticks(centers) + ax_bar.set_xticklabels(transfer_sizes, fontsize=8, rotation=45, ha="right") + else: + ax_bar.set_xticks([]) + + ax_bar.set_yticks([]) + ax_bar.set_xlim(0, len(colors)) + + for spine in ax_bar.spines.values(): + spine.set_visible(False) + + # Title to the right + ax_bar.text(len(colors) + 0.2, 0.5, title, + fontsize=9, va='center', ha='left', transform=ax_bar.transData) + + # DMA Transfer Size label centered over the 6 colors + if lgn_title: + ax_bar.text(3, 1.4, "DMA Transfer Size", # x=3 is center of 0..6 range + fontsize=9, ha='center', va='bottom', fontweight="bold", + transform=ax_bar.transData) + + # Draw bars + draw_bar(ax_sw2, colors2, sw2_label, show_labels=False, lgn_title=True) + draw_bar(ax_sw1, colors1, sw1_label, show_labels=True, lgn_title=False) + +def plot_two_software_speedup( + clusters, + transfer_sizes, + sw1_speedups, + sw2_speedups, + sw1_label="Software 1", + sw2_label="Software 2", + cmap1="viridis", + cmap2="magma", + title="Speedup by Transfer Size and Software", + xlabel="Number of Clusters", + ylabel="Speedup", + save_name=None +): + n_clusters = len(clusters) + x = np.arange(n_clusters) + bars_per_group = 12 + bar_width = 0.8 / bars_per_group + + colors1 = cm.get_cmap(cmap1)(np.linspace(0.15, 0.85, 6)) + colors2 = cm.get_cmap(cmap2)(np.linspace(0.15, 0.85, 6)) + + fig, ax = plt.subplots(figsize=(12, 6)) + + for i in range(6): + offsets_sw1 = x + i * bar_width - (bar_width * (bars_per_group - 1) / 2) + offsets_sw2 = x + (6 + i) * bar_width - (bar_width * (bars_per_group - 1) / 2) + + ax.bar(offsets_sw1, [sw1_speedups[j][i] for j in range(n_clusters)], + width=bar_width, color=colors1[i]) + ax.bar(offsets_sw2, [sw2_speedups[j][i] for j in range(n_clusters)], + width=bar_width, color=colors2[i]) + + ax.set_title(title, fontsize=16) + ax.set_xlabel(xlabel, fontsize=14) + ax.set_ylabel(ylabel, fontsize=14) + ax.set_xticks(x) + ax.set_xticklabels(clusters) + ax.grid(axis="y", linestyle="--", alpha=0.7) + + # Add gradient legend in top-left corner + _add_gradient_legend_inset(ax, colors1, colors2, transfer_sizes, sw1_label, sw2_label) + + fig.tight_layout() + + # Save plot if a name is provided + if save_name: + plots_dir = os.path.join(os.getcwd(), "plots") + os.makedirs(plots_dir, exist_ok=True) + file_path = os.path.join(plots_dir, f"{save_name}.svg") + plt.savefig(file_path, format="svg", bbox_inches="tight") + print(f"Plot saved to: {file_path}") + + return fig, ax + +# ===== Example Usage ===== +def main(): + clusters = [4, 8, 16] + transfer_sizes = ["1 KiB", "2 KiB", "4 KiB", "8 KiB", "16 KiB", "32 KiB"] + + # Example speedups (rows = clusters, cols = transfer sizes) + speedups_mcast_unopt = [ + [3.92, 3.953846154, 3.76, 3.915151515, 3.969491525, 3.992857143], # For 4 clusters + [7.7, 7.738461538, 7.45, 7.848484848, 7.983050847, 7.859649123], # For 8 clusters + [15.24, 15.66153846, 15.02, 15.73939394, 15.88813559, 15.67894737] # For 16 clusters +] + + speedups_sw1_unopt = [ + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], # For 4 clusters + [0.891203704, 0.994071146, 1.125377644, 1.29241517, # For 8 clusters + 1.424682396, 1.509942703], + [1.448669202, 1.583203733, 1.796650718, 1.965934898, # For 16 clusters + 2.099910394, 2.192590775] + ] + + speedups_sw2_unopt = [ + [1.02617801, 1.088983051, 1.153374233, 1.216572505, 1.271444083, 1.303030303], # For 4 clusters + [1.370106762, 1.462209302, 1.616052061, 1.75951087, 1.875, 1.942758023], # For 8 clusters + [2.01055409, 2.237362637, 2.486754967, 2.736564805, 2.931207004, 3.066918325] # For 16 clusters +] + + speedups_sw2_sw1 = [ + [1.02617801, 1.088983051, 1.153374233, 1.216572505, 1.271444083, 1.303030303], # For 4 clusters + [1.537366548, 1.470930233, 1.436008677, 1.361413043, 1.316082803, 1.286643539], # For 8 clusters + [1.387862797, 1.413186813, 1.38410596, 1.39199157, 1.39587242, 1.398764585] # For 16 clusters + ] + + speedups_optimized = [ + [3.82, 3.630769231, 3.26, 3.218181818, 3.122033898, 3.064285714], # For 4 clusters + [5.62, 5.292307692, 4.61, 4.460606061, 4.257627119, 4.045614035], # For 8 clusters + [7.58, 7.0, 6.04, 5.751515152, 5.420338983, 5.112280702] # For 16 clusters + ] + + # Select PULP colors for the 6 transfer sizes + custom_colors = [ + PULP_COLORS["PULP Blue"], + PULP_COLORS["PULP Petrol"], + PULP_COLORS["PULP Green"], + PULP_COLORS["PULP Bronze"], + PULP_COLORS["PULP Red"], + PULP_COLORS["PULP Purple"] + ] + + plot_multicast_speedup(clusters, transfer_sizes, speedups_mcast_unopt, + title="Multicast Speedup against unoptimized software version", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="mcast_vs_unopt") + + plot_multicast_speedup(clusters, transfer_sizes, speedups_sw2_sw1, + title="Speedup of SW OPT2 implementation against SW OPT1", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="sw2_vs_sw1") + + plot_multicast_speedup(clusters, transfer_sizes, speedups_optimized, + title="Multicast Speedup against best software version", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="mcast_vs_optimized") + + plot_two_software_speedup( + clusters, + transfer_sizes, + speedups_sw1_unopt, speedups_sw2_unopt, + sw1_label="Column first", + sw2_label="Binary tree", + cmap1="viridis", + cmap2="magma", + title="Speedup of different Software schemes", + xlabel="Number of Clusters", + ylabel="Speedup", + save_name="sw_comparison" + ) + + + plt.show() + +if __name__ == '__main__': + main() diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c index e844cb5f..98249231 100644 --- a/sw/snitch/tests/dma_multicast.c +++ b/sw/snitch/tests/dma_multicast.c @@ -3,10 +3,28 @@ // SPDX-License-Identifier: Apache-2.0 // // Luca Colagrande +// Lorenzo Leone // // This code tests the multicast feature of the wide interconnect. // It uses the DMA to broadcast chunks of data to different clusters // in the Picobello system. +// +// Extension: 30-07-25 +// The code can now be used to run microbenchamrk and check the multicast +// feature performance. The test support different software implemenation +// to multicast some DMA trasnefers over multiple clusters in the Picobello +// system. By driving some parameters externally, it's possible to get +// the performance results. +// +// Comment: +// The use of WFI and global_barrier together can create some issues due to +// non unique CLINT signal mapping. A possibility would be not to send the +// clusters into WFI, however this will create complicances in teh wake up routine. +// For instance, each cluster would nee to poll the DMA transfer detsination +// until teh expected value is written, and only then it can move to the next +// algorithm step. However, since the code is run twice to pre-heat the cache, +// in the second iteration each cluster will already have the expected results +// and will start the transfer immediately. #include #include "pb_addrmap.h" @@ -14,60 +32,238 @@ #define INITIALIZER 0xAAAAAAAA -#ifndef LENGTH -#define LENGTH 32 +typedef enum { + SW_UNOPT, + SW_OPT, + SW_OPT2, + HW_MCAST +} mode_t; + +// Run experiments with and without multicast support +#ifndef MODE +#define MODE SW_UNOPT +#endif + +// Transfer LENGTH in uint32_t elements +#ifndef TRAN_LEN +#define TRAN_LEN 1024 #endif #define LENGTH_TO_CHECK 32 #ifndef N_CLUSTERS_TO_USE -#define N_CLUSTERS_TO_USE snrt_cluster_num() +#define N_CLUSTERS_TO_USE 8 #endif -// TODO(lleone): Check if is okay to shift by 18. Probably you need to adapt it to picobello +#define LOG2_COL 2 +#define LOG2_ROW 2 + #define BCAST_MASK_ACTIVE ((N_CLUSTERS_TO_USE - 1) << 18) -// #define BCAST_MASK_ALL ((snrt_cluster_num() - 1) << 18) +#define cluster_is_in_column(col) \ + __cluster_is_in_column_api((col) - 1) -static inline void dma_broadcast_to_clusters(void* dst, void* src, size_t size) { - // snrt_enable_multicast(BCAST_MASK_ACTIVE); - if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { +static inline int __cluster_is_in_column_api(int col){ + return ((snrt_cluster_idx() >> LOG2_ROW) == col); +} + +//------------------- +//| Mcast functions | +//------------------- +// +// HW_MCAST: Broadcast from cluster_idx to all mcast destinations (only power of 2 dst supported) +static inline void cluster_broadcast_hw(uint32_t cluster_idx, void* dst, void* src, size_t size) { + if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { snrt_dma_start_1d_mcast(snrt_remote_l1_ptr(dst, 0, 1), src, size, BCAST_MASK_ACTIVE); snrt_dma_wait_all(); } - // snrt_disable_multicast(); } -static inline int cluster_participates_in_bcast(int i) { - return (i < N_CLUSTERS_TO_USE); + +// SW_UNOPT: Unoptimized software version. Cluster_idx initiates multiple DMA transfers to each destination +static inline void cluster_broadcast_sw_unopt(uint32_t cluster_idx, void* dst, void* src, size_t size) { + if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { + for (int cl = 0; cl < N_CLUSTERS_TO_USE; cl++) { + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, cl), src, size); + snrt_dma_wait_all(); + } + } } -static inline void broadcast_wrapper(void* dst, void* src, size_t size) { - snrt_global_barrier(); - dma_broadcast_to_clusters(dst, src, size); - // Put clusters who don't participate in the broadcast to sleep, as if - // they proceed directly to the global barrier, they will interfere with - // the other clusters, by sending their atomics on the narrow interconnect. - if (!cluster_participates_in_bcast(snrt_cluster_idx())) { + +// SW_OPT: Cluster 0 broadcast over the first column, then each cluster in the first column +// broadcast over its own row +static inline void cluster_broadcast_sw_opt(uint32_t cluster_idx, void* dst, void* src, size_t size) { + if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { + for (int row_cl = 0; row_cl < 4; row_cl ++){ + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, row_cl), src, size); + snrt_dma_wait_all(); + } + // Wait completion of all the transfer to then wakeup the clusters in col 0 + // In this benchmark setup, it's okay to wake up all clusters in a col together, + // because they'll issue multicast requests on their row without creating + // crossing paths between multicast transactions and therefore without creating + // any network congestion that might affect the final results. + + // Wake up clusters in col 0 + if (N_CLUSTERS_TO_USE > 4) { + for (int row_cl = 1; row_cl < 4; row_cl ++){ + snrt_cluster(row_cl)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; + } + } + // Send data from cluster 0 to row 0 + for (int col_cl = 1; col_cl < N_CLUSTERS_TO_USE/4; col_cl ++){ + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, col_cl * 4), + src, size); + snrt_dma_wait_all(); + } + } else if (snrt_is_dm_core() && cluster_is_in_column(1) && N_CLUSTERS_TO_USE > 4) { + // The clusters in the first column will wait to be waked up by cluster 0 + // and only then will isue the multicast transaction to the row snrt_wfi(); snrt_int_clr_mcip(); + // Send data to each row + for (int col_cl = 1; col_cl < N_CLUSTERS_TO_USE/4; col_cl ++){ + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + col_cl * 4), + dst, size); + snrt_dma_wait_all(); + } } - // Wake these up when cluster 0 is done - else if ((snrt_cluster_idx() == 0) && snrt_is_dm_core()) { - for (int i = 0; i < snrt_cluster_num(); i++) { - if (!cluster_participates_in_bcast(i)) - snrt_cluster()->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; +} + +// SW_OPT2: to maximize the paralle work, cluster 0 multicast to cluster 2, +// then simultaneously cluster 0 and 2 multicast to 1 and 3 respectively. +// The same strided approach is used column-wise. +// This approach is afster for NUM_CLUSTERS = 16 only. +// or this reason the following code is not generic enough to handle NUM_CLUSTERS < 16. +static inline void cluster_broadcast_sw_opt2(uint32_t cluster_idx, void* dst, void* src, size_t size) { + + if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { + // DMA transfer to cluster 0 + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 0), src, size); + snrt_dma_wait_all(); + + // DMA transfer to cluster 2 anf then wake up and move one + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 2), src, size); + snrt_dma_wait_all(); + snrt_cluster(2)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; + + // DMA transfer to cluster 1 and then wake up and move to the row + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 1), src, size); + snrt_dma_wait_all(); + // WWake up cluster 1 only if you need a second iteration for the next column + if (N_CLUSTERS_TO_USE > 4){ + snrt_cluster(1)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; + } + + if (N_CLUSTERS_TO_USE > 8){ + // DMA transfer to cluster 8 and then wake up and move to 4 + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 8), src, size); + snrt_dma_wait_all(); + snrt_cluster(8)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; } + + if (N_CLUSTERS_TO_USE > 4) { + // DMA transfer to cluster 4. No need to wake it up now since + // it is not involved in any transfer + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 4), src, size); + snrt_dma_wait_all(); + } + + } else if (snrt_is_dm_core() && cluster_is_in_column(1)) { + // The clusters in the first column will wait to be waked up by cluster 0 + // and only then will isue the multicast transaction to the row + if (N_CLUSTERS_TO_USE > 4 || snrt_cluster_idx() == 2) { + snrt_wfi(); + snrt_int_clr_mcip(); + } + + // Only cluster 2 is repsonsible to transfer also to cluster 3 + if (snrt_cluster_idx() == 2) { + // DMA transfer to cluster 8 and then wake up and move to 4 + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 1), dst, size); + snrt_dma_wait_all(); + snrt_cluster(snrt_cluster_idx() + 1)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; + } + + if (N_CLUSTERS_TO_USE > 8){ + // All clusters in the first column need to send the data with + // a stride of 8 first and 4 after. + // DMA transfer to cluster 8 and then wake up and move to 4 + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 8), dst, size); + snrt_dma_wait_all(); + snrt_cluster(snrt_cluster_idx() + 8)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; + } + + if (N_CLUSTERS_TO_USE > 4) { + // Then the clusters in the first column can send the transfer with stride 4, + // i.e. to the cluster in he same row but next column. + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 4), dst, size); + snrt_dma_wait_all(); + } + + + } else if (snrt_is_dm_core() && cluster_is_in_column(3) && N_CLUSTERS_TO_USE > 8) { + // The clusters in the column 3 will wait to be waked up by cluster + // and only then will isue the multicast transaction to the row + snrt_wfi(); + snrt_int_clr_mcip(); + + snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 4), dst, size); + snrt_dma_wait_all(); } } + +static inline void dma_broadcast_to_clusters(void* dst, void* src, size_t size) { + switch (MODE) { + case HW_MCAST: + default: + cluster_broadcast_hw(0, dst, src, size); + break; + case SW_UNOPT: + cluster_broadcast_sw_unopt(0, dst, src, size); + break; + case SW_OPT: + cluster_broadcast_sw_opt(0, dst, src, size); + break; + case SW_OPT2: + cluster_broadcast_sw_opt2(0, dst, src, size); + break; + } +} + + +static inline int cluster_participates_in_bcast(int i) { + return (i < N_CLUSTERS_TO_USE); +} + +static inline void broadcast_wrapper(void* dst, void* src, size_t size) { + snrt_global_barrier(); + snrt_mcycle(); + dma_broadcast_to_clusters(dst, src, size); + snrt_mcycle(); +} + +// +// int main() { snrt_interrupt_enable(IRQ_M_CLUSTER); - // snrt_int_clr_mcip(); + snrt_int_clr_mcip(); // Allocate destination buffer uint32_t *buffer_dst = (uint32_t *)snrt_l1_next_v2(); - uint32_t *buffer_src = buffer_dst + LENGTH; + + // ALlocate source buffer. To avoid multicast conflictr in the Multicast + // capable AXI Xbar inside each cluster, the source will be a location in + // the memory tile (L3). + // To have better contorl of the DMA transfers triggerred for a given LENGTH, + // align the src pointer to 4kiB, since whenever the src address cross the + // 4kiB section, the DMA will issue multiple transfers. + uintptr_t raw_buffer_src = (uintptr_t) snrt_l3_next_v2(); + uintptr_t aligned_addr = (raw_buffer_src + 4095) & ~(uintptr_t)(4095); // Align to 4 KiB + uint32_t *buffer_src = (uint32_t *)aligned_addr; + // Every cluster initializes its destination buffer. if (snrt_is_dm_core()) { @@ -81,14 +277,14 @@ int main() { // First cluster initializes the source buffer and multicast- // copies it to the destination buffer in every cluster's TCDM. if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - for (uint32_t i = 0; i < LENGTH; i++) { + for (uint32_t i = 0; i < TRAN_LEN; i++) { buffer_src[i] = INITIALIZER; } } // Initiate DMA transfer (twice to preheat the cache) for (volatile int i = 0; i < 2; i++) { - broadcast_wrapper(buffer_dst, buffer_src, LENGTH * sizeof(uint32_t)); + broadcast_wrapper(buffer_dst, buffer_src, TRAN_LEN * sizeof(uint32_t) ); } // All other clusters wait on a global barrier to signal the transfer From cc6f2d0a4ae2f6a6c6ff24d33954643fbe8112c1 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 14:22:39 +0200 Subject: [PATCH 13/77] python-venv: Install Snitch in editable mode --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0419dce0..75dc641c 100644 --- a/Makefile +++ b/Makefile @@ -224,7 +224,7 @@ python-venv: .venv python -m pip install --upgrade pip setuptools && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \ - python -m pip install --cache-dir $(PIP_CACHE_DIR) "$(shell $(BENDER) path snitch_cluster)[kernels]" + python -m pip install --cache-dir $(PIP_CACHE_DIR) -e "$(shell $(BENDER) path snitch_cluster)[kernels]" python-venv-clean: rm -rf .venv From 0c9017b106d6dc28ebcd8db1641065d98f1082bd Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 14:23:25 +0200 Subject: [PATCH 14/77] dma_multicast.c: Fix missing `LENGTH` variable --- sw/snitch/tests/dma_multicast.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c index 98249231..4b1d948c 100644 --- a/sw/snitch/tests/dma_multicast.c +++ b/sw/snitch/tests/dma_multicast.c @@ -45,8 +45,8 @@ typedef enum { #endif // Transfer LENGTH in uint32_t elements -#ifndef TRAN_LEN -#define TRAN_LEN 1024 +#ifndef LENGTH +#define LENGTH 1024 #endif #define LENGTH_TO_CHECK 32 @@ -277,14 +277,14 @@ int main() { // First cluster initializes the source buffer and multicast- // copies it to the destination buffer in every cluster's TCDM. if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - for (uint32_t i = 0; i < TRAN_LEN; i++) { + for (uint32_t i = 0; i < LENGTH; i++) { buffer_src[i] = INITIALIZER; } } // Initiate DMA transfer (twice to preheat the cache) for (volatile int i = 0; i < 2; i++) { - broadcast_wrapper(buffer_dst, buffer_src, TRAN_LEN * sizeof(uint32_t) ); + broadcast_wrapper(buffer_dst, buffer_src, LENGTH * sizeof(uint32_t) ); } // All other clusters wait on a global barrier to signal the transfer From 4df0236ccb4a73bb6f9796848c72c317b419876e Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 14:25:07 +0200 Subject: [PATCH 15/77] sw: Build Picobello's Snitch tests using Snitch's rules --- sw/sw.mk | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/sw/sw.mk b/sw/sw.mk index 012435e1..83489072 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -39,36 +39,13 @@ SN_APPS += $(SN_ROOT)/sw/kernels/dnn/flashattention_2 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear SN_APPS += $(PB_SNITCH_SW_DIR)/apps/mha +SN_TESTS = $(wildcard $(PB_SNITCH_SW_DIR)/tests/*.c) + include $(SN_ROOT)/make/sw.mk $(PB_GEN_DIR)/pb_raw_addrmap.h: $(PB_RDL_ALL) $(PEAKRDL) raw-header $< -o $@ $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) --base_name $(notdir $(basename $@)) --format c -# Collect Snitch tests which should be built -PB_SN_TESTS_DIR = $(PB_SNITCH_SW_DIR)/tests -PB_SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build -PB_SN_TEST_NAMES = $(basename $(notdir $(wildcard $(PB_SN_TESTS_DIR)/*.c))) -PB_SN_TEST_ELFS = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .elf,$(PB_SN_TEST_NAMES)))) -PB_SN_TEST_DUMP = $(abspath $(addprefix $(PB_SN_TESTS_BUILDDIR)/,$(addsuffix .dump,$(PB_SN_TEST_NAMES)))) - -.PHONY: pb-sn-tests clean-pb-sn-tests - -pb-sn-tests: $(PB_SN_TEST_ELFS) $(PB_SN_TEST_DUMP) - -clean-pb-sn-tests: - rm -rf $(PB_SN_TEST_ELFS) - -$(PB_SN_TEST_ELFS): $(PB_GEN_DIR)/pb_addrmap.h - -$(PB_SN_TESTS_BUILDDIR)/%.d: $(PB_SN_TESTS_DIR)/%.c | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) -MM -MT '$(@:.d=.elf)' -x c++ $< > $@ - -$(PB_SN_TESTS_BUILDDIR)/%.elf: $(PB_SN_TESTS_DIR)/%.c $(SN_RUNTIME_LIB) | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_CXX) $(SN_TESTS_RISCV_CFLAGS) $(SN_TESTS_RISCV_LDFLAGS) -x c++ $< -o $@ - -$(PB_SN_TESTS_BUILDDIR)/%.dump: $(PB_SN_TESTS_BUILDDIR)/%.elf | $(PB_SN_TESTS_BUILDDIR) - $(SN_RISCV_OBJDUMP) $(SN_RISCV_OBJDUMP_FLAGS) $< > $@ - ############## ## Cheshire ## ############## @@ -108,6 +85,6 @@ sn-runtime-clean: sn-clean-runtime sn-apps-clean: sn-clean-apps .PHONY: sw sw-tests sw-clean sw-tests-clean -sw sw-tests: chs-sw-tests sn-tests pb-sn-tests sn-apps +sw sw-tests: chs-sw-tests sn-tests sn-apps -sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean clean-pb-sn-tests sn-apps-clean +sw-clean sw-tests-clean: chs-sw-tests-clean sn-tests-clean sn-runtime-clean sn-apps-clean From 62090d726ed91cc033e6c41b4917aec4f7b2bbc9 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 20:20:28 +0200 Subject: [PATCH 16/77] dma_multicast.c: Use `snrt_align_up` function --- sw/snitch/tests/dma_multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c index 4b1d948c..6261d876 100644 --- a/sw/snitch/tests/dma_multicast.c +++ b/sw/snitch/tests/dma_multicast.c @@ -261,7 +261,7 @@ int main() { // align the src pointer to 4kiB, since whenever the src address cross the // 4kiB section, the DMA will issue multiple transfers. uintptr_t raw_buffer_src = (uintptr_t) snrt_l3_next_v2(); - uintptr_t aligned_addr = (raw_buffer_src + 4095) & ~(uintptr_t)(4095); // Align to 4 KiB + uintptr_t aligned_addr = snrt_align_up(raw_buffer_src, 4096); // Align to 4 KiB uint32_t *buffer_src = (uint32_t *)aligned_addr; From c3b9c5119b0d822c9ec935432a45876089da8072 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 20:21:22 +0200 Subject: [PATCH 17/77] make: Enable running simulations in different directories --- Makefile | 1 + target/sim/vsim/vsim.mk | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 75dc641c..02d15f28 100644 --- a/Makefile +++ b/Makefile @@ -195,6 +195,7 @@ include $(PB_ROOT)/sw/sw.mk ############## TB_DUT = tb_picobello_top +SIM_DIR = $(PB_ROOT) include $(PB_ROOT)/target/sim/vsim/vsim.mk include $(PB_ROOT)/target/sim/traces.mk diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk index 40e0ad30..ec5f216f 100644 --- a/target/sim/vsim/vsim.mk +++ b/target/sim/vsim/vsim.mk @@ -49,12 +49,12 @@ $(VSIM_DIR)/compile.tcl: $(BENDER_YML) $(BENDER_LOCK) echo 'vlog -work $(VSIM_WORK) "$(realpath $(CHS_ROOT))/target/sim/src/elfloader.cpp" -ccflags "-std=c++11"' >> $@ vsim-run: - $(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*" + cd $(SIM_DIR) && $(VSIM) $(VSIM_FLAGS) $(VSIM_FLAGS_GUI) $(TB_DUT) -do "log -r /*" vsim-run-batch: - $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit" + cd $(SIM_DIR) && $(VSIM) -c $(VSIM_FLAGS) $(TB_DUT) -do "run -all; quit" vsim-run-batch-verify: vsim-run-batch ifdef VERIFY_PY - $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000 + cd $(SIM_DIR) && $(VERIFY_PY) placeholder $(SN_BINARY) --no-ipc --memdump l2mem.bin --memaddr 0x70000000 endif \ No newline at end of file From 651b3037c284950428ada16fc6d1c2afd4fe785f Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Tue, 30 Sep 2025 12:07:32 +0200 Subject: [PATCH 18/77] hw: Integrate updated parallel reduction support --- Bender.lock | 8 ++-- Bender.yml | 2 +- cfg/snitch_cluster.json | 3 +- hw/cheshire_tile.sv | 42 +++++++++++++++++-- hw/cluster_tile.sv | 44 ++++++++++++++++++-- hw/dummy_tile.sv | 26 +++++++++++- hw/fhg_spu_tile.sv | 26 +++++++++++- hw/mem_tile.sv | 41 +++++++++++++++++-- hw/picobello_pkg.sv | 90 +++++++++++++++++++++++++++++++++++++++++ hw/spm_tile.sv | 30 ++++++++++++-- 10 files changed, 287 insertions(+), 25 deletions(-) diff --git a/Bender.lock b/Bender.lock index 47c06106..02700a76 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,10 +207,10 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: ea35a9909d60d552bbdafc7e898590f326b1898a + revision: null version: null source: - Git: https://github.com/pulp-platform/FlooNoC.git + Path: working_dir/floo_noc dependencies: - axi - axi_riscv_atomics @@ -383,10 +383,10 @@ packages: - common_cells - register_interface snitch_cluster: - revision: 1e7aadb7f8b31afcb1f7a9d7369b6ccee7e20aa8 + revision: null version: null source: - Git: https://github.com/pulp-platform/snitch_cluster.git + Path: working_dir/snitch_cluster dependencies: - apb - axi diff --git a/Bender.yml b/Bender.yml index a1628a36..311dc9c8 100644 --- a/Bender.yml +++ b/Bender.yml @@ -13,7 +13,7 @@ dependencies: common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "5d6c957c70e3824e2330d8308eb904724cd41cbe" } - floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "develop" } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "feature/reduction" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } hci: { git: "https://github.com/pulp-platform/hci.git", rev: "06fcba671e060f2e1b03b7ebe2d3e719f1557099" } diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json index b3b4d020..b6fcf713 100644 --- a/cfg/snitch_cluster.json +++ b/cfg/snitch_cluster.json @@ -12,7 +12,6 @@ addr_width: 48, data_width: 64, atomic_id_width: 5, // clog2(#clusters + 2) - user_width: 53, // addr_width + atomic_id_width tcdm: { size: 128, banks: 32, @@ -26,7 +25,6 @@ dma_req_fifo_depth: 8, narrow_trans: 4, wide_trans: 32, - dma_user_width: 48, enable_narrow_collectives: true, enable_wide_collectives: true, cluster_base_expose: true, @@ -34,6 +32,7 @@ alias_region_base: 0x30000000, num_exposed_wide_tcdm_ports: 1, narrow_axi_port_expose: true, + collective_width: 4, enable_external_interrupts: true, vm_support: false, // Timing parameters diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index ed406db1..ac484d21 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -8,6 +8,8 @@ `include "cheshire/typedef.svh" `include "floo_noc/typedef.svh" `include "axi/assign.svh" +`include "common_cells/assertions.svh" + module cheshire_tile import cheshire_pkg::*; @@ -117,7 +119,11 @@ module cheshire_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .EnMultiCast (1'b0), + .EnParallelReduction (1'b0), + .EnOffloadWideReduction (1'b0), + .EnOffloadNarrowReduction (1'b0) ) i_router ( .clk_i, .rst_ni, @@ -129,7 +135,25 @@ module cheshire_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_west_o = router_floo_req_out[West]; @@ -188,7 +212,9 @@ module cheshire_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i, .rst_ni, @@ -472,4 +498,14 @@ module cheshire_tile assign fhg_spu_rst_no = control_reg.fhg_spu_rsts.rst.value; assign fhg_spu_clk_en_o = control_reg.fhg_spu_clk_enables.clk_en.value; + // Add Assertion that no multicast / reduction can enter this tile! + for (genvar r = 0; r < 4; r++) begin : gen_virt + `ASSERT(NoCollectivOperation_NReq_In, (!router_floo_req_in[r].valid | (router_floo_req_in[r].req.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_In, (!router_floo_rsp_in[r].valid | (router_floo_rsp_in[r].rsp.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide.generic.hdr.collective_op == Unicast))) + end + endmodule diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 5ed7b7a1..1d83ea6d 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -39,6 +39,12 @@ module cluster_tile input floo_wide_t [ West:North] floo_wide_i ); + /// TODO(lleone): Offload typedef to be generated by Floogen or removed form + /// the hardware after more optimization + typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t; + typedef logic[AxiCfgN.DataWidth-1:0] RdDataNarrow_t; + + // Tile-specific reset and clock signals logic tile_clk; logic tile_rst_n; @@ -227,8 +233,12 @@ module cluster_tile floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), - .EnMultiCast (RouteCfg.EnMultiCast), .RouteAlgo (RouteCfg.RouteAlgo), + .EnMultiCast (RouteCfg.EnMultiCast), + .EnParallelReduction (EnParallelReduction), + .EnOffloadWideReduction (EnWideOffloadReduction), + .EnOffloadNarrowReduction (EnNarrowOffloadReduction), + .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), .OutFifoDepth(2), @@ -236,7 +246,14 @@ module cluster_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .RdWideOperation_t (floo_pkg::collect_op_e), + .RdNarrowOperation_t (floo_pkg::collect_op_e), + .RdWideData_t (RdDataWide_t), + .RdNarrowData_t (RdDataNarrow_t), + .RdWideCfg (WideReductionCfg), + .RdNarrowCfg (NarrowReductionCfg), + .RdRespCfg (ResponseReductionCfg) ) i_router ( .clk_i, .rst_ni, @@ -248,7 +265,25 @@ module cluster_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_o = router_floo_req_out[West:North]; @@ -289,7 +324,8 @@ module cluster_tile .floo_rsp_t (floo_picobello_noc_pkg::floo_rsp_t), .floo_wide_t (floo_picobello_noc_pkg::floo_wide_t), .sram_cfg_t (snitch_cluster_pkg::sram_cfg_t), - .user_struct_t (picobello_pkg::mcast_user_t) + .user_narrow_struct_t (picobello_pkg::collective_narrow_user_t), + .user_wide_struct_t (picobello_pkg::collective_wide_user_t) ) i_chimney ( .clk_i (tile_clk), .rst_ni (tile_rst_n), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index c42f1d38..f45ae2f1 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -42,7 +42,11 @@ module dummy_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .EnMultiCast (1'b0), + .EnParallelReduction (1'b0), + .EnOffloadWideReduction (1'b0), + .EnOffloadNarrowReduction (1'b0) ) i_router ( .clk_i, .rst_ni, @@ -54,7 +58,25 @@ module dummy_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_o = router_floo_req_out[West:North]; diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index ad0636e7..35825df7 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -60,7 +60,11 @@ module fhg_spu_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .EnMultiCast (1'b0), + .EnParallelReduction (1'b0), + .EnOffloadWideReduction (1'b0), + .EnOffloadNarrowReduction (1'b0) ) i_router ( .clk_i, .rst_ni, @@ -72,7 +76,25 @@ module fhg_spu_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_west_o = router_floo_req_out[West]; diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index faf4249a..7fe09417 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -7,6 +7,7 @@ `include "common_cells/registers.svh" `include "axi/typedef.svh" `include "obi/typedef.svh" +`include "common_cells/assertions.svh" module mem_tile import floo_pkg::*; @@ -50,7 +51,6 @@ module mem_tile floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), - .EnMultiCast (RouteCfgNoMcast.EnMultiCast), .RouteAlgo (RouteCfgNoMcast.RouteAlgo), .NumRoutes (5), .InFifoDepth (2), @@ -59,7 +59,11 @@ module mem_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .EnMultiCast (1'b0), + .EnParallelReduction (1'b0), + .EnOffloadWideReduction (1'b0), + .EnOffloadNarrowReduction (1'b0) ) i_router ( .clk_i, .rst_ni, @@ -71,7 +75,25 @@ module mem_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_o = router_floo_req_out[West:North]; @@ -113,7 +135,9 @@ module mem_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i (tile_clk), .rst_ni (tile_rst_n), @@ -473,5 +497,14 @@ module mem_tile .clk_o (tile_rst_n) ); `endif + // Add Assertion that no multicast / reduction can enter this tile! + for (genvar r = 0; r < 4; r++) begin : gen_virt + `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide.generic.hdr.collective_op == Unicast))) + end endmodule diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index dcbfa060..ca659a37 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -347,6 +347,10 @@ package picobello_pkg; floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg; // Disable multicast for non-cluster tiles ret.EnMultiCast = 1'b0; + ret.EnParallelReduction = 1'b0; + ret.EnNarrowOffloadReduction = 1'b0; + ret.EnWideOffloadReduction = 1'b0; + ret.CollectiveCfg = CollectiveDefaultCfg; return ret; endfunction @@ -391,6 +395,92 @@ package picobello_pkg; $display("----------------------------------------------------------"); endfunction + ///////////////////// + // REDUCTION // + ///////////////////// + + // Support Reduction on the Wide port + localparam bit EnWideOffloadReduction = 1; + localparam bit EnNarrowOffloadReduction = 1; + localparam bit EnParallelReduction = 1; + + // TODO (lleone): Add generation of the follwing types ans structs into Floogen + typedef struct packed { + user_mask_t collective_mask; + floo_pkg::collect_op_e collective_op; + logic [snitch_cluster_pkg::AtomicIdWidth-1:0] atomic; + } collective_narrow_user_t; + + typedef struct packed { + user_mask_t collective_mask; + floo_pkg::collect_op_e collective_op; + } collective_wide_user_t; + + // Configurations for the Reductions + // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything + // ATTENTION: + // @GENERIC Implementation: RdPartialBufferSize Needs to be bigger than the "RdPipelineDepth" otherwise we can build a deadlock + // @STALLING Implementation: min(ceil(log2(NumRoutes)), RdPipelineDepth+1) + // TODO: Add Assertion to check this + // raroth - overwrite benchmark autotest - start +localparam reduction_cfg_t WideReductionCfg = '{ + RdControllConf: ControllerGeneric, + RdFifoFallThrough: 1'b1, + RdFifoDepth: 0, + RdPipelineDepth: 5, + RdPartialBufferSize: 6, + RdTagBits: 5, + RdSupportAxi: 1'b1, + RdEnableBypass: 1'b1, + RdSupportLoopback: 1'b1 + }; + // raroth - overwrite benchmark autotest - end + +localparam reduction_cfg_t NarrowReductionCfg = '{ + RdControllConf: ControllerGeneric, + RdFifoFallThrough: 1'b1, + RdFifoDepth: 0, + RdPipelineDepth: 1, + RdPartialBufferSize: 3, + RdTagBits: 5, + RdSupportAxi: 1'b1, + RdEnableBypass: 1'b1, + RdSupportLoopback: 1'b1 + }; + + localparam reduction_cfg_t ResponseReductionCfg = '{ + RdEnableBypass: 1'b1, + RdSupportLoopback: 1'b1, + default: '0 + }; + + // TODO (lleone): Get rid of all this configuration struct, make only one to pass to the system + // with enough info to decide what to do with the different routers +function automatic floo_pkg::collect_op_cfg_t gen_collect_op_cfg(); + floo_pkg:: collect_op_cfg_t ret = floo_pkg::CollectiveOpDefaultCfg; + ret.EnNarrowMulticast = 1'b1; + ret.EnWideMulticast = 1'b1; + ret.EnLSBAnd = 1'b1; + return ret; +endfunction + +localparam collect_op_cfg_t CollectOpCfg = gen_collect_op_cfg(); + +localparam collective_cfg_t CollectCfgNarrow = '{ + OpCfg: CollectOpCfg, + SequentialRedCfg: NarrowReductionCfg +}; + +localparam collective_cfg_t CollectCfgWide = '{ + OpCfg: CollectOpCfg, + SequentialRedCfg: WideReductionCfg +}; + +localparam collective_cfg_t CollectCfgRsp = '{ + OpCfg: CollectOpCfg, + SequentialRedCfg: ResponseReductionCfg +}; + //////////////// // Cheshire // //////////////// diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index e4b3b5ca..b9bff9a9 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -94,7 +94,11 @@ module spm_tile .hdr_t (hdr_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .EnMultiCast (1'b0), + .EnParallelReduction (1'b0), + .EnOffloadWideReduction (1'b0), + .EnOffloadNarrowReduction (1'b0) ) i_router ( .clk_i, .rst_ni, @@ -106,7 +110,25 @@ module spm_tile .floo_req_o (router_floo_req_out), .floo_rsp_i (router_floo_rsp_in), .floo_wide_i (router_floo_wide_in), - .floo_wide_o (router_floo_wide_out) + .floo_wide_o (router_floo_wide_out), + // Wide Reduction offload port + .offload_wide_req_op_o (), + .offload_wide_req_operand1_o (), + .offload_wide_req_operand2_o (), + .offload_wide_req_valid_o (), + .offload_wide_req_ready_i ('0), + .offload_wide_resp_result_i ('0), + .offload_wide_resp_valid_i ('0), + .offload_wide_resp_ready_o (), + // Narrow Reduction offload port + .offload_narrow_req_op_o (), + .offload_narrow_req_operand1_o (), + .offload_narrow_req_operand2_o (), + .offload_narrow_req_valid_o (), + .offload_narrow_req_ready_i ('0), + .offload_narrow_resp_result_i ('0), + .offload_narrow_resp_valid_i ('0), + .offload_narrow_resp_ready_o () ); assign floo_req_o = router_floo_req_out[West:North]; @@ -151,7 +173,9 @@ module spm_tile .axi_wide_out_rsp_t (axi_wide_out_rsp_t), .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), - .floo_wide_t (floo_wide_t) + .floo_wide_t (floo_wide_t), + .user_narrow_struct_t (collective_narrow_user_t), + .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( .clk_i, .rst_ni, From 0ee875e528e2323b06b301fd49bd72660b8078db Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 2 Oct 2025 15:55:29 +0200 Subject: [PATCH 19/77] hw: Integrate virtual channel + offload reduction --- cfg/snitch_cluster.json | 2 + floo_picobello_noc_pkg_REDUCTION.sv | 343 ++++++++++++++++++++++++++++ hw/cheshire_tile.sv | 4 +- hw/cluster_tile.sv | 134 ++++++++++- hw/dummy_tile.sv | 3 +- hw/fhg_spu_tile.sv | 3 +- hw/mem_tile.sv | 4 +- hw/spm_tile.sv | 4 +- 8 files changed, 481 insertions(+), 16 deletions(-) create mode 100644 floo_picobello_noc_pkg_REDUCTION.sv diff --git a/cfg/snitch_cluster.json b/cfg/snitch_cluster.json index b6fcf713..7930bc7f 100644 --- a/cfg/snitch_cluster.json +++ b/cfg/snitch_cluster.json @@ -27,6 +27,8 @@ wide_trans: 32, enable_narrow_collectives: true, enable_wide_collectives: true, + enable_dca: true, + dca_data_width: 512, cluster_base_expose: true, alias_region_enable: true, alias_region_base: 0x30000000, diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv new file mode 100644 index 00000000..90e36a05 --- /dev/null +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -0,0 +1,343 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_picobello_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic [4:0] { + ClusterX0Y0 = 0, + ClusterX0Y1 = 1, + ClusterX0Y2 = 2, + ClusterX0Y3 = 3, + ClusterX1Y0 = 4, + ClusterX1Y1 = 5, + ClusterX1Y2 = 6, + ClusterX1Y3 = 7, + ClusterX2Y0 = 8, + ClusterX2Y1 = 9, + ClusterX2Y2 = 10, + ClusterX2Y3 = 11, + ClusterX3Y0 = 12, + ClusterX3Y1 = 13, + ClusterX3Y2 = 14, + ClusterX3Y3 = 15, + Cheshire = 16, + FhgSpu = 17, + TopSpmNarrow = 18, + TopSpmWide = 19, + L2Spm0 = 20, + L2Spm1 = 21, + L2Spm2 = 22, + L2Spm3 = 23, + L2Spm4 = 24, + L2Spm5 = 25, + L2Spm6 = 26, + L2Spm7 = 27, + NumEndpoints = 28 + } ep_id_e; + + + + typedef enum logic [4:0] { + ClusterX0Y0SamIdx = 0, + ClusterX0Y1SamIdx = 1, + ClusterX0Y2SamIdx = 2, + ClusterX0Y3SamIdx = 3, + ClusterX1Y0SamIdx = 4, + ClusterX1Y1SamIdx = 5, + ClusterX1Y2SamIdx = 6, + ClusterX1Y3SamIdx = 7, + ClusterX2Y0SamIdx = 8, + ClusterX2Y1SamIdx = 9, + ClusterX2Y2SamIdx = 10, + ClusterX2Y3SamIdx = 11, + ClusterX3Y0SamIdx = 12, + ClusterX3Y1SamIdx = 13, + ClusterX3Y2SamIdx = 14, + ClusterX3Y3SamIdx = 15, + CheshireExternalSamIdx = 16, + CheshireInternalSamIdx = 17, + FhgSpuSamIdx = 18, + TopSpmNarrowSamIdx = 19, + TopSpmWideSamIdx = 20, + L2Spm0SamIdx = 21, + L2Spm1SamIdx = 22, + L2Spm2SamIdx = 23, + L2Spm3SamIdx = 24, + L2Spm4SamIdx = 25, + L2Spm5SamIdx = 26, + L2Spm6SamIdx = 27, + L2Spm7SamIdx = 28 + } sam_idx_e; + + + + typedef logic [0:0] rob_idx_t; + typedef logic [0:0] port_id_t; + typedef logic [3:0] x_bits_t; + typedef logic [1:0] y_bits_t; + typedef struct packed { + x_bits_t x; + y_bits_t y; + port_id_t port_id; + } id_t; + + typedef logic route_t; + + + localparam int unsigned SamNumRules = 29; + + typedef struct packed { + id_t idx; + logic [47:0] start_addr; + logic [47:0] end_addr; + } sam_rule_t; + + localparam sam_rule_t [SamNumRules-1:0] Sam = '{ + '{ + idx: '{x: 8, y: 3, port_id: 0}, + start_addr: 48'h000070700000, + end_addr: 48'h000070800000 + }, // L2Spm7 + '{ + idx: '{x: 8, y: 2, port_id: 0}, + start_addr: 48'h000070600000, + end_addr: 48'h000070700000 + }, // L2Spm6 + '{ + idx: '{x: 8, y: 1, port_id: 0}, + start_addr: 48'h000070500000, + end_addr: 48'h000070600000 + }, // L2Spm5 + '{ + idx: '{x: 8, y: 0, port_id: 0}, + start_addr: 48'h000070400000, + end_addr: 48'h000070500000 + }, // L2Spm4 + '{ + idx: '{x: 0, y: 3, port_id: 0}, + start_addr: 48'h000070300000, + end_addr: 48'h000070400000 + }, // L2Spm3 + '{ + idx: '{x: 0, y: 2, port_id: 0}, + start_addr: 48'h000070200000, + end_addr: 48'h000070300000 + }, // L2Spm2 + '{ + idx: '{x: 0, y: 1, port_id: 0}, + start_addr: 48'h000070100000, + end_addr: 48'h000070200000 + }, // L2Spm1 + '{ + idx: '{x: 0, y: 0, port_id: 0}, + start_addr: 48'h000070000000, + end_addr: 48'h000070100000 + }, // L2Spm0 + '{ + idx: '{x: 9, y: 1, port_id: 0}, + start_addr: 48'h000060040000, + end_addr: 48'h000060080000 + }, // TopSpmWide + '{ + idx: '{x: 9, y: 2, port_id: 0}, + start_addr: 48'h000060000000, + end_addr: 48'h000060040000 + }, // TopSpmNarrow + '{ + idx: '{x: 9, y: 0, port_id: 0}, + start_addr: 48'h000040000000, + end_addr: 48'h000040040000 + }, // FhgSpu + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000000000000, + end_addr: 48'h000020000000 + }, // CheshireInternal + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000080000000, + end_addr: 48'h020000000000 + }, // CheshireExternal + '{ + idx: '{x: 7, y: 3, port_id: 0}, + start_addr: 48'h0000203c0000, + end_addr: 48'h000020400000 + }, // ClusterX3Y3 + '{ + idx: '{x: 7, y: 2, port_id: 0}, + start_addr: 48'h000020380000, + end_addr: 48'h0000203c0000 + }, // ClusterX3Y2 + '{ + idx: '{x: 7, y: 1, port_id: 0}, + start_addr: 48'h000020340000, + end_addr: 48'h000020380000 + }, // ClusterX3Y1 + '{ + idx: '{x: 7, y: 0, port_id: 0}, + start_addr: 48'h000020300000, + end_addr: 48'h000020340000 + }, // ClusterX3Y0 + '{ + idx: '{x: 6, y: 3, port_id: 0}, + start_addr: 48'h0000202c0000, + end_addr: 48'h000020300000 + }, // ClusterX2Y3 + '{ + idx: '{x: 6, y: 2, port_id: 0}, + start_addr: 48'h000020280000, + end_addr: 48'h0000202c0000 + }, // ClusterX2Y2 + '{ + idx: '{x: 6, y: 1, port_id: 0}, + start_addr: 48'h000020240000, + end_addr: 48'h000020280000 + }, // ClusterX2Y1 + '{ + idx: '{x: 6, y: 0, port_id: 0}, + start_addr: 48'h000020200000, + end_addr: 48'h000020240000 + }, // ClusterX2Y0 + '{ + idx: '{x: 5, y: 3, port_id: 0}, + start_addr: 48'h0000201c0000, + end_addr: 48'h000020200000 + }, // ClusterX1Y3 + '{ + idx: '{x: 5, y: 2, port_id: 0}, + start_addr: 48'h000020180000, + end_addr: 48'h0000201c0000 + }, // ClusterX1Y2 + '{ + idx: '{x: 5, y: 1, port_id: 0}, + start_addr: 48'h000020140000, + end_addr: 48'h000020180000 + }, // ClusterX1Y1 + '{ + idx: '{x: 5, y: 0, port_id: 0}, + start_addr: 48'h000020100000, + end_addr: 48'h000020140000 + }, // ClusterX1Y0 + '{ + idx: '{x: 4, y: 3, port_id: 0}, + start_addr: 48'h0000200c0000, + end_addr: 48'h000020100000 + }, // ClusterX0Y3 + '{ + idx: '{x: 4, y: 2, port_id: 0}, + start_addr: 48'h000020080000, + end_addr: 48'h0000200c0000 + }, // ClusterX0Y2 + '{ + idx: '{x: 4, y: 1, port_id: 0}, + start_addr: 48'h000020040000, + end_addr: 48'h000020080000 + }, // ClusterX0Y1 + '{ + idx: '{x: 4, y: 0, port_id: 0}, + start_addr: 48'h000020000000, + end_addr: 48'h000020040000 + } // ClusterX0Y0 + + }; + + + + localparam route_cfg_t RouteCfg = '{ + RouteAlgo: XYRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 41, + XYAddrOffsetY: 45, + IdAddrOffset: 0, + NumSamRules: 29, + NumRoutes: 0, + EnMultiCast: 1'b1, + EnParallelReduction: 1'b1, + EnNarrowOffloadReduction: 1'b1, + EnWideOffloadReduction: 1'b1, + CollectiveCfg: '{ + OpCfg: '{ + EnNarrowMulticast: 1'b1, + EnWideMulticast: 1'b1, + EnLSBAnd: 1'b1, + default: '0 + }, + SequentialRedCfg: ReductionDefaultCfg + } + }; + + + typedef logic [47:0] axi_narrow_in_addr_t; + typedef logic [63:0] axi_narrow_in_data_t; + typedef logic [7:0] axi_narrow_in_strb_t; + typedef logic [4:0] axi_narrow_in_id_t; + typedef logic [2:0] axi_narrow_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, + axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, + axi_narrow_in_user_t) + + + typedef logic [47:0] axi_narrow_out_addr_t; + typedef logic [63:0] axi_narrow_out_data_t; + typedef logic [7:0] axi_narrow_out_strb_t; + typedef logic [1:0] axi_narrow_out_id_t; + typedef logic [2:0] axi_narrow_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, + axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, + axi_narrow_out_strb_t, axi_narrow_out_user_t) + + + typedef logic [47:0] axi_wide_in_addr_t; + typedef logic [511:0] axi_wide_in_data_t; + typedef logic [63:0] axi_wide_in_strb_t; + typedef logic [2:0] axi_wide_in_id_t; + typedef logic [0:0] axi_wide_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, + axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic [47:0] axi_wide_out_addr_t; + typedef logic [511:0] axi_wide_out_data_t; + typedef logic [63:0] axi_wide_out_strb_t; + typedef logic [0:0] axi_wide_out_id_t; + typedef logic [0:0] axi_wide_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, + axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, + axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) + localparam axi_cfg_t AxiCfgN = '{ + AddrWidth: 48, + DataWidth: 64, + UserWidth: 3, + InIdWidth: 5, + OutIdWidth: 2 + }; + localparam axi_cfg_t AxiCfgW = '{ + AddrWidth: 48, + DataWidth: 512, + UserWidth: 1, + InIdWidth: 3, + OutIdWidth: 1 + }; + `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, + hdr_t) + + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) + + +endpackage diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index ac484d21..42f0b56d 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -123,7 +123,8 @@ module cheshire_tile .EnMultiCast (1'b0), .EnParallelReduction (1'b0), .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0) + .EnOffloadNarrowReduction (1'b0), + .EnDecoupledRW (1'b1) ) i_router ( .clk_i, .rst_ni, @@ -196,6 +197,7 @@ module cheshire_tile .ChimneyCfgW (ChimneyCfgW), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .EnDecoupledRW (1'b1), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 1d83ea6d..6d59b582 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -41,7 +41,7 @@ module cluster_tile /// TODO(lleone): Offload typedef to be generated by Floogen or removed form /// the hardware after more optimization - typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t; + typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t; typedef logic[AxiCfgN.DataWidth-1:0] RdDataNarrow_t; @@ -89,6 +89,116 @@ module cluster_tile logic [NrCores-1:0] mxip; + + //////////////////////// + // Wide FPU Reduction // + //////////////////////// + + // Snitch cluster DCA interface + snitch_cluster_pkg::dca_req_t offload_dca_req, offload_dca_req_cut; + snitch_cluster_pkg::dca_rsp_t offload_dca_rsp, offload_dca_rsp_cut; + + // Signals to connect the NW router to the wide parser + RdDataWide_t [1:0] offload_wide_req_operand; + collect_op_e offload_wide_req_operation; + logic offload_wide_req_valid; + logic offload_wide_req_ready; + RdDataWide_t offload_wide_resp_data; + logic offload_wide_resp_valid; + logic offload_wide_resp_ready; + + // Parse the Wide request from the reouter to the one from the snitch cluster! + // TODO(raroth): possible to remove this decode from the picobello repo and move it inside the + // FlooNoC repo. Currently the Decode used for the ALU is directly inside the floo_alu.sv + // file. Maybe do the same for the FPU + if(EnWideOffloadReduction) begin : gen_wide_offload_reduction + // Connect the DCA Request + assign offload_dca_req.q_valid = offload_wide_req_valid; + assign offload_wide_req_ready = offload_dca_rsp.q_ready; + + // Parse the FPU Request + always_comb begin + // Init default values + offload_dca_req.q.operands = '0; + + // Set default Values + offload_dca_req.q.src_fmt = fpnew_pkg::FP64; + offload_dca_req.q.dst_fmt = fpnew_pkg::FP64; + offload_dca_req.q.int_fmt = fpnew_pkg::INT64; + offload_dca_req.q.vectorial_op = 1'b0; + offload_dca_req.q.op_mod = 1'b0; + offload_dca_req.q.rnd_mode = fpnew_pkg::RNE; + offload_dca_req.q.op = fpnew_pkg::ADD; + + // Define the operation we want to execute on the FPU + unique casez (offload_wide_req_operation) + (floo_pkg::F_Add) : begin + offload_dca_req.q.op = fpnew_pkg::ADD; + offload_dca_req.q.operands[0] = '0; + offload_dca_req.q.operands[1] = offload_wide_req_operand[0]; + offload_dca_req.q.operands[2] = offload_wide_req_operand[1]; + end + (floo_pkg::F_Mul) : begin + offload_dca_req.q.op = fpnew_pkg::MUL; + offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; + offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[2] = '0; + end + (floo_pkg::F_Max) : begin + offload_dca_req.q.op = fpnew_pkg::MINMAX; + offload_dca_req.q.rnd_mode = fpnew_pkg::RNE; + offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; + offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[2] = '0; + end + (floo_pkg::F_Min) : begin + offload_dca_req.q.op = fpnew_pkg::MINMAX; + offload_dca_req.q.rnd_mode = fpnew_pkg::RTZ; + offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; + offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[2] = '0; + end + default : begin + offload_dca_req.q.op = fpnew_pkg::ADD; + offload_dca_req.q.operands[0] = '0; + offload_dca_req.q.operands[1] = '0; + offload_dca_req.q.operands[2] = '0; + end + endcase + end + + // TODO(raroth): move these spill register inside FlooNoC and make them configurable. + // Insert a reqrsp-cut to avoid timing violations + generic_reqrsp_cut #( + .req_chan_t (snitch_cluster_pkg::dca_req_chan_t), + .rsp_chan_t (snitch_cluster_pkg::dca_rsp_chan_t), + .BypassReq (1'b0), + .BypassRsp (1'b0) + ) i_dca_router_cut ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .slv_req_i (offload_dca_req), + .slv_rsp_o (offload_dca_rsp), + .mst_req_o (offload_dca_req_cut), + .mst_rsp_i (offload_dca_rsp_cut) + ); + // Connect the Response + assign offload_wide_resp_valid = offload_dca_rsp.p_valid; + assign offload_dca_req.p_ready = offload_wide_resp_ready; + assign offload_wide_resp_data = offload_dca_rsp.p.result; + + // No Wide Reduction supported + end else begin : gen_no_wide_reduction + assign offload_dca_req_cut = '0; + assign offload_dca_rsp = '0; + assign offload_wide_req_ready = '0; + assign offload_wide_resp_data = '0; + assign offload_wide_resp_valid = '0; + end + + // TODO(lleone): Add teh narrow ALU reduction unit and connections here + + snitch_cluster_wrapper i_cluster ( .clk_i (tile_clk), .rst_ni (tile_rst_n), @@ -114,8 +224,8 @@ module cluster_tile .narrow_ext_resp_i(cluster_narrow_ext_rsp), .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned), - .dca_req_i ('0), - .dca_rsp_o () + .dca_req_i (offload_dca_req_cut), + .dca_rsp_o (offload_dca_rsp_cut) ); if (UseHWPE) begin : gen_hwpe @@ -238,6 +348,7 @@ module cluster_tile .EnParallelReduction (EnParallelReduction), .EnOffloadWideReduction (EnWideOffloadReduction), .EnOffloadNarrowReduction (EnNarrowOffloadReduction), + .EnDecoupledRW (1'b1), .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), @@ -267,14 +378,14 @@ module cluster_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_op_o (offload_wide_req_operation), + .offload_wide_req_operand1_o (offload_wide_req_operand[0]), + .offload_wide_req_operand2_o (offload_wide_req_operand[1]), + .offload_wide_req_valid_o (offload_wide_req_valid), + .offload_wide_req_ready_i (offload_wide_req_ready), + .offload_wide_resp_result_i (offload_wide_resp_data), + .offload_wide_resp_valid_i (offload_wide_resp_valid), + .offload_wide_resp_ready_o (offload_wide_resp_ready), // Narrow Reduction offload port .offload_narrow_req_op_o (), .offload_narrow_req_operand1_o (), @@ -304,6 +415,7 @@ module cluster_tile .ChimneyCfgW (floo_pkg::ChimneyDefaultCfg), .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), + .EnDecoupledRW (1'b1), .MaxAtomicTxns (1), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index f45ae2f1..0908e66f 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -46,7 +46,8 @@ module dummy_tile .EnMultiCast (1'b0), .EnParallelReduction (1'b0), .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0) + .EnOffloadNarrowReduction (1'b0), + .EnDecoupledRW (1'b1) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 35825df7..a9da71b8 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -64,7 +64,8 @@ module fhg_spu_tile .EnMultiCast (1'b0), .EnParallelReduction (1'b0), .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0) + .EnOffloadNarrowReduction (1'b0), + .EnDecoupledRW (1'b1) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 7fe09417..1a1a39d4 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -63,7 +63,8 @@ module mem_tile .EnMultiCast (1'b0), .EnParallelReduction (1'b0), .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0) + .EnOffloadNarrowReduction (1'b0), + .EnDecoupledRW (1'b1) ) i_router ( .clk_i, .rst_ni, @@ -119,6 +120,7 @@ module mem_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, 1'b1, 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .EnDecoupledRW (1'b1), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index b9bff9a9..1ce3d56d 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -98,7 +98,8 @@ module spm_tile .EnMultiCast (1'b0), .EnParallelReduction (1'b0), .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0) + .EnOffloadNarrowReduction (1'b0), + .EnDecoupledRW (1'b1) ) i_router ( .clk_i, .rst_ni, @@ -157,6 +158,7 @@ module spm_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, bit'(!IsNarrow), 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), + .EnDecoupledRW (1'b1), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), From 28fd2faf5092a71445500d8532bb5f78b22530f6 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sat, 4 Oct 2025 14:48:29 +0200 Subject: [PATCH 20/77] hw: Report destination address of collectives attempted in memtile --- hw/mem_tile.sv | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 1a1a39d4..d2399252 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -500,13 +500,21 @@ module mem_tile ); `endif // Add Assertion that no multicast / reduction can enter this tile! - for (genvar r = 0; r < 4; r++) begin : gen_virt - `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req.generic.hdr.collective_op == Unicast))) + for (genvar r = 0; r < 4; r++) begin : gen_route_assertions + `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req.generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_req_i[r].req.narrow_aw.payload.addr)) `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide.generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide.wide_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req.generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_req_o[r].req.narrow_aw.payload.addr)) `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide.generic.hdr.collective_op == Unicast)), + clk_i, !rst_ni, + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide.wide_aw.payload.addr)) end endmodule From 59d62dcdce380ce358530d6078bb176e970d0f5d Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 29 Sep 2025 14:30:23 +0200 Subject: [PATCH 21/77] experiments/mcast: Set up Python-based experiment infrastructure --- experiments.sh | 37 --- iis-env.sh | 2 + sw/snitch/experiments/util/quick_plot.py | 293 ----------------------- sw/snitch/tests/dma_multicast_v2.c | 240 +++++++++++++++++++ 4 files changed, 242 insertions(+), 330 deletions(-) delete mode 100755 experiments.sh delete mode 100755 sw/snitch/experiments/util/quick_plot.py create mode 100644 sw/snitch/tests/dma_multicast_v2.c diff --git a/experiments.sh b/experiments.sh deleted file mode 100755 index b227358a..00000000 --- a/experiments.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -TRAN_LEN_VALUES=(256 512 1024 2048 4096 8192) -MODE_VALUES=(SW_UNOPT SW_OPT SW_OPT2) -N_CLUSTERS_VALUES=(4 8 16) - -for N_CLUSTERS in "${N_CLUSTERS_VALUES[@]}"; do - for MODE in "${MODE_VALUES[@]}"; do - for TRAN_LEN in "${TRAN_LEN_VALUES[@]}"; do - - echo "=== Running with TRAN_LEN=${TRAN_LEN}, MODE=${MODE}, N_CLUSTERS=${N_CLUSTERS} ===" - - # Remove old ELF - rm -f sw/snitch/tests/build/dma_multicast.elf - - # Build - make sw DEBUG=ON TRAN_LEN=${TRAN_LEN} MODE=${MODE} N_CLUSTERS=${N_CLUSTERS} -j - - # Run simulation - make vsim-run-batch CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf \ - SN_BINARY=sw/snitch/tests/build/dma_multicast.elf \ - PRELMODE=3 - - # Annotate - make annotate DEBUG=ON -j - - # Move logs - DEST_DIR="logs_${TRAN_LEN}_${MODE}_${N_CLUSTERS}" - mkdir -p results/"${DEST_DIR}" - mv logs/*.s results/"${DEST_DIR}" - - echo "Logs saved to results/${DEST_DIR}" - echo "------------------------------------------------------------" - - done - done -done diff --git a/iis-env.sh b/iis-env.sh index 2e9f41b3..6a47bc37 100644 --- a/iis-env.sh +++ b/iis-env.sh @@ -20,3 +20,5 @@ fi if [ -z "$VIRTUAL_ENV" ] || [ "$VIRTUAL_ENV" != "$(realpath .venv)" ]; then source .venv/bin/activate fi + +export PYTHONPATH=$PWD/experiments/:$PYTHONPATH \ No newline at end of file diff --git a/sw/snitch/experiments/util/quick_plot.py b/sw/snitch/experiments/util/quick_plot.py deleted file mode 100755 index 547e3045..00000000 --- a/sw/snitch/experiments/util/quick_plot.py +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Lorenzo Leone - -import os -import numpy as np -import matplotlib.pyplot as plt -from matplotlib import cm -from matplotlib.patches import Patch -from matplotlib.colors import ListedColormap - -PULP_COLORS = { - "PULP Blue": "#3067AB", - "PULP Petrol": "#84B5E9", - "PULP Green": "#168638", - "PULP Bronze": "#E59955", - "PULP Red": "#9B3B32", - "PULP Purple": "#851B66", - "PULP Grey": "#6F6F6F" -} - -def plot_multicast_speedup(clusters, transfer_sizes, speedups, - colors=None, - title="Multicast Speedup", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=1, - legend_frame=True, - save_name=None): - """ - Plots a bar chart of multicast speedup. - - Parameters: - - clusters: list of cluster counts (x-axis categories) - - transfer_sizes: list of transfer sizes (legend categories) - - speedups: 2D list of speedups where each row corresponds to clusters[i] - and each column corresponds to transfer_sizes[j] - - colors: list of colors for each transfer size - - title, xlabel, ylabel: strings for plot customization - - legend_loc: legend location (e.g., 'upper left', 'upper right') - - legend_ncol: number of columns in legend - - legend_frame: whether to show frame around legend - """ - - # Number of clusters and transfer sizes - n_clusters = len(clusters) - n_sizes = len(transfer_sizes) - - # Default colors if none are provided - if colors is None: - colors = plt.cm.viridis(np.linspace(0, 1, n_sizes)) - - # X positions for cluster groups - x = np.arange(n_clusters) - - # Width of each bar - bar_width = 0.8 / n_sizes - - fig, ax = plt.subplots(figsize=(10, 6)) - - # Plot each transfer size group - for i, size in enumerate(transfer_sizes): - offsets = x + i * bar_width - (bar_width * (n_sizes - 1) / 2) - ax.bar(offsets, - [speedups[j][i] for j in range(n_clusters)], - width=bar_width, - color=colors[i], - label=f"{size}") - - # Labels and title - ax.set_xlabel(xlabel, fontsize=14) - ax.set_ylabel(ylabel, fontsize=14) - ax.set_title(title, fontsize=16) - ax.set_xticks(x) - ax.set_xticklabels(clusters) - ax.tick_params(axis='both', which='major', labelsize=14) - - - # Legend customization - ax.legend(title="Transfer Size", loc=legend_loc, ncol=legend_ncol, frameon=legend_frame, fontsize=14, title_fontsize=16) - - # Grid for readability - ax.grid(axis="y", linestyle="--", alpha=0.7) - - fig.tight_layout() - - # Save plot if a name is provided - if save_name: - plots_dir = os.path.join(os.getcwd(), "plots") - os.makedirs(plots_dir, exist_ok=True) - file_path = os.path.join(plots_dir, f"{save_name}.svg") - plt.savefig(file_path, format="svg", bbox_inches="tight") - print(f"Plot saved to: {file_path}") - - return fig, ax - - -def _add_gradient_legend_inset( - ax_main, colors1, colors2, transfer_sizes, sw1_label, sw2_label, - inset_width=0.3, bar_height=0.04, gap=0.01, x0=0.02, y_top=0.92 -): - """Place two gradient bars in the top-left; bottom=SW1 (with labels), top=SW2 (no labels).""" - fig = ax_main.figure - - # Top bar = SW2 (no labels) - ax_sw2 = ax_main.inset_axes([x0, y_top - bar_height, inset_width, bar_height], - transform=ax_main.transAxes) - # Bottom bar = SW1 (with labels) - ax_sw1 = ax_main.inset_axes([x0, y_top - 2*bar_height - gap, inset_width, bar_height], - transform=ax_main.transAxes) - - def draw_bar(ax_bar, colors, title, show_labels=False, lgn_title=False): - data = np.arange(len(colors)).reshape(1, len(colors)) - cmap = ListedColormap(colors) - ax_bar.imshow(data, aspect='auto', cmap=cmap, extent=[0, len(colors), 0, 1]) - - if show_labels: - centers = np.arange(len(colors)) + 0.5 - ax_bar.set_xticks(centers) - ax_bar.set_xticklabels(transfer_sizes, fontsize=8, rotation=45, ha="right") - else: - ax_bar.set_xticks([]) - - ax_bar.set_yticks([]) - ax_bar.set_xlim(0, len(colors)) - - for spine in ax_bar.spines.values(): - spine.set_visible(False) - - # Title to the right - ax_bar.text(len(colors) + 0.2, 0.5, title, - fontsize=9, va='center', ha='left', transform=ax_bar.transData) - - # DMA Transfer Size label centered over the 6 colors - if lgn_title: - ax_bar.text(3, 1.4, "DMA Transfer Size", # x=3 is center of 0..6 range - fontsize=9, ha='center', va='bottom', fontweight="bold", - transform=ax_bar.transData) - - # Draw bars - draw_bar(ax_sw2, colors2, sw2_label, show_labels=False, lgn_title=True) - draw_bar(ax_sw1, colors1, sw1_label, show_labels=True, lgn_title=False) - -def plot_two_software_speedup( - clusters, - transfer_sizes, - sw1_speedups, - sw2_speedups, - sw1_label="Software 1", - sw2_label="Software 2", - cmap1="viridis", - cmap2="magma", - title="Speedup by Transfer Size and Software", - xlabel="Number of Clusters", - ylabel="Speedup", - save_name=None -): - n_clusters = len(clusters) - x = np.arange(n_clusters) - bars_per_group = 12 - bar_width = 0.8 / bars_per_group - - colors1 = cm.get_cmap(cmap1)(np.linspace(0.15, 0.85, 6)) - colors2 = cm.get_cmap(cmap2)(np.linspace(0.15, 0.85, 6)) - - fig, ax = plt.subplots(figsize=(12, 6)) - - for i in range(6): - offsets_sw1 = x + i * bar_width - (bar_width * (bars_per_group - 1) / 2) - offsets_sw2 = x + (6 + i) * bar_width - (bar_width * (bars_per_group - 1) / 2) - - ax.bar(offsets_sw1, [sw1_speedups[j][i] for j in range(n_clusters)], - width=bar_width, color=colors1[i]) - ax.bar(offsets_sw2, [sw2_speedups[j][i] for j in range(n_clusters)], - width=bar_width, color=colors2[i]) - - ax.set_title(title, fontsize=16) - ax.set_xlabel(xlabel, fontsize=14) - ax.set_ylabel(ylabel, fontsize=14) - ax.set_xticks(x) - ax.set_xticklabels(clusters) - ax.grid(axis="y", linestyle="--", alpha=0.7) - - # Add gradient legend in top-left corner - _add_gradient_legend_inset(ax, colors1, colors2, transfer_sizes, sw1_label, sw2_label) - - fig.tight_layout() - - # Save plot if a name is provided - if save_name: - plots_dir = os.path.join(os.getcwd(), "plots") - os.makedirs(plots_dir, exist_ok=True) - file_path = os.path.join(plots_dir, f"{save_name}.svg") - plt.savefig(file_path, format="svg", bbox_inches="tight") - print(f"Plot saved to: {file_path}") - - return fig, ax - -# ===== Example Usage ===== -def main(): - clusters = [4, 8, 16] - transfer_sizes = ["1 KiB", "2 KiB", "4 KiB", "8 KiB", "16 KiB", "32 KiB"] - - # Example speedups (rows = clusters, cols = transfer sizes) - speedups_mcast_unopt = [ - [3.92, 3.953846154, 3.76, 3.915151515, 3.969491525, 3.992857143], # For 4 clusters - [7.7, 7.738461538, 7.45, 7.848484848, 7.983050847, 7.859649123], # For 8 clusters - [15.24, 15.66153846, 15.02, 15.73939394, 15.88813559, 15.67894737] # For 16 clusters -] - - speedups_sw1_unopt = [ - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], # For 4 clusters - [0.891203704, 0.994071146, 1.125377644, 1.29241517, # For 8 clusters - 1.424682396, 1.509942703], - [1.448669202, 1.583203733, 1.796650718, 1.965934898, # For 16 clusters - 2.099910394, 2.192590775] - ] - - speedups_sw2_unopt = [ - [1.02617801, 1.088983051, 1.153374233, 1.216572505, 1.271444083, 1.303030303], # For 4 clusters - [1.370106762, 1.462209302, 1.616052061, 1.75951087, 1.875, 1.942758023], # For 8 clusters - [2.01055409, 2.237362637, 2.486754967, 2.736564805, 2.931207004, 3.066918325] # For 16 clusters -] - - speedups_sw2_sw1 = [ - [1.02617801, 1.088983051, 1.153374233, 1.216572505, 1.271444083, 1.303030303], # For 4 clusters - [1.537366548, 1.470930233, 1.436008677, 1.361413043, 1.316082803, 1.286643539], # For 8 clusters - [1.387862797, 1.413186813, 1.38410596, 1.39199157, 1.39587242, 1.398764585] # For 16 clusters - ] - - speedups_optimized = [ - [3.82, 3.630769231, 3.26, 3.218181818, 3.122033898, 3.064285714], # For 4 clusters - [5.62, 5.292307692, 4.61, 4.460606061, 4.257627119, 4.045614035], # For 8 clusters - [7.58, 7.0, 6.04, 5.751515152, 5.420338983, 5.112280702] # For 16 clusters - ] - - # Select PULP colors for the 6 transfer sizes - custom_colors = [ - PULP_COLORS["PULP Blue"], - PULP_COLORS["PULP Petrol"], - PULP_COLORS["PULP Green"], - PULP_COLORS["PULP Bronze"], - PULP_COLORS["PULP Red"], - PULP_COLORS["PULP Purple"] - ] - - plot_multicast_speedup(clusters, transfer_sizes, speedups_mcast_unopt, - title="Multicast Speedup against unoptimized software version", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="mcast_vs_unopt") - - plot_multicast_speedup(clusters, transfer_sizes, speedups_sw2_sw1, - title="Speedup of SW OPT2 implementation against SW OPT1", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="sw2_vs_sw1") - - plot_multicast_speedup(clusters, transfer_sizes, speedups_optimized, - title="Multicast Speedup against best software version", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="mcast_vs_optimized") - - plot_two_software_speedup( - clusters, - transfer_sizes, - speedups_sw1_unopt, speedups_sw2_unopt, - sw1_label="Column first", - sw2_label="Binary tree", - cmap1="viridis", - cmap2="magma", - title="Speedup of different Software schemes", - xlabel="Number of Clusters", - ylabel="Speedup", - save_name="sw_comparison" - ) - - - plt.show() - -if __name__ == '__main__': - main() diff --git a/sw/snitch/tests/dma_multicast_v2.c b/sw/snitch/tests/dma_multicast_v2.c new file mode 100644 index 00000000..c231f0b9 --- /dev/null +++ b/sw/snitch/tests/dma_multicast_v2.c @@ -0,0 +1,240 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Lorenzo Leone +// +// This code implements a function to multicast data from one memory tile +// to all clusters in the same row. + +#define SNRT_ENABLE_NARROW_REDUCTION +#define SNRT_ENABLE_NARROW_MULTICAST + +#include +#include "snrt.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 1024 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 64 +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(uint32_t)) + +#define ELEMS_TO_CHECK 32 + +#ifndef N_CLUSTERS_TO_USE +#define N_CLUSTERS_TO_USE 4 +#endif + +// TODO(colluca): calculate from non-log2 function +static inline uint32_t pb_log2_cluster_num_in_col() { + return 2; +} + +static inline uint32_t pb_cluster_num_in_row() { + return 4; +} + +static inline uint32_t pb_cluster_row_idx() { + return snrt_cluster_idx() % pb_cluster_num_in_row(); +} + +static inline uint32_t pb_cluster_in_row(uint32_t row_idx) { + return pb_cluster_row_idx() == row_idx; +} + +static inline uint32_t pb_cluster_idx_in_row() { + return snrt_cluster_idx() / pb_cluster_num_in_row(); +} + +static inline uint32_t pb_cluster_left_neighbour() { + return snrt_cluster_idx() - pb_cluster_num_in_row(); +} + +static inline void pb_create_row_comm(uint32_t row, snrt_comm_t *comm) { + // Allocate communicator struct in L1 and point to it. + *comm = + (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); + + // Allocate barrier counter in L1. Only the zero-th cluster's is actually + // used, but we want to keep all clusters' L1 allocators aligned. Thus, + // only cluster 0 initializes its barrier counter. A global barrier is + // then used to ensure all cores "see" the initialized value. + void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t)); + barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0); + if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0; + snrt_global_barrier(); + + // Initialize communicator, pointing to the newly-allocated barrier + // counter in L3. + (*comm)->size = pb_cluster_num_in_row(); + (*comm)->mask = row | + ((pb_cluster_num_in_row() - 1) << pb_log2_cluster_num_in_col()); + (*comm)->base = row; + (*comm)->barrier_ptr = (uint32_t *)barrier_ptr; + (*comm)->is_participant = pb_cluster_in_row(row); +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, + snrt_comm_t comm) { + +#ifdef ENABLE_WIDE_MULTICAST + // Only DMA core of cluster 0 continues past this point + if (!snrt_is_dm_core() || !(snrt_cluster_idx() == 0)) return; + + // Hardware multicast transfer + uint64_t mask = snrt_get_collective_mask(comm); + snrt_dma_enable_multicast(mask); + snrt_mcycle(); + snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); + snrt_dma_disable_multicast(); + snrt_dma_wait_all(); +#else + // Only DMA cores of clusters in the first row participate + if (!snrt_is_dm_core() || !comm->is_participant) return; + + // Compute address of source buffer (in left tile) + uintptr_t src; + if (snrt_cluster_idx() == 0) { + src = l3_buffer; + } else { + src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), pb_cluster_left_neighbour()); + } + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all transfers + uint32_t n_iters = N_BATCHES - 1 + N_CLUSTERS_TO_USE; + for (uint32_t i = 0; i < n_iters; i++) { + + // Every cluster is active for N_BATCHES iterations + // starting from the iteration with i == snrt_cluster_idx() + // TODO(colluca): use communicator + char cluster_active = i >= pb_cluster_idx_in_row(); + cluster_active &= i < (pb_cluster_idx_in_row() + N_BATCHES); + + // Every active cluster copies data from the left tile + if (cluster_active) { + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(l1_buffer, src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + l1_buffer += BATCH; + src += BATCH; + asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +#endif +} + +// Global variables for verification script +uint32_t output[N_ELEMS * N_CLUSTERS_TO_USE]; +uint32_t n_clusters = N_CLUSTERS_TO_USE; +uint32_t length = N_ELEMS * N_CLUSTERS_TO_USE; + +int main() { + // Enable interrupts and clear pending interrupt + snrt_interrupt_enable(IRQ_M_CLUSTER); + snrt_int_clr_mcip(); + + // Allocate 4KiB-aligned buffer in every cluster + uint32_t *buffer = (uint32_t *)snrt_l1_alloc_cluster_local(SIZE, 4096); + + // Allocate 4KiB-aligned buffer in memory tile + uint32_t *l3_buffer = (uint32_t *)snrt_l3_alloc_v2(SIZE, 4096); + + // First cluster initializes the L3 buffer. + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + DUMP(l3_buffer); + for (uint32_t i = 0; i < N_ELEMS; i++) { + l3_buffer[i] = i + 1; + } + } + + // Every cluster in row 0 initializes its destination buffer + if (snrt_is_dm_core() && pb_cluster_in_row(0)) { + snrt_dma_start_1d( + (uintptr_t)buffer, (uintptr_t)(snrt_cluster()->zeromem.mem), SIZE); + snrt_dma_wait_all(); + } + + // Create communicator for row 0 (all other clusters are inactive) + // TODO(colluca): extend to use N_CLUSTERS_TO_USE + snrt_comm_t comm; + pb_create_row_comm(0, &comm); + + // Only DMA cores of clusters in the first row continue from here + if (!snrt_is_dm_core() || !comm->is_participant) return 0; + + // Print participating clusters + if (snrt_is_dm_core() && snrt_cluster_idx() == 0) { + uint32_t mask = comm->mask; + uint32_t fixed = comm->base & ~mask; + uint32_t submask = 0; + do { + uint32_t i = fixed | submask; + DUMP(i); + submask = (submask - 1) & mask; + } while (submask != 0); + } + + // Synchronize all clusters. + snrt_inter_cluster_barrier(comm); + + // Initiate multicast transfer (twice to preheat the cache) + for (volatile int i = 0; i < 2; i++) { + snrt_mcycle(); + dma_multicast((uintptr_t)buffer, (uintptr_t)l3_buffer, comm); + snrt_mcycle(); + snrt_inter_cluster_barrier(comm); + } + + // Writeback to L3 + if (snrt_is_dm_core() && pb_cluster_in_row(0)) { + uint32_t offset = pb_cluster_idx_in_row() * SIZE; + snrt_dma_start_1d( + (uintptr_t)output + offset, (uintptr_t)buffer, SIZE); + snrt_dma_wait_all(); + } + + // Every cluster checks that the data in its buffer is correct + if (snrt_is_dm_core() && pb_cluster_in_row(0)) { + uint32_t n_errs = ELEMS_TO_CHECK; + uint32_t stride = N_ELEMS / ELEMS_TO_CHECK; + for (uint32_t i = 0; i < N_ELEMS; i += stride) + if (buffer[i] == (i + 1)) n_errs--; + return n_errs; + } else + return 0; + } From 00786492e1296b2c5479d77e076f854e2bb0b317 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Sun, 5 Oct 2025 19:17:43 +0200 Subject: [PATCH 22/77] hw: Add two physical channels on wide_out interface to local port --- hw/cluster_tile.sv | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 6d59b582..fbd48060 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -336,11 +336,18 @@ module cluster_tile // Router // //////////// + typedef struct packed { + logic [1:0] valid; + logic [1:0] ready; + floo_wide_chan_t [1:0] wide; + } floo_wide_double_t; + floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_in; - floo_nw_router #( + floo_nw_router_v2 #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), @@ -358,6 +365,7 @@ module cluster_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .RdWideOperation_t (floo_pkg::collect_op_e), .RdNarrowOperation_t (floo_pkg::collect_op_e), .RdWideData_t (RdDataWide_t), @@ -401,14 +409,19 @@ module cluster_tile assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + end assign router_floo_wide_in[West:North] = floo_wide_i; ///////////// // Chimney // ///////////// - floo_nw_chimney #( + floo_nw_chimney_v2 #( .AxiCfgN (floo_picobello_noc_pkg::AxiCfgN), .AxiCfgW (floo_picobello_noc_pkg::AxiCfgW), .ChimneyCfgN (floo_pkg::ChimneyDefaultCfg), @@ -435,6 +448,7 @@ module cluster_tile .floo_req_t (floo_picobello_noc_pkg::floo_req_t), .floo_rsp_t (floo_picobello_noc_pkg::floo_rsp_t), .floo_wide_t (floo_picobello_noc_pkg::floo_wide_t), + .floo_wide_in_t (floo_wide_double_t), .sram_cfg_t (snitch_cluster_pkg::sram_cfg_t), .user_narrow_struct_t (picobello_pkg::collective_narrow_user_t), .user_wide_struct_t (picobello_pkg::collective_wide_user_t) From 0d77e66726d188738fb53ddc99082fa9572ae6d9 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 8 Oct 2025 12:08:59 +0200 Subject: [PATCH 23/77] gitignore: Ignore `__pycache__` folders --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 43cd241c..8128d3ca 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,10 @@ .bender working_dir/ -# Python virtual environment +# Python virtual environment and caches .cache/pip .venv +__pycache__ # Generated source files .generated From b68733ddc62b8843ba9efc2dd930fe4ba3d1b17a Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 8 Oct 2025 13:47:26 +0200 Subject: [PATCH 24/77] sw: Split runtime into impl and src folder --- sw/snitch/runtime/{src => impl}/.gitignore | 0 sw/snitch/runtime/{src => impl}/memory.ld | 0 sw/snitch/runtime/{src => impl}/pb_memory.h | 0 sw/snitch/runtime/{src => impl}/pb_noc_cfg.h | 3 +++ sw/snitch/runtime/{src => impl}/putchar.c | 0 .../snitch_cluster_addrmap.rdl.tpl | 0 .../{src => impl}/snitch_cluster_cfg.h.tpl | 0 .../{src => impl}/snitch_cluster_memory.c | 0 .../{src => impl}/snitch_cluster_start.c | 0 .../{src => impl}/snitch_cluster_start.h | 0 sw/snitch/runtime/{src => impl}/snrt.S | 0 sw/snitch/runtime/impl/snrt.cc | 26 +++++++++++++++++++ sw/snitch/runtime/{src => impl}/snrt.h | 5 +++- sw/snitch/runtime/src/snrt.cc | 1 - sw/sw.mk | 3 ++- 15 files changed, 35 insertions(+), 3 deletions(-) rename sw/snitch/runtime/{src => impl}/.gitignore (100%) rename sw/snitch/runtime/{src => impl}/memory.ld (100%) rename sw/snitch/runtime/{src => impl}/pb_memory.h (100%) rename sw/snitch/runtime/{src => impl}/pb_noc_cfg.h (69%) rename sw/snitch/runtime/{src => impl}/putchar.c (100%) rename sw/snitch/runtime/{src => impl}/snitch_cluster_addrmap.rdl.tpl (100%) rename sw/snitch/runtime/{src => impl}/snitch_cluster_cfg.h.tpl (100%) rename sw/snitch/runtime/{src => impl}/snitch_cluster_memory.c (100%) rename sw/snitch/runtime/{src => impl}/snitch_cluster_start.c (100%) rename sw/snitch/runtime/{src => impl}/snitch_cluster_start.h (100%) rename sw/snitch/runtime/{src => impl}/snrt.S (100%) create mode 100644 sw/snitch/runtime/impl/snrt.cc rename sw/snitch/runtime/{src => impl}/snrt.h (97%) delete mode 120000 sw/snitch/runtime/src/snrt.cc diff --git a/sw/snitch/runtime/src/.gitignore b/sw/snitch/runtime/impl/.gitignore similarity index 100% rename from sw/snitch/runtime/src/.gitignore rename to sw/snitch/runtime/impl/.gitignore diff --git a/sw/snitch/runtime/src/memory.ld b/sw/snitch/runtime/impl/memory.ld similarity index 100% rename from sw/snitch/runtime/src/memory.ld rename to sw/snitch/runtime/impl/memory.ld diff --git a/sw/snitch/runtime/src/pb_memory.h b/sw/snitch/runtime/impl/pb_memory.h similarity index 100% rename from sw/snitch/runtime/src/pb_memory.h rename to sw/snitch/runtime/impl/pb_memory.h diff --git a/sw/snitch/runtime/src/pb_noc_cfg.h b/sw/snitch/runtime/impl/pb_noc_cfg.h similarity index 69% rename from sw/snitch/runtime/src/pb_noc_cfg.h rename to sw/snitch/runtime/impl/pb_noc_cfg.h index bc285ba6..266b2191 100644 --- a/sw/snitch/runtime/src/pb_noc_cfg.h +++ b/sw/snitch/runtime/impl/pb_noc_cfg.h @@ -6,3 +6,6 @@ #define PB_CLUSTER_PER_ROW 4 #define PB_CLUSTER_PER_COL 4 +// TODO(colluca): derive from previous or viceversa +#define PB_LOG2_CLUSTER_PER_ROW 2 +#define PB_LOG2_CLUSTER_PER_COL 2 diff --git a/sw/snitch/runtime/src/putchar.c b/sw/snitch/runtime/impl/putchar.c similarity index 100% rename from sw/snitch/runtime/src/putchar.c rename to sw/snitch/runtime/impl/putchar.c diff --git a/sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl b/sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_addrmap.rdl.tpl rename to sw/snitch/runtime/impl/snitch_cluster_addrmap.rdl.tpl diff --git a/sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl b/sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_cfg.h.tpl rename to sw/snitch/runtime/impl/snitch_cluster_cfg.h.tpl diff --git a/sw/snitch/runtime/src/snitch_cluster_memory.c b/sw/snitch/runtime/impl/snitch_cluster_memory.c similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_memory.c rename to sw/snitch/runtime/impl/snitch_cluster_memory.c diff --git a/sw/snitch/runtime/src/snitch_cluster_start.c b/sw/snitch/runtime/impl/snitch_cluster_start.c similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_start.c rename to sw/snitch/runtime/impl/snitch_cluster_start.c diff --git a/sw/snitch/runtime/src/snitch_cluster_start.h b/sw/snitch/runtime/impl/snitch_cluster_start.h similarity index 100% rename from sw/snitch/runtime/src/snitch_cluster_start.h rename to sw/snitch/runtime/impl/snitch_cluster_start.h diff --git a/sw/snitch/runtime/src/snrt.S b/sw/snitch/runtime/impl/snrt.S similarity index 100% rename from sw/snitch/runtime/src/snrt.S rename to sw/snitch/runtime/impl/snrt.S diff --git a/sw/snitch/runtime/impl/snrt.cc b/sw/snitch/runtime/impl/snrt.cc new file mode 100644 index 00000000..74f12312 --- /dev/null +++ b/sw/snitch/runtime/impl/snrt.cc @@ -0,0 +1,26 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "snrt.h" + +#include "alloc.c" +#include "alloc_v2.c" +#include "cls.c" +#include "cluster_interrupts.c" +#include "dm.c" +#include "dma.c" +#include "eu.c" +#include "kmp.c" +#include "omp.c" +#include "printf.c" +#include "putchar.c" +#include "riscv.c" +#include "snitch_cluster_memory.c" +#include "snitch_cluster_start.c" +#include "ssr.c" +#include "sync.c" +#include "team.c" + +#include "pb_team.c" +#include "pb_sync.c" diff --git a/sw/snitch/runtime/src/snrt.h b/sw/snitch/runtime/impl/snrt.h similarity index 97% rename from sw/snitch/runtime/src/snrt.h rename to sw/snitch/runtime/impl/snrt.h index a7bdb4aa..fd8be02e 100644 --- a/sw/snitch/runtime/src/snrt.h +++ b/sw/snitch/runtime/impl/snrt.h @@ -55,7 +55,6 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t; #include "sync.h" #include "team.h" #include "types.h" -#include "pb_team.h" // Accelerators #include "datamover/archi_datamover.h" @@ -64,3 +63,7 @@ typedef snitch_cluster__stride40000_t snitch_cluster_t; #include "redmule/archi_redmule.h" #include "redmule/hal_redmule.h" #include "redmule/redmule_utils.h" + +// Picobello specific +#include "pb_team.h" +#include "pb_sync.h" diff --git a/sw/snitch/runtime/src/snrt.cc b/sw/snitch/runtime/src/snrt.cc deleted file mode 120000 index 686f3cf5..00000000 --- a/sw/snitch/runtime/src/snrt.cc +++ /dev/null @@ -1 +0,0 @@ -../../../../.deps/snitch_cluster/sw/runtime/impl/snrt.cc \ No newline at end of file diff --git a/sw/sw.mk b/sw/sw.mk index 83489072..e5d03c21 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -22,12 +22,13 @@ PB_GEN_DIR = $(PB_ROOT)/.generated ## Snitch Cluster ## #################### -SN_RUNTIME_SRCDIR = $(PB_SNITCH_SW_DIR)/runtime/src +SN_RUNTIME_SRCDIR = $(PB_SNITCH_SW_DIR)/runtime/impl SN_RUNTIME_BUILDDIR = $(PB_SNITCH_SW_DIR)/runtime/build SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build SN_RUNTIME_INCDIRS = $(PB_INCDIR) SN_RUNTIME_INCDIRS += $(PB_GEN_DIR) +SN_RUNTIME_INCDIRS += $(PB_SNITCH_SW_DIR)/runtime/src SN_RUNTIME_HAL_HDRS = $(PB_GEN_DIR)/pb_addrmap.h SN_RUNTIME_HAL_HDRS += $(PB_GEN_DIR)/pb_raw_addrmap.h SN_BUILD_APPS = OFF From bd0eabc95fb8f3b8a983c74e3424fce856f773b2 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 8 Oct 2025 13:47:52 +0200 Subject: [PATCH 25/77] sw: Add `pb_team` and `pb_sync` functions --- sw/snitch/runtime/src/pb_sync.c | 6 ++ sw/snitch/runtime/src/pb_sync.h | 31 +++++++ sw/snitch/runtime/src/pb_team.c | 38 +++++++-- sw/snitch/runtime/src/pb_team.h | 147 ++++++++++++++++++++++++++------ 4 files changed, 189 insertions(+), 33 deletions(-) create mode 100644 sw/snitch/runtime/src/pb_sync.c create mode 100644 sw/snitch/runtime/src/pb_sync.h diff --git a/sw/snitch/runtime/src/pb_sync.c b/sw/snitch/runtime/src/pb_sync.c new file mode 100644 index 00000000..46a5cb0b --- /dev/null +++ b/sw/snitch/runtime/src/pb_sync.c @@ -0,0 +1,6 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +extern inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, + uint32_t n_cols); diff --git a/sw/snitch/runtime/src/pb_sync.h b/sw/snitch/runtime/src/pb_sync.h new file mode 100644 index 00000000..33955499 --- /dev/null +++ b/sw/snitch/runtime/src/pb_sync.h @@ -0,0 +1,31 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, + uint32_t n_cols) { + // Allocate communicator struct in L1 and point to it. + *comm = + (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); + + // Allocate barrier counter in L1. Only the zero-th cluster's is actually + // used, but we want to keep all clusters' L1 allocators aligned. Thus, + // only cluster 0 initializes its barrier counter. A global barrier is + // then used to ensure all cores "see" the initialized value. + void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t)); + barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0); + if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0; + snrt_global_barrier(); + + // Initialize communicator, pointing to the newly-allocated barrier + // counter in L3. + (*comm)->size = n_rows * n_cols; + (*comm)->mask = ((n_cols - 1) << PB_LOG2_CLUSTER_PER_COL) | + (n_rows - 1); + (*comm)->base = 0; + (*comm)->barrier_ptr = (uint32_t *)barrier_ptr; + (*comm)->is_participant = (pb_cluster_row_idx() < n_rows) && + (pb_cluster_col_idx() < n_cols); +} diff --git a/sw/snitch/runtime/src/pb_team.c b/sw/snitch/runtime/src/pb_team.c index cd280f61..0a67b710 100644 --- a/sw/snitch/runtime/src/pb_team.c +++ b/sw/snitch/runtime/src/pb_team.c @@ -1,4 +1,4 @@ -// Copyright 2023 ETH Zurich and University of Bologna. +// Copyright 2025 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 @@ -6,14 +6,40 @@ extern inline uintptr_t pb_l2_tile_address(uint32_t tile_idx); extern inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr); -extern inline uint32_t pb_cluster_row(uint32_t cidx); +extern inline constexpr uint32_t pb_log2_cluster_num_in_col(); -extern inline uint32_t pb_cluster_row(); +extern inline constexpr uint32_t pb_log2_cluster_num_in_row(); -extern inline uint32_t pb_cluster_col(uint32_t cidx); +extern inline constexpr uint32_t pb_cluster_num_in_row(); -extern inline uint32_t pb_cluster_col(); +extern inline constexpr uint32_t pb_cluster_num_in_col(); -extern inline uint32_t pb_closest_mem_tile(uint32_t cidx); +extern inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx); + +extern inline uint32_t pb_cluster_row_idx(); + +extern inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx); + +extern inline uint32_t pb_cluster_col_idx(); + +extern inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col); + +extern inline uint32_t pb_cluster_in_row(uint32_t row); + +extern inline uint32_t pb_cluster_in_col(uint32_t col); + +extern inline uint32_t pb_cluster_is_easternmost(); + +extern inline uint32_t pb_cluster_is_northernmost(); + +extern inline uint32_t pb_cluster_north_neighbour(); + +extern inline uint32_t pb_cluster_east_neighbour(); + +extern inline uint32_t pb_cluster_south_neighbour(); + +extern inline uint32_t pb_cluster_west_neighbour(); + +extern inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx); extern inline uint32_t pb_closest_mem_tile(); diff --git a/sw/snitch/runtime/src/pb_team.h b/sw/snitch/runtime/src/pb_team.h index 1fa8d8d0..fa41866c 100644 --- a/sw/snitch/runtime/src/pb_team.h +++ b/sw/snitch/runtime/src/pb_team.h @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // // Lorenzo Leone - +// Luca Colagrande /** * @file @@ -30,60 +30,153 @@ inline uintptr_t pb_l2_tile_offset(uintptr_t src_addr) { PICOBELLO_ADDRMAP_L2_SPM_0_SIZE; } +/** + * @brief Get the log2 of the number of clusters in a row + */ + inline constexpr uint32_t pb_log2_cluster_num_in_row() { + // TODO(colluca): derive this from pb_cluster_num_in_row() or viceversa + return 2; +} /** - * @brief Get the NoC row index - * @param cidx The cluster index - * @return The Row index + * @brief Get the log2 of the number of clusters in a column */ -inline uint32_t pb_cluster_row(uint32_t cidx) -{ - return cidx % PB_CLUSTER_PER_ROW; +inline constexpr uint32_t pb_log2_cluster_num_in_col() { + // TODO(colluca): derive this from pb_cluster_num_in_col() or viceversa + return 2; +} + +/** + * @brief Get the number of clusters in a row + */ +inline constexpr uint32_t pb_cluster_num_in_row() { + return PB_CLUSTER_PER_ROW; +} + +/** + * @brief Get the number of clusters in a column + */ +inline constexpr uint32_t pb_cluster_num_in_col() { + return PB_CLUSTER_PER_COL; } /** - * @brief Get the NoC row index - * This is a convenience orload of pb_cluster_row() - * @return The Row index + * @brief Get the row index of a cluster + * @param cluster_idx The cluster index + * @return The row index relative to the first row of cluster tiles */ -inline uint32_t pb_cluster_row() +inline uint32_t pb_cluster_row_idx(uint32_t cluster_idx) { - return pb_cluster_row(snrt_cluster_idx()); + return cluster_idx % pb_cluster_num_in_col(); } +/** + * @brief Get the row index of the invoking cluster + * @return The row index relative to the first row of cluster tiles + */ +inline uint32_t pb_cluster_row_idx() +{ + return pb_cluster_row_idx(snrt_cluster_idx()); +} /** - * @brief Get the NoC column index - * @param cidx The cluster index - * @return The Column index + * @brief Get the column index of a cluster + * @param cluster_idx The cluster index + * @return The column index relative to the first column of cluster tiles */ -inline uint32_t pb_cluster_col(uint32_t cidx) +inline uint32_t pb_cluster_col_idx(uint32_t cluster_idx) { - return cidx / PB_CLUSTER_PER_COL; + return cluster_idx / pb_cluster_num_in_col(); } /** - * @brief Get the NoC column index - * This is a convenience orload of pb_cluster_row() - * @return The Column index + * @brief Get the column index of the invoking cluster + * @return The column index relative to the first column of cluster tiles */ -inline uint32_t pb_cluster_col() +inline uint32_t pb_cluster_col_idx() { - return pb_cluster_col(snrt_cluster_idx()); + return pb_cluster_col_idx(snrt_cluster_idx()); +} + +/** + * @brief Calculate the cluster index from its (row, col) coordinates + * @param row Row index relative to the first row of cluster tiles + * @param col Column index relative to the first column of cluster tiles + */ +inline uint32_t pb_calculate_cluster_idx(uint32_t row, uint32_t col) { + return col * pb_cluster_num_in_col() + row; +} + +/** + * @brief Test if cluster is in a given row + * @param row Row index relative to the first row of cluster tiles + */ +inline uint32_t pb_cluster_in_row(uint32_t row) { + return pb_cluster_row_idx() == row; +} + +/** + * @brief Test if cluster is in a given column + * @param col Column index relative to the first column of cluster tiles + */ +inline uint32_t pb_cluster_in_col(uint32_t col) { + return pb_cluster_col_idx() == col; +} + +/** + * @brief Test if cluster is in the easternmost column + */ +inline uint32_t pb_cluster_is_easternmost() { + return pb_cluster_in_col(pb_cluster_num_in_row() - 1); +} + +/** + * @brief Test if cluster is in the northernmost row + */ +inline uint32_t pb_cluster_is_northernmost() { + return pb_cluster_in_row(pb_cluster_num_in_col() - 1); +} + +/** + * @brief Get cluster index of north neighbour + */ +inline uint32_t pb_cluster_north_neighbour() { + return snrt_cluster_idx() + 1; } +/** + * @brief Get cluster index of east neighbour + */ +inline uint32_t pb_cluster_east_neighbour() { + return snrt_cluster_idx() + pb_cluster_num_in_row(); +} + +/** + * @brief Get cluster index of south neighbour + */ +inline uint32_t pb_cluster_south_neighbour() { + return snrt_cluster_idx() - 1; +} + +/** + * @brief Get cluster index of west neighbour + */ +inline uint32_t pb_cluster_west_neighbour() { + return snrt_cluster_idx() - pb_cluster_num_in_row(); +} /** * @brief Get the index of the closest memory tile - * @param cidx The cluster index - * @return Index of the closest memory tile to cidx + * @param cluster_idx The cluster index + * @return Index of the closest memory tile to cluster_idx */ -inline uint32_t pb_closest_mem_tile(uint32_t cidx) { - uint32_t row = pb_cluster_row(cidx); +inline uint32_t pb_closest_mem_tile(uint32_t cluster_idx) { + uint32_t row = pb_cluster_row_idx(cluster_idx); // e.g. with 4x4 matrix // first 8 clusters -> left column tiles 0..3 // clusters >= 8 -> right column tiles 4..7 - return (cidx < (snrt_cluster_num() / 2)) ? row : (row + PB_CLUSTER_PER_COL); + return (cluster_idx < (snrt_cluster_num() / 2)) ? + row : (row + PB_CLUSTER_PER_COL); } /** From c7cf4c23e503be2b179e02109487c3df10dfaff8 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 8 Oct 2025 12:09:56 +0200 Subject: [PATCH 26/77] experiments: Add wide multicast experiments --- experiments/.gitignore | 4 + experiments/dma_multicast/.gitignore | 4 + experiments/dma_multicast/README.md | 14 + experiments/dma_multicast/experiments.py | 67 +++ experiments/dma_multicast/quick_plot.py | 288 +++++++++++++ experiments/dma_multicast_v2/README.md | 32 ++ experiments/dma_multicast_v2/__init__.py | 9 + experiments/dma_multicast_v2/experiments.py | 138 ++++++ experiments/dma_multicast_v2/fit.py | 129 ++++++ experiments/dma_multicast_v2/model.py | 97 +++++ experiments/dma_multicast_v2/plot.py | 399 ++++++++++++++++++ experiments/dma_multicast_v2/roi/hw.json.tpl | 14 + experiments/dma_multicast_v2/roi/seq.json.tpl | 29 ++ .../dma_multicast_v2/roi/tree.json.tpl | 162 +++++++ experiments/dma_multicast_v2/verify.py | 34 ++ experiments/picobello.py | 53 +++ sw/snitch/tests/dma_multicast_v2.c | 311 +++++++++----- 17 files changed, 1679 insertions(+), 105 deletions(-) create mode 100644 experiments/.gitignore create mode 100644 experiments/dma_multicast/.gitignore create mode 100644 experiments/dma_multicast/README.md create mode 100755 experiments/dma_multicast/experiments.py create mode 100755 experiments/dma_multicast/quick_plot.py create mode 100644 experiments/dma_multicast_v2/README.md create mode 100644 experiments/dma_multicast_v2/__init__.py create mode 100755 experiments/dma_multicast_v2/experiments.py create mode 100755 experiments/dma_multicast_v2/fit.py create mode 100755 experiments/dma_multicast_v2/model.py create mode 100755 experiments/dma_multicast_v2/plot.py create mode 100644 experiments/dma_multicast_v2/roi/hw.json.tpl create mode 100644 experiments/dma_multicast_v2/roi/seq.json.tpl create mode 100644 experiments/dma_multicast_v2/roi/tree.json.tpl create mode 100755 experiments/dma_multicast_v2/verify.py create mode 100644 experiments/picobello.py diff --git a/experiments/.gitignore b/experiments/.gitignore new file mode 100644 index 00000000..7a3f208a --- /dev/null +++ b/experiments/.gitignore @@ -0,0 +1,4 @@ +build/ +runs/ +plots/ +results/ \ No newline at end of file diff --git a/experiments/dma_multicast/.gitignore b/experiments/dma_multicast/.gitignore new file mode 100644 index 00000000..45b44053 --- /dev/null +++ b/experiments/dma_multicast/.gitignore @@ -0,0 +1,4 @@ +build/ +runs/ +plots/ +results.csv \ No newline at end of file diff --git a/experiments/dma_multicast/README.md b/experiments/dma_multicast/README.md new file mode 100644 index 00000000..edcb834d --- /dev/null +++ b/experiments/dma_multicast/README.md @@ -0,0 +1,14 @@ +From the present directory: +```shell +./experiments.py --actions sw run -j +``` + +Run `dma_multicast_v2.elf`: +```shell +questa-2023.4 vsim -work /usr/scratch2/vulcano/colluca/workspace/REPOS/PICOBELLO/dca-snitch/target/sim/vsim/work -suppress 3009 -suppress 8386 -suppress 13314 -quiet -64 +CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf +SN_BINARY=sw/snitch/tests/build/dma_multicast_v2.elf +PRELMODE=3 -voptargs=+acc tb_picobello_top -do "log -r /*; do wave.do; run -a;" +``` + +Verify `dma_multicast_v2.elf`: +```shell +./experiments/mcast/verify.py placeholder $PWD/sw/snitch/tests/build/dma_multicast_v2.elf --no-ipc --memdump $PWD/l2mem.bin --memaddr 0x70000000 +``` \ No newline at end of file diff --git a/experiments/dma_multicast/experiments.py b/experiments/dma_multicast/experiments.py new file mode 100755 index 00000000..02135c39 --- /dev/null +++ b/experiments/dma_multicast/experiments.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + return eu.derive_axes_from_keys(experiment, ['length', 'mode', 'n_clusters']) + + def derive_cdefines(self, experiment): + cdefs = { + 'N_CLUSTERS_TO_USE': experiment['n_clusters'], + 'MODE': experiment['mode'], + 'LENGTH': experiment['length'], + } + return cdefs + + +def gen_experiments(): + experiments = [] + # for length in [256]:#, 512, 1024, 2048, 4096, 8192]: + # for mode in ['SW_UNOPT']:#, 'SW_OPT', 'SW_OPT2', 'HW_MCAST']: + # for n_clusters in [4]:#, 8, 16]: + for length in [256, 512, 1024, 2048, 4096, 8192]: + for mode in ['SW_UNOPT', 'SW_OPT', 'SW_OPT2', 'HW_MCAST']: + for n_clusters in [4, 8, 16]: + experiments.append({ + 'app': 'dma_multicast', + 'cmd': pb.sim_cmd, + 'length': length, + 'mode': mode, + 'n_clusters': n_clusters, + }) + return experiments + + +def main(): + + manager = ExperimentManager(gen_experiments()) + manager.run() + + df = manager.get_results() + print(df) + + if not manager.perf_results_available: + return + + # Get runtime of region 3 (second invocation of `dma_broadcast_to_clusters`) + def get_cycles(row): + return row['results'].get_metric(SimRegion('hart_9', 3), 'cycles') + + df['cycles'] = df.apply(get_cycles, axis=1) + df.drop(labels=['results'], inplace=True, axis=1) + df.to_csv('results.csv', index=False) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/dma_multicast/quick_plot.py b/experiments/dma_multicast/quick_plot.py new file mode 100755 index 00000000..073df29f --- /dev/null +++ b/experiments/dma_multicast/quick_plot.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Lorenzo Leone + +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import cm +from matplotlib.colors import ListedColormap + +PULP_COLORS = { + "PULP Blue": "#3067AB", + "PULP Petrol": "#84B5E9", + "PULP Green": "#168638", + "PULP Bronze": "#E59955", + "PULP Red": "#9B3B32", + "PULP Purple": "#851B66", + "PULP Grey": "#6F6F6F" +} + + +def plot_multicast_speedup(clusters, transfer_sizes, speedups, + colors=None, + title="Multicast Speedup", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=1, + legend_frame=True, + save_name=None): + """ + Plots a bar chart of multicast speedup. + + Parameters: + - clusters: list of cluster counts (x-axis categories) + - transfer_sizes: list of transfer sizes (legend categories) + - speedups: 2D list of speedups where each row corresponds to clusters[i] + and each column corresponds to transfer_sizes[j] + - colors: list of colors for each transfer size + - title, xlabel, ylabel: strings for plot customization + - legend_loc: legend location (e.g., 'upper left', 'upper right') + - legend_ncol: number of columns in legend + - legend_frame: whether to show frame around legend + """ + + # Number of clusters and transfer sizes + n_clusters = len(clusters) + n_sizes = len(transfer_sizes) + + # Default colors if none are provided + if colors is None: + colors = plt.cm.viridis(np.linspace(0, 1, n_sizes)) + + # X positions for cluster groups + x = np.arange(n_clusters) + + # Width of each bar + bar_width = 0.8 / n_sizes + + fig, ax = plt.subplots(figsize=(10, 6)) + + # Plot each transfer size group + for i, size in enumerate(transfer_sizes): + offsets = x + i * bar_width - (bar_width * (n_sizes - 1) / 2) + ax.bar(offsets, + [speedups[j][i] for j in range(n_clusters)], + width=bar_width, + color=colors[i], + label=f"{size}") + + # Labels and title + ax.set_xlabel(xlabel, fontsize=14) + ax.set_ylabel(ylabel, fontsize=14) + ax.set_title(title, fontsize=16) + ax.set_xticks(x) + ax.set_xticklabels(clusters) + ax.tick_params(axis='both', which='major', labelsize=14) + + # Legend customization + ax.legend(title="Transfer Size", loc=legend_loc, ncol=legend_ncol, frameon=legend_frame, fontsize=14, title_fontsize=16) + + # Grid for readability + ax.grid(axis="y", linestyle="--", alpha=0.7) + + fig.tight_layout() + + # Save plot if a name is provided + if save_name: + plots_dir = os.path.join(os.getcwd(), "plots") + os.makedirs(plots_dir, exist_ok=True) + file_path = os.path.join(plots_dir, f"{save_name}.svg") + plt.savefig(file_path, format="svg", bbox_inches="tight") + print(f"Plot saved to: {file_path}") + + return fig, ax + + +def _add_gradient_legend_inset( + ax_main, colors1, colors2, transfer_sizes, sw1_label, sw2_label, + inset_width=0.3, bar_height=0.04, gap=0.01, x0=0.02, y_top=0.92 +): + """Place two gradient bars in the top-left; bottom=SW1 (with labels), top=SW2 (no labels).""" + + # Top bar = SW2 (no labels) + ax_sw2 = ax_main.inset_axes([x0, y_top - bar_height, inset_width, bar_height], + transform=ax_main.transAxes) + # Bottom bar = SW1 (with labels) + ax_sw1 = ax_main.inset_axes([x0, y_top - 2*bar_height - gap, inset_width, bar_height], + transform=ax_main.transAxes) + + def draw_bar(ax_bar, colors, title, show_labels=False, lgn_title=False): + data = np.arange(len(colors)).reshape(1, len(colors)) + cmap = ListedColormap(colors) + ax_bar.imshow(data, aspect='auto', cmap=cmap, extent=[0, len(colors), 0, 1]) + + if show_labels: + centers = np.arange(len(colors)) + 0.5 + ax_bar.set_xticks(centers) + ax_bar.set_xticklabels(transfer_sizes, fontsize=8, rotation=45, ha="right") + else: + ax_bar.set_xticks([]) + + ax_bar.set_yticks([]) + ax_bar.set_xlim(0, len(colors)) + + for spine in ax_bar.spines.values(): + spine.set_visible(False) + + # Title to the right + ax_bar.text(len(colors) + 0.2, 0.5, title, + fontsize=9, va='center', ha='left', transform=ax_bar.transData) + + # DMA Transfer Size label centered over the 6 colors + if lgn_title: + ax_bar.text(3, 1.4, "DMA Transfer Size", # x=3 is center of 0..6 range + fontsize=9, ha='center', va='bottom', fontweight="bold", + transform=ax_bar.transData) + + # Draw bars + draw_bar(ax_sw2, colors2, sw2_label, show_labels=False, lgn_title=True) + draw_bar(ax_sw1, colors1, sw1_label, show_labels=True, lgn_title=False) + + +def plot_two_software_speedup( + clusters, + transfer_sizes, + sw1_speedups, + sw2_speedups, + sw1_label="Software 1", + sw2_label="Software 2", + cmap1="viridis", + cmap2="magma", + title="Speedup by Transfer Size and Software", + xlabel="Number of Clusters", + ylabel="Speedup", + save_name=None +): + n_clusters = len(clusters) + x = np.arange(n_clusters) + bars_per_group = 12 + bar_width = 0.8 / bars_per_group + + colors1 = cm.get_cmap(cmap1)(np.linspace(0.15, 0.85, 6)) + colors2 = cm.get_cmap(cmap2)(np.linspace(0.15, 0.85, 6)) + + fig, ax = plt.subplots(figsize=(12, 6)) + + for i in range(6): + offsets_sw1 = x + i * bar_width - (bar_width * (bars_per_group - 1) / 2) + offsets_sw2 = x + (6 + i) * bar_width - (bar_width * (bars_per_group - 1) / 2) + + ax.bar(offsets_sw1, [sw1_speedups[j][i] for j in range(n_clusters)], + width=bar_width, color=colors1[i]) + ax.bar(offsets_sw2, [sw2_speedups[j][i] for j in range(n_clusters)], + width=bar_width, color=colors2[i]) + + ax.set_title(title, fontsize=16) + ax.set_xlabel(xlabel, fontsize=14) + ax.set_ylabel(ylabel, fontsize=14) + ax.set_xticks(x) + ax.set_xticklabels(clusters) + ax.grid(axis="y", linestyle="--", alpha=0.7) + + # Add gradient legend in top-left corner + _add_gradient_legend_inset(ax, colors1, colors2, transfer_sizes, sw1_label, sw2_label) + + fig.tight_layout() + + # Save plot if a name is provided + if save_name: + plots_dir = os.path.join(os.getcwd(), "plots") + os.makedirs(plots_dir, exist_ok=True) + file_path = os.path.join(plots_dir, f"{save_name}.svg") + plt.savefig(file_path, format="svg", bbox_inches="tight") + print(f"Plot saved to: {file_path}") + + return fig, ax + + +# ===== Example Usage ===== +def main(): + df = pd.read_csv('results.csv') + + clusters = df['n_clusters'].unique().tolist() + transfer_sizes = [f'{size} B' for size in df['length'].unique().tolist()] + + cycles = { + mode: grp.drop(columns='mode') + .set_index(['n_clusters', 'length']) + .sort_index(level=['n_clusters', 'length']) + for mode, grp in df.groupby("mode", sort=False) + } + + speedups_mcast_unopt = cycles['SW_UNOPT'] / cycles['HW_MCAST'] + speedups_sw1_unopt = cycles['SW_UNOPT'] / cycles['SW_OPT'] + speedups_sw2_unopt = cycles['SW_UNOPT'] / cycles['SW_OPT2'] + speedups_sw2_sw1 = cycles['SW_OPT'] / cycles['SW_OPT2'] + speedups_mcast_sw2 = cycles['SW_OPT2'] / cycles['HW_MCAST'] + + speedups_mcast_unopt = speedups_mcast_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() + speedups_sw1_unopt = speedups_sw1_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() + speedups_sw2_unopt = speedups_sw2_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() + speedups_sw2_sw1 = speedups_sw2_sw1.values.reshape((len(clusters), len(transfer_sizes))).tolist() + speedups_mcast_sw2 = speedups_mcast_sw2.values.reshape((len(clusters), len(transfer_sizes))).tolist() + + # Select PULP colors for the 6 transfer sizes + custom_colors = [ + PULP_COLORS["PULP Blue"], + PULP_COLORS["PULP Petrol"], + PULP_COLORS["PULP Green"], + PULP_COLORS["PULP Bronze"], + PULP_COLORS["PULP Red"], + PULP_COLORS["PULP Purple"] + ] + + plot_multicast_speedup( + clusters, transfer_sizes, speedups_mcast_unopt, + title="Multicast Speedup against unoptimized software version", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="mcast_vs_unopt" + ) + + plot_multicast_speedup( + clusters, transfer_sizes, speedups_sw2_sw1, + title="Speedup of SW OPT2 implementation against SW OPT1", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="sw2_vs_sw1" + ) + + plot_multicast_speedup( + clusters, transfer_sizes, speedups_mcast_sw2, + title="Multicast Speedup against best software version", + xlabel="Number of Clusters", + ylabel="Speedup", + legend_loc="upper left", + legend_ncol=2, + save_name="mcast_vs_optimized" + ) + + plot_two_software_speedup( + clusters, + transfer_sizes, + speedups_sw1_unopt, speedups_sw2_unopt, + sw1_label="Column first", + sw2_label="Binary tree", + cmap1="viridis", + cmap2="magma", + title="Speedup of different Software schemes", + xlabel="Number of Clusters", + ylabel="Speedup", + save_name="sw_comparison" + ) + + plt.show() + + +if __name__ == '__main__': + main() diff --git a/experiments/dma_multicast_v2/README.md b/experiments/dma_multicast_v2/README.md new file mode 100644 index 00000000..d5a32fda --- /dev/null +++ b/experiments/dma_multicast_v2/README.md @@ -0,0 +1,32 @@ +All commands are assumed to be run from this folder. + +To run the experiments: +```shell +./sw_experiments.py --actions sw run visual-trace -j +``` + +To manually verify a simulation: +```shell +./verify.py placeholder /dma_multicast_v2.elf --no-ipc --memdump /l2mem.bin --memaddr 0x70000000 +``` + +To manually generate the traces for a simulation: +```shell +make -C ../../ annotate -j DEBUG=ON SIM_DIR= +``` + +To manually build the visual trace for a simulation: +```shell +make -C ../../ sn-visual-trace -j DEBUG=ON SIM_DIR= SN_ROI_SPEC=/roi_spec.json +``` + +To fit the HW multicast model to the data: +```shell +./fit.py +``` +Verify that `beta=1`, and plug `alpha`s in `model.py`. This file contains runtime models for the hardware and software multicast. + +To plot the results (uses `model.py` behind the scenes): +```shell +./plot.py +``` diff --git a/experiments/dma_multicast_v2/__init__.py b/experiments/dma_multicast_v2/__init__.py new file mode 100644 index 00000000..73945a38 --- /dev/null +++ b/experiments/dma_multicast_v2/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit, model + +__all__ = ["plot", "experiments", "fit", "model"] diff --git a/experiments/dma_multicast_v2/experiments.py b/experiments/dma_multicast_v2/experiments.py new file mode 100755 index 00000000..b3968554 --- /dev/null +++ b/experiments/dma_multicast_v2/experiments.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +N_CLUSTERS = 4 +N_CLUSTERS_PER_ROW = 4 +WIDE_DATA_WIDTH = 64 # bytes +TCK = 5 +DIR = Path(__file__).parent + + +############### +# Experiments # +############### + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + keys = ['impl', 'n_rows', 'size'] + if experiment['impl'] == 'seq': + keys.append('batch') + return eu.derive_axes_from_keys(experiment, keys) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])), + 'SIZE': experiment['size'], + 'BATCH': experiment['batch'], + } + return cdefs + + +def gen_experiments(): + experiments = [] + for impl in ['seq', 'tree', 'hw']: + # for n_rows in [1]: + for n_rows in [1, 2, 4]: + # for size in [1024]: + for size in [1024, 2048, 4096, 8192, 16384, 32768]: + # for n_batches in [1]: + for n_batches in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]: + + # Only sequential implementation supports batching, for all other + # implementations we only accept n_batches = 1 + batch_size = int(size // n_batches) + batch_beats = int(batch_size // WIDE_DATA_WIDTH) + if impl == 'seq': + valid_n_batches = batch_beats > 8 and batch_beats < 1024 + else: + valid_n_batches = n_batches == 1 + + # If the current setting for n_batches is valid, add the experiment + if valid_n_batches: + experiments.append({ + 'impl': impl, + 'n_rows': n_rows, + 'size': size, + 'batch': batch_size, + 'app': 'dma_multicast_v2', + 'cmd': pb.sim_and_verify_cmd(Path.cwd() / 'verify.py'), + 'roi': Path.cwd() / f'roi/{impl}.json.tpl', + }) + return experiments + + +########### +# Results # +########### + +def dma_core(cluster_idx): + return f'hart_{1 + cluster_idx * 9 + 8}' + + +def tree_cycles(sim_results, n_rows): + n_levels = int(math.log2(N_CLUSTERS_PER_ROW * n_rows) + 1) + start = SimRegion(dma_core(0 * N_CLUSTERS_PER_ROW), 'level 0', 0) + end = SimRegion(dma_core(2 * N_CLUSTERS_PER_ROW), f'level {n_levels-1}', 0) + return sim_results.get_timespan(start, end) // TCK + + +# Cluster to cluster (C2C) transfer cycles +def tree_c2c_cycles(sim_results): + roi = SimRegion(dma_core(0), 'level 1', 0) + return sim_results.get_timespan(roi) // TCK + + +# Memory to cluster (M2C) transfer cycles +def tree_m2c_cycles(sim_results): + roi = SimRegion(dma_core(0), 'level 0', 0) + return sim_results.get_timespan(roi) // TCK + + +def seq_cycles(sim_results, n_batches, n_rows): + start = SimRegion(dma_core(0), 'batch 0', 1) + end = SimRegion(dma_core(12 + n_rows - 1), f'batch {n_batches-1}', 1) + return sim_results.get_timespan(start, end) // TCK + + +def seq_batch_cycles(sim_results, batch_idx, cluster_idx=0): + start = SimRegion(dma_core(cluster_idx), f'batch {batch_idx}', 1) + return sim_results.get_timespan(start) // TCK + + +def hw_cycles(sim_results): + roi = SimRegion(dma_core(0), 'transfer', 1) + return sim_results.get_timespan(roi) // TCK + + +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +######## +# Main # +######## + +def main(): + + manager = ExperimentManager(gen_experiments(), dir=DIR) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/dma_multicast_v2/fit.py b/experiments/dma_multicast_v2/fit.py new file mode 100755 index 00000000..88687659 --- /dev/null +++ b/experiments/dma_multicast_v2/fit.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy import stats +from dma_multicast_v2 import experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +def fit_hw(df, quiet=True): + # Retrieve only single-row experiments (fitted model should generalize to multi-row) + df = df[df['n_rows'] == 1] + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Fit data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['cycles'].to_numpy() + print("Fit transfer time for HW multicast:") + fit(x, y, quiet=quiet) + + +def fit_seq(df, quiet=True): + # Retrieve 1-batch experiments + df['n_batches'] = df['size'] // df['batch'] + df_no = df[df['n_batches'] == 1] + + # Get cycles for non-overlapped batch (batch 0, cluster 0) + df_no['no_batch_cycles'] = df_no.apply( + lambda row: experiments.seq_batch_cycles( + row['results'], batch_idx=0, cluster_idx=0), + axis=1 + ) + + # Fit non-overlapped batch data + x = df_no['batch'].to_numpy() / BEAT_BYTES + y = df_no['no_batch_cycles'].to_numpy() + print("Fit non-overlapped batch time for sequential multicast:") + fit(x, y, quiet=quiet) + + # Retrieve 2-batch experiments + df_o = df[df['n_batches'] == 2] + + # Get cycles for overlapped batch (batch 1, cluster 0) + df_o['o_batch_cycles'] = df_o.apply( + lambda row: experiments.seq_batch_cycles( + row['results'], batch_idx=1, cluster_idx=0), + axis=1 + ) + + # Fit overlapped batch data + x = df_o['batch'].to_numpy() / BEAT_BYTES + y = df_o['o_batch_cycles'].to_numpy() + print("Fit overlapped batch time for sequential multicast:") + fit(x, y, quiet=quiet) + + +def fit_tree(df, quiet=True): + # Get cycles of cluster to cluster and memory to cluster transfers + df['c2c_cycles'] = df['results'].apply(experiments.tree_c2c_cycles) + df['m2c_cycles'] = df['results'].apply(experiments.tree_m2c_cycles) + + # Fit cluster to cluster data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['c2c_cycles'].to_numpy() + print("Fit cluster-to-cluster transfer time for tree multicast:") + fit(x, y, quiet=quiet) + + # Fit memory to cluster data + x = df['size'].to_numpy() / BEAT_BYTES + y = df['m2c_cycles'].to_numpy() + print("Fit memory-to-cluster transfer time for tree multicast:") + fit(x, y, quiet=quiet) + + +def main(): + df = experiments.results() + + seq = df[df['impl'] == 'seq'] + fit_seq(seq, quiet=False) + + tree = df[df['impl'] == 'tree'] + fit_tree(tree, quiet=False) + + hw = df[df['impl'] == 'hw'] + fit_hw(hw, quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/dma_multicast_v2/model.py b/experiments/dma_multicast_v2/model.py new file mode 100755 index 00000000..852b526d --- /dev/null +++ b/experiments/dma_multicast_v2/model.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from math import isqrt, sqrt, log2 + +# N: num clusters +# L: num beats in transfer +# B: num beats in batch +# delta: num synchronization overhead cycles + +BEAT_BYTES = 64 +SEQ_ALPHA = 30 +# TODO(colluca): in reality delta depends on n_rows, and the arrival times of all clusters +DELTA = 36 +HW_ALPHA = 30 +TREE_M2C_ALPHA = 30 +TREE_C2C_ALPHA = 17 + + +def nearest_divisors(divisor, dividend): + from bisect import bisect_left + + # 1) Enumerate all positive divisors of dividend + divs = set() + r = int(isqrt(int(dividend))) + for d in range(1, r + 1): + if dividend % d == 0: # d divides dividend + divs.add(d) # add the small factor + divs.add(dividend // d) # add the paired large factor + divs = sorted(divs) # sort them ascending + + # 2) Binary search to locate where x would be inserted + i = bisect_left(divs, divisor) # first index with divs[i] >= divisor + + # 3) Pick neighbors around divisor + lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None + upper = divs[i] if i < len(divs) else None + return lower, upper + + +# N1: num cols, N2: num rows +def seq_runtime(N1, N2, L, B, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + n_batches = L // B + n_iters_row = n_batches - 1 + (N1 - 1) + n_iters_col = n_batches - 1 + (N2 - 1) if N2 > 1 else 0 + n_iters = n_iters_row + n_iters_col + return (alpha1 + B) + n_iters * (alpha + B + delta) + + +def optimal_batch_size(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA): + if N2 > 1: + real_B = sqrt(L * (alpha + delta) / (N1 + N2 - 1)) + else: + real_B = sqrt(L * (alpha + delta) / (N1 - 1)) + lower_B, upper_B = nearest_divisors(real_B, L) + assert (lower_B is None) or (lower_B > 0) + assert (upper_B is None) or (upper_B <= L) + if lower_B is None: + return upper_B + elif upper_B is None: + return lower_B + else: + T_lower_B = seq_runtime(N1, N2, L, lower_B, delta, alpha) + T_upper_B = seq_runtime(N1, N2, L, upper_B, delta, alpha) + if T_lower_B < T_upper_B: + return lower_B + else: + return upper_B + + +def optimal_seq_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + B_opt = optimal_batch_size(N1, N2, L, delta, alpha) + return seq_runtime(N1, N2, L, B_opt, delta, alpha, alpha1) + + +def hw_runtime(N1, N2, L): + if N2 > 1: + return HW_ALPHA + L + N1 - 1 + N2 - 1 + else: + return HW_ALPHA + L + N1 - 1 + + +def tree_runtime(N1, N2, L, delta=DELTA): + m2c_transfer_cycles = TREE_M2C_ALPHA + L + c2c_transfer_cycles = (TREE_C2C_ALPHA + L) * log2(N1 * N2) + delta_cycles = delta * (log2(N1 * N2) - 1) + return m2c_transfer_cycles + c2c_transfer_cycles + delta_cycles + + +def optimal_sw_runtime(N1, N2, L, delta=DELTA, alpha=SEQ_ALPHA, alpha1=SEQ_ALPHA): + T_seq = optimal_seq_runtime(N1, N2, L, delta, alpha, alpha1) + T_tree = tree_runtime(N1, N2, L, delta) + return min(T_seq, T_tree) diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/dma_multicast_v2/plot.py new file mode 100755 index 00000000..0a380bd9 --- /dev/null +++ b/experiments/dma_multicast_v2/plot.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from dma_multicast_v2 import model +from dma_multicast_v2 import experiments + +pd.options.mode.copy_on_write = True + + +# Get actual runtimes +def get_actual_cycles(row): + if row['impl'] == 'seq': + return experiments.seq_cycles( + row['results'], int(row['size'] // row['batch']), row['n_rows']) + elif row['impl'] == 'tree': + return experiments.tree_cycles(row['results'], row['n_rows']) + elif row['impl'] == 'hw': + return experiments.hw_cycles(row['results']) + + +# Get expected runtimes +def get_expected_cycles(row): + if row['impl'] == 'seq': + cycles = model.seq_runtime(4, row['n_rows'], row['size'] // 64, row['batch'] // 64) + elif row['impl'] == 'tree': + cycles = model.tree_runtime(4, row['n_rows'], row['size'] // 64) + elif row['impl'] == 'hw': + cycles = model.hw_runtime(4, row['n_rows'], row['size'] // 64) + return int(cycles) + + +# Select optimal configuration (optimal n_batches) for sequential pipelined runs +def select_best_sequential_configuration(df): + return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \ + .sort_values(['impl', 'n_rows', 'size']) \ + .reset_index(drop=True) + + +def seq_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit. +def monotone_seq_runtime_curve(xmin, xmax, n_rows): + x, y = seq_runtime_curve(xmin, xmax, n_rows) + return find_monotone_lower_fit(x, y) + + +# Find the monotone non-decreasing lower fit of an oscillating curve +def find_monotone_lower_fit(x, y): + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept: + y_monotone.append(y[i]) + x_monotone.append(x[i]) + return x_monotone, y_monotone + + +def tree_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax, 64) + y = [model.tree_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +def hw_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax, 64) + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + return x, y + + +def sw_runtime_curve(xmin, xmax, n_rows): + x = np.arange(xmin, xmax, 64) + x, y_seq = seq_runtime_curve(xmin, xmax, n_rows) + _, y_tree = tree_runtime_curve(xmin, xmax, n_rows) + x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) + return x, y_min + + +def plot1(): + """Plot actual vs. expected runtime heatmaps.""" + + # Load results + df = experiments.results() + + # Get only sequential implementation results + df = df[(df['impl'] == 'seq') & (df['n_rows'] == 1)] + + # Add actual and expected runtimes to the dataframe + df['act'] = df.apply( + lambda row: experiments.seq_cycles( + row['results'], int(row['size'] // row['batch']), row['n_rows']), + axis=1 + ) + df['exp'] = df.apply( + lambda row: model.seq_runtime(4, 1, row['size'] // 64, row['batch'] // 64), + axis=1 + ) + + # Create dataframe for actual runtime heatmap + df_act = df.drop(columns=['exp']) + df_act = df_act.pivot(index='size', columns='batch', values='act') + sizes = df_act.index + batches = df_act.columns + + # Create dataframe for expected runtime heatmap + df_exp = df.drop(columns=['act']) + df_exp = df_exp.pivot(index='size', columns='batch', values='exp') + sizes = df_exp.index + batches = df_exp.columns + + # Plot actual and expected heatmaps + fig, ax = plt.subplots(1, 2) + cmap = mpl.colormaps['plasma'] + cmap = mpl.colors.ListedColormap(cmap(np.linspace(0.15, 0.85, 128))) + sns.heatmap( + df_act, + cmap=cmap, + annot=True, + fmt=".0f", + cbar=True, + xticklabels=batches, + yticklabels=sizes, + ax=ax[0] + ) + sns.heatmap( + df_exp, + cmap=cmap, + annot=True, + fmt=".0f", + cbar=True, + xticklabels=batches, + yticklabels=sizes, + ax=ax[1] + ) + plt.tight_layout() + plt.show() + + +def plot2(show=True): + """Plot actual and expected runtimes.""" + + # Load results + df = experiments.results() + + # Get expected and (best) actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_sequential_configuration(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C0", "tree": "C1", "hw": "C2"} + markers = {"expected": "o", "actual": "x"} + + # Plot measured runtimes + # _, ax = plt.subplots(2, 2) + _, ax = plt.subplots() + + def plot_runtime_vs_speedup(ax, n_rows): + res = df[df['n_rows'] == n_rows] + sizes = res['size'].unique() + ax.scatter( + sizes, res[res['impl'] == 'seq']['cycles'], label='Actual (seq)', + marker=markers['actual'], color=colors['seq'] + ) + ax.scatter( + sizes, res[res['impl'] == 'tree']['cycles'], label='Actual (tree)', + marker=markers['actual'], color=colors['tree'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)', + marker=markers['actual'], color=colors['hw'] + ) + + # # Plot expected runtimes + # ax.scatter( + # sizes, res[res['impl'] == 'seq']['exp_cycles'], label='Expected (seq)', + # marker=markers['expected'], color=colors['seq'] + # ) + # ax.scatter( + # sizes, res[res['impl'] == 'tree']['exp_cycles'], label='Expected (tree)', + # marker=markers['expected'], color=colors['tree'] + # ) + # ax.scatter( + # sizes, res[res['impl'] == 'hw']['exp_cycles'], label='Expected (hw)', + # marker=markers['expected'], color=colors['hw'] + # ) + + # Plot model line for sequential runtime. Find monotone non-decreasing lower fit. + x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + + # Plot model line for tree runtime + x, y = tree_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + + # Plot model line for hardware runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) + + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, alpha=0.4) + ax.legend() + + plot_runtime_vs_speedup(ax, n_rows=1) + # plot_runtime_vs_speedup(ax[0][0], n_rows=1) + # plot_runtime_vs_speedup(ax[0][1], n_rows=2) + # plot_runtime_vs_speedup(ax[1][0], n_rows=4) + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot3(show=True): + """Plot runtime dependency with delta.""" + + # Load results + df = experiments.results() + + # Get only hardware implementation results + df = df[df['impl'] == 'hw'] + + # Retrieve single-row experiments + n_rows = 1 + df = df[df['n_rows'] == n_rows] + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C1", "hw": "C2"} + + # Plot measured runtimes + _, ax = plt.subplots() + + # Plot model lines for sequential runtime (varying delta). + x = np.arange(df['size'].min(), df['size'].max(), 64) + for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -12), 0]): + alpha = alpha_delta // 2 + delta = alpha_delta - alpha + x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA) + for e in x_ext] + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept and i < len(x): + y_monotone.append(y[i]) + x_monotone.append(x[i]) + ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None, + color=colors['seq']) + ax.annotate( + f'${alpha_delta}$', + xy=(x[-1], y_monotone[-1]), + xytext=(1, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + ax.plot(x, y, label='hw', linestyle='--', color=colors['hw']) + + ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']]) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, linestyle='-', alpha=0.4) + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + +def plot4(show=True): + """Plot actual and expected runtimes, for multiple number of rows.""" + + # Load results + df = experiments.results() + + # Get expected and (best) actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_sequential_configuration(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors for the plots + colors = {"sw": "C0", "hw": "C2"} + + # Plot measured runtimes + _, ax = plt.subplots() + sizes = df['size'].unique() + + def plot_runtime_vs_speedup(ax, n_rows, show_label=False): + res = df[df['n_rows'] == n_rows] + best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() + ax.scatter( + sizes, best_sw_cycles, label='Actual (min(seq, tree))' if show_label else None, + marker='x', color=colors['sw'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)' if show_label else None, + marker='x', color=colors['hw'] + ) + + # Plot model line for best software implementation + x, y = sw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot( + x, y, label='Model (min(seq, tree))' if show_label else None, + linestyle='--', color=colors['sw']) + + # Annotate sw model lines with number of rows, in correspondence with actual runtime + ax.annotate( + f'r={n_rows}', + xy=(x[-1], best_sw_cycles[sizes.max()]), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) + ax.plot( + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + + for i, n_rows in enumerate(df['n_rows'].unique()): + plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) + + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlim(0, sizes.max() * 1.1) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, alpha=0.4) + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + return df + + +def main(): + # Parse arguments + functions = [plot1, plot2, plot3, plot4] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + if requested('plot3'): + plot3() + if requested('plot4'): + plot4() + + +if __name__ == "__main__": + main() diff --git a/experiments/dma_multicast_v2/roi/hw.json.tpl b/experiments/dma_multicast_v2/roi/hw.json.tpl new file mode 100644 index 00000000..b1cd5567 --- /dev/null +++ b/experiments/dma_multicast_v2/roi/hw.json.tpl @@ -0,0 +1,14 @@ +[ + { + // DMA core of cluster 0 + "thread": "${f'hart_9'}", + "roi": [ + // First iteration + {"idx": 1, "label": "init"}, + {"idx": 2, "label": "transfer"}, + // Second iteration + {"idx": 4, "label": "init"}, + {"idx": 5, "label": "transfer"}, + ] + }, +] diff --git a/experiments/dma_multicast_v2/roi/seq.json.tpl b/experiments/dma_multicast_v2/roi/seq.json.tpl new file mode 100644 index 00000000..876752fd --- /dev/null +++ b/experiments/dma_multicast_v2/roi/seq.json.tpl @@ -0,0 +1,29 @@ +<% +num_batches = experiment['size'] // experiment['batch'] +%> + +[ +% for row in range(0, experiment['n_rows']): + % for col in range(0, 4): + { + // DMA cores + "thread": "${f'hart_{(col * 4 + row) * 9 + 8 + 1}'}", + "roi": [ + // First iteration + {"idx": 1, "label": "init"}, + % for batch in range(num_batches): + {"idx": ${1 + batch * 2 + 1}, "label": "${f'batch {batch}'}"}, + // {"idx": ${1 + batch * 2 + 2}, "label": "${f'sync {batch}'}"}, + % endfor + {"idx": ${1 + num_batches * 2 + 1}, "label": "barrier"}, + // Second iteration + {"idx": ${3 + num_batches * 2}, "label": "init"}, + % for batch in range(num_batches): + {"idx": ${3 + num_batches * 2 + batch * 2 + 1}, "label": "${f'batch {batch}'}"}, + // {"idx": ${3 + num_batches * 2 + batch * 2 + 2}, "label": "${f'sync {batch}'}"}, + % endfor + ] + }, + % endfor +% endfor +] diff --git a/experiments/dma_multicast_v2/roi/tree.json.tpl b/experiments/dma_multicast_v2/roi/tree.json.tpl new file mode 100644 index 00000000..7095181c --- /dev/null +++ b/experiments/dma_multicast_v2/roi/tree.json.tpl @@ -0,0 +1,162 @@ +% if experiment['n_rows'] == 1: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 10, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 12, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 14, "label": "level 2"}, + + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 6, "label": "level 2"} + ] + }, +] +% elif experiment['n_rows'] == 2: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 12, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 14, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 16, "label": "level 2"}, + + // Transfer from cluster 0 to cluster 1 + {"idx": 18, "label": "level 3"}, + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 8, "label": "level 2"}, + + // Transfer from cluster 8 to cluster 9 + {"idx": 10, "label": "level 3"} + ] + }, + + { + "thread": "${f'hart_{4 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 4 to cluster 5 + {"idx": 6, "label": "level 3"} + ] + }, + + { + "thread": "${f'hart_{12 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 12 to cluster 13 + {"idx": 6, "label": "level 3"} + ] + }, +] +% elif experiment['n_rows'] == 4: +[ + { + "thread": "${f'hart_{0 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from memory tile to cluster 0 + {"idx": 14, "label": "level 0"}, + + // Transfer from cluster 0 to cluster 8 + {"idx": 16, "label": "level 1"}, + + // Transfer from cluster 0 to cluster 4 + {"idx": 18, "label": "level 2"}, + + // Transfer from cluster 0 to cluster 2 + {"idx": 20, "label": "level 3"}, + + // Transfer from cluster 0 to cluster 1 + {"idx": 22, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{8 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 8 to cluster 12 + {"idx": 10, "label": "level 2"}, + + // Transfer from cluster 8 to cluster 10 + {"idx": 12, "label": "level 3"}, + + // Transfer from cluster 8 to cluster 9 + {"idx": 14, "label": "level 4"} + ] + }, + + { + "thread": "${f'hart_{4 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 4 to cluster 6 + {"idx": 8, "label": "level 3"}, + + // Transfer from cluster 4 to cluster 5 + {"idx": 10, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{12 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 12 to cluster 14 + {"idx": 8, "label": "level 3"}, + + // Transfer from cluster 12 to cluster 13 + {"idx": 10, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{2 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 2 to cluster 3 + {"idx": 6, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{6 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 6 to cluster 7 + {"idx": 6, "label": "level 4"} + ] + }, + + { + "thread": "${f'hart_{10 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 10 to cluster 11 + {"idx": 6, "label": "level 4"}, + ] + }, + + { + "thread": "${f'hart_{14 * 9 + 8 + 1}'}", + "roi": [ + // Transfer from cluster 14 to cluster 15 + {"idx": 6, "label": "level 4"}, + ] + }, +] +% endif diff --git a/experiments/dma_multicast_v2/verify.py b/experiments/dma_multicast_v2/verify.py new file mode 100755 index 00000000..1aa58510 --- /dev/null +++ b/experiments/dma_multicast_v2/verify.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# +# Verification script for `dma_multicast_v2.c` + +import sys +import snitch.util.sim.verif_utils as vu + + +class Verifier(vu.Verifier): + + OUTPUT_UIDS = ['output'] + + def __init__(self): + super().__init__() + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'uint32_t') + + def get_expected_results(self): + length = int(self.get_input_from_symbol('length', 'uint32_t')[0]) + n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0]) + return n_clusters * [i + 1 for i in range(length // n_clusters)] + + def check_results(self, *args): + return super().check_results(*args, atol=0) + + +if __name__ == "__main__": + sys.exit(Verifier().main()) diff --git a/experiments/picobello.py b/experiments/picobello.py new file mode 100644 index 00000000..32ad102a --- /dev/null +++ b/experiments/picobello.py @@ -0,0 +1,53 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from pathlib import Path +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments import common + +root = Path(__file__).resolve().parents[1] +sim_cmd = [ + 'bash', '-c', + f'make vsim-run-batch -C {str(root)} SIM_DIR=${{run_dir}} ' + f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' + f'SN_BINARY=${{elf}} PRELMODE=3; ' + f'grep -q "] SUCCESS" ${{run_dir}}/transcript;' +] + + +def sim_and_verify_cmd(verify_script): + return [ + 'bash', '-c', + f'make vsim-run-batch-verify -C {str(root)} SIM_DIR=${{run_dir}} ' + f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' + f'SN_BINARY=${{elf}} VERIFY_PY={verify_script} PRELMODE=3;' + ] + + +def sw_callback(target=None, build_dir=None, defines=None, dry_run=False, sync=False, **kwargs): + env = { + 'SN_TESTS_RISCV_CFLAGS': common.join_cdefines(defines), + } + vars = { + 'SN_TESTS_BUILDDIR': build_dir, + 'DEBUG': 'ON', + } + env = common.extend_environment(env) + return common.make( + target, flags=['-j'], dir=root, vars=vars, env=env, sync=sync, + dry_run=dry_run + ) + + +callbacks = { + 'sw': sw_callback +} + + +class ExperimentManager(eu.ExperimentManager): + + def __init__(self, *args, **kwargs): + super().__init__(*args, callbacks=callbacks, **kwargs) diff --git a/sw/snitch/tests/dma_multicast_v2.c b/sw/snitch/tests/dma_multicast_v2.c index c231f0b9..4d11275b 100644 --- a/sw/snitch/tests/dma_multicast_v2.c +++ b/sw/snitch/tests/dma_multicast_v2.c @@ -6,10 +6,7 @@ // Lorenzo Leone // // This code implements a function to multicast data from one memory tile -// to all clusters in the same row. - -#define SNRT_ENABLE_NARROW_REDUCTION -#define SNRT_ENABLE_NARROW_MULTICAST +// to all clusters in the same row, or multiple rows. #include #include "snrt.h" @@ -24,91 +21,43 @@ #define BATCH 64 #endif -#define N_BATCHES (SIZE / BATCH) -#define N_ELEMS (SIZE / sizeof(uint32_t)) +typedef enum { + SEQ, + TREE, + HW +} impl_t; -#define ELEMS_TO_CHECK 32 - -#ifndef N_CLUSTERS_TO_USE -#define N_CLUSTERS_TO_USE 4 +#ifndef IMPL +#define IMPL SEQ #endif -// TODO(colluca): calculate from non-log2 function -static inline uint32_t pb_log2_cluster_num_in_col() { - return 2; -} - -static inline uint32_t pb_cluster_num_in_row() { - return 4; -} - -static inline uint32_t pb_cluster_row_idx() { - return snrt_cluster_idx() % pb_cluster_num_in_row(); -} - -static inline uint32_t pb_cluster_in_row(uint32_t row_idx) { - return pb_cluster_row_idx() == row_idx; -} - -static inline uint32_t pb_cluster_idx_in_row() { - return snrt_cluster_idx() / pb_cluster_num_in_row(); -} +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(uint32_t)) -static inline uint32_t pb_cluster_left_neighbour() { - return snrt_cluster_idx() - pb_cluster_num_in_row(); -} +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 2 +#endif -static inline void pb_create_row_comm(uint32_t row, snrt_comm_t *comm) { - // Allocate communicator struct in L1 and point to it. - *comm = - (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); - - // Allocate barrier counter in L1. Only the zero-th cluster's is actually - // used, but we want to keep all clusters' L1 allocators aligned. Thus, - // only cluster 0 initializes its barrier counter. A global barrier is - // then used to ensure all cores "see" the initialized value. - void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t)); - barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0); - if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0; - snrt_global_barrier(); - - // Initialize communicator, pointing to the newly-allocated barrier - // counter in L3. - (*comm)->size = pb_cluster_num_in_row(); - (*comm)->mask = row | - ((pb_cluster_num_in_row() - 1) << pb_log2_cluster_num_in_col()); - (*comm)->base = row; - (*comm)->barrier_ptr = (uint32_t *)barrier_ptr; - (*comm)->is_participant = pb_cluster_in_row(row); -} +#define N_ROWS (1 << LOG2_N_ROWS) -// L1 buffer of every cluster invoking this function should be -// at the same offset in the TCDM -static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, - snrt_comm_t comm) { - -#ifdef ENABLE_WIDE_MULTICAST - // Only DMA core of cluster 0 continues past this point - if (!snrt_is_dm_core() || !(snrt_cluster_idx() == 0)) return; - - // Hardware multicast transfer - uint64_t mask = snrt_get_collective_mask(comm); - snrt_dma_enable_multicast(mask); - snrt_mcycle(); - snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); - snrt_dma_disable_multicast(); - snrt_dma_wait_all(); -#else +static inline void dma_multicast_sequential(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { // Only DMA cores of clusters in the first row participate if (!snrt_is_dm_core() || !comm->is_participant) return; - // Compute address of source buffer (in left tile) + // Compute address of source buffer: + // - in memory tile for cluster 0 + // - in left neighbour for clusters in row 0 + // - in bottom neighbour for clusters in other rows uintptr_t src; if (snrt_cluster_idx() == 0) { src = l3_buffer; + } else if (pb_cluster_in_row(0)) { + src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), pb_cluster_west_neighbour()); } else { src = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, - snrt_cluster_idx(), pb_cluster_left_neighbour()); + snrt_cluster_idx(), pb_cluster_south_neighbour()); } // Prepare for inter-cluster barrier in advance, preventing instruction @@ -120,15 +69,15 @@ static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, uint32_t user = (uint32_t)op.w; asm volatile ("" : "+r"(user) ::); - // Iterations to cover all transfers - uint32_t n_iters = N_BATCHES - 1 + N_CLUSTERS_TO_USE; + // Iterations to cover all transfers in a row + uint32_t n_iters = N_BATCHES - 1 + pb_cluster_num_in_row(); for (uint32_t i = 0; i < n_iters; i++) { // Every cluster is active for N_BATCHES iterations // starting from the iteration with i == snrt_cluster_idx() - // TODO(colluca): use communicator - char cluster_active = i >= pb_cluster_idx_in_row(); - cluster_active &= i < (pb_cluster_idx_in_row() + N_BATCHES); + char cluster_active = i >= pb_cluster_col_idx(); + cluster_active &= i < (pb_cluster_col_idx() + N_BATCHES); + cluster_active &= pb_cluster_in_row(0); // Every active cluster copies data from the left tile if (cluster_active) { @@ -155,18 +104,177 @@ static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, snrt_set_awuser_low(0); snrt_fence(); } -#endif -} + + // Iterations to cover all transfers in each column + n_iters = N_BATCHES - 1 + N_ROWS - 1; + for (uint32_t i = 0; i < n_iters; i++) { + + // Every cluster is active for N_BATCHES iterations + // starting from the iteration with (i + 1) == pb_cluster_row_idx() + char cluster_active = (i + 1) >= pb_cluster_row_idx(); + cluster_active &= (i + 1) < (pb_cluster_row_idx() + N_BATCHES); + cluster_active &= !pb_cluster_in_row(0); + + // Every active cluster copies data from the bottom tile + if (cluster_active) { + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(l1_buffer, src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + l1_buffer += BATCH; + src += BATCH; + asm volatile ("" : "+r"(l1_buffer), "+r"(src) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +static inline void dma_multicast_tree(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { + + // Only DMA cores of clusters in the communicator continue from here + if (!snrt_is_dm_core() || !comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iteration 0: cluster 0 fetches data from memory tile + if (snrt_cluster_idx() == 0) { + snrt_mcycle(); + snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Iterations to cover all transfers in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders at every level of the tree + char cluster_idx_inv = pb_cluster_num_in_row() - pb_cluster_col_idx(); + char num_active_clusters = 1 << i; + char sender_stride = pb_cluster_num_in_row() / num_active_clusters; + char is_sender = ((cluster_idx_inv % sender_stride) == 0) + && pb_cluster_in_row(0); + + // Every active cluster sends the data to a cluster on the right + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx(0, + pb_cluster_col_idx() + receiver_offset); + uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(dst, l1_buffer, SIZE); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + + // Iterations to cover all transfers in each column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char row_idx_inv = N_ROWS - pb_cluster_row_idx(); + char num_active_rows = 1 << i; + char sender_stride = N_ROWS / num_active_rows; + char is_sender = (row_idx_inv % sender_stride) == 0; + + // Every active cluster sends the data to a cluster above it + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx()); + uintptr_t dst = (uintptr_t)snrt_remote_l1_ptr((void *)l1_buffer, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(dst, l1_buffer, SIZE); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +static inline void dma_multicast_hw(uintptr_t l1_buffer, + uintptr_t l3_buffer, snrt_comm_t comm) { + // Only DMA core of cluster 0 continues past this point + if (!snrt_is_dm_core() || !(snrt_cluster_idx() == 0)) return; + + // Hardware multicast transfer + uint64_t mask = snrt_get_collective_mask(comm); + snrt_dma_enable_multicast(mask); + snrt_mcycle(); + snrt_dma_start_1d(l1_buffer, l3_buffer, SIZE); + snrt_dma_disable_multicast(); + snrt_dma_wait_all(); +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_multicast(uintptr_t l1_buffer, uintptr_t l3_buffer, + snrt_comm_t comm) { + if (IMPL == SEQ) + dma_multicast_sequential(l1_buffer, l3_buffer, comm); + else if (IMPL == TREE) + dma_multicast_tree(l1_buffer, l3_buffer, comm); + else if (IMPL == HW) + dma_multicast_hw(l1_buffer, l3_buffer, comm); +} // Global variables for verification script -uint32_t output[N_ELEMS * N_CLUSTERS_TO_USE]; -uint32_t n_clusters = N_CLUSTERS_TO_USE; -uint32_t length = N_ELEMS * N_CLUSTERS_TO_USE; +uint32_t output[N_ELEMS * N_ROWS * pb_cluster_num_in_row()]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS * N_ROWS * pb_cluster_num_in_row(); int main() { - // Enable interrupts and clear pending interrupt - snrt_interrupt_enable(IRQ_M_CLUSTER); - snrt_int_clr_mcip(); // Allocate 4KiB-aligned buffer in every cluster uint32_t *buffer = (uint32_t *)snrt_l1_alloc_cluster_local(SIZE, 4096); @@ -176,12 +284,12 @@ int main() { // First cluster initializes the L3 buffer. if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - DUMP(l3_buffer); for (uint32_t i = 0; i < N_ELEMS; i++) { l3_buffer[i] = i + 1; } } + // TODO(colluca): is this needed? // Every cluster in row 0 initializes its destination buffer if (snrt_is_dm_core() && pb_cluster_in_row(0)) { snrt_dma_start_1d( @@ -189,12 +297,11 @@ int main() { snrt_dma_wait_all(); } - // Create communicator for row 0 (all other clusters are inactive) - // TODO(colluca): extend to use N_CLUSTERS_TO_USE + // Create communicator for first N rows (all other clusters are inactive) snrt_comm_t comm; - pb_create_row_comm(0, &comm); + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); - // Only DMA cores of clusters in the first row continue from here + // Only DMA cores of clusters in the first N rows continue from here if (!snrt_is_dm_core() || !comm->is_participant) return 0; // Print participating clusters @@ -221,20 +328,14 @@ int main() { } // Writeback to L3 - if (snrt_is_dm_core() && pb_cluster_in_row(0)) { - uint32_t offset = pb_cluster_idx_in_row() * SIZE; + if (snrt_is_dm_core() && (pb_cluster_row_idx() < N_ROWS)) { + uint32_t cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + uint32_t offset = cluster_idx * SIZE; snrt_dma_start_1d( (uintptr_t)output + offset, (uintptr_t)buffer, SIZE); snrt_dma_wait_all(); } - // Every cluster checks that the data in its buffer is correct - if (snrt_is_dm_core() && pb_cluster_in_row(0)) { - uint32_t n_errs = ELEMS_TO_CHECK; - uint32_t stride = N_ELEMS / ELEMS_TO_CHECK; - for (uint32_t i = 0; i < N_ELEMS; i += stride) - if (buffer[i] == (i + 1)) n_errs--; - return n_errs; - } else - return 0; - } + return 0; +} From fb21f918352cd68097fe0e7fa2a585a7682a54f5 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 9 Oct 2025 17:24:12 +0200 Subject: [PATCH 27/77] hw: Fix user width to carry full atomic ID --- floo_picobello_noc_pkg_REDUCTION.sv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index 90e36a05..98096c98 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -283,7 +283,7 @@ package floo_picobello_noc_pkg; typedef logic [63:0] axi_narrow_in_data_t; typedef logic [7:0] axi_narrow_in_strb_t; typedef logic [4:0] axi_narrow_in_id_t; - typedef logic [2:0] axi_narrow_in_user_t; + typedef logic [4:0] axi_narrow_in_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, axi_narrow_in_user_t) @@ -293,7 +293,7 @@ package floo_picobello_noc_pkg; typedef logic [63:0] axi_narrow_out_data_t; typedef logic [7:0] axi_narrow_out_strb_t; typedef logic [1:0] axi_narrow_out_id_t; - typedef logic [2:0] axi_narrow_out_user_t; + typedef logic [4:0] axi_narrow_out_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, axi_narrow_out_strb_t, axi_narrow_out_user_t) @@ -323,7 +323,7 @@ package floo_picobello_noc_pkg; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 48, DataWidth: 64, - UserWidth: 3, + UserWidth: 5, InIdWidth: 5, OutIdWidth: 2 }; From d254e3ffbdeeab4bf3af51830f8374bb2bae51c1 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 9 Oct 2025 18:29:53 +0200 Subject: [PATCH 28/77] experiments: Add barrier experiments --- experiments/barrier/__init__.py | 9 ++ experiments/barrier/experiments.py | 73 ++++++++++++++++ experiments/barrier/fit.py | 80 +++++++++++++++++ experiments/barrier/plot.py | 58 ++++++++++++ experiments/barrier/roi.json.tpl | 27 ++++++ sw/snitch/tests/barrier_benchmark.c | 131 ++++++++++++++++++++++++++++ 6 files changed, 378 insertions(+) create mode 100644 experiments/barrier/__init__.py create mode 100755 experiments/barrier/experiments.py create mode 100755 experiments/barrier/fit.py create mode 100755 experiments/barrier/plot.py create mode 100644 experiments/barrier/roi.json.tpl create mode 100644 sw/snitch/tests/barrier_benchmark.c diff --git a/experiments/barrier/__init__.py b/experiments/barrier/__init__.py new file mode 100644 index 00000000..d01b5a8f --- /dev/null +++ b/experiments/barrier/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit + +__all__ = ["plot", "experiments", "fit"] diff --git a/experiments/barrier/experiments.py b/experiments/barrier/experiments.py new file mode 100755 index 00000000..6eea589f --- /dev/null +++ b/experiments/barrier/experiments.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +TCK = 5 +DIR = Path(__file__).parent + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + return eu.derive_axes_from_keys(experiment, ['impl', 'n_clusters']) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'N_ROWS': experiment['n_rows'], + 'N_COLS': experiment['n_cols'], + } + return cdefs + + +def gen_experiments(): + experiments = [] + # for impl in ['sw']: + # for n_clusters in [8]: + for impl in ['sw', 'hw']: + for n_clusters in [2, 4, 8, 16]: + experiments.append({ + 'app': 'barrier_benchmark', + 'cmd': pb.sim_cmd, + 'impl': impl, + 'n_clusters': n_clusters, + 'n_rows': int(math.ceil(n_clusters / 4)), + 'n_cols': n_clusters % 4 if n_clusters % 4 != 0 else 4, + }) + return experiments + + +def dma_core(row_idx, col_idx): + cluster_idx = row_idx + col_idx * 4 + return 1 + cluster_idx * 9 + 8 + + +def get_total_cycles(sim_results): + roi = SimRegion(f'hart_{dma_core(0, 1)}', 'barrier', 1) + return sim_results.get_timespan(roi) // TCK + + +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +def main(): + manager = ExperimentManager(gen_experiments(), dir=DIR) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/barrier/fit.py b/experiments/barrier/fit.py new file mode 100755 index 00000000..9a2f66c9 --- /dev/null +++ b/experiments/barrier/fit.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import matplotlib.pyplot as plt +import numpy as np +from scipy import stats +from barrier import experiments + +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +def get_results(): + df = experiments.results() + df['cycles'] = df['results'].apply(experiments.get_total_cycles) + return df + + +def fit_hw(quiet=True): + if not quiet: + print("Hardware barrier:") + df = get_results() + df = df[df['impl'] == 'hw'] + x = df['n_clusters'].to_numpy() + y = df['cycles'].to_numpy() + return fit(x, y, quiet) + + +def fit_sw(quiet=True): + if not quiet: + print("Software barrier:") + df = get_results() + df = df[df['impl'] == 'sw'] + x = df['n_clusters'].to_numpy() + y = df['cycles'].to_numpy() + return fit(x, y, quiet) + + +def main(): + fit_hw(quiet=False) + fit_sw(quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/barrier/plot.py b/experiments/barrier/plot.py new file mode 100755 index 00000000..c793ccda --- /dev/null +++ b/experiments/barrier/plot.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +from barrier import experiments + + +def plot1(show=True): + """Plot runtime vs number of clusters.""" + + # Load results + df = experiments.results() + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.get_total_cycles) + + # Plot + ax = df.pivot(index="n_clusters", columns="impl", values="cycles").plot(kind="bar") + ax.set_xlabel("Nr. clusters") + ax.set_ylabel("Runtime [cycles]") + ax.tick_params(axis='x', labelrotation=0) + ax.grid(axis="y", alpha=0.4) + ax.legend() + plt.tight_layout() + if show: + plt.show() + + +def main(): + + # Parse arguments + functions = [plot1] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + + +if __name__ == "__main__": + main() diff --git a/experiments/barrier/roi.json.tpl b/experiments/barrier/roi.json.tpl new file mode 100644 index 00000000..3b32be9a --- /dev/null +++ b/experiments/barrier/roi.json.tpl @@ -0,0 +1,27 @@ +<% +if experiment['n_clusters'] == 2: + clusters = [0, 4] +elif experiment['n_clusters'] == 4: + clusters = [0, 4, 8, 12] +elif experiment['n_clusters'] == 8: + clusters = [0, 4, 8, 12, 1, 5, 9, 13] +elif experiment['n_clusters'] == 16: + clusters = list(range(16)) +%> + +[ +% for cluster in clusters: + { + "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", + "roi": [ + % if experiment['mode'] == 'sw': + {"idx": 1, "label": "barrier"}, + {"idx": 3, "label": "barrier"}, + % elif experiment['mode'] == 'hw': + {"idx": 3, "label": "barrier"}, + {"idx": 5, "label": "barrier"}, + % endif + ] + }, +% endfor +] diff --git a/sw/snitch/tests/barrier_benchmark.c b/sw/snitch/tests/barrier_benchmark.c new file mode 100644 index 00000000..f72f68a7 --- /dev/null +++ b/sw/snitch/tests/barrier_benchmark.c @@ -0,0 +1,131 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "snrt.h" + +#ifndef N_ROWS +#define N_ROWS (pb_cluster_num_in_col()) +#endif + +#ifndef N_COLS +#define N_COLS (pb_cluster_num_in_row()) +#endif + +typedef enum { + SW, + HW +} impl_t; + +#ifndef IMPL +#define IMPL SW +#endif + +static inline void sw_barrier(snrt_comm_t comm) { + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_COLLECTIVE_MULTICAST; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + volatile uint32_t *mcip_set = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_set.w); + volatile uint32_t *mcip_clr = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_clear.w); + uint32_t size = comm->size; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Execute barrier in submesh (first iteration to preheat I$) + for (volatile uint32_t i = 0; i < 2; i++) { + // Align start times using hardware barrier + snrt_inter_cluster_barrier(comm); + + snrt_mcycle(); + + // Perform software inter-cluster barrier. + uint32_t cnt = __atomic_add_fetch(barrier_ptr, 1, __ATOMIC_RELAXED); + + // Cluster 0 polls the barrier counter, resets it for the next barrier + // and multicasts an interrupt to wake up the other clusters. Compared + // to the implementation in sync.h, this version is cache-friendly, + // i.e. successive invocations of the barrier will find a hot cache. + if (snrt_cluster_idx() == 0) { + while (*barrier_ptr != size); + *barrier_ptr = 0; + snrt_fence(); + snrt_set_awuser_low(user); + *mcip_set = 1 << snrt_cluster_compute_core_num(); + snrt_set_awuser_low(0); + } else { + snrt_wfi(); + } + // Clear interrupt for next barrier (also the sending cluster) + *mcip_clr = 1 << snrt_cluster_compute_core_num(); + snrt_mcycle(); + } +} + +static inline void hw_barrier(snrt_comm_t comm) { + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Execute barrier in submesh (first iteration to preheat I$, second to + // align start times for third iteration) + for (volatile uint32_t i = 0; i < 3; i++) { + snrt_mcycle(); + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + snrt_mcycle(); + } +} + +int main (void) { + + // Create communicator for submesh + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, N_COLS); + + if (snrt_is_dm_core() && comm->is_participant) { + // Execute barrier in submesh + if (IMPL == SW) { + sw_barrier(comm); + } else { + hw_barrier(comm); + } + + // Wake up all cores put to sleep. + // Note: this is a workaround since multiple intersecting barriers are + // not supported yet. + if (snrt_cluster_idx() == 0) { + for (uint32_t i = 0; i < snrt_cluster_num(); i++) { + uint32_t is_participant = pb_cluster_col_idx(i) < N_COLS && + pb_cluster_row_idx(i) < N_ROWS; + if (!is_participant) snrt_int_cluster_set( + 1 << snrt_cluster_compute_core_num(), i); + } + } + } + // Other clusters go to sleep + else { + if (snrt_is_dm_core()) { + snrt_wfi(); + snrt_int_clr_mcip(); + } + } + + // Make compute cores wait on cluster hardware barrier + snrt_cluster_hw_barrier(); + + return 0; +} From 49cc653ecd89ed22c2e621ac771ab1f0ed94079c Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 15 Oct 2025 14:37:00 +0200 Subject: [PATCH 29/77] sw: Extend mesh communicator to have an arbitrary base cluster --- sw/snitch/runtime/src/pb_sync.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sw/snitch/runtime/src/pb_sync.h b/sw/snitch/runtime/src/pb_sync.h index 33955499..9840b4af 100644 --- a/sw/snitch/runtime/src/pb_sync.h +++ b/sw/snitch/runtime/src/pb_sync.h @@ -5,18 +5,23 @@ // Luca Colagrande inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, - uint32_t n_cols) { + uint32_t n_cols, uint32_t start_row = 0, uint32_t start_col = 0) { // Allocate communicator struct in L1 and point to it. *comm = (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); - // Allocate barrier counter in L1. Only the zero-th cluster's is actually + // Allocate barrier counter in L1. Only the first cluster's is actually // used, but we want to keep all clusters' L1 allocators aligned. Thus, - // only cluster 0 initializes its barrier counter. A global barrier is - // then used to ensure all cores "see" the initialized value. + // only the first cluster initializes its barrier counter. A global barrier + // is then used to ensure all cores "see" the initialized value. + uint32_t first_cluster = start_col * pb_cluster_num_in_col() + start_row; void *barrier_ptr = snrt_l1_alloc_cluster_local(sizeof(uint32_t)); - barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), 0); - if (snrt_global_core_idx() == 0) *(uint32_t *)barrier_ptr = 0; + barrier_ptr = snrt_remote_l1_ptr(barrier_ptr, snrt_cluster_idx(), + first_cluster); + if ((snrt_cluster_idx() == first_cluster) && snrt_is_dm_core()) { + *(uint32_t *)barrier_ptr = 0; + snrt_fence(); + } snrt_global_barrier(); // Initialize communicator, pointing to the newly-allocated barrier @@ -24,8 +29,11 @@ inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, (*comm)->size = n_rows * n_cols; (*comm)->mask = ((n_cols - 1) << PB_LOG2_CLUSTER_PER_COL) | (n_rows - 1); - (*comm)->base = 0; + (*comm)->base = (start_col << PB_LOG2_CLUSTER_PER_COL) | start_row; (*comm)->barrier_ptr = (uint32_t *)barrier_ptr; - (*comm)->is_participant = (pb_cluster_row_idx() < n_rows) && - (pb_cluster_col_idx() < n_cols); + uint32_t in_row_range = (pb_cluster_row_idx() >= start_row) && + (pb_cluster_row_idx() < (start_row + n_rows)); + uint32_t in_col_range = (pb_cluster_col_idx() >= start_col) && + (pb_cluster_col_idx() < (start_col + n_cols)); + (*comm)->is_participant = in_row_range && in_col_range; } From 4091ecf01ae91334f420d5b0825bb580887166d3 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 15 Oct 2025 14:37:35 +0200 Subject: [PATCH 30/77] sw: Add more complex barrier tests --- sw/snitch/tests/overlapping_barriers.c | 29 +++++++++++++++++++ sw/snitch/tests/parallel_row_col_barriers.c | 31 +++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 sw/snitch/tests/overlapping_barriers.c create mode 100644 sw/snitch/tests/parallel_row_col_barriers.c diff --git a/sw/snitch/tests/overlapping_barriers.c b/sw/snitch/tests/overlapping_barriers.c new file mode 100644 index 00000000..f78b3153 --- /dev/null +++ b/sw/snitch/tests/overlapping_barriers.c @@ -0,0 +1,29 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "snrt.h" + +int main (void) { + + // Create communicator for row 0 + snrt_comm_t comm; + pb_create_mesh_comm(&comm, 1, pb_cluster_num_in_row()); + + // Make sure row-0 clusters arrive on the row-0 barrier only after the + // other clusters have arrived on the global barrier. This ensures that + // the global barrier, arriving first, takes ownership of the router, + // preventing the row-0 clusters from ever reaching the global barrier + // and consequently deadlocking the system. + if (pb_cluster_row_idx() == 0) { + for (int j = 0; j < 100; j++) snrt_nop(); + snrt_global_barrier(comm); + snrt_global_barrier(); + } else { + snrt_global_barrier(); + } + + return 0; +} diff --git a/sw/snitch/tests/parallel_row_col_barriers.c b/sw/snitch/tests/parallel_row_col_barriers.c new file mode 100644 index 00000000..136f2c5c --- /dev/null +++ b/sw/snitch/tests/parallel_row_col_barriers.c @@ -0,0 +1,31 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande + +#include "snrt.h" + +int main (void) { + + // Create communicators for each row and column. + snrt_comm_t row_comm[pb_cluster_num_in_col()]; + snrt_comm_t col_comm[pb_cluster_num_in_row()]; + for (uint32_t r = 0; r < pb_cluster_num_in_col(); r++) { + pb_create_mesh_comm(&row_comm[r], 1, pb_cluster_num_in_row(), + r, 0); + } + for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++) { + pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1, + 0, c); + } + + // Test multiple barriers in succession (row, col and global) + for (int i = 0; i < 10; i++) { + snrt_global_barrier(row_comm[pb_cluster_row_idx()]); + snrt_global_barrier(col_comm[pb_cluster_col_idx()]); + snrt_global_barrier(); + } + + return 0; +} From ec955168b8abb68933dd3c5c9f60e6071ae01b2f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 15 Oct 2025 14:38:45 +0200 Subject: [PATCH 31/77] experiments: Extend `picobello.py` to support kernel experiments --- experiments/picobello.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/experiments/picobello.py b/experiments/picobello.py index 32ad102a..31e06069 100644 --- a/experiments/picobello.py +++ b/experiments/picobello.py @@ -27,14 +27,19 @@ def sim_and_verify_cmd(verify_script): ] -def sw_callback(target=None, build_dir=None, defines=None, dry_run=False, sync=False, **kwargs): +def sw_callback(target=None, build_dir=None, defines=None, data_cfg=None, dry_run=False, + sync=False, **kwargs): env = { 'SN_TESTS_RISCV_CFLAGS': common.join_cdefines(defines), + f'{target}_RISCV_CFLAGS': common.join_cdefines(defines), } vars = { + f'{target}_BUILD_DIR': build_dir, 'SN_TESTS_BUILDDIR': build_dir, 'DEBUG': 'ON', } + if data_cfg is not None: + vars[f'{target}_DATA_CFG'] = data_cfg env = common.extend_environment(env) return common.make( target, flags=['-j'], dir=root, vars=vars, env=env, sync=sync, From dddc5f5e065de9880a03c3f6cb033efa9545f237 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 15 Oct 2025 14:41:58 +0200 Subject: [PATCH 32/77] sw: Add `summa_gemm` kernel --- sw/snitch/apps/summa_gemm/app.mk | 17 + sw/snitch/apps/summa_gemm/data/params.json | 25 ++ sw/snitch/apps/summa_gemm/src/summa_gemm.c | 462 +++++++++++++++++++++ sw/sw.mk | 1 + 4 files changed, 505 insertions(+) create mode 100644 sw/snitch/apps/summa_gemm/app.mk create mode 100644 sw/snitch/apps/summa_gemm/data/params.json create mode 100644 sw/snitch/apps/summa_gemm/src/summa_gemm.c diff --git a/sw/snitch/apps/summa_gemm/app.mk b/sw/snitch/apps/summa_gemm/app.mk new file mode 100644 index 00000000..163ae610 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/app.mk @@ -0,0 +1,17 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP := summa_gemm +$(APP)_BUILD_DIR ?= $(PB_SNITCH_SW_DIR)/apps/$(APP)/build +SRC_DIR := $(PB_SNITCH_SW_DIR)/apps/$(APP)/src +SRCS := $(SRC_DIR)/summa_gemm.c +$(APP)_INCDIRS := $(SN_ROOT)/sw/kernels/blas $(SN_ROOT)/sw/kernels/blas/gemm/src + +# Refer to Snitch scripts +$(APP)_SCRIPT_DIR := $(SN_ROOT)/sw/kernels/blas/gemm/scripts + +include $(SN_ROOT)/sw/kernels/datagen.mk +include $(SN_ROOT)/sw/kernels/common.mk diff --git a/sw/snitch/apps/summa_gemm/data/params.json b/sw/snitch/apps/summa_gemm/data/params.json new file mode 100644 index 00000000..2b332462 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/data/params.json @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 4, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 32, + n: 32, + k: 8, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/sw/snitch/apps/summa_gemm/src/summa_gemm.c b/sw/snitch/apps/summa_gemm/src/summa_gemm.c new file mode 100644 index 00000000..13840cc1 --- /dev/null +++ b/sw/snitch/apps/summa_gemm/src/summa_gemm.c @@ -0,0 +1,462 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Tim Fischer +// Luca Bertaccini +// Luca Colagrande +// Viviane Potocnik +// Lorenzo Leone + +#include "snrt.h" + +#include "blas.h" + +typedef enum { + SW_NAIVE, + SW_TREE, + HW +} mode_t; + +#ifndef MODE +#define MODE SW_NAIVE +#endif + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreorder-init-list" +#include "data.h" +#pragma clang diagnostic pop + +static inline void snrt_dma_load_2d_tile_mcast_sw( + void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, + size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, + uint32_t prec, snrt_comm_t comm) { + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iteration 0: cluster in row 0 fetches data from memory tile + if (pb_cluster_row_idx() == 0) { + snrt_dma_load_2d_tile( + dst, src, tile_x1_idx, tile_x0_idx, + tile_x1_size, tile_x0_size, full_x0_size, + prec); + snrt_dma_wait_all(); + } + + // Iterations to cover all transfers in each column + uint32_t n_levels_in_col = PB_LOG2_CLUSTER_PER_COL; + uint32_t size = tile_x1_size * tile_x0_size * prec; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char row_idx_inv = pb_cluster_num_in_col() - pb_cluster_row_idx(); + char num_active_rows = 1 << i; + char sender_stride = pb_cluster_num_in_col() / num_active_rows; + char is_sender = (row_idx_inv % sender_stride) == 0; + + // Every active cluster sends the data to a cluster above it + if (is_sender) { + + // Calculate destination + char receiver_offset = sender_stride / 2; + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() + receiver_offset, pb_cluster_col_idx()); + void *remote_dst = snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, dst, size); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } +} + +/** + * @brief Performs a General Matrix Multiplication (GEMM) operation on a + * Snitch-based multiple-cluster architecture with a 2D mesh topology + * using the SUMMA dataflow. + * + * @param args Pointer to a `gemm_args_t` structure containing arguments + * for the GEMM operation. + * + * @details + * The function performs the following steps: + * 1. Copies the input arguments to local memory for faster access. + * 2. Calculates tile sizes based on the input dimensions and number of tiles. + * 3. Allocates space in TCDM for local copies of matrix tiles, unless + * matrix tiles are already stored in TCDM (see `load_* arguments`). + * 4. Distributes tiles to clusters for parallel processing. + * 5. Iterates over the tiles, performing the following: + * - Copies data for the current tile into local memory. + * - Performs the tile computation using the `sc_st_gemm` function. + * - Writes the result back to global memory. + */ +static inline int gemm_picobello(const gemm_args_t *args) { +#ifndef JOB_ARGS_PRELOADED + // Copy the arguments to local memory + gemm_args_t *largs = (gemm_args_t *)snrt_l1_alloc_cluster_local( + sizeof(gemm_args_t), alignof(gemm_args_t)); + if (snrt_is_dm_core()) { + snrt_dma_start_1d((void *)largs, (void *)args, sizeof(gemm_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); +#else + const gemm_args_t *largs = args; +#endif + + // Create a communicator for each column to share the B tiles + snrt_comm_t col_comm[pb_cluster_num_in_row()]; + for (uint32_t c = 0; c < pb_cluster_num_in_row(); c++) + pb_create_mesh_comm(&col_comm[c], pb_cluster_num_in_col(), 1, + 0, c); + + // Calculate tile sizes + uint32_t tile_m = largs->m / largs->m_tiles; + uint32_t tile_n = largs->n / largs->n_tiles; + uint32_t tile_k = largs->k / largs->k_tiles; + uint32_t tile_a_size = tile_m * tile_k * largs->prec; + uint32_t tile_b_size = tile_k * tile_n * largs->prec; + uint32_t tile_c_size = tile_m * tile_n * largs->prec; + + // Allocate space for local tile buffers in TCDM, unless preloaded + void *a0, *a1, *b0, *b1, *c0, *c1; + void *la[2], *lb[2], *lc[2], *lcr; + int banks_per_buffer = snrt_cluster_compute_core_num(); + allocate_buffers(tile_a_size, tile_b_size, tile_c_size, largs, + banks_per_buffer, la, lb, lc, &lcr); + if (snrt_cluster_core_idx() == 0) { + DUMP(la[0]); + DUMP(la[1]); + DUMP(lb[0]); + DUMP(lb[1]); + DUMP(lc[0]); + DUMP(lc[1]); + } + snrt_cluster_hw_barrier(); + + // NoC layout (6 columns x 4 rows) + /* + // + |------| |------| |------| |------| |------| |------| + | M3 |---| C3 |---| C7 |---| C11 |---| C15 |---| M7 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M2 |---| C2 |---| C6 |---| C10 |---| C14 |---| M6 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M1 |---| C1 |---| C5 |---| C9 |---| C13 |---| M5 | + |------| |------| |------| |------| |------| |------| + | | | | | | + | | | | | | + |------| |------| |------| |------| |------| |------| + | M0 |---| C0 |---| C4 |---| C8 |---| C12 |---| M4 | + |------| |------| |------| |------| |------| |------| + // + */ + + // SUMMA dataflow: + // - Clusters in the same row calculate different tiles of the same + // row block of C, reusing the same row block of A but different column + // blocks of B. + // - Clusters in the same column calculate different tiles of the same + // column block of C, reusing the same column block of B but different + // row blocks of A. + // + // Notes: + // - K tiling not supported. + + // Distribute m tiles to cluster rows and n tiles to cluster columns + uint32_t cluster_m_tiles = largs->m_tiles / pb_cluster_num_in_col(); + uint32_t cluster_n_tiles = largs->n_tiles / pb_cluster_num_in_row(); + uint32_t cluster_k_tiles = 1; + + // Calculate number of iterations + uint32_t num_tiles = cluster_m_tiles * cluster_n_tiles * cluster_k_tiles; + uint32_t num_iters = num_tiles; + if (largs->double_buffer) + num_iters += 2; + else + num_iters += 1; + + // Iterate over all tiles + for (uint32_t i = 0; i < num_iters; i++) { + // Calculate tile indices (we iterate in n->m order) + int dma_in_i = i; + int comp_i = largs->double_buffer ? i - 1 : i; + int dma_out_i = largs->double_buffer ? i - 2 : i - 1; + int dma_in_k = dma_in_i % cluster_k_tiles; + int dma_in_mn = dma_in_i / cluster_k_tiles; + int dma_in_n = dma_in_mn % cluster_n_tiles; + int dma_in_m = dma_in_mn / cluster_n_tiles; + int comp_k = comp_i % cluster_k_tiles; + int comp_mn = comp_i / cluster_k_tiles; + int comp_n = comp_mn % cluster_n_tiles; + int comp_m = comp_mn / cluster_n_tiles; + int dma_out_k = dma_out_i % cluster_k_tiles; + int dma_out_mn = dma_out_i / cluster_k_tiles; + int dma_out_n = dma_out_mn % cluster_n_tiles; + int dma_out_m = dma_out_mn / cluster_n_tiles; + + // Calculate the absolute m, n and k indices for each cluster + int dma_in_m_abs = dma_in_m + pb_cluster_row_idx() * cluster_m_tiles; + int comp_m_abs = comp_m + pb_cluster_row_idx() * cluster_m_tiles; + int dma_out_m_abs = dma_out_m + pb_cluster_row_idx() * cluster_m_tiles; + int dma_in_n_abs = dma_in_n + pb_cluster_col_idx() * cluster_n_tiles; + int comp_n_abs = comp_n + pb_cluster_col_idx() * cluster_n_tiles; + int dma_out_n_abs = dma_out_n + pb_cluster_col_idx() * cluster_n_tiles; + int dma_in_k_abs = dma_in_k; + int comp_k_abs = comp_k; + int dma_out_k_abs = dma_out_k; + + // DMA out phase + if (snrt_is_dm_core()) { + if (dma_out_i >= 0) { + snrt_mcycle(); + // Switch buffers + int buff_idx = largs->double_buffer ? dma_out_mn % 2 : 0; + + // Store C + // If parallelize_k, then only cluster 0 must writeback + if ((snrt_cluster_idx() == 0) || !(largs->parallelize_k)) { + if (largs->partition_banks) { + snrt_dma_2d_to_1d( + (void *)((uintptr_t)largs->c + + dma_out_m_abs * tile_c_size), + lc[buff_idx], tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_store_2d_tile(largs->c, lc[buff_idx], + dma_out_m_abs, dma_out_n_abs, tile_m, + tile_n, largs->ldc, largs->prec); + } + snrt_dma_wait_all(); + } + snrt_mcycle(); + } + } + + // DMA in phase + if (snrt_is_dm_core()) { + if (dma_in_i < num_tiles) { + snrt_mcycle(); + // Switch buffers + // When tiling on N, a row block of A is reused with multiple + // column blocks of B. There is no need to reload A on every + // iteration. The A buffer must be switched only when reloading + // A. On the other hand, the B buffer is switched every + // iteration, and the C buffer only needs to be switched after + // fully accumulating the result, i.e. after finishing the K loop. + int a_buff_idx = largs->double_buffer ? dma_in_m % 2 : 0; + int b_buff_idx = largs->double_buffer ? dma_in_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? dma_in_mn % 2 : 0; + + // Load A + if (largs->load_a && (dma_in_n == 0)) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + la[a_buff_idx], + (void *)((uintptr_t)largs->a + + dma_in_m_abs * tile_a_size), + tile_a_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_load_2d_tile( + la[a_buff_idx], largs->a, dma_in_m_abs, dma_in_k_abs, + tile_m, tile_k, largs->lda, largs->prec); + } + } + + // Load B + if (largs->load_b) { + if (largs->transb) { + snrt_dma_load_2d_tile(lb[b_buff_idx], largs->b, + dma_in_n_abs, dma_in_k_abs, + tile_n, tile_k, + largs->ldb, largs->prec); + } else { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lb[b_buff_idx], + (void *)((uintptr_t)largs->b + + dma_in_k_abs * tile_b_size), + tile_b_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + // In SW_NAIVE mode, every cluster fetches the B + // tile it requires, independently. In all other + // modes, the clusters in row 0 multicast the B + // tile to all clusters in the same column. + if (MODE == HW) { + if (pb_cluster_row_idx() == 0) { + snrt_dma_load_2d_tile_mcast( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, + largs->ldb, largs->prec, + col_comm[pb_cluster_col_idx()]); + } + } else if (MODE == SW_TREE) { + snrt_dma_load_2d_tile_mcast_sw( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, + largs->ldb, largs->prec, + col_comm[pb_cluster_col_idx()]); + } else { + snrt_dma_load_2d_tile( + lb[b_buff_idx], largs->b, dma_in_k_abs, + dma_in_n_abs, tile_k, tile_n, largs->ldb, + largs->prec); + } + } + } + } + + // Load C + // C tile is loaded only upon the first k iteration, then + // the C array will contain the partial results from the + // previous iteration + if (largs->load_c) { + if (dma_in_k_abs == 0) { + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], + (void *)((uintptr_t)largs->c + + dma_in_m_abs * tile_c_size), + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_load_2d_tile(lc[c_buff_idx], largs->c, + dma_in_m_abs, dma_in_n_abs, + tile_m, tile_n, largs->ldc, + largs->prec); + } + } else if (dma_in_k == 0) { + // Clusters other than the first need to initialize + // the C array to zero in their first iteration + if (largs->partition_banks) { + snrt_dma_1d_to_2d( + lc[c_buff_idx], snrt_cluster()->zeromem.mem, + tile_c_size, + banks_per_buffer * SNRT_TCDM_BANK_WIDTH, + SNRT_TCDM_HYPERBANK_WIDTH); + } else { + snrt_dma_start_1d(lc[c_buff_idx], + snrt_cluster()->zeromem.mem, + tile_c_size); + } + } + } + snrt_dma_wait_all(); + snrt_mcycle(); + } + } + + // Additional barrier required when not double buffering + if (!largs->double_buffer) snrt_global_barrier(); + + // Compute phase + if (comp_i >= 0 && comp_i < num_tiles) { + // Switch buffers + int a_buff_idx = largs->double_buffer ? comp_m % 2 : 0; + int b_buff_idx = largs->double_buffer ? comp_i % 2 : 0; + int c_buff_idx = largs->double_buffer ? comp_mn % 2 : 0; + + // Only compute cores participate in the tile computation + if (!snrt_is_dm_core()) { + // uint32_t start_cycle = snrt_mcycle(); + + // In the first k iteration we accumulate with the C matrix + // scaled by beta, in successive iterations we accumulate + // the previous partial result. The tile-level beta is thus + // a function of k: beta(k). + uint32_t beta_k = comp_k_abs == 0 ? largs->beta : 1; + + // Tile computation + sc_st_gemm_args_t sc_st_args; + sc_st_args.prec = largs->prec; + sc_st_args.setup_ssr = largs->setup_ssr; + sc_st_args.partition_banks = largs->partition_banks; + sc_st_args.transa = largs->transa; + sc_st_args.transb = largs->transb; + sc_st_args.a = la[a_buff_idx]; + if (largs->transa) { + sc_st_args.lda = tile_m; + } else if (largs->partition_banks) { + sc_st_args.lda = calculate_partitioned_banks_stride( + banks_per_buffer, tile_k, largs->prec); + } else { + sc_st_args.lda = tile_k; + } + sc_st_args.b = lb[b_buff_idx]; + if (largs->transb) { + sc_st_args.ldb = tile_k; + } else if (largs->partition_banks) { + sc_st_args.ldb = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldb = tile_n; + } + sc_st_args.beta = beta_k; + sc_st_args.c = lc[c_buff_idx]; + if (largs->partition_banks) { + sc_st_args.ldc = calculate_partitioned_banks_stride( + banks_per_buffer, tile_n, largs->prec); + } else { + sc_st_args.ldc = tile_n; + } + sc_st_args.m = tile_m; + sc_st_args.n = tile_n; + sc_st_args.k = tile_k; + sc_st_gemm(largs->gemm_fp, &sc_st_args); + + // uint32_t end_cycle = snrt_mcycle(); + } + + // Add the partial result tiles from the various clusters together + // in a logarithmic reduction fashion. + // Note: both compute and DMA cores participate in this step. + if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) { + snrt_global_reduction_dma( + (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n); + } + } + + // Synchronize cores after every iteration + snrt_global_barrier(); + } + + return 0; +} + + +int main () { + gemm_picobello(&args); + return 0; +} diff --git a/sw/sw.mk b/sw/sw.mk index e5d03c21..376db883 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -39,6 +39,7 @@ SN_APPS += $(PB_SNITCH_SW_DIR)/apps/axpy SN_APPS += $(SN_ROOT)/sw/kernels/dnn/flashattention_2 SN_APPS += $(PB_SNITCH_SW_DIR)/apps/fused_concat_linear SN_APPS += $(PB_SNITCH_SW_DIR)/apps/mha +SN_APPS += $(PB_SNITCH_SW_DIR)/apps/summa_gemm SN_TESTS = $(wildcard $(PB_SNITCH_SW_DIR)/tests/*.c) From 4c5b67a10bf6046b6743e3c18a63d17f628d0f47 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 15 Oct 2025 14:42:28 +0200 Subject: [PATCH 33/77] experiments: Add SUMMA GEMM experiments --- experiments/.gitignore | 3 +- experiments/summa_gemm/__init__.py | 9 ++ experiments/summa_gemm/cfg.json.tpl | 46 ++++++++ experiments/summa_gemm/experiments.py | 52 +++++++++ experiments/summa_gemm/model.py | 76 +++++++++++++ experiments/summa_gemm/plot.py | 154 ++++++++++++++++++++++++++ experiments/summa_gemm/roi.json.tpl | 29 +++++ 7 files changed, 368 insertions(+), 1 deletion(-) create mode 100644 experiments/summa_gemm/__init__.py create mode 100644 experiments/summa_gemm/cfg.json.tpl create mode 100755 experiments/summa_gemm/experiments.py create mode 100755 experiments/summa_gemm/model.py create mode 100755 experiments/summa_gemm/plot.py create mode 100644 experiments/summa_gemm/roi.json.tpl diff --git a/experiments/.gitignore b/experiments/.gitignore index 7a3f208a..8037a6ca 100644 --- a/experiments/.gitignore +++ b/experiments/.gitignore @@ -1,4 +1,5 @@ build/ runs/ plots/ -results/ \ No newline at end of file +results/ +data/ \ No newline at end of file diff --git a/experiments/summa_gemm/__init__.py b/experiments/summa_gemm/__init__.py new file mode 100644 index 00000000..7858b657 --- /dev/null +++ b/experiments/summa_gemm/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, model + +__all__ = ["plot", "experiments", "model"] diff --git a/experiments/summa_gemm/cfg.json.tpl b/experiments/summa_gemm/cfg.json.tpl new file mode 100644 index 00000000..4086a31a --- /dev/null +++ b/experiments/summa_gemm/cfg.json.tpl @@ -0,0 +1,46 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +//{ +// setup_ssr: 1, +// parallelize_m: 1, +// parallelize_k: 0, +// m_tiles: 4, // number of tiles in M dimension +// n_tiles: ${4 * experiment['n_tiles']}, // number of tiles in N dimension +// k_tiles: 1, // number of tiles in K dimension +// load_a: 1, +// load_b: 1, +// load_c: 1, +// double_buffer: 1, +// partition_banks: 0, +// transa: false, +// transb: false, // must be true for SIMD +// m: 32, +// n: ${32 * experiment['n_tiles']}, +// k: 8, +// alpha: 1, +// beta: 0, +// gemm_fp: "gemm_fp64_opt" +//} +{ + setup_ssr: 1, + parallelize_m: 1, + parallelize_k: 0, + m_tiles: 4, // number of tiles in M dimension + n_tiles: ${4 * experiment['n_tiles']}, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + double_buffer: 1, + partition_banks: 0, + transa: false, + transb: false, // must be true for SIMD + m: 64, + n: ${64 * experiment['n_tiles']}, + k: 128, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp64_opt" +} diff --git a/experiments/summa_gemm/experiments.py b/experiments/summa_gemm/experiments.py new file mode 100755 index 00000000..ca333764 --- /dev/null +++ b/experiments/summa_gemm/experiments.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu + +TCK = 5 +VERIFY_PY = Path(__file__).parent / '../../.deps/snitch_cluster/sw/kernels/blas/gemm/scripts/verify.py' + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + return eu.derive_axes_from_keys(experiment, ['mode', 'n_tiles']) + + def derive_cdefines(self, experiment): + cdefs = { + 'MODE': experiment['mode'].upper(), + } + return cdefs + + def derive_data_cfg(self, experiment): + return eu.derive_data_cfg_from_template(experiment) + + +def gen_experiments(): + experiments = [] + # for mode in ['hw']: + for mode in ['sw_tree']: + for n_tiles in [4]: + # for mode in ['sw_naive', 'sw_tree', 'hw']: + experiments.append({ + 'app': 'summa_gemm', + 'cmd': pb.sim_and_verify_cmd(VERIFY_PY), + 'mode': mode, + 'n_tiles': n_tiles, + }) + return experiments + + +def main(): + manager = ExperimentManager(gen_experiments()) + manager.run() + + +if __name__ == '__main__': + main() diff --git a/experiments/summa_gemm/model.py b/experiments/summa_gemm/model.py new file mode 100755 index 00000000..2326a265 --- /dev/null +++ b/experiments/summa_gemm/model.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import math +import dma_multicast_v2 as multicast +import reduction + +PREC = 8 # in bytes +BEAT_BYTES = 64 # in bytes +UTIL = 0.981 # median utilization from https://arxiv.org/pdf/2506.10921 +PEAKPERF = 16 # DPflop/cycle on a single cluster +L1SIZE = 16 * 1024 # in bytes + + +def beats(bytes): + return math.ceil(bytes / BEAT_BYTES) + + +def max_square_problem_size(): + return math.floor(math.sqrt(L1SIZE / (6 * PREC))) + + +def t_mcast(dim, bytes, impl='sw'): + n = beats(bytes) + if impl == 'sw': + return multicast.model.optimal_sw_runtime(dim, 1, n) + elif impl == 'hw': + return multicast.model.hw_runtime(dim, 1, n) + else: + raise ValueError(f"Unknown multicast implementation: {impl}") + + +def t_mcast_a(c, Mt, Kt, impl='sw'): + return t_mcast(c, Mt * Kt * PREC, impl) + + +def t_mcast_b(r, Nt, Kt, impl='sw'): + return t_mcast(r, Nt * Kt * PREC, impl) + + +def t_summa_comm(r, c, Mt, Nt, Kt, impl='sw'): + return t_mcast_a(c, Mt, Kt, impl=impl) + t_mcast_b(r, Nt, Kt, impl=impl) + + +def t_comp(Mt, Nt, Kt): + return (2 * Mt * Nt * Kt) / (UTIL * PEAKPERF) + + +def t_summa_gemm(r, c, Mt, Nt, Kt, impl='sw'): + return max(t_summa_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt)) + + +def t_fcl_comm(r, c, Mt, Nt, Kt, impl='sw'): + # Time to load c*r submatrices of B (each of size Nt x Kt) + # from r memory tiles + return c * r * beats(Nt * Kt * PREC) / r + + +def t_reduction(r, c, bytes, impl='sw'): + n = beats(bytes) + if impl == 'sw': + return reduction.model.optimal_sw_runtime(c, r, n) + elif impl == 'hw': + return reduction.model.hw_simple_runtime(c, r, n) + else: + raise ValueError(f"Unknown reduction implementation: {impl}") + + +def t_fcl_gemm(r, c, Mt, Nt, Kt, impl='sw'): + t_partial_result = max(t_fcl_comm(r, c, Mt, Nt, Kt, impl=impl), t_comp(Mt, Nt, Kt)) + t_redu = t_reduction(r, c, Mt * Nt * PREC, impl=impl) + return t_partial_result + t_redu diff --git a/experiments/summa_gemm/plot.py b/experiments/summa_gemm/plot.py new file mode 100755 index 00000000..e19efc5b --- /dev/null +++ b/experiments/summa_gemm/plot.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +from matplotlib.ticker import FuncFormatter +import pandas as pd +from summa_gemm import model + + +def plot1(show=True): + # Get data + mesh_sizes = [4, 8, 16, 32, 64, 128, 256] + t_comm_sw = [] + t_comm_hw = [] + t_compute = [] + Mt = model.max_square_problem_size() + for c in mesh_sizes: + r = c + t_comm_sw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='sw')) + t_comm_hw.append(model.t_summa_comm(r, c, Mt, Mt, Mt, impl='hw')) + t_compute.append(model.t_comp(Mt, Mt, Mt)) + + df = pd.DataFrame({ + 'size': mesh_sizes, + 't_comm_sw': t_comm_sw, + 't_comm_hw': t_comm_hw, + 't_compute': t_compute, + }) + + # Prepare critical paths and speedups + df['t_sw'] = df[['t_comm_sw', 't_compute']].max(axis=1) + df['t_hw'] = df[['t_comm_hw', 't_compute']].max(axis=1) + df['speedup'] = df['t_sw'] / df['t_hw'] + + # Plot curves + fig, ax = plt.subplots() + ax.plot(mesh_sizes, t_comm_sw, marker='o', label='$T_{comm}$ (sw)') + ax.plot(mesh_sizes, t_comm_hw, marker='s', label='$T_{comm}$ (hw)') + ax.plot(mesh_sizes, t_compute, marker='^', label='$T_{comp}$') + ax.set_xticks(mesh_sizes) + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) + ax.set_xlabel('Mesh size') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, which="both", alpha=0.5) + ax.legend() + fig.tight_layout() + + # Annotate plot with speedup arrows (when speedup > 1) + keys = ['size', 't_comm_sw', 't_comm_hw', 't_compute', 'speedup'] + for x, sw, hw, t, sp in df[keys].itertuples(index=False, name=None): + if sp > 1.0: + y0 = sw + y1 = max(hw, t) + + # Double-headed arrow from t_comm_sw to max(t_comm_hw, t_compute) + ax.annotate( + "", + xy=(x, y1), + xytext=(x, y0), + arrowprops=dict(arrowstyle="<->", lw=1.2), + annotation_clip=False, + ) + + # Arithmetic midpoint for linear scale + y_mid = 0.5 * (y0 + y1) + + offset = 3 + if sp > 1.2: + ha = "right" + xytext = (-offset, 0) + else: + ha = "left" + xytext = (offset, 0) + ax.annotate( + f"{sp:.2f}$\\times$", + xy=(x, y_mid), + xytext=xytext, + textcoords="offset points", + ha=ha, + va="center", + rotation=90 if ax.get_yscale() == "log" else 0, # optional + annotation_clip=True, + ) + + if show: + plt.show() + + return df + + +def plot2(show=True): + # Get data + mesh_sizes = [4, 8, 16, 32, 64, 128, 256] + Mt = model.max_square_problem_size() + t_sw = [] + t_hw = [] + for size in mesh_sizes: + t_sw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='sw')) + t_hw.append(model.t_fcl_gemm(size, size, Mt, Mt, Mt, impl='hw')) + df = pd.DataFrame({ + 'size': mesh_sizes, + 't_sw': t_sw, + 't_hw': t_hw, + }) + + # Calculate speedup + df['speedup'] = df['t_sw'] / df['t_hw'] + + # Plot + fig, ax = plt.subplots() + ax.bar(df['size'].astype(str), df['speedup']) + ax.set_xlabel('Mesh size') + ax.set_ylabel('Speedup') + ax.grid(axis='y', alpha=0.5) + fig.tight_layout() + + if show: + plt.show() + + return df + + +def main(): + # Parse arguments + functions = [plot1, plot2] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/experiments/summa_gemm/roi.json.tpl b/experiments/summa_gemm/roi.json.tpl new file mode 100644 index 00000000..4ed347bd --- /dev/null +++ b/experiments/summa_gemm/roi.json.tpl @@ -0,0 +1,29 @@ +<% n_tiles = experiment['n_tiles'] %> +[ + % for cluster in range(0,16): + // Compute cores + % for j in range(0, 8): + { + "thread": "${f'hart_{cluster * 9 + j + 1}'}", + "roi": [ + % for i in range(0, n_tiles): + {"idx": ${2 * i + 1}, "label": "${f'tile_{i}'}"}, + % endfor + ] + }, + % endfor + + // DMA core + { + "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", + "roi": [ + {"idx": 1, "label": "${f'tile_in_0'}"}, + % for i in range(0, n_tiles - 1): + {"idx": ${4*i + 3}, "label": "${f'tile_in_{i+1}'}"}, + {"idx": ${4*i + 5}, "label": "${f'tile_out_{i}'}"}, + % endfor + {"idx": ${n_tiles * 4 - 1}, "label": "${f'tile_out_{n_tiles-1}'}"}, + ] + }, + % endfor +] From 5a1c6fd4ae321a50366d70c5281e06832a042374 Mon Sep 17 00:00:00 2001 From: Raphael Date: Sun, 5 Oct 2025 18:04:50 +0200 Subject: [PATCH 34/77] hw: Fix reduction + VC Read/Write --- hw/picobello_pkg.sv | 5 + sw/snitch/tests/bench_hw_reduction_1stage.c | 174 ++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 sw/snitch/tests/bench_hw_reduction_1stage.c diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index ca659a37..f7f10487 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -400,6 +400,7 @@ package picobello_pkg; ///////////////////// // Support Reduction on the Wide port + // TODO(lleone): To be removed and merged into the routecfg localparam bit EnWideOffloadReduction = 1; localparam bit EnNarrowOffloadReduction = 1; localparam bit EnParallelReduction = 1; @@ -461,6 +462,10 @@ function automatic floo_pkg::collect_op_cfg_t gen_collect_op_cfg(); ret.EnNarrowMulticast = 1'b1; ret.EnWideMulticast = 1'b1; ret.EnLSBAnd = 1'b1; + ret.EnF_Add = 1'b1; + ret.EnF_Mul = 1'b1; + ret.EnF_Min = 1'b1; + ret.EnF_Max = 1'b1; return ret; endfunction diff --git a/sw/snitch/tests/bench_hw_reduction_1stage.c b/sw/snitch/tests/bench_hw_reduction_1stage.c new file mode 100644 index 00000000..1cba5ad9 --- /dev/null +++ b/sw/snitch/tests/bench_hw_reduction_1stage.c @@ -0,0 +1,174 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Raphael Roth +// +// This code can be used to benchmark the hardware reduction feature of the picobello +// system. It can reduce double data directly in flight with the internal DMA. +// It is possible to set the target cluster freely. +// The reduction is done in one stage for all clusters. + +#include +#include "pb_addrmap.h" +#include "snrt.h" + +// use this define to enable "row" reduction instead of columne reduction +// has only a effect if Number of cluster is 8! +#ifndef REDUCE_IN_ROW +#define REDUCE_IN_ROW 0 +#endif + +// Benchmark Parameter: +#ifndef NUMBER_OF_CLUSTERS +#define NUMBER_OF_CLUSTERS 16 // Needs to be either 4 / 8 / 16 +#endif + +#ifndef TARGET_CLUSTER +#define TARGET_CLUSTER 0 +#endif + +#ifndef DATA_BYTE +#define DATA_BYTE 8192 +#endif + +// Translate from byte into doubles +#ifndef DATA_LENGTH +#define DATA_LENGTH (DATA_BYTE/8) +#endif + +#define DATA_EVAL_LENGTH (DATA_LENGTH) + +// Define the reduction mask depending on the number of involved clusters +// When we reduce in the row then the mask is hardcoded! +#if REDUCE_IN_ROW == 0 + #define REDUCTION_MASK ((NUMBER_OF_CLUSTERS - 1) << 18) +#else + #define GROUND_MASK (((NUMBER_OF_CLUSTERS == 4) * 12) + ((NUMBER_OF_CLUSTERS == 8) * 13) + ((NUMBER_OF_CLUSTERS == 16) * 15)) + #define REDUCTION_MASK (GROUND_MASK << 18) +#endif + +/** + * @brief Return if the cluster is involved in the reduction or not. + * @param cluster_nr cluster id + */ +static inline int cluster_participates_in_reduction(int cluster_nr) { +#if REDUCE_IN_ROW == 0 + return (cluster_nr < NUMBER_OF_CLUSTERS); +#else + if(NUMBER_OF_CLUSTERS == 4){ + return ((cluster_nr % 4) == 0); + } else if(NUMBER_OF_CLUSTERS == 8){ + return (((cluster_nr % 4) == 0) || ((cluster_nr % 4) == 1)); + } else { + return (cluster_nr < NUMBER_OF_CLUSTERS); + } +#endif +} + +/** + * @brief Verify if the data are properly copied. Please adapt the algorythm if you change the data generation + * @param cluster_nr cluster id + * @param ptrData pointer to fetch the data from + */ +static inline uint32_t cluster_verify_reduction(int cluster_nr, double * ptrData) { + // Evaluate the reduction result + if (snrt_is_dm_core() && (cluster_nr == TARGET_CLUSTER)) { + uint32_t n_errs = DATA_EVAL_LENGTH; +#if REDUCE_IN_ROW == 0 + double base_value = (NUMBER_OF_CLUSTERS * 15.0) + (double) (((NUMBER_OF_CLUSTERS-1) * ((NUMBER_OF_CLUSTERS-1) + 1)) >> 1); +#else + double base_value = 0.0; + if(NUMBER_OF_CLUSTERS == 4){ + base_value = (NUMBER_OF_CLUSTERS * 15.0) + 24.0; + } else if(NUMBER_OF_CLUSTERS == 8){ + base_value = (NUMBER_OF_CLUSTERS * 15.0) + 52.0; + } else { + base_value = (NUMBER_OF_CLUSTERS * 15.0) + 120.0; + } +#endif + for (uint32_t i = 0; i < DATA_EVAL_LENGTH; i++) { + if (*ptrData == base_value){ + n_errs--; + } + base_value = base_value + (double) NUMBER_OF_CLUSTERS; + ptrData = ptrData + 1; + } + return n_errs; + } else { + return 0; + } +} + +// Main code +int main() { + snrt_interrupt_enable(IRQ_M_CLUSTER); + + // Sanity check: + if(((NUMBER_OF_CLUSTERS % 4) != 0) && (NUMBER_OF_CLUSTERS != 12)){ + // Number of cluster should be a power of 2 (for mask generation) + return 1; + } + + // Cluster ID + uint32_t cluster_id = snrt_cluster_idx(); + + // Set the mask for the multicast + uint64_t mask = REDUCTION_MASK; + + // Generate unique data for the reduction + double fill_value = 42.0; + double init_data = 15.0 + (double) cluster_id; + + // Allocate destination buffer + double *buffer_dst = (double*) snrt_l1_next_v2(); + double *buffer_src = buffer_dst + DATA_LENGTH; + + // Determint the target address + double *buffer_target = (double*) snrt_remote_l1_ptr(buffer_dst, cluster_id, TARGET_CLUSTER); + volatile double *buffer_last_entry = (volatile double *) (buffer_dst + DATA_LENGTH - 1); + + // Fill the source buffer with the init data + if (snrt_is_dm_core()) { + for (uint32_t i = 0; i < DATA_LENGTH; i++) { + buffer_src[i] = init_data + (double) i; + buffer_dst[i] = fill_value; + } + } + + // Do the transmission 3 times to preheat the cache + for(volatile int i = 0; i < 3; i++){ + // Reset the last entry from prior loop (if @ end of loop > check at end fails) + if((cluster_id == TARGET_CLUSTER) && snrt_is_dm_core()){ + *buffer_last_entry = fill_value; + } + + // Wait until the cluster are finished + snrt_global_barrier(); + + // Perf. analysis + snrt_mcycle(); + + // Init the DMA multicast + if (snrt_is_dm_core() && cluster_participates_in_reduction(cluster_id)) { + snrt_dma_start_1d_reduction(buffer_target, buffer_src, DATA_LENGTH * sizeof(double), mask, SNRT_REDUCTION_FADD); + // Only waits until last w is transmitted but not until b response arrives! + snrt_dma_wait_all(); + } + + // Target cluster polls the last array entry to know when we wrote the data + if((cluster_id == TARGET_CLUSTER) && snrt_is_dm_core()){ + while(*buffer_last_entry == fill_value); + } + + // Perf. analysis + snrt_mcycle(); + + } + + // Sync all cores + snrt_global_barrier(); + + // Evaluate the reduction result + return cluster_verify_reduction(cluster_id, buffer_dst); +} From 397cb0c7738af5adf884c8aa6e197cf75406a7f4 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 9 Oct 2025 10:12:35 +0200 Subject: [PATCH 35/77] hw: Align with latest FlooNoC collective configuration --- floo_picobello_noc_pkg_REDUCTION.sv | 10 +++---- hw/cheshire_tile.sv | 4 --- hw/cluster_tile.sv | 7 ++--- hw/dummy_tile.sv | 4 --- hw/fhg_spu_tile.sv | 4 --- hw/mem_tile.sv | 4 --- hw/picobello_pkg.sv | 42 +---------------------------- hw/spm_tile.sv | 4 --- 8 files changed, 8 insertions(+), 71 deletions(-) diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index 98096c98..191d1f55 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -263,18 +263,18 @@ package floo_picobello_noc_pkg; IdAddrOffset: 0, NumSamRules: 29, NumRoutes: 0, - EnMultiCast: 1'b1, - EnParallelReduction: 1'b1, - EnNarrowOffloadReduction: 1'b1, - EnWideOffloadReduction: 1'b1, CollectiveCfg: '{ OpCfg: '{ EnNarrowMulticast: 1'b1, EnWideMulticast: 1'b1, EnLSBAnd: 1'b1, + EnF_Add: 1'b1, + EnF_Mul: 1'b1, + EnF_Min: 1'b1, + EnF_Max: 1'b1, default: '0 }, - SequentialRedCfg: ReductionDefaultCfg + RedCfg: ReductionDefaultCfg } }; diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index 42f0b56d..438705d3 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -120,10 +120,6 @@ module cheshire_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .EnMultiCast (1'b0), - .EnParallelReduction (1'b0), - .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index fbd48060..f1aef055 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -111,7 +111,7 @@ module cluster_tile // TODO(raroth): possible to remove this decode from the picobello repo and move it inside the // FlooNoC repo. Currently the Decode used for the ALU is directly inside the floo_alu.sv // file. Maybe do the same for the FPU - if(EnWideOffloadReduction) begin : gen_wide_offload_reduction + if(is_en_wide_reduction(RouteCfg.CollectiveCfg.OpCfg)) begin : gen_wide_offload_reduction // Connect the DCA Request assign offload_dca_req.q_valid = offload_wide_req_valid; assign offload_wide_req_ready = offload_dca_rsp.q_ready; @@ -351,10 +351,6 @@ module cluster_tile .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), - .EnMultiCast (RouteCfg.EnMultiCast), - .EnParallelReduction (EnParallelReduction), - .EnOffloadWideReduction (EnWideOffloadReduction), - .EnOffloadNarrowReduction (EnNarrowOffloadReduction), .EnDecoupledRW (1'b1), .NoLoopback (1'b0), .NumRoutes (5), @@ -370,6 +366,7 @@ module cluster_tile .RdNarrowOperation_t (floo_pkg::collect_op_e), .RdWideData_t (RdDataWide_t), .RdNarrowData_t (RdDataNarrow_t), + .CollectiveOpCfg (RouteCfg.CollectiveCfg.OpCfg), .RdWideCfg (WideReductionCfg), .RdNarrowCfg (NarrowReductionCfg), .RdRespCfg (ResponseReductionCfg) diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index 0908e66f..9f80a5cc 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -43,10 +43,6 @@ module dummy_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .EnMultiCast (1'b0), - .EnParallelReduction (1'b0), - .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index a9da71b8..385d21c0 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -61,10 +61,6 @@ module fhg_spu_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .EnMultiCast (1'b0), - .EnParallelReduction (1'b0), - .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index d2399252..9ddbfa83 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -60,10 +60,6 @@ module mem_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .EnMultiCast (1'b0), - .EnParallelReduction (1'b0), - .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index f7f10487..fc5a162d 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -346,11 +346,8 @@ package picobello_pkg; function automatic floo_pkg::route_cfg_t gen_nomcast_route_cfg(); floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg; // Disable multicast for non-cluster tiles - ret.EnMultiCast = 1'b0; - ret.EnParallelReduction = 1'b0; - ret.EnNarrowOffloadReduction = 1'b0; - ret.EnWideOffloadReduction = 1'b0; ret.CollectiveCfg = CollectiveDefaultCfg; + ret.CollectiveCfg.RedCfg.RdSupportLoopback = 1'b0; return ret; endfunction @@ -399,12 +396,6 @@ package picobello_pkg; // REDUCTION // ///////////////////// - // Support Reduction on the Wide port - // TODO(lleone): To be removed and merged into the routecfg - localparam bit EnWideOffloadReduction = 1; - localparam bit EnNarrowOffloadReduction = 1; - localparam bit EnParallelReduction = 1; - // TODO (lleone): Add generation of the follwing types ans structs into Floogen typedef struct packed { user_mask_t collective_mask; @@ -455,37 +446,6 @@ localparam reduction_cfg_t NarrowReductionCfg = '{ default: '0 }; - // TODO (lleone): Get rid of all this configuration struct, make only one to pass to the system - // with enough info to decide what to do with the different routers -function automatic floo_pkg::collect_op_cfg_t gen_collect_op_cfg(); - floo_pkg:: collect_op_cfg_t ret = floo_pkg::CollectiveOpDefaultCfg; - ret.EnNarrowMulticast = 1'b1; - ret.EnWideMulticast = 1'b1; - ret.EnLSBAnd = 1'b1; - ret.EnF_Add = 1'b1; - ret.EnF_Mul = 1'b1; - ret.EnF_Min = 1'b1; - ret.EnF_Max = 1'b1; - return ret; -endfunction - -localparam collect_op_cfg_t CollectOpCfg = gen_collect_op_cfg(); - -localparam collective_cfg_t CollectCfgNarrow = '{ - OpCfg: CollectOpCfg, - SequentialRedCfg: NarrowReductionCfg -}; - -localparam collective_cfg_t CollectCfgWide = '{ - OpCfg: CollectOpCfg, - SequentialRedCfg: WideReductionCfg -}; - -localparam collective_cfg_t CollectCfgRsp = '{ - OpCfg: CollectOpCfg, - SequentialRedCfg: ResponseReductionCfg -}; - //////////////// // Cheshire // //////////////// diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index 1ce3d56d..7aa85b05 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -95,10 +95,6 @@ module spm_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .EnMultiCast (1'b0), - .EnParallelReduction (1'b0), - .EnOffloadWideReduction (1'b0), - .EnOffloadNarrowReduction (1'b0), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, From 89787a881c7c365afe61cf9f5e076776fc21de6a Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Fri, 17 Oct 2025 14:01:24 +0200 Subject: [PATCH 36/77] hw: Add paramterization for internal offload cut --- hw/cluster_tile.sv | 5 +++-- hw/picobello_pkg.sv | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index f1aef055..2ffe1578 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -169,11 +169,12 @@ module cluster_tile // TODO(raroth): move these spill register inside FlooNoC and make them configurable. // Insert a reqrsp-cut to avoid timing violations + //If teh CutOffloadIntf is enabled, the cut is already in the offload controller, you can bypass teh one below generic_reqrsp_cut #( .req_chan_t (snitch_cluster_pkg::dca_req_chan_t), .rsp_chan_t (snitch_cluster_pkg::dca_rsp_chan_t), - .BypassReq (1'b0), - .BypassRsp (1'b0) + .BypassReq (WideReductionCfg.CutOffloadIntf), + .BypassRsp (WideReductionCfg.CutOffloadIntf) ) i_dca_router_cut ( .clk_i (clk_i), .rst_ni (rst_ni), diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index fc5a162d..3c1568e0 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -347,6 +347,7 @@ package picobello_pkg; floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg; // Disable multicast for non-cluster tiles ret.CollectiveCfg = CollectiveDefaultCfg; + // DIsbale loop back ret.CollectiveCfg.RedCfg.RdSupportLoopback = 1'b0; return ret; endfunction @@ -424,7 +425,8 @@ localparam reduction_cfg_t WideReductionCfg = '{ RdTagBits: 5, RdSupportAxi: 1'b1, RdEnableBypass: 1'b1, - RdSupportLoopback: 1'b1 + RdSupportLoopback: 1'b1, + CutOffloadIntf: 1'b1 }; // raroth - overwrite benchmark autotest - end @@ -437,7 +439,8 @@ localparam reduction_cfg_t NarrowReductionCfg = '{ RdTagBits: 5, RdSupportAxi: 1'b1, RdEnableBypass: 1'b1, - RdSupportLoopback: 1'b1 + RdSupportLoopback: 1'b1, + CutOffloadIntf: 1'b0 }; localparam reduction_cfg_t ResponseReductionCfg = '{ From 7a792138b4a04161175de638df1fc379c10aa23d Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Fri, 17 Oct 2025 15:29:41 +0200 Subject: [PATCH 37/77] hw: Adapt Picobello to multi ejection channels --- hw/cheshire_tile.sv | 16 ++++++++++++---- hw/cluster_tile.sv | 13 ++++--------- hw/dummy_tile.sv | 11 +++++++++-- hw/fhg_spu_tile.sv | 12 +++++++++--- hw/mem_tile.sv | 12 ++++++++++-- hw/picobello_pkg.sv | 7 +++++++ hw/spm_tile.sv | 13 +++++++++++-- 7 files changed, 62 insertions(+), 22 deletions(-) diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index 438705d3..66430503 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -106,7 +106,8 @@ module cheshire_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -120,6 +121,7 @@ module cheshire_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, @@ -165,8 +167,13 @@ module cheshire_tile assign router_floo_rsp_in[North] = '0; // No North port in this tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[South] = floo_rsp_south_i; - assign floo_wide_west_o = router_floo_wide_out[West]; - assign floo_wide_south_o = router_floo_wide_out[South]; + //TODO(colluca): Merge with floo_wide + assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + assign floo_wide_south_o.valid = router_floo_wide_out[South].valid; + assign floo_wide_south_o.ready = router_floo_wide_out[South].ready; + assign floo_wide_south_o.wide = router_floo_wide_out[South].wide[0]; assign router_floo_wide_in[West] = floo_wide_west_i; assign router_floo_wide_in[North] = '0; // No North port in this tile assign router_floo_wide_in[East] = '0; // No East port in this tile @@ -211,6 +218,7 @@ module cheshire_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( @@ -503,7 +511,7 @@ module cheshire_tile `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide.generic.hdr.collective_op == Unicast))) `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req.generic.hdr.collective_op == Unicast))) `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide[0].generic.hdr.collective_op == Unicast))) end endmodule diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 2ffe1578..80c709bb 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -337,18 +337,13 @@ module cluster_tile // Router // //////////// - typedef struct packed { - logic [1:0] valid; - logic [1:0] ready; - floo_wide_chan_t [1:0] wide; - } floo_wide_double_t; - floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; + - floo_nw_router_v2 #( + floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), @@ -419,7 +414,7 @@ module cluster_tile // Chimney // ///////////// - floo_nw_chimney_v2 #( + floo_nw_chimney #( .AxiCfgN (floo_picobello_noc_pkg::AxiCfgN), .AxiCfgW (floo_picobello_noc_pkg::AxiCfgW), .ChimneyCfgN (floo_pkg::ChimneyDefaultCfg), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index 9f80a5cc..3d3c8a20 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -29,7 +29,8 @@ module dummy_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -43,6 +44,7 @@ module dummy_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, @@ -80,7 +82,12 @@ module dummy_tile assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + end assign router_floo_wide_in[West:North] = floo_wide_i; // Tie the router’s Eject input ports to 0 diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 385d21c0..03c8db90 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -47,7 +47,8 @@ module fhg_spu_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -61,6 +62,7 @@ module fhg_spu_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, @@ -106,8 +108,12 @@ module fhg_spu_tile assign router_floo_rsp_in[South] = '0; // No South port in this tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[North] = floo_rsp_north_i; - assign floo_wide_west_o = router_floo_wide_out[West]; - assign floo_wide_north_o = router_floo_wide_out[North]; + assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + assign floo_wide_north_o.valid = router_floo_wide_out[North].valid; + assign floo_wide_north_o.ready = router_floo_wide_out[North].ready; + assign floo_wide_north_o.wide = router_floo_wide_out[North].wide[0]; assign router_floo_wide_in[West] = floo_wide_west_i; assign router_floo_wide_in[South] = '0; // No South port in this tile assign router_floo_wide_in[East] = '0; // No East port in this tile diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 9ddbfa83..c6080daf 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -46,7 +46,8 @@ module mem_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -60,6 +61,7 @@ module mem_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, @@ -97,7 +99,12 @@ module mem_tile assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + end assign router_floo_wide_in[West:North] = floo_wide_i; ///////////// @@ -134,6 +141,7 @@ module mem_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 3c1568e0..cbb208cc 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -409,6 +409,13 @@ package picobello_pkg; floo_pkg::collect_op_e collective_op; } collective_wide_user_t; + // TODO (colluca): Merge with floo_wide + typedef struct packed { + logic [1:0] valid; + logic [1:0] ready; + floo_wide_chan_t [1:0] wide; + } floo_wide_double_t; + // Configurations for the Reductions // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything // ATTENTION: diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index 7aa85b05..99c9c8f5 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -81,7 +81,9 @@ module spm_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; - floo_wide_t [Eject:North] router_floo_wide_out, router_floo_wide_in; + floo_wide_t [Eject:North] router_floo_wide_in; + floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -95,6 +97,7 @@ module spm_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_out_t (floo_wide_double_t), .EnDecoupledRW (1'b1) ) i_router ( .clk_i, @@ -132,7 +135,12 @@ module spm_tile assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - assign floo_wide_o = router_floo_wide_out[West:North]; + // Only the local port uses both physical channels. Other outputs use only the lower. + for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + end assign router_floo_wide_in[West:North] = floo_wide_i; ///////////// @@ -172,6 +180,7 @@ module spm_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), + .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( From 20db417717983d5cf41fde84f0248413eea7088b Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 21 Oct 2025 11:52:13 +0200 Subject: [PATCH 38/77] sw: Add Snitch BLAS headers to tests' include directory --- sw/sw.mk | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sw/sw.mk b/sw/sw.mk index 376db883..a743aebe 100644 --- a/sw/sw.mk +++ b/sw/sw.mk @@ -24,14 +24,18 @@ PB_GEN_DIR = $(PB_ROOT)/.generated SN_RUNTIME_SRCDIR = $(PB_SNITCH_SW_DIR)/runtime/impl SN_RUNTIME_BUILDDIR = $(PB_SNITCH_SW_DIR)/runtime/build -SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build -SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build SN_RUNTIME_INCDIRS = $(PB_INCDIR) SN_RUNTIME_INCDIRS += $(PB_GEN_DIR) SN_RUNTIME_INCDIRS += $(PB_SNITCH_SW_DIR)/runtime/src SN_RUNTIME_HAL_HDRS = $(PB_GEN_DIR)/pb_addrmap.h SN_RUNTIME_HAL_HDRS += $(PB_GEN_DIR)/pb_raw_addrmap.h -SN_BUILD_APPS = OFF + +SN_RVTESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/riscv-tests/build + +SN_TESTS_BUILDDIR = $(PB_SNITCH_SW_DIR)/tests/build +SN_TESTS_INCDIRS = $(SN_ROOT)/sw/kernels/blas + +SN_BUILD_APPS = OFF SN_APPS = $(PB_SNITCH_SW_DIR)/apps/gemm_2d SN_APPS += $(PB_SNITCH_SW_DIR)/apps/gemm From 5eb6959c54e239bdb0f6c80567a18688ce3bdc75 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 22 Oct 2025 14:10:33 +0200 Subject: [PATCH 39/77] experiments: Add callback for hardware build --- experiments/picobello.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/experiments/picobello.py b/experiments/picobello.py index 31e06069..27d2f55e 100644 --- a/experiments/picobello.py +++ b/experiments/picobello.py @@ -5,25 +5,29 @@ # Luca Colagrande from pathlib import Path +import shutil import snitch.util.experiments.experiment_utils as eu from snitch.util.experiments import common root = Path(__file__).resolve().parents[1] -sim_cmd = [ - 'bash', '-c', - f'make vsim-run-batch -C {str(root)} SIM_DIR=${{run_dir}} ' - f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' - f'SN_BINARY=${{elf}} PRELMODE=3; ' - f'grep -q "] SUCCESS" ${{run_dir}}/transcript;' -] +default_work_dir = root / 'target/sim/vsim' + +def sim_cmd(work_dir=default_work_dir): + return [ + 'bash', '-c', + f'make vsim-run-batch -C {str(root)} SIM_DIR=${{run_dir}} ' + f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' + f'SN_BINARY=${{elf}} PRELMODE=3 VSIM_DIR={str(work_dir)}; ' + f'grep -q "] SUCCESS" ${{run_dir}}/transcript;' + ] -def sim_and_verify_cmd(verify_script): +def sim_and_verify_cmd(verify_script, work_dir=default_work_dir): return [ 'bash', '-c', f'make vsim-run-batch-verify -C {str(root)} SIM_DIR=${{run_dir}} ' f'CHS_BINARY={str(root)}/sw/cheshire/tests/simple_offload.spm.elf ' - f'SN_BINARY=${{elf}} VERIFY_PY={verify_script} PRELMODE=3;' + f'SN_BINARY=${{elf}} VERIFY_PY={verify_script} PRELMODE=3 VSIM_DIR={str(work_dir)};' ] @@ -47,8 +51,19 @@ def sw_callback(target=None, build_dir=None, defines=None, data_cfg=None, dry_ru ) +def hw_callback(work_dir=None, hw_cfg=None, dry_run=False, + sync=False, **kwargs): + shutil.copy2(hw_cfg, root / '.generated/floo_picobello_noc_pkg.sv') + work_dir.mkdir(exist_ok=True, parents=True) + vars = { + 'VSIM_DIR': work_dir, + } + return common.make('vsim-compile', dir=root, vars=vars, flags=['-j'], dry_run=dry_run, sync=sync) + + callbacks = { - 'sw': sw_callback + 'sw': sw_callback, + 'hw': hw_callback, } From 3d04ce93c310a693c4e8e40a0f8d2ae79656cdde Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 22 Oct 2025 14:12:23 +0200 Subject: [PATCH 40/77] sw: Enable communicator creation from arbitrary communicators --- sw/snitch/runtime/src/pb_sync.c | 2 +- sw/snitch/runtime/src/pb_sync.h | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sw/snitch/runtime/src/pb_sync.c b/sw/snitch/runtime/src/pb_sync.c index 46a5cb0b..bba8376d 100644 --- a/sw/snitch/runtime/src/pb_sync.c +++ b/sw/snitch/runtime/src/pb_sync.c @@ -3,4 +3,4 @@ // SPDX-License-Identifier: Apache-2.0 extern inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, - uint32_t n_cols); + uint32_t n_cols, snrt_comm_t parent_comm); diff --git a/sw/snitch/runtime/src/pb_sync.h b/sw/snitch/runtime/src/pb_sync.h index 9840b4af..782a9271 100644 --- a/sw/snitch/runtime/src/pb_sync.h +++ b/sw/snitch/runtime/src/pb_sync.h @@ -5,7 +5,12 @@ // Luca Colagrande inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, - uint32_t n_cols, uint32_t start_row = 0, uint32_t start_col = 0) { + uint32_t n_cols, uint32_t start_row = 0, uint32_t start_col = 0, + snrt_comm_t parent_comm = NULL) { + + // If no communicator is given, world communicator is used as default. + if (parent_comm == NULL) parent_comm = snrt_comm_world; + // Allocate communicator struct in L1 and point to it. *comm = (snrt_comm_t)snrt_l1_alloc_cluster_local(sizeof(snrt_comm_info_t)); @@ -22,7 +27,7 @@ inline void pb_create_mesh_comm(snrt_comm_t *comm, uint32_t n_rows, *(uint32_t *)barrier_ptr = 0; snrt_fence(); } - snrt_global_barrier(); + snrt_global_barrier(parent_comm); // Initialize communicator, pointing to the newly-allocated barrier // counter in L3. From a4d446d5f8ccfb7af061a0bf731c68c70d9d262f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 23 Oct 2025 13:55:58 +0200 Subject: [PATCH 41/77] experiments: Update barrier experiments --- experiments/barrier/experiments.py | 4 ++-- experiments/barrier/roi.json.tpl | 4 ++-- sw/snitch/tests/barrier_benchmark.c | 23 ++--------------------- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/experiments/barrier/experiments.py b/experiments/barrier/experiments.py index 6eea589f..fb551175 100755 --- a/experiments/barrier/experiments.py +++ b/experiments/barrier/experiments.py @@ -31,13 +31,13 @@ def derive_cdefines(self, experiment): def gen_experiments(): experiments = [] - # for impl in ['sw']: + # for impl in ['hw']: # for n_clusters in [8]: for impl in ['sw', 'hw']: for n_clusters in [2, 4, 8, 16]: experiments.append({ 'app': 'barrier_benchmark', - 'cmd': pb.sim_cmd, + 'cmd': pb.sim_cmd(), 'impl': impl, 'n_clusters': n_clusters, 'n_rows': int(math.ceil(n_clusters / 4)), diff --git a/experiments/barrier/roi.json.tpl b/experiments/barrier/roi.json.tpl index 3b32be9a..ea0dfcf1 100644 --- a/experiments/barrier/roi.json.tpl +++ b/experiments/barrier/roi.json.tpl @@ -14,10 +14,10 @@ elif experiment['n_clusters'] == 16: { "thread": "${f'hart_{cluster * 9 + 8 + 1}'}", "roi": [ - % if experiment['mode'] == 'sw': + % if experiment['impl'] == 'sw': {"idx": 1, "label": "barrier"}, {"idx": 3, "label": "barrier"}, - % elif experiment['mode'] == 'hw': + % elif experiment['impl'] == 'hw': {"idx": 3, "label": "barrier"}, {"idx": 5, "label": "barrier"}, % endif diff --git a/sw/snitch/tests/barrier_benchmark.c b/sw/snitch/tests/barrier_benchmark.c index f72f68a7..e5c6e579 100644 --- a/sw/snitch/tests/barrier_benchmark.c +++ b/sw/snitch/tests/barrier_benchmark.c @@ -103,29 +103,10 @@ int main (void) { } else { hw_barrier(comm); } - - // Wake up all cores put to sleep. - // Note: this is a workaround since multiple intersecting barriers are - // not supported yet. - if (snrt_cluster_idx() == 0) { - for (uint32_t i = 0; i < snrt_cluster_num(); i++) { - uint32_t is_participant = pb_cluster_col_idx(i) < N_COLS && - pb_cluster_row_idx(i) < N_ROWS; - if (!is_participant) snrt_int_cluster_set( - 1 << snrt_cluster_compute_core_num(), i); - } - } - } - // Other clusters go to sleep - else { - if (snrt_is_dm_core()) { - snrt_wfi(); - snrt_int_clr_mcip(); - } } - // Make compute cores wait on cluster hardware barrier - snrt_cluster_hw_barrier(); + // Non-participating clusters wait on a global barrier + snrt_global_barrier(); return 0; } From c8a21a4f350c802e8705db7eafeb5d52c4512926 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Thu, 23 Oct 2025 16:43:46 +0200 Subject: [PATCH 42/77] hw(cluster): Increase atomic txns to 3 --- hw/cluster_tile.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 80c709bb..54cb9c0b 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -422,7 +422,7 @@ module cluster_tile .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .MaxAtomicTxns (1), + .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), .rob_idx_t (floo_picobello_noc_pkg::rob_idx_t), From 04d6566fc6ba65c6cd05adac0a1a28b0df164abe Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Thu, 23 Oct 2025 16:44:16 +0200 Subject: [PATCH 43/77] hw: Use manual floogen package --- Bender.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Bender.yml b/Bender.yml index 311dc9c8..44cc8583 100644 --- a/Bender.yml +++ b/Bender.yml @@ -36,7 +36,7 @@ sources: - target: pb_gen_rtl files: # Level 0.0 - - .generated/floo_picobello_noc_pkg.sv + - floo_picobello_noc_pkg_REDUCTION.sv - .generated/snitch_cluster_pkg.sv - .generated/pb_soc_regs_pkg.sv # Level 0.1 From adc19dd9a293d9b080e1addb883f25bf98e6a5b3 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 21 Oct 2025 11:57:01 +0200 Subject: [PATCH 44/77] experiments: Add wide reduction experiments --- experiments/.gitignore | 3 +- experiments/reduction/__init__.py | 9 + .../cfg/floo_picobello_noc_pkg_generic.sv | 343 ++++++++++++ .../cfg/floo_picobello_noc_pkg_simple.sv | 343 ++++++++++++ experiments/reduction/experiments.py | 179 ++++++ experiments/reduction/fit.py | 183 ++++++ experiments/reduction/model.py | 133 +++++ experiments/reduction/plot.py | 375 +++++++++++++ experiments/reduction/roi/hw_generic.json.tpl | 21 + experiments/reduction/roi/hw_simple.json.tpl | 23 + experiments/reduction/roi/seq.json.tpl | 87 +++ experiments/reduction/roi/tree.json.tpl | 123 ++++ experiments/reduction/verify.py | 34 ++ sw/snitch/tests/bench_hw_reduction_1stage.c | 174 ------ sw/snitch/tests/reduction_benchmark.c | 523 ++++++++++++++++++ target/sim/traces.mk | 6 + 16 files changed, 2384 insertions(+), 175 deletions(-) create mode 100644 experiments/reduction/__init__.py create mode 100644 experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv create mode 100644 experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv create mode 100755 experiments/reduction/experiments.py create mode 100755 experiments/reduction/fit.py create mode 100755 experiments/reduction/model.py create mode 100755 experiments/reduction/plot.py create mode 100644 experiments/reduction/roi/hw_generic.json.tpl create mode 100644 experiments/reduction/roi/hw_simple.json.tpl create mode 100644 experiments/reduction/roi/seq.json.tpl create mode 100644 experiments/reduction/roi/tree.json.tpl create mode 100755 experiments/reduction/verify.py delete mode 100644 sw/snitch/tests/bench_hw_reduction_1stage.c create mode 100644 sw/snitch/tests/reduction_benchmark.c diff --git a/experiments/.gitignore b/experiments/.gitignore index 8037a6ca..15806eca 100644 --- a/experiments/.gitignore +++ b/experiments/.gitignore @@ -2,4 +2,5 @@ build/ runs/ plots/ results/ -data/ \ No newline at end of file +data/ +hw/ \ No newline at end of file diff --git a/experiments/reduction/__init__.py b/experiments/reduction/__init__.py new file mode 100644 index 00000000..73945a38 --- /dev/null +++ b/experiments/reduction/__init__.py @@ -0,0 +1,9 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from . import plot, experiments, fit, model + +__all__ = ["plot", "experiments", "fit", "model"] diff --git a/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv b/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv new file mode 100644 index 00000000..191d1f55 --- /dev/null +++ b/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv @@ -0,0 +1,343 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_picobello_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic [4:0] { + ClusterX0Y0 = 0, + ClusterX0Y1 = 1, + ClusterX0Y2 = 2, + ClusterX0Y3 = 3, + ClusterX1Y0 = 4, + ClusterX1Y1 = 5, + ClusterX1Y2 = 6, + ClusterX1Y3 = 7, + ClusterX2Y0 = 8, + ClusterX2Y1 = 9, + ClusterX2Y2 = 10, + ClusterX2Y3 = 11, + ClusterX3Y0 = 12, + ClusterX3Y1 = 13, + ClusterX3Y2 = 14, + ClusterX3Y3 = 15, + Cheshire = 16, + FhgSpu = 17, + TopSpmNarrow = 18, + TopSpmWide = 19, + L2Spm0 = 20, + L2Spm1 = 21, + L2Spm2 = 22, + L2Spm3 = 23, + L2Spm4 = 24, + L2Spm5 = 25, + L2Spm6 = 26, + L2Spm7 = 27, + NumEndpoints = 28 + } ep_id_e; + + + + typedef enum logic [4:0] { + ClusterX0Y0SamIdx = 0, + ClusterX0Y1SamIdx = 1, + ClusterX0Y2SamIdx = 2, + ClusterX0Y3SamIdx = 3, + ClusterX1Y0SamIdx = 4, + ClusterX1Y1SamIdx = 5, + ClusterX1Y2SamIdx = 6, + ClusterX1Y3SamIdx = 7, + ClusterX2Y0SamIdx = 8, + ClusterX2Y1SamIdx = 9, + ClusterX2Y2SamIdx = 10, + ClusterX2Y3SamIdx = 11, + ClusterX3Y0SamIdx = 12, + ClusterX3Y1SamIdx = 13, + ClusterX3Y2SamIdx = 14, + ClusterX3Y3SamIdx = 15, + CheshireExternalSamIdx = 16, + CheshireInternalSamIdx = 17, + FhgSpuSamIdx = 18, + TopSpmNarrowSamIdx = 19, + TopSpmWideSamIdx = 20, + L2Spm0SamIdx = 21, + L2Spm1SamIdx = 22, + L2Spm2SamIdx = 23, + L2Spm3SamIdx = 24, + L2Spm4SamIdx = 25, + L2Spm5SamIdx = 26, + L2Spm6SamIdx = 27, + L2Spm7SamIdx = 28 + } sam_idx_e; + + + + typedef logic [0:0] rob_idx_t; + typedef logic [0:0] port_id_t; + typedef logic [3:0] x_bits_t; + typedef logic [1:0] y_bits_t; + typedef struct packed { + x_bits_t x; + y_bits_t y; + port_id_t port_id; + } id_t; + + typedef logic route_t; + + + localparam int unsigned SamNumRules = 29; + + typedef struct packed { + id_t idx; + logic [47:0] start_addr; + logic [47:0] end_addr; + } sam_rule_t; + + localparam sam_rule_t [SamNumRules-1:0] Sam = '{ + '{ + idx: '{x: 8, y: 3, port_id: 0}, + start_addr: 48'h000070700000, + end_addr: 48'h000070800000 + }, // L2Spm7 + '{ + idx: '{x: 8, y: 2, port_id: 0}, + start_addr: 48'h000070600000, + end_addr: 48'h000070700000 + }, // L2Spm6 + '{ + idx: '{x: 8, y: 1, port_id: 0}, + start_addr: 48'h000070500000, + end_addr: 48'h000070600000 + }, // L2Spm5 + '{ + idx: '{x: 8, y: 0, port_id: 0}, + start_addr: 48'h000070400000, + end_addr: 48'h000070500000 + }, // L2Spm4 + '{ + idx: '{x: 0, y: 3, port_id: 0}, + start_addr: 48'h000070300000, + end_addr: 48'h000070400000 + }, // L2Spm3 + '{ + idx: '{x: 0, y: 2, port_id: 0}, + start_addr: 48'h000070200000, + end_addr: 48'h000070300000 + }, // L2Spm2 + '{ + idx: '{x: 0, y: 1, port_id: 0}, + start_addr: 48'h000070100000, + end_addr: 48'h000070200000 + }, // L2Spm1 + '{ + idx: '{x: 0, y: 0, port_id: 0}, + start_addr: 48'h000070000000, + end_addr: 48'h000070100000 + }, // L2Spm0 + '{ + idx: '{x: 9, y: 1, port_id: 0}, + start_addr: 48'h000060040000, + end_addr: 48'h000060080000 + }, // TopSpmWide + '{ + idx: '{x: 9, y: 2, port_id: 0}, + start_addr: 48'h000060000000, + end_addr: 48'h000060040000 + }, // TopSpmNarrow + '{ + idx: '{x: 9, y: 0, port_id: 0}, + start_addr: 48'h000040000000, + end_addr: 48'h000040040000 + }, // FhgSpu + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000000000000, + end_addr: 48'h000020000000 + }, // CheshireInternal + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000080000000, + end_addr: 48'h020000000000 + }, // CheshireExternal + '{ + idx: '{x: 7, y: 3, port_id: 0}, + start_addr: 48'h0000203c0000, + end_addr: 48'h000020400000 + }, // ClusterX3Y3 + '{ + idx: '{x: 7, y: 2, port_id: 0}, + start_addr: 48'h000020380000, + end_addr: 48'h0000203c0000 + }, // ClusterX3Y2 + '{ + idx: '{x: 7, y: 1, port_id: 0}, + start_addr: 48'h000020340000, + end_addr: 48'h000020380000 + }, // ClusterX3Y1 + '{ + idx: '{x: 7, y: 0, port_id: 0}, + start_addr: 48'h000020300000, + end_addr: 48'h000020340000 + }, // ClusterX3Y0 + '{ + idx: '{x: 6, y: 3, port_id: 0}, + start_addr: 48'h0000202c0000, + end_addr: 48'h000020300000 + }, // ClusterX2Y3 + '{ + idx: '{x: 6, y: 2, port_id: 0}, + start_addr: 48'h000020280000, + end_addr: 48'h0000202c0000 + }, // ClusterX2Y2 + '{ + idx: '{x: 6, y: 1, port_id: 0}, + start_addr: 48'h000020240000, + end_addr: 48'h000020280000 + }, // ClusterX2Y1 + '{ + idx: '{x: 6, y: 0, port_id: 0}, + start_addr: 48'h000020200000, + end_addr: 48'h000020240000 + }, // ClusterX2Y0 + '{ + idx: '{x: 5, y: 3, port_id: 0}, + start_addr: 48'h0000201c0000, + end_addr: 48'h000020200000 + }, // ClusterX1Y3 + '{ + idx: '{x: 5, y: 2, port_id: 0}, + start_addr: 48'h000020180000, + end_addr: 48'h0000201c0000 + }, // ClusterX1Y2 + '{ + idx: '{x: 5, y: 1, port_id: 0}, + start_addr: 48'h000020140000, + end_addr: 48'h000020180000 + }, // ClusterX1Y1 + '{ + idx: '{x: 5, y: 0, port_id: 0}, + start_addr: 48'h000020100000, + end_addr: 48'h000020140000 + }, // ClusterX1Y0 + '{ + idx: '{x: 4, y: 3, port_id: 0}, + start_addr: 48'h0000200c0000, + end_addr: 48'h000020100000 + }, // ClusterX0Y3 + '{ + idx: '{x: 4, y: 2, port_id: 0}, + start_addr: 48'h000020080000, + end_addr: 48'h0000200c0000 + }, // ClusterX0Y2 + '{ + idx: '{x: 4, y: 1, port_id: 0}, + start_addr: 48'h000020040000, + end_addr: 48'h000020080000 + }, // ClusterX0Y1 + '{ + idx: '{x: 4, y: 0, port_id: 0}, + start_addr: 48'h000020000000, + end_addr: 48'h000020040000 + } // ClusterX0Y0 + + }; + + + + localparam route_cfg_t RouteCfg = '{ + RouteAlgo: XYRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 41, + XYAddrOffsetY: 45, + IdAddrOffset: 0, + NumSamRules: 29, + NumRoutes: 0, + CollectiveCfg: '{ + OpCfg: '{ + EnNarrowMulticast: 1'b1, + EnWideMulticast: 1'b1, + EnLSBAnd: 1'b1, + EnF_Add: 1'b1, + EnF_Mul: 1'b1, + EnF_Min: 1'b1, + EnF_Max: 1'b1, + default: '0 + }, + RedCfg: ReductionDefaultCfg + } + }; + + + typedef logic [47:0] axi_narrow_in_addr_t; + typedef logic [63:0] axi_narrow_in_data_t; + typedef logic [7:0] axi_narrow_in_strb_t; + typedef logic [4:0] axi_narrow_in_id_t; + typedef logic [4:0] axi_narrow_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, + axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, + axi_narrow_in_user_t) + + + typedef logic [47:0] axi_narrow_out_addr_t; + typedef logic [63:0] axi_narrow_out_data_t; + typedef logic [7:0] axi_narrow_out_strb_t; + typedef logic [1:0] axi_narrow_out_id_t; + typedef logic [4:0] axi_narrow_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, + axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, + axi_narrow_out_strb_t, axi_narrow_out_user_t) + + + typedef logic [47:0] axi_wide_in_addr_t; + typedef logic [511:0] axi_wide_in_data_t; + typedef logic [63:0] axi_wide_in_strb_t; + typedef logic [2:0] axi_wide_in_id_t; + typedef logic [0:0] axi_wide_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, + axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic [47:0] axi_wide_out_addr_t; + typedef logic [511:0] axi_wide_out_data_t; + typedef logic [63:0] axi_wide_out_strb_t; + typedef logic [0:0] axi_wide_out_id_t; + typedef logic [0:0] axi_wide_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, + axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, + axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) + localparam axi_cfg_t AxiCfgN = '{ + AddrWidth: 48, + DataWidth: 64, + UserWidth: 5, + InIdWidth: 5, + OutIdWidth: 2 + }; + localparam axi_cfg_t AxiCfgW = '{ + AddrWidth: 48, + DataWidth: 512, + UserWidth: 1, + InIdWidth: 3, + OutIdWidth: 1 + }; + `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, + hdr_t) + + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) + + +endpackage diff --git a/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv b/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv new file mode 100644 index 00000000..191d1f55 --- /dev/null +++ b/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv @@ -0,0 +1,343 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_picobello_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic [4:0] { + ClusterX0Y0 = 0, + ClusterX0Y1 = 1, + ClusterX0Y2 = 2, + ClusterX0Y3 = 3, + ClusterX1Y0 = 4, + ClusterX1Y1 = 5, + ClusterX1Y2 = 6, + ClusterX1Y3 = 7, + ClusterX2Y0 = 8, + ClusterX2Y1 = 9, + ClusterX2Y2 = 10, + ClusterX2Y3 = 11, + ClusterX3Y0 = 12, + ClusterX3Y1 = 13, + ClusterX3Y2 = 14, + ClusterX3Y3 = 15, + Cheshire = 16, + FhgSpu = 17, + TopSpmNarrow = 18, + TopSpmWide = 19, + L2Spm0 = 20, + L2Spm1 = 21, + L2Spm2 = 22, + L2Spm3 = 23, + L2Spm4 = 24, + L2Spm5 = 25, + L2Spm6 = 26, + L2Spm7 = 27, + NumEndpoints = 28 + } ep_id_e; + + + + typedef enum logic [4:0] { + ClusterX0Y0SamIdx = 0, + ClusterX0Y1SamIdx = 1, + ClusterX0Y2SamIdx = 2, + ClusterX0Y3SamIdx = 3, + ClusterX1Y0SamIdx = 4, + ClusterX1Y1SamIdx = 5, + ClusterX1Y2SamIdx = 6, + ClusterX1Y3SamIdx = 7, + ClusterX2Y0SamIdx = 8, + ClusterX2Y1SamIdx = 9, + ClusterX2Y2SamIdx = 10, + ClusterX2Y3SamIdx = 11, + ClusterX3Y0SamIdx = 12, + ClusterX3Y1SamIdx = 13, + ClusterX3Y2SamIdx = 14, + ClusterX3Y3SamIdx = 15, + CheshireExternalSamIdx = 16, + CheshireInternalSamIdx = 17, + FhgSpuSamIdx = 18, + TopSpmNarrowSamIdx = 19, + TopSpmWideSamIdx = 20, + L2Spm0SamIdx = 21, + L2Spm1SamIdx = 22, + L2Spm2SamIdx = 23, + L2Spm3SamIdx = 24, + L2Spm4SamIdx = 25, + L2Spm5SamIdx = 26, + L2Spm6SamIdx = 27, + L2Spm7SamIdx = 28 + } sam_idx_e; + + + + typedef logic [0:0] rob_idx_t; + typedef logic [0:0] port_id_t; + typedef logic [3:0] x_bits_t; + typedef logic [1:0] y_bits_t; + typedef struct packed { + x_bits_t x; + y_bits_t y; + port_id_t port_id; + } id_t; + + typedef logic route_t; + + + localparam int unsigned SamNumRules = 29; + + typedef struct packed { + id_t idx; + logic [47:0] start_addr; + logic [47:0] end_addr; + } sam_rule_t; + + localparam sam_rule_t [SamNumRules-1:0] Sam = '{ + '{ + idx: '{x: 8, y: 3, port_id: 0}, + start_addr: 48'h000070700000, + end_addr: 48'h000070800000 + }, // L2Spm7 + '{ + idx: '{x: 8, y: 2, port_id: 0}, + start_addr: 48'h000070600000, + end_addr: 48'h000070700000 + }, // L2Spm6 + '{ + idx: '{x: 8, y: 1, port_id: 0}, + start_addr: 48'h000070500000, + end_addr: 48'h000070600000 + }, // L2Spm5 + '{ + idx: '{x: 8, y: 0, port_id: 0}, + start_addr: 48'h000070400000, + end_addr: 48'h000070500000 + }, // L2Spm4 + '{ + idx: '{x: 0, y: 3, port_id: 0}, + start_addr: 48'h000070300000, + end_addr: 48'h000070400000 + }, // L2Spm3 + '{ + idx: '{x: 0, y: 2, port_id: 0}, + start_addr: 48'h000070200000, + end_addr: 48'h000070300000 + }, // L2Spm2 + '{ + idx: '{x: 0, y: 1, port_id: 0}, + start_addr: 48'h000070100000, + end_addr: 48'h000070200000 + }, // L2Spm1 + '{ + idx: '{x: 0, y: 0, port_id: 0}, + start_addr: 48'h000070000000, + end_addr: 48'h000070100000 + }, // L2Spm0 + '{ + idx: '{x: 9, y: 1, port_id: 0}, + start_addr: 48'h000060040000, + end_addr: 48'h000060080000 + }, // TopSpmWide + '{ + idx: '{x: 9, y: 2, port_id: 0}, + start_addr: 48'h000060000000, + end_addr: 48'h000060040000 + }, // TopSpmNarrow + '{ + idx: '{x: 9, y: 0, port_id: 0}, + start_addr: 48'h000040000000, + end_addr: 48'h000040040000 + }, // FhgSpu + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000000000000, + end_addr: 48'h000020000000 + }, // CheshireInternal + '{ + idx: '{x: 9, y: 3, port_id: 0}, + start_addr: 48'h000080000000, + end_addr: 48'h020000000000 + }, // CheshireExternal + '{ + idx: '{x: 7, y: 3, port_id: 0}, + start_addr: 48'h0000203c0000, + end_addr: 48'h000020400000 + }, // ClusterX3Y3 + '{ + idx: '{x: 7, y: 2, port_id: 0}, + start_addr: 48'h000020380000, + end_addr: 48'h0000203c0000 + }, // ClusterX3Y2 + '{ + idx: '{x: 7, y: 1, port_id: 0}, + start_addr: 48'h000020340000, + end_addr: 48'h000020380000 + }, // ClusterX3Y1 + '{ + idx: '{x: 7, y: 0, port_id: 0}, + start_addr: 48'h000020300000, + end_addr: 48'h000020340000 + }, // ClusterX3Y0 + '{ + idx: '{x: 6, y: 3, port_id: 0}, + start_addr: 48'h0000202c0000, + end_addr: 48'h000020300000 + }, // ClusterX2Y3 + '{ + idx: '{x: 6, y: 2, port_id: 0}, + start_addr: 48'h000020280000, + end_addr: 48'h0000202c0000 + }, // ClusterX2Y2 + '{ + idx: '{x: 6, y: 1, port_id: 0}, + start_addr: 48'h000020240000, + end_addr: 48'h000020280000 + }, // ClusterX2Y1 + '{ + idx: '{x: 6, y: 0, port_id: 0}, + start_addr: 48'h000020200000, + end_addr: 48'h000020240000 + }, // ClusterX2Y0 + '{ + idx: '{x: 5, y: 3, port_id: 0}, + start_addr: 48'h0000201c0000, + end_addr: 48'h000020200000 + }, // ClusterX1Y3 + '{ + idx: '{x: 5, y: 2, port_id: 0}, + start_addr: 48'h000020180000, + end_addr: 48'h0000201c0000 + }, // ClusterX1Y2 + '{ + idx: '{x: 5, y: 1, port_id: 0}, + start_addr: 48'h000020140000, + end_addr: 48'h000020180000 + }, // ClusterX1Y1 + '{ + idx: '{x: 5, y: 0, port_id: 0}, + start_addr: 48'h000020100000, + end_addr: 48'h000020140000 + }, // ClusterX1Y0 + '{ + idx: '{x: 4, y: 3, port_id: 0}, + start_addr: 48'h0000200c0000, + end_addr: 48'h000020100000 + }, // ClusterX0Y3 + '{ + idx: '{x: 4, y: 2, port_id: 0}, + start_addr: 48'h000020080000, + end_addr: 48'h0000200c0000 + }, // ClusterX0Y2 + '{ + idx: '{x: 4, y: 1, port_id: 0}, + start_addr: 48'h000020040000, + end_addr: 48'h000020080000 + }, // ClusterX0Y1 + '{ + idx: '{x: 4, y: 0, port_id: 0}, + start_addr: 48'h000020000000, + end_addr: 48'h000020040000 + } // ClusterX0Y0 + + }; + + + + localparam route_cfg_t RouteCfg = '{ + RouteAlgo: XYRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 41, + XYAddrOffsetY: 45, + IdAddrOffset: 0, + NumSamRules: 29, + NumRoutes: 0, + CollectiveCfg: '{ + OpCfg: '{ + EnNarrowMulticast: 1'b1, + EnWideMulticast: 1'b1, + EnLSBAnd: 1'b1, + EnF_Add: 1'b1, + EnF_Mul: 1'b1, + EnF_Min: 1'b1, + EnF_Max: 1'b1, + default: '0 + }, + RedCfg: ReductionDefaultCfg + } + }; + + + typedef logic [47:0] axi_narrow_in_addr_t; + typedef logic [63:0] axi_narrow_in_data_t; + typedef logic [7:0] axi_narrow_in_strb_t; + typedef logic [4:0] axi_narrow_in_id_t; + typedef logic [4:0] axi_narrow_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, + axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, + axi_narrow_in_user_t) + + + typedef logic [47:0] axi_narrow_out_addr_t; + typedef logic [63:0] axi_narrow_out_data_t; + typedef logic [7:0] axi_narrow_out_strb_t; + typedef logic [1:0] axi_narrow_out_id_t; + typedef logic [4:0] axi_narrow_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, + axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, + axi_narrow_out_strb_t, axi_narrow_out_user_t) + + + typedef logic [47:0] axi_wide_in_addr_t; + typedef logic [511:0] axi_wide_in_data_t; + typedef logic [63:0] axi_wide_in_strb_t; + typedef logic [2:0] axi_wide_in_id_t; + typedef logic [0:0] axi_wide_in_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, + axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic [47:0] axi_wide_out_addr_t; + typedef logic [511:0] axi_wide_out_data_t; + typedef logic [63:0] axi_wide_out_strb_t; + typedef logic [0:0] axi_wide_out_id_t; + typedef logic [0:0] axi_wide_out_user_t; + `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, + axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, + axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) + localparam axi_cfg_t AxiCfgN = '{ + AddrWidth: 48, + DataWidth: 64, + UserWidth: 5, + InIdWidth: 5, + OutIdWidth: 2 + }; + localparam axi_cfg_t AxiCfgW = '{ + AddrWidth: 48, + DataWidth: 512, + UserWidth: 1, + InIdWidth: 3, + OutIdWidth: 1 + }; + `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, + hdr_t) + + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) + + +endpackage diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py new file mode 100755 index 00000000..48bb028a --- /dev/null +++ b/experiments/reduction/experiments.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from functools import cache +import math +from pathlib import Path +import picobello as pb +import snitch.util.experiments.experiment_utils as eu +from snitch.util.experiments.SimResults import SimRegion + +BEAT_BYTES = 64 +TCK = 5 +DIR = Path(__file__).parent + + +############### +# Experiments # +############### + + +class ExperimentManager(pb.ExperimentManager): + + def derive_axes(self, experiment): + keys = ['impl', 'n_rows', 'size'] + if experiment['impl'] in ['seq', 'tree']: + keys.append('batch') + return eu.derive_axes_from_keys(experiment, keys) + + def derive_cdefines(self, experiment): + cdefs = { + 'IMPL': experiment['impl'].upper(), + 'LOG2_N_ROWS': int(math.log2(experiment['n_rows'])), + 'SIZE': experiment['size'], + } + if experiment['impl'] in ['seq', 'tree']: + cdefs['BATCH'] = experiment['batch'] + return cdefs + + # def derive_hw_cfg(self, experiment): + # hw = experiment['hw'] + # return DIR / 'cfg' / f'floo_picobello_noc_pkg_{hw}.sv' + + def derive_vsim_builddir(self, experiment): + return self.dir / 'hw' / experiment['hw'] + + +def gen_experiments(): + experiments = [] + # for impl in ['seq']: + # for impl in ['tree', 'hw_simple', 'hw_generic']: + for impl in ['seq', 'tree', 'hw_simple', 'hw_generic']: + # for n_rows in [1]: + for n_rows in [1, 2, 4]: + # for size in [4096]: + for size in [1024, 2048, 4096, 8192, 16384, 32768]: + # for n_batches in [4]: + for n_batches in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]: + + # Only sequential and tree implementations supports batching, for all other + # implementations we only accept n_batches = 1 + batch_size = int(size // n_batches) + batch_beats = int(batch_size // BEAT_BYTES) + if impl in ['seq', 'tree']: + valid_n_batches = batch_beats > 8 and batch_beats < 256 + else: + valid_n_batches = n_batches == 1 + + # If the current setting for n_batches is valid, add the experiment + if valid_n_batches: + experiment = { + 'impl': impl, + 'n_rows': n_rows, + 'size': size, + 'batch': batch_size, + 'app': 'reduction_benchmark', + 'hw': 'simple' if impl == 'hw_simple' else 'generic', + 'roi': Path.cwd() / f'roi/{impl}.json.tpl', + } + work_dir = DIR / 'hw' / experiment['hw'] + experiment['cmd'] = pb.sim_and_verify_cmd(Path.cwd() / 'verify.py', + work_dir) + experiments.append(experiment) + return experiments + + +########### +# Results # +########### + +def dma_core(cluster_idx): + return f'hart_{1 + cluster_idx * 9 + 8}' + + +def compute_core(cluster_idx, core_idx): + return f'hart_{1 + cluster_idx * 9 + core_idx}' + + +# Assumes DMA transfers on all clusters, and for all batches to be equal. +# Therefore we only sample cluster 12's first transfer. +def seq_dma_cycles(sim_results): + roi = SimRegion(dma_core(12), 'd_0', 1) + return sim_results.get_timespan(roi) // TCK + + +# Assumes computation on all clusters, and for all batches to be equal. +# Therefore we only sample cluster 0's first computation. +def seq_compute_cycles(sim_results): + roi = SimRegion(compute_core(0, 0), 'c_0', 1) + return sim_results.get_timespan(roi) // TCK + + +def seq_cycles(sim_results, c, r, n_batches): + start = SimRegion(dma_core(12), 'd_0', 1) + end = SimRegion(compute_core(0, 0), f'c{"" if r == 1 else 2}_{n_batches-1}', 1) + return sim_results.get_timespan(start, end) // TCK + + +# Assumes DMA transfers on all clusters, and all levels of the tree to be equal. +# Therefore we only sample cluster 4's first transfer. +def tree_dma_cycles(sim_results): + roi = SimRegion(dma_core(4), '4 > 0', 0) + return sim_results.get_timespan(roi) // TCK + + +# Assumes computation on all clusters, and all levels of the tree to be equal. +# Therefore we only sample cluster 0's first computation. +def tree_compute_cycles(sim_results): + roi = SimRegion(compute_core(0, 0), 'comp l0', 0) + return sim_results.get_timespan(roi) // TCK + + +def tree_cycles(sim_results, c, r, n_batches): + n_levels = int(math.log2(c * r)) + start = SimRegion(dma_core(4), '4 > 0', 0) + end = SimRegion(compute_core(0, 0), f'comp l{n_levels-1}', n_batches - 1) + return sim_results.get_timespan(start, end) // TCK + + +def hw_generic_cycles(sim_results): + roi = SimRegion(dma_core(0), 'reduction', 1) + return sim_results.get_timespan(roi) // TCK + + +def hw_simple_dma_cycles(sim_results): + roi = SimRegion(dma_core(0), 'row reduction', 1) + return sim_results.get_timespan(roi) // TCK + + +def hw_simple_cycles(sim_results): + start = SimRegion(dma_core(0), 'row reduction', 1) + end = SimRegion(dma_core(0), 'column reduction', 1) + return sim_results.get_timespan(start, end) // TCK + + +@cache +def results(manager=None): + if manager is None: + manager = ExperimentManager(gen_experiments(), dir=DIR, parse_args=False) + return manager.get_results() + + +######## +# Main # +######## + +def main(): + + manager = ExperimentManager(gen_experiments(), dir=DIR) + manager.run() + df = results(manager) + print(df) + + +if __name__ == '__main__': + main() diff --git a/experiments/reduction/fit.py b/experiments/reduction/fit.py new file mode 100755 index 00000000..d35700c4 --- /dev/null +++ b/experiments/reduction/fit.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from functools import cache +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy import stats +from reduction import experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +def fit(x, y, quiet=True): + # Perform linear regression + res = stats.linregress(x, y) + alpha = res.intercept # α + beta = res.slope # β + r_value = res.rvalue + stderr_beta = res.stderr + stderr_alpha = res.intercept_stderr + + if not quiet: + # Print results + print(f"\talpha (intercept): {alpha:.6g}") + print(f"\tbeta (slope) : {beta:.6g}") + print(f"\tR^2 : {r_value**2:.6g}") + print(f"\tSE(beta) : {stderr_beta:.6g}") + print(f"\tSE(alpha) : {stderr_alpha:.6g}") + + # Plot results + plt.figure() + plt.scatter(x, y, s=12) + xline = np.linspace(x.min(), x.max(), 200) + yline = alpha + beta * xline + plt.plot(xline, yline, linewidth=2) + plt.xlabel('size [B]') + plt.ylabel('cycles') + plt.title("Linear fit: cycles = alpha + beta * X") + plt.show() + + # Return results + return alpha, beta + + +@cache +def fit_seq_dma(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'seq'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['dma_cycles'] = df['results'].apply(experiments.seq_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['dma_cycles'].to_numpy() + if not quiet: + print("Fit DMA transfer time for seq reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_seq_compute(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'seq'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['comp_cycles'] = df['results'].apply(experiments.seq_compute_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['comp_cycles'].to_numpy() + if not quiet: + print("Fit compute time for seq reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_tree_dma(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'tree'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['dma_cycles'] = df['results'].apply(experiments.tree_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['dma_cycles'].to_numpy() + if not quiet: + print("Fit DMA transfer time for tree reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_tree_compute(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'tree'] + + # Retrieve only single-row and single-batch experiments (fitted model should generalize) + df = df[df['n_rows'] == 1] + df['n_batches'] = df['size'] // df['batch'] + df = df[df['n_batches'] == 1] + + # Fit data + df['comp_cycles'] = df['results'].apply(experiments.tree_compute_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['comp_cycles'].to_numpy() + if not quiet: + print("Fit compute time for tree reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_hw_generic(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'hw_generic'] + + # Retrieve only single-row experiments (fitted model should generalize to multi-row) + df = df[df['n_rows'] == 1] + + # Fit data + df['cycles'] = df['results'].apply(experiments.hw_generic_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['cycles'].to_numpy() + if not quiet: + print("Fit runtime for hw generic reduction:") + return fit(x, y, quiet=quiet) + + +@cache +def fit_hw_simple(df=None, quiet=True): + # Get experiment data + if df is None: + df = experiments.results() + df = df[df['impl'] == 'hw_simple'] + + # Retrieve only single-row experiments (fitted model should generalize to multi-row) + df = df[df['n_rows'] == 1] + + # Fit data + df['cycles'] = df['results'].apply(experiments.hw_simple_dma_cycles) + x = df['size'].to_numpy() / BEAT_BYTES + y = df['cycles'].to_numpy() + if not quiet: + print("Fit runtime for hw simple reduction:") + return fit(x, y, quiet=quiet) + + +def main(): + + fit_tree_dma(quiet=False) + fit_tree_compute(quiet=False) + + fit_seq_dma(quiet=False) + fit_seq_compute(quiet=False) + + +if __name__ == '__main__': + main() diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py new file mode 100755 index 00000000..d503d17f --- /dev/null +++ b/experiments/reduction/model.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +from math import log2, sqrt, isqrt +from reduction import fit + +BEAT_BYTES = 64 +DELTA = 30 + + +def nearest_divisors(divisor, dividend): + from bisect import bisect_left + + # 1) Enumerate all positive divisors of dividend + divs = set() + r = int(isqrt(int(dividend))) + for d in range(1, r + 1): + if dividend % d == 0: # d divides dividend + divs.add(d) # add the small factor + divs.add(dividend // d) # add the paired large factor + divs = sorted(divs) # sort them ascending + + # 2) Binary search to locate where x would be inserted + i = bisect_left(divs, divisor) # first index with divs[i] >= divisor + + # 3) Pick neighbors around divisor + lower = divs[i - 1] if i > 0 and divs[i - 1] <= divisor else None + upper = divs[i] if i < len(divs) else None + return lower, upper + + +def hw_generic_runtime(c, r, n): + hw_alpha, _ = fit.fit_hw_generic() + beta = 1 + if r > 1: + beta = 2 + return hw_alpha + n * beta + + +def hw_simple_runtime(c, r, n): + hw_alpha, _ = fit.fit_hw_simple() + beta = 1 + if r > 1: + beta = 2 + return hw_alpha + n * beta + + +def seq_runtime(c, r, n, k, delta=DELTA): + batch = int(n // k) + dma_alpha, _ = fit.fit_seq_dma() + comp_alpha, _ = fit.fit_seq_compute() + t_comp = comp_alpha + batch * 1 + t_dma = dma_alpha + batch * 1 + t_max = max(t_comp, t_dma) + # print(t_dma * 5, t_comp * 5, t_max * 5) + n_iters = 1 + 2 * (c - 2) + k + # First approximation model assumes compute and dma take roughly the same time + return n_iters * (t_max + delta) - delta + + +def optimal_seq_k(c, r, n, delta=DELTA): + comp_alpha, _ = fit.fit_seq_compute() + real_k = sqrt(n / (delta + comp_alpha)) + lower_k, upper_k = nearest_divisors(real_k, n) + assert (lower_k is None) or (lower_k > 0) + assert (upper_k is None) or (upper_k <= n) + if lower_k is None: + return upper_k + elif upper_k is None: + return lower_k + else: + T_lower_k = seq_runtime(c, r, n, lower_k, delta) + T_upper_k = seq_runtime(c, r, n, upper_k, delta) + if T_lower_k < T_upper_k: + return lower_k + else: + return upper_k + + +def optimal_seq_runtime(c, r, n, delta=DELTA): + k_opt = optimal_seq_k(c, r, n, delta) + return seq_runtime(c, r, n, k_opt, delta) + + +def tree_runtime(c, r, n, k, delta=DELTA): + batch = int(n // k) + dma_alpha, _ = fit.fit_tree_dma() + comp_alpha, _ = fit.fit_tree_compute() + t_comp = comp_alpha + batch * 1 + t_dma = dma_alpha + batch * 1 + t_max = max(t_comp, t_dma) + # print(t_dma * 5, t_comp * 5, t_max * 5) + n_levels = log2(c * r) + return n_levels * (t_dma + (k - 1) * (delta + t_max) + delta + t_comp) + + +def optimal_tree_k(c, r, n, delta=DELTA): + dma_alpha, _ = fit.fit_tree_dma() + comp_alpha, _ = fit.fit_tree_compute() + real_k = sqrt(n / (delta + max(dma_alpha, comp_alpha))) + lower_k, upper_k = nearest_divisors(real_k, n) + assert (lower_k is None) or (lower_k > 0) + assert (upper_k is None) or (upper_k <= n) + if lower_k is None: + return upper_k + elif upper_k is None: + return lower_k + else: + T_lower_k = tree_runtime(c, r, n, lower_k, delta) + T_upper_k = tree_runtime(c, r, n, upper_k, delta) + if T_lower_k < T_upper_k: + return lower_k + else: + return upper_k + + +def optimal_tree_runtime(c, r, n, delta=DELTA): + k_opt = optimal_tree_k(c, r, n, delta) + return tree_runtime(c, r, n, k_opt, delta) + + +def optimal_sw_runtime(c, r, n, delta=DELTA): + T_seq = optimal_seq_runtime(c, r, n, delta) + T_tree = optimal_tree_runtime(c, r, n, delta) + return min(T_seq, T_tree) + + +# print(5 * tree_runtime(4, 1, 8192 // 64, 1)) +# print(5 * tree_runtime(4, 1, 32768 // 64, 4)) diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py new file mode 100755 index 00000000..33f14b6d --- /dev/null +++ b/experiments/reduction/plot.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +import argparse +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from reduction import model, experiments + +pd.options.mode.copy_on_write = True +BEAT_BYTES = 64 + + +# Find the monotone non-decreasing lower fit of an oscillating curve +def find_monotone_lower_fit(x, y): + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept: + y_monotone.append(y[i]) + x_monotone.append(x[i]) + return x_monotone, y_monotone + + +def get_actual_cycles(row): + if row['impl'] == 'seq': + n_batches = int(row['size'] // row['batch']) + return experiments.seq_cycles(row['results'], 4, row['n_rows'], n_batches) + elif row['impl'] == 'tree': + n_batches = int(row['size'] // row['batch']) + return experiments.tree_cycles(row['results'], 4, row['n_rows'], n_batches) + elif row['impl'] == 'hw_simple': + return experiments.hw_simple_cycles(row['results']) + elif row['impl'] == 'hw_generic': + return experiments.hw_generic_cycles(row['results']) + + +def get_expected_cycles(row): + if row['impl'] == 'seq': + cycles = model.optimal_seq_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'tree': + cycles = model.optimal_tree_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'hw_generic': + cycles = model.hw_generic_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'hw_simple': + cycles = model.hw_simple_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(cycles) + + +def get_expected_batch_size(row): + if row['impl'] == 'tree': + optimal_k = model.optimal_tree_k(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(row['size'] // optimal_k) + elif row['impl'] == 'hw': + return None + + +# Select optimal configuration (optimal k) for seq and tree runs +def select_best_configs(df): + return df.loc[df.groupby(['impl', 'size', 'n_rows'])['cycles'].idxmin()] \ + .sort_values(['impl', 'n_rows', 'size']) \ + .reset_index(drop=True) + + +def seq_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax, 64) + y = [model.optimal_seq_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +# Filter optimal sequential runtime, by finding the monotone non-decreasing lower fit. +def monotone_seq_runtime_curve(xmin, xmax, c, r): + x, y = seq_runtime_curve(xmin, xmax, c, r) + return find_monotone_lower_fit(x, y) + + +def tree_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax, 64) + y = [model.optimal_tree_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +# Filter optimal tree runtime, by finding the monotone non-decreasing lower fit. +def monotone_tree_runtime_curve(xmin, xmax, c, r): + x, y = tree_runtime_curve(xmin, xmax, c, r) + return find_monotone_lower_fit(x, y) + + +def hw_generic_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax, 64) + y = [model.hw_generic_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +def hw_simple_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax, 64) + y = [model.hw_simple_runtime(c, r, e // BEAT_BYTES) for e in x] + return x, y + + +# def sw_runtime_curve(xmin, xmax, n_rows): +# x = np.arange(xmin, xmax, 64) +# x, y_seq = seq_runtime_curve(xmin, xmax, n_rows) +# _, y_tree = tree_runtime_curve(xmin, xmax, n_rows) +# x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) +# return x, y_min + + +def plot1(show=True): + """Plot actual and expected runtimes.""" + + # Load single-row results + c = 4 + r = 1 + df = experiments.results() + df = df[df['n_rows'] == r] + + # Get expected and actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_configs(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + # df['exp_batch'] = df.apply(get_expected_batch_size, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C0", "tree": "C1", "hw_simple": "C2", "hw_generic": "C3"} + markers = {"expected": "o", "actual": "x"} + + # Create plot + _, ax = plt.subplots() + + # Plot actual runtimes + sizes = df['size'].unique() + ax.scatter( + sizes, df[df['impl'] == 'seq']['cycles'], label='Actual (seq)', + marker=markers['actual'], color=colors['seq'] + ) + ax.scatter( + sizes, df[df['impl'] == 'tree']['cycles'], label='Actual (tree)', + marker=markers['actual'], color=colors['tree'] + ) + ax.scatter( + sizes, df[df['impl'] == 'hw_simple']['cycles'], label='Actual (hw_simple)', + marker=markers['actual'], color=colors['hw_simple'] + ) + # ax.scatter( + # sizes, df[df['impl'] == 'hw_generic']['cycles'], label='Actual (hw_generic)', + # marker=markers['actual'], color=colors['hw_generic'] + # ) + + # # Plot expected runtimes + # ax.scatter( + # sizes, df[df['impl'] == 'tree']['exp_cycles'], label='Expected (tree)', + # marker=markers['expected'], color=colors['tree'] + # ) + # ax.scatter( + # sizes, df[df['impl'] == 'hw_simple']['exp_cycles'], label='Expected (hw_simple)', + # marker=markers['expected'], color=colors['hw_simple'] + # ) + # ax.scatter( + # sizes, df[df['impl'] == 'hw_generic']['exp_cycles'], label='Expected (hw_generic)', + # marker=markers['expected'], color=colors['hw_generic'] + # ) + + # Plot model line for seq runtime + x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + + # Plot model line for tree runtime + x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + + # # Plot model line for hw generic runtime + # x, y = hw_generic_runtime_curve(sizes.min(), sizes.max(), c, r) + # ax.plot(x, y, label='Model (hw_generic)', linestyle='--', color=colors['hw_generic']) + + # Plot model line for hw simple runtime + x, y = hw_simple_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (hw_simple)', linestyle='--', color=colors['hw_simple']) + + # Plot formatting + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, alpha=0.4) + ax.legend() + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot2(show=True): + """Plot actual and expected runtimes, for multiple number of rows.""" + + # Load results + c = 4 + df = experiments.results() + + # Get expected and actual runtimes + df['cycles'] = df.apply(get_actual_cycles, axis=1) + df = select_best_configs(df) + df['exp_cycles'] = df.apply(get_expected_cycles, axis=1) + df['exp_batch'] = df.apply(get_expected_batch_size, axis=1) + + # Drop results column for cleaner output + df = df.drop(columns=['results']) + + # Choose consistent colors for the plots + colors = {"sw": "C0", "hw_generic": "C1", "hw_simple": "C2"} + + # Create plot + _, ax = plt.subplots() + sizes = df['size'].unique() + + # Create function to plot runtimes for a given number of rows + def plot_runtime_vs_speedup(ax, n_rows, show_label=False): + res = df[df['n_rows'] == n_rows] + best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() + ax.scatter( + sizes, best_sw_cycles, label='Actual (min(seq, tree))' if show_label else None, + marker='x', color=colors['sw'] + ) + ax.scatter( + sizes, res[res['impl'] == 'hw_simple']['cycles'], label='Actual (hw_simple)' if show_label else None, + marker='x', color=colors['hw_simple'] + ) + # ax.scatter( + # sizes, res[res['impl'] == 'hw_generic']['cycles'], label='Actual (hw_generic)' if show_label else None, + # marker='x', color=colors['hw_generic'] + # ) + + # Plot model line for best software implementation + # TODO(colluca): use sw_runtime_curve once sequential implementation is integrated + x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + ax.plot( + x, y, label='Model (min(seq, tree))' if show_label else None, + linestyle='--', color=colors['sw']) + + # Annotate sw model lines with number of rows, in correspondence with actual runtime + ax.annotate( + f'r={n_rows}', + xy=(x[-1], best_sw_cycles[sizes.max()]), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware simple runtime + x, y = hw_simple_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + ax.plot( + x, y, label='Model (hw_simple)' if show_label else None, linestyle='--', color=colors['hw_simple']) + + # # Plot model line for hardware generic runtime + # x, y = hw_generic_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + # ax.plot( + # x, y, label='Model (hw_generic)' if show_label else None, linestyle='--', color=colors['hw_generic']) + + for i, n_rows in enumerate(df['n_rows'].unique()): + plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) + + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlim(0, sizes.max() * 1.1) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, alpha=0.4) + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + return df + + +def plot3(show=True): + """Plot runtime dependency with delta.""" + + # Load results + df = experiments.results() + + # Get only hardware implementation results + df = df[df['impl'] == 'hw'] + + # Retrieve single-row experiments + n_rows = 1 + df = df[df['n_rows'] == n_rows] + + # Get actual runtimes + df['cycles'] = df['results'].apply(experiments.hw_cycles) + + # Choose consistent colors and markers for the plots + colors = {"seq": "C1", "hw": "C2"} + + # Plot measured runtimes + _, ax = plt.subplots() + + # Plot model lines for sequential runtime (varying delta). + x = np.arange(df['size'].min(), df['size'].max(), 64) + for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -12), 0]): + alpha = alpha_delta // 2 + delta = alpha_delta - alpha + x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64) + y = [model.optimal_seq_runtime(4, n_rows, e // 64, delta, alpha, model.SEQ_ALPHA) + for e in x_ext] + y_monotone = [] + x_monotone = [] + for i, e in enumerate(y): + accept = True + for j in range(i, len(y)): + if y[j] < e: + accept = False + break + if accept and i < len(x): + y_monotone.append(y[i]) + x_monotone.append(x[i]) + ax.plot(x_monotone, y_monotone, label='seq$(\\alpha_i+\\delta)$' if k == 0 else None, + color=colors['seq']) + ax.annotate( + f'${alpha_delta}$', + xy=(x[-1], y_monotone[-1]), + xytext=(1, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) + + # Plot model line for hardware runtime + y = [model.hw_runtime(4, n_rows, e // 64) for e in x] + ax.plot(x, y, label='hw', linestyle='--', color=colors['hw']) + + ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']]) + ax.set_xlabel('Size [B]') + ax.set_ylabel('Runtime [cycles]') + ax.grid(True, linestyle='-', alpha=0.4) + ax.legend() + + plt.tight_layout() + if show: + plt.show() + + +def main(): + # Parse arguments + functions = [plot1, plot2] + parser = argparse.ArgumentParser(description='Plot wide multicast results.') + parser.add_argument( + 'plots', + nargs='*', + default='all', + choices=[f.__name__ for f in functions] + ['all'], + help='Plots to generate.' + ) + args = parser.parse_args() + + # Helper function to check if a plot was requested on the command line + def requested(plot): + return plot in args.plots or 'all' in args.plots + + # Generate plots + if requested('plot1'): + plot1() + if requested('plot2'): + plot2() + + +if __name__ == "__main__": + main() diff --git a/experiments/reduction/roi/hw_generic.json.tpl b/experiments/reduction/roi/hw_generic.json.tpl new file mode 100644 index 00000000..00e1c78a --- /dev/null +++ b/experiments/reduction/roi/hw_generic.json.tpl @@ -0,0 +1,21 @@ +<% + def pb_cluster_idx(c, r): + return c * 4 + r + + n_rows = experiment['n_rows'] +%> +[ +% for r in range(n_rows): + % for c in range(4): + { + "thread": "${f'hart_{1 + pb_cluster_idx(c, r) * 9 + 8}'}", + "roi": [ + // First iteration + {"idx": 1, "label": "reduction"}, + // Second iteration + {"idx": 3, "label": "reduction"}, + ] + }, + % endfor +% endfor +] diff --git a/experiments/reduction/roi/hw_simple.json.tpl b/experiments/reduction/roi/hw_simple.json.tpl new file mode 100644 index 00000000..c9301cc5 --- /dev/null +++ b/experiments/reduction/roi/hw_simple.json.tpl @@ -0,0 +1,23 @@ +<% + def pb_cluster_idx(c, r): + return c * 4 + r + + n_rows = experiment['n_rows'] +%> +[ +% for r in range(n_rows): + % for c in range(4): + { + "thread": "${f'hart_{1 + pb_cluster_idx(c, r) * 9 + 8}'}", + "roi": [ + // First iteration + {"idx": 2, "label": "row reduction"}, + {"idx": 4, "label": "column reduction"}, + // Second iteration + {"idx": 8, "label": "row reduction"}, + {"idx": 10, "label": "column reduction"}, + ] + }, + % endfor +% endfor +] diff --git a/experiments/reduction/roi/seq.json.tpl b/experiments/reduction/roi/seq.json.tpl new file mode 100644 index 00000000..f14ede55 --- /dev/null +++ b/experiments/reduction/roi/seq.json.tpl @@ -0,0 +1,87 @@ +<% +n_rows = experiment['n_rows'] +n_cols = 4 +n_batches = int(experiment['size'] // experiment['batch']) +n_repetitions = 2 + +def pb_cluster_idx(r, c): + return c * n_cols + r +%> +[ + <% n_repetitions = 2 %> + + % for row in range(0, n_rows): + ## for easternmost column, only DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, n_cols-1)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"}, + % endfor + % endfor + ] + }, + + ## for middle columns + % for col in range(n_cols-2, 0, -1): + ## Compute cores + % for core in range(0, 8): + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + core}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + % endfor + % endfor + ] + }, + % endfor + ## DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, col)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd_{i}'}"}, + % endfor + % endfor + ] + }, + % endfor + + ## for westernmost column + % if row != 0 and n_rows > 1: + ## DMA core + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + 8}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'd2_{i}'}"}, + % endfor + % endfor + ] + }, + % endif + ## Compute cores + % for core in range(0, 8): + { + "thread": "${f'hart_{1 + pb_cluster_idx(row, 0)*9 + core}'}", + "roi": [ + % for iter in range(0, n_repetitions): + % for i in range(0, n_batches): + % if row == (n_rows - 1): + {"idx": ${(2 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + % else: + {"idx": ${(4 * n_batches + 2) * iter + 2 * i + 2}, "label": "${f'c_{i}'}"}, + {"idx": ${(4 * n_batches + 2) * iter + 2 * n_batches + 2 * i + 2}, "label": "${f'c2_{i}'}"}, + % endif + % endfor + % endfor + ] + }, + % endfor + % endfor +] \ No newline at end of file diff --git a/experiments/reduction/roi/tree.json.tpl b/experiments/reduction/roi/tree.json.tpl new file mode 100644 index 00000000..a6e3e541 --- /dev/null +++ b/experiments/reduction/roi/tree.json.tpl @@ -0,0 +1,123 @@ +<% +import math +n_rows = experiment['n_rows'] +n_batches = int(experiment['size'] // experiment['batch']) +n_levels_in_row = int(math.log2(4)) +def n_levels_in_col(row_idx): + if row_idx == 0: + return int(math.log2(n_rows)) + elif row_idx == 2: + return 1 + else: + return 0 + +def pb_cluster_idx(row, col): + return row + col * 4 +%> + +[ +% for r in range(n_rows): + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 0}'}", + "roi": [ + ## Reductions in row + % for level_in_row in range(n_levels_in_row): + % for batch in range(n_batches): + { + "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + (level_in_row * n_batches + batch) * 2}, + "label": "${f'comp l{level_in_row}'}" + }, + % endfor + % endfor + ## Reductions in column 0 + % for level_in_col in range(n_levels_in_col(r)): + % for batch in range(n_batches): + { + "idx": ${4 + n_batches * (n_levels_in_col(r) + n_levels_in_row) * 2 + ((n_levels_in_row + level_in_col) * n_batches + batch) * 2}, + "label": "${f'comp l{n_levels_in_row + level_in_col}'}" + }, + % endfor + % endfor + ] + }, + + % if r > 0: + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 0) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 2 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "${f'{r} > {0 if r in [1, 2] else 2}'}" + }, + % endfor + ] + }, + %endif + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 4 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "4 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 1) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 4 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "4 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 0}'}", + "roi": [ + % for batch in range(n_batches): + ## Stage 0 reduction + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "comp l0" + }, + % endfor + ] + }, + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 2) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 8 to cluster 0 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "8 > 0" + }, + % endfor + ] + }, + + { + "thread": "${f'hart_{1 + pb_cluster_idx(r, 3) * 9 + 8}'}", + "roi": [ + % for batch in range(n_batches): + ## Transfer from cluster 12 to cluster 8 + { + "idx": ${4 + n_batches * 2 + batch * 2}, + "label": "12 > 8" + }, + % endfor + ] + }, +% endfor +] diff --git a/experiments/reduction/verify.py b/experiments/reduction/verify.py new file mode 100755 index 00000000..e6a17e78 --- /dev/null +++ b/experiments/reduction/verify.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande +# +# Verification script for `reduction_benchmark.c` + +import sys +import snitch.util.sim.verif_utils as vu + + +class Verifier(vu.Verifier): + + OUTPUT_UIDS = ['output'] + + def __init__(self): + super().__init__() + + def get_actual_results(self): + return self.get_output_from_symbol(self.OUTPUT_UIDS[0], 'double') + + def get_expected_results(self): + length = int(self.get_input_from_symbol('length', 'uint32_t')[0]) + n_clusters = int(self.get_input_from_symbol('n_clusters', 'uint32_t')[0]) + return [i * n_clusters + sum(range(n_clusters)) for i in range(length)] + + def check_results(self, *args): + return super().check_results(*args, atol=0) + + +if __name__ == "__main__": + sys.exit(Verifier().main()) diff --git a/sw/snitch/tests/bench_hw_reduction_1stage.c b/sw/snitch/tests/bench_hw_reduction_1stage.c deleted file mode 100644 index 1cba5ad9..00000000 --- a/sw/snitch/tests/bench_hw_reduction_1stage.c +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright 2025 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Raphael Roth -// -// This code can be used to benchmark the hardware reduction feature of the picobello -// system. It can reduce double data directly in flight with the internal DMA. -// It is possible to set the target cluster freely. -// The reduction is done in one stage for all clusters. - -#include -#include "pb_addrmap.h" -#include "snrt.h" - -// use this define to enable "row" reduction instead of columne reduction -// has only a effect if Number of cluster is 8! -#ifndef REDUCE_IN_ROW -#define REDUCE_IN_ROW 0 -#endif - -// Benchmark Parameter: -#ifndef NUMBER_OF_CLUSTERS -#define NUMBER_OF_CLUSTERS 16 // Needs to be either 4 / 8 / 16 -#endif - -#ifndef TARGET_CLUSTER -#define TARGET_CLUSTER 0 -#endif - -#ifndef DATA_BYTE -#define DATA_BYTE 8192 -#endif - -// Translate from byte into doubles -#ifndef DATA_LENGTH -#define DATA_LENGTH (DATA_BYTE/8) -#endif - -#define DATA_EVAL_LENGTH (DATA_LENGTH) - -// Define the reduction mask depending on the number of involved clusters -// When we reduce in the row then the mask is hardcoded! -#if REDUCE_IN_ROW == 0 - #define REDUCTION_MASK ((NUMBER_OF_CLUSTERS - 1) << 18) -#else - #define GROUND_MASK (((NUMBER_OF_CLUSTERS == 4) * 12) + ((NUMBER_OF_CLUSTERS == 8) * 13) + ((NUMBER_OF_CLUSTERS == 16) * 15)) - #define REDUCTION_MASK (GROUND_MASK << 18) -#endif - -/** - * @brief Return if the cluster is involved in the reduction or not. - * @param cluster_nr cluster id - */ -static inline int cluster_participates_in_reduction(int cluster_nr) { -#if REDUCE_IN_ROW == 0 - return (cluster_nr < NUMBER_OF_CLUSTERS); -#else - if(NUMBER_OF_CLUSTERS == 4){ - return ((cluster_nr % 4) == 0); - } else if(NUMBER_OF_CLUSTERS == 8){ - return (((cluster_nr % 4) == 0) || ((cluster_nr % 4) == 1)); - } else { - return (cluster_nr < NUMBER_OF_CLUSTERS); - } -#endif -} - -/** - * @brief Verify if the data are properly copied. Please adapt the algorythm if you change the data generation - * @param cluster_nr cluster id - * @param ptrData pointer to fetch the data from - */ -static inline uint32_t cluster_verify_reduction(int cluster_nr, double * ptrData) { - // Evaluate the reduction result - if (snrt_is_dm_core() && (cluster_nr == TARGET_CLUSTER)) { - uint32_t n_errs = DATA_EVAL_LENGTH; -#if REDUCE_IN_ROW == 0 - double base_value = (NUMBER_OF_CLUSTERS * 15.0) + (double) (((NUMBER_OF_CLUSTERS-1) * ((NUMBER_OF_CLUSTERS-1) + 1)) >> 1); -#else - double base_value = 0.0; - if(NUMBER_OF_CLUSTERS == 4){ - base_value = (NUMBER_OF_CLUSTERS * 15.0) + 24.0; - } else if(NUMBER_OF_CLUSTERS == 8){ - base_value = (NUMBER_OF_CLUSTERS * 15.0) + 52.0; - } else { - base_value = (NUMBER_OF_CLUSTERS * 15.0) + 120.0; - } -#endif - for (uint32_t i = 0; i < DATA_EVAL_LENGTH; i++) { - if (*ptrData == base_value){ - n_errs--; - } - base_value = base_value + (double) NUMBER_OF_CLUSTERS; - ptrData = ptrData + 1; - } - return n_errs; - } else { - return 0; - } -} - -// Main code -int main() { - snrt_interrupt_enable(IRQ_M_CLUSTER); - - // Sanity check: - if(((NUMBER_OF_CLUSTERS % 4) != 0) && (NUMBER_OF_CLUSTERS != 12)){ - // Number of cluster should be a power of 2 (for mask generation) - return 1; - } - - // Cluster ID - uint32_t cluster_id = snrt_cluster_idx(); - - // Set the mask for the multicast - uint64_t mask = REDUCTION_MASK; - - // Generate unique data for the reduction - double fill_value = 42.0; - double init_data = 15.0 + (double) cluster_id; - - // Allocate destination buffer - double *buffer_dst = (double*) snrt_l1_next_v2(); - double *buffer_src = buffer_dst + DATA_LENGTH; - - // Determint the target address - double *buffer_target = (double*) snrt_remote_l1_ptr(buffer_dst, cluster_id, TARGET_CLUSTER); - volatile double *buffer_last_entry = (volatile double *) (buffer_dst + DATA_LENGTH - 1); - - // Fill the source buffer with the init data - if (snrt_is_dm_core()) { - for (uint32_t i = 0; i < DATA_LENGTH; i++) { - buffer_src[i] = init_data + (double) i; - buffer_dst[i] = fill_value; - } - } - - // Do the transmission 3 times to preheat the cache - for(volatile int i = 0; i < 3; i++){ - // Reset the last entry from prior loop (if @ end of loop > check at end fails) - if((cluster_id == TARGET_CLUSTER) && snrt_is_dm_core()){ - *buffer_last_entry = fill_value; - } - - // Wait until the cluster are finished - snrt_global_barrier(); - - // Perf. analysis - snrt_mcycle(); - - // Init the DMA multicast - if (snrt_is_dm_core() && cluster_participates_in_reduction(cluster_id)) { - snrt_dma_start_1d_reduction(buffer_target, buffer_src, DATA_LENGTH * sizeof(double), mask, SNRT_REDUCTION_FADD); - // Only waits until last w is transmitted but not until b response arrives! - snrt_dma_wait_all(); - } - - // Target cluster polls the last array entry to know when we wrote the data - if((cluster_id == TARGET_CLUSTER) && snrt_is_dm_core()){ - while(*buffer_last_entry == fill_value); - } - - // Perf. analysis - snrt_mcycle(); - - } - - // Sync all cores - snrt_global_barrier(); - - // Evaluate the reduction result - return cluster_verify_reduction(cluster_id, buffer_dst); -} diff --git a/sw/snitch/tests/reduction_benchmark.c b/sw/snitch/tests/reduction_benchmark.c new file mode 100644 index 00000000..164876d3 --- /dev/null +++ b/sw/snitch/tests/reduction_benchmark.c @@ -0,0 +1,523 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Chen Wu + +#include "snrt.h" +#include "blas.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 1024 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 64 +#endif + +typedef enum { + SEQ, + TREE, + HW_GENERIC, + HW_SIMPLE, +} impl_t; + +#ifndef IMPL +#define IMPL SEQ +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(double)) + +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 0 +#endif + +#define N_ROWS (1 << LOG2_N_ROWS) + +// Replace `1` literals with this thread-local variable to prevent it from +// being allocated by the compiler in L2. +__thread double one = 1; + +static inline void global_hw_barrier(volatile uint32_t *barrier_ptr, + uint32_t user) { + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + if (snrt_is_dm_core()) { + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + snrt_cluster_hw_barrier(); +} + +static inline void swap_buffers(uintptr_t *a, uintptr_t *b) { + uintptr_t temp = *a; + *a = *b; + *b = temp; +} + +static inline void dma_reduction_hw_generic(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + if (snrt_is_dm_core() && comm->is_participant) { + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), 0); + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + snrt_dma_start_1d_reduction(remote_dst, src, SIZE, comm, op); + snrt_dma_wait_all(); + } +} + +static inline void dma_reduction_hw_simple(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + + // Create a communicator per row + snrt_comm_t row_comm[N_ROWS]; + for (int i = 0; i < N_ROWS; i++) + pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0, + comm); + + // Create a communicator for the first column + snrt_comm_t col_comm; + pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm); + + if (snrt_is_dm_core() && comm->is_participant) { + uint32_t remote_cluster; + uintptr_t remote_dst; + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + + // Reduction across rows (destination: first cluster in row) + snrt_mcycle(); + remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0); + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), remote_cluster); + snrt_dma_start_1d_reduction( + remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op); + snrt_dma_wait_all(); + snrt_mcycle(); + + // Barrier to ensure there is only one outstanding reduction in every + // router + snrt_inter_cluster_barrier(comm); + + // Reduction across first column (destination: cluster 0) + snrt_mcycle(); + if (N_ROWS > 1 && col_comm->is_participant) { + // Switch source and destination buffer + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)src, snrt_cluster_idx(), 0); + snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op); + snrt_dma_wait_all(); + } + snrt_mcycle(); + } +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_seq(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + + if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { + DUMP(a); + DUMP(b); + DUMP(c); + DUMP(d); + } + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a; + } else { + dma_src = c; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a; + comp_src2 = dst[0]; + comp_result = c; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + comp_src1 = c; + comp_src2 = dst[0]; + comp_result = a; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + +static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx(), pb_cluster_col_idx() - dist); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() - dist, pb_cluster_col_idx()); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_compute_phase(char is_receiver, + uintptr_t src, uintptr_t dst, uintptr_t result) { + + // Every receiving cluster reduces the data from the sender + if (is_receiver && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst, + (double *)result); + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + + // Only clusters in the communicator continue from here + if (!comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Initially a is the source buffer and c is the result buffer + uintptr_t src = a; + uintptr_t dst[2] = {b, d}; + uintptr_t result = c; + + // Iterations to cover all reductions in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders and receivers at every level + // of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist; + char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0; + + // Transfer phase for first batch + dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } + + // Iterations to cover all reductions in the first column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0); + char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0); + + // Transfer phase for first batch + dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_reduction(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (IMPL == SEQ) + dma_reduction_seq(a, b, c, d, comm); + else if (IMPL == TREE) + dma_reduction_tree(a, b, c, d, comm); + else if (IMPL == HW_GENERIC) + dma_reduction_hw_generic(a, c, comm); + else if (IMPL == HW_SIMPLE) + dma_reduction_hw_simple(a, c, comm); +} + +// Global variables for verification script +double output[N_ELEMS]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS; + +int main (void){ + + // Allocate 4KiB-aligned buffers in every cluster + uintptr_t a_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + uintptr_t c_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + uintptr_t b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + uintptr_t d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + + // Create communicator for first N rows (all other clusters are inactive) + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); + + // Only clusters in the first N rows continue from here + if (!comm->is_participant) return 0; + + // Two iterations to to preheat the cache + for (volatile int i = 0; i < 2; i++){ + + // Initialize source buffer + if (snrt_is_dm_core()) { + for (uint32_t i = 0; i < N_ELEMS; i++) { + uint32_t row_major_cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + ((double *)a_buffer)[i] = row_major_cluster_idx + i; + } + } + snrt_global_barrier(comm); + + // Perform reduction + snrt_mcycle(); + dma_reduction(a_buffer, b_buffer, c_buffer, d_buffer, comm); + snrt_mcycle(); + snrt_global_barrier(comm); + } + + // Writeback to L3 + uintptr_t result_buffer = c_buffer; + uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS; + if ((IMPL == HW_SIMPLE && N_ROWS > 1) || + (IMPL == SEQ && N_ROWS > 1) || + (IMPL == TREE && (total_tree_levels % 2) == 0)) + result_buffer = a_buffer; + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE); + snrt_dma_wait_all(); + } +} diff --git a/target/sim/traces.mk b/target/sim/traces.mk index f4773b33..4cc627c1 100644 --- a/target/sim/traces.mk +++ b/target/sim/traces.mk @@ -9,6 +9,10 @@ LOGS_DIR = $(SIM_DIR)/logs SN_SIM_DIR = $(SIM_DIR) include $(SN_ROOT)/make/traces.mk +ifdef ROI_SPEC + SN_ROI_SPEC = $(ROI_SPEC) +endif + CHS_ADDR2LINE ?= $(CHS_SW_GCC_BINROOT)/riscv64-unknown-elf-addr2line CHS_TXT_TRACE = $(LOGS_DIR)/trace_hart_00000.txt CHS_ANNOTATED_TRACE = $(LOGS_DIR)/trace_hart_00000.s @@ -24,6 +28,8 @@ traces: sn-traces chs-trace annotate: sn-annotate chs-annotate traces-clean: sn-clean-traces chs-trace-clean annotate-clean: sn-clean-annotate chs-annotate-clean +visual-trace: sn-visual-trace +clean-visual-trace: sn-clean-visual-trace chs-trace: $(CHS_TXT_TRACE) chs-annotate: $(CHS_ANNOTATED_TRACE) From e3a727966a02b2ecef383ecbcfca52cdfa0d0acf Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Tue, 28 Oct 2025 16:58:24 +0100 Subject: [PATCH 45/77] bender: Bump common_cells --- Bender.lock | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Bender.lock b/Bender.lock index 02700a76..bc81674e 100644 --- a/Bender.lock +++ b/Bender.lock @@ -146,7 +146,7 @@ packages: dependencies: - common_cells common_cells: - revision: e1c09c75775c5f03eb45906d5145dbd2f5bcfb95 + revision: bab4cea30f1cdfe303a4585db04c1a1f6882b718 version: null source: Git: https://github.com/pulp-platform/common_cells.git @@ -216,7 +216,15 @@ packages: - axi_riscv_atomics - common_cells - common_verification + - floo_noc_pd + - fpnew - idma + floo_noc_pd: + revision: null + version: null + source: + Path: working_dir/floo_noc/./pd + dependencies: [] fpnew: revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 version: null @@ -267,7 +275,7 @@ packages: - tech_cells_generic idma: revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e - version: null + version: 0.6.5 source: Git: https://github.com/pulp-platform/iDMA.git dependencies: From 8f777aa3a85f05dc2d8c12aa6224963a3c572f0f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 29 Oct 2025 10:46:02 +0100 Subject: [PATCH 46/77] Update plots for paper --- experiments/barrier/plot.py | 4 +- experiments/dma_multicast_v2/plot.py | 75 +++++++++++++++++++++------- experiments/reduction/plot.py | 36 +++++++++---- experiments/summa_gemm/plot.py | 54 ++++++++++++++++---- 4 files changed, 129 insertions(+), 40 deletions(-) diff --git a/experiments/barrier/plot.py b/experiments/barrier/plot.py index c793ccda..f1e82de4 100755 --- a/experiments/barrier/plot.py +++ b/experiments/barrier/plot.py @@ -20,12 +20,12 @@ def plot1(show=True): df['cycles'] = df['results'].apply(experiments.get_total_cycles) # Plot - ax = df.pivot(index="n_clusters", columns="impl", values="cycles").plot(kind="bar") + ax = df.pivot(index="n_clusters", columns="impl", values="cycles").plot(kind="bar", width=0.65) ax.set_xlabel("Nr. clusters") ax.set_ylabel("Runtime [cycles]") ax.tick_params(axis='x', labelrotation=0) ax.grid(axis="y", alpha=0.4) - ax.legend() + ax.legend(handlelength=1, ncol=2, columnspacing=0.5, handletextpad=0.3) plt.tight_layout() if show: plt.show() diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/dma_multicast_v2/plot.py index 0a380bd9..cbd846f2 100755 --- a/experiments/dma_multicast_v2/plot.py +++ b/experiments/dma_multicast_v2/plot.py @@ -154,7 +154,7 @@ def plot1(): plt.show() -def plot2(show=True): +def plot2(ax=None, y_label=None, hide_x_axis=False, show=True): """Plot actual and expected runtimes.""" # Load results @@ -173,10 +173,10 @@ def plot2(show=True): markers = {"expected": "o", "actual": "x"} # Plot measured runtimes - # _, ax = plt.subplots(2, 2) - _, ax = plt.subplots() + if ax is None: + _, ax = plt.subplots() - def plot_runtime_vs_speedup(ax, n_rows): + def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label): res = df[df['n_rows'] == n_rows] sizes = res['size'].unique() ax.scatter( @@ -218,10 +218,24 @@ def plot_runtime_vs_speedup(ax, n_rows): x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) - ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) - ax.set_xlabel('Size [B]') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, alpha=0.4) + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_xlim(0, sizes.max() * 1.08) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') ax.legend() plot_runtime_vs_speedup(ax, n_rows=1) @@ -235,7 +249,7 @@ def plot_runtime_vs_speedup(ax, n_rows): return df -def plot3(show=True): +def plot3(ax=None, y_label=None, hide_x_axis=False, show=True): """Plot runtime dependency with delta.""" # Load results @@ -255,7 +269,8 @@ def plot3(show=True): colors = {"seq": "C1", "hw": "C2"} # Plot measured runtimes - _, ax = plt.subplots() + if ax is None: + _, ax = plt.subplots() # Plot model lines for sequential runtime (varying delta). x = np.arange(df['size'].min(), df['size'].max(), 64) @@ -288,10 +303,26 @@ def plot3(show=True): y = [model.hw_runtime(4, n_rows, e // 64) for e in x] ax.plot(x, y, label='hw', linestyle='--', color=colors['hw']) - ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']]) - ax.set_xlabel('Size [B]') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, linestyle='-', alpha=0.4) + # Configure plot + sizes = df['size'] + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + ax.set_xlim(0, sizes.max() * 1.08) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') ax.legend() plt.tight_layout() @@ -299,7 +330,7 @@ def plot3(show=True): plt.show() -def plot4(show=True): +def plot4(ax=None, y_label=None, show=True): """Plot actual and expected runtimes, for multiple number of rows.""" # Load results @@ -317,7 +348,8 @@ def plot4(show=True): colors = {"sw": "C0", "hw": "C2"} # Plot measured runtimes - _, ax = plt.subplots() + if ax is None: + _, ax = plt.subplots() sizes = df['size'].unique() def plot_runtime_vs_speedup(ax, n_rows, show_label=False): @@ -353,11 +385,16 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): for i, n_rows in enumerate(df['n_rows'].unique()): plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) + # Configure plot + sizes = df['size'].unique() + ax.set_xlim(0, sizes.max() * 1.08) ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) - ax.set_xlim(0, sizes.max() * 1.1) ax.set_xlabel('Size [B]') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, alpha=0.4) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') ax.legend() plt.tight_layout() diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index 33f14b6d..9a9d08b1 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -115,7 +115,7 @@ def hw_simple_runtime_curve(xmin, xmax, c, r): # return x, y_min -def plot1(show=True): +def plot1(y_label=None, hide_x_axis=False, show=True): """Plot actual and expected runtimes.""" # Load single-row results @@ -190,10 +190,24 @@ def plot1(show=True): ax.plot(x, y, label='Model (hw_simple)', linestyle='--', color=colors['hw_simple']) # Plot formatting - ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) - ax.set_xlabel('Size [B]') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, alpha=0.4) + if hide_x_axis: + ax.set_xlabel('') + ax.set_xticks(sizes) + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) + ax.set_xlabel('Size [B]') + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_xlim(0, sizes.max() * 1.1) + ax.set_axisbelow(True) + ax.grid(True, color='gainsboro') ax.legend() plt.tight_layout() if show: @@ -202,7 +216,7 @@ def plot1(show=True): return df -def plot2(show=True): +def plot2(y_label=None, show=True): """Plot actual and expected runtimes, for multiple number of rows.""" # Load results @@ -272,8 +286,11 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): ax.set_xticks(sizes, [str(size) if size != 2048 else "" for size in sizes]) ax.set_xlim(0, sizes.max() * 1.1) ax.set_xlabel('Size [B]') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, alpha=0.4) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, color='gainsboro') ax.legend() plt.tight_layout() @@ -339,7 +356,8 @@ def plot3(show=True): ax.set_xticks(df['size'], [str(size) if size != 2048 else "" for size in df['size']]) ax.set_xlabel('Size [B]') ax.set_ylabel('Runtime [cycles]') - ax.grid(True, linestyle='-', alpha=0.4) + ax.set_axisbelow(True) + ax.grid(True, linestyle='-', color='gainsboro') ax.legend() plt.tight_layout() diff --git a/experiments/summa_gemm/plot.py b/experiments/summa_gemm/plot.py index e19efc5b..9ac2a83b 100755 --- a/experiments/summa_gemm/plot.py +++ b/experiments/summa_gemm/plot.py @@ -8,11 +8,12 @@ import argparse import matplotlib.pyplot as plt from matplotlib.ticker import FuncFormatter +import numpy as np import pandas as pd from summa_gemm import model -def plot1(show=True): +def plot1(y_label=None, hide_x_axis=False, show=True): # Get data mesh_sizes = [4, 8, 16, 32, 64, 128, 256] t_comm_sw = [] @@ -42,12 +43,25 @@ def plot1(show=True): ax.plot(mesh_sizes, t_comm_sw, marker='o', label='$T_{comm}$ (sw)') ax.plot(mesh_sizes, t_comm_hw, marker='s', label='$T_{comm}$ (hw)') ax.plot(mesh_sizes, t_compute, marker='^', label='$T_{comp}$') - ax.set_xticks(mesh_sizes) ax.set_xscale('log', base=2) - ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) - ax.set_xlabel('Mesh size') - ax.set_ylabel('Runtime [cycles]') - ax.grid(True, which="both", alpha=0.5) + ax.set_xticks(mesh_sizes) + ax.set_xlim(2.8, 370) + if hide_x_axis: + ax.set_xlabel('') + ax.tick_params( + axis='x', + which='both', + bottom=False, top=False, # hide tick marks + labelbottom=False # hide labels + ) + else: + ax.set_xlabel('Mesh size') + ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(True, which="both", color='gainsboro') ax.legend() fig.tight_layout() @@ -94,7 +108,7 @@ def plot1(show=True): return df -def plot2(show=True): +def plot2(y_label=None, show=True): # Get data mesh_sizes = [4, 8, 16, 32, 64, 128, 256] Mt = model.max_square_problem_size() @@ -112,12 +126,32 @@ def plot2(show=True): # Calculate speedup df['speedup'] = df['t_sw'] / df['t_hw'] + # Bar widths (to make them of equal width on a log scale) + logx = np.log2(df['size'].to_numpy(dtype=float)) + mid = 0.5 * (logx[:-1] + logx[1:]) # midpoints between neighbors + log_left_edges = np.r_[logx[0] - (mid[0] - logx[0]), mid] # extrapolate first + log_right_edges = np.r_[mid, logx[-1] + (logx[-1] - mid[-1])]# extrapolate last + fill = 0.6 + log_span = (log_right_edges - log_left_edges) * fill + logL = logx - 0.5 * log_span + logR = logx + 0.5 * log_span + left = 2.0**logL + right = 2.0**logR + width = right - left + # Plot fig, ax = plt.subplots() - ax.bar(df['size'].astype(str), df['speedup']) + ax.bar(left, df['speedup'], width=width, align='edge') + ax.set_xscale('log', base=2) + ax.set_xlim(2.8, 370) + ax.set_xticks(mesh_sizes) + ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) ax.set_xlabel('Mesh size') - ax.set_ylabel('Speedup') - ax.grid(axis='y', alpha=0.5) + if y_label is None: + y_label = 'Runtime [cycles]' + ax.set_ylabel(y_label) + ax.set_axisbelow(True) + ax.grid(axis='both', color='gainsboro') fig.tight_layout() if show: From f40cebe156f1898fe634db8298baf671c07b3317 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 30 Oct 2025 04:29:23 +0100 Subject: [PATCH 47/77] Update plots for paper again --- experiments/barrier/plot.py | 3 ++- experiments/dma_multicast_v2/plot.py | 6 +++--- experiments/reduction/plot.py | 5 ++--- experiments/summa_gemm/plot.py | 4 +++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/experiments/barrier/plot.py b/experiments/barrier/plot.py index f1e82de4..dfd8341c 100755 --- a/experiments/barrier/plot.py +++ b/experiments/barrier/plot.py @@ -24,7 +24,8 @@ def plot1(show=True): ax.set_xlabel("Nr. clusters") ax.set_ylabel("Runtime [cycles]") ax.tick_params(axis='x', labelrotation=0) - ax.grid(axis="y", alpha=0.4) + ax.set_axisbelow(True) + ax.grid(axis="y", color="gainsboro") ax.legend(handlelength=1, ncol=2, columnspacing=0.5, handletextpad=0.3) plt.tight_layout() if show: diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/dma_multicast_v2/plot.py index cbd846f2..bab6ec02 100755 --- a/experiments/dma_multicast_v2/plot.py +++ b/experiments/dma_multicast_v2/plot.py @@ -274,7 +274,7 @@ def plot3(ax=None, y_label=None, hide_x_axis=False, show=True): # Plot model lines for sequential runtime (varying delta). x = np.arange(df['size'].min(), df['size'].max(), 64) - for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -12), 0]): + for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -14), 0]): alpha = alpha_delta // 2 delta = alpha_delta - alpha x_ext = np.arange(df['size'].min(), df['size'].max() + 1024, 64) @@ -356,7 +356,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): res = df[df['n_rows'] == n_rows] best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() ax.scatter( - sizes, best_sw_cycles, label='Actual (min(seq, tree))' if show_label else None, + sizes, best_sw_cycles, label='Actual (sw)' if show_label else None, marker='x', color=colors['sw'] ) ax.scatter( @@ -367,7 +367,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): # Plot model line for best software implementation x, y = sw_runtime_curve(sizes.min(), sizes.max(), n_rows) ax.plot( - x, y, label='Model (min(seq, tree))' if show_label else None, + x, y, label='Model (sw)' if show_label else None, linestyle='--', color=colors['sw']) # Annotate sw model lines with number of rows, in correspondence with actual runtime diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index 9a9d08b1..405b5e0a 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -244,7 +244,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): res = df[df['n_rows'] == n_rows] best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() ax.scatter( - sizes, best_sw_cycles, label='Actual (min(seq, tree))' if show_label else None, + sizes, best_sw_cycles, label='Actual (sw)' if show_label else None, marker='x', color=colors['sw'] ) ax.scatter( @@ -257,10 +257,9 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): # ) # Plot model line for best software implementation - # TODO(colluca): use sw_runtime_curve once sequential implementation is integrated x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( - x, y, label='Model (min(seq, tree))' if show_label else None, + x, y, label='Model (sw)' if show_label else None, linestyle='--', color=colors['sw']) # Annotate sw model lines with number of rows, in correspondence with actual runtime diff --git a/experiments/summa_gemm/plot.py b/experiments/summa_gemm/plot.py index 9ac2a83b..6d77bc6c 100755 --- a/experiments/summa_gemm/plot.py +++ b/experiments/summa_gemm/plot.py @@ -141,9 +141,11 @@ def plot2(y_label=None, show=True): # Plot fig, ax = plt.subplots() - ax.bar(left, df['speedup'], width=width, align='edge') + ax.axhline(y=1, color='black', linewidth=1, zorder=5) + ax.bar(left, df['speedup'], width=width, align='edge', zorder=10) ax.set_xscale('log', base=2) ax.set_xlim(2.8, 370) + ax.set_ylim(0.5, None) ax.set_xticks(mesh_sizes) ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x)}x{int(x)}")) ax.set_xlabel('Mesh size') From 6213bc2e6dbccac913fbac3739d4ee785c811adc Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Thu, 30 Oct 2025 14:23:45 +0100 Subject: [PATCH 48/77] More fixes for the paper --- experiments/dma_multicast_v2/plot.py | 8 +-- experiments/reduction/experiments.py | 15 ++---- experiments/reduction/fit.py | 27 ++-------- experiments/reduction/model.py | 23 +++------ experiments/reduction/plot.py | 73 ++++++++-------------------- experiments/summa_gemm/model.py | 2 +- 6 files changed, 43 insertions(+), 105 deletions(-) diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/dma_multicast_v2/plot.py index bab6ec02..aba317fa 100755 --- a/experiments/dma_multicast_v2/plot.py +++ b/experiments/dma_multicast_v2/plot.py @@ -47,7 +47,7 @@ def select_best_sequential_configuration(df): def seq_runtime_curve(xmin, xmax, n_rows): - x = np.arange(xmin, xmax, 64) + x = np.arange(xmin, xmax + 64, 64) y = [model.optimal_seq_runtime(4, n_rows, e // 64) for e in x] return x, y @@ -75,7 +75,7 @@ def find_monotone_lower_fit(x, y): def tree_runtime_curve(xmin, xmax, n_rows): - x = np.arange(xmin, xmax, 64) + x = np.arange(xmin, xmax + 64, 64) y = [model.tree_runtime(4, n_rows, e // 64) for e in x] return x, y @@ -87,7 +87,7 @@ def hw_runtime_curve(xmin, xmax, n_rows): def sw_runtime_curve(xmin, xmax, n_rows): - x = np.arange(xmin, xmax, 64) + x = np.arange(xmin, xmax + 64, 64) x, y_seq = seq_runtime_curve(xmin, xmax, n_rows) _, y_tree = tree_runtime_curve(xmin, xmax, n_rows) x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) @@ -273,7 +273,7 @@ def plot3(ax=None, y_label=None, hide_x_axis=False, show=True): _, ax = plt.subplots() # Plot model lines for sequential runtime (varying delta). - x = np.arange(df['size'].min(), df['size'].max(), 64) + x = np.arange(df['size'].min(), df['size'].max() + 64, 64) for k, alpha_delta in enumerate([*range(model.DELTA + model.SEQ_ALPHA, -1, -14), 0]): alpha = alpha_delta // 2 delta = alpha_delta - alpha diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py index 48bb028a..1461d0f2 100755 --- a/experiments/reduction/experiments.py +++ b/experiments/reduction/experiments.py @@ -51,8 +51,8 @@ def derive_vsim_builddir(self, experiment): def gen_experiments(): experiments = [] # for impl in ['seq']: - # for impl in ['tree', 'hw_simple', 'hw_generic']: - for impl in ['seq', 'tree', 'hw_simple', 'hw_generic']: + # for impl in ['tree', 'hw']: + for impl in ['seq', 'tree', 'hw']: # for n_rows in [1]: for n_rows in [1, 2, 4]: # for size in [4096]: @@ -77,7 +77,7 @@ def gen_experiments(): 'size': size, 'batch': batch_size, 'app': 'reduction_benchmark', - 'hw': 'simple' if impl == 'hw_simple' else 'generic', + 'hw': 'simple', 'roi': Path.cwd() / f'roi/{impl}.json.tpl', } work_dir = DIR / 'hw' / experiment['hw'] @@ -140,17 +140,12 @@ def tree_cycles(sim_results, c, r, n_batches): return sim_results.get_timespan(start, end) // TCK -def hw_generic_cycles(sim_results): - roi = SimRegion(dma_core(0), 'reduction', 1) - return sim_results.get_timespan(roi) // TCK - - -def hw_simple_dma_cycles(sim_results): +def hw_dma_cycles(sim_results): roi = SimRegion(dma_core(0), 'row reduction', 1) return sim_results.get_timespan(roi) // TCK -def hw_simple_cycles(sim_results): +def hw_cycles(sim_results): start = SimRegion(dma_core(0), 'row reduction', 1) end = SimRegion(dma_core(0), 'column reduction', 1) return sim_results.get_timespan(start, end) // TCK diff --git a/experiments/reduction/fit.py b/experiments/reduction/fit.py index d35700c4..55c72691 100755 --- a/experiments/reduction/fit.py +++ b/experiments/reduction/fit.py @@ -133,36 +133,17 @@ def fit_tree_compute(df=None, quiet=True): @cache -def fit_hw_generic(df=None, quiet=True): +def fit_hw(df=None, quiet=True): # Get experiment data if df is None: df = experiments.results() - df = df[df['impl'] == 'hw_generic'] + df = df[df['impl'] == 'hw'] # Retrieve only single-row experiments (fitted model should generalize to multi-row) df = df[df['n_rows'] == 1] # Fit data - df['cycles'] = df['results'].apply(experiments.hw_generic_cycles) - x = df['size'].to_numpy() / BEAT_BYTES - y = df['cycles'].to_numpy() - if not quiet: - print("Fit runtime for hw generic reduction:") - return fit(x, y, quiet=quiet) - - -@cache -def fit_hw_simple(df=None, quiet=True): - # Get experiment data - if df is None: - df = experiments.results() - df = df[df['impl'] == 'hw_simple'] - - # Retrieve only single-row experiments (fitted model should generalize to multi-row) - df = df[df['n_rows'] == 1] - - # Fit data - df['cycles'] = df['results'].apply(experiments.hw_simple_dma_cycles) + df['cycles'] = df['results'].apply(experiments.hw_dma_cycles) x = df['size'].to_numpy() / BEAT_BYTES y = df['cycles'].to_numpy() if not quiet: @@ -178,6 +159,8 @@ def main(): fit_seq_dma(quiet=False) fit_seq_compute(quiet=False) + fit_hw(quiet=False) + if __name__ == '__main__': main() diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py index d503d17f..29cc016c 100755 --- a/experiments/reduction/model.py +++ b/experiments/reduction/model.py @@ -33,27 +33,18 @@ def nearest_divisors(divisor, dividend): return lower, upper -def hw_generic_runtime(c, r, n): - hw_alpha, _ = fit.fit_hw_generic() - beta = 1 +def hw_runtime(c, r, n): + hw_alpha, hw_beta = fit.fit_hw() if r > 1: - beta = 2 - return hw_alpha + n * beta - - -def hw_simple_runtime(c, r, n): - hw_alpha, _ = fit.fit_hw_simple() - beta = 1 - if r > 1: - beta = 2 - return hw_alpha + n * beta + hw_beta = 2 + return hw_alpha + n * hw_beta def seq_runtime(c, r, n, k, delta=DELTA): batch = int(n // k) dma_alpha, _ = fit.fit_seq_dma() comp_alpha, _ = fit.fit_seq_compute() - t_comp = comp_alpha + batch * 1 + t_comp = comp_alpha + batch * 1.16 t_dma = dma_alpha + batch * 1 t_max = max(t_comp, t_dma) # print(t_dma * 5, t_comp * 5, t_max * 5) @@ -64,7 +55,7 @@ def seq_runtime(c, r, n, k, delta=DELTA): def optimal_seq_k(c, r, n, delta=DELTA): comp_alpha, _ = fit.fit_seq_compute() - real_k = sqrt(n / (delta + comp_alpha)) + real_k = sqrt(n * (2 * c - 3) / (delta + comp_alpha)) lower_k, upper_k = nearest_divisors(real_k, n) assert (lower_k is None) or (lower_k > 0) assert (upper_k is None) or (upper_k <= n) @@ -90,7 +81,7 @@ def tree_runtime(c, r, n, k, delta=DELTA): batch = int(n // k) dma_alpha, _ = fit.fit_tree_dma() comp_alpha, _ = fit.fit_tree_compute() - t_comp = comp_alpha + batch * 1 + t_comp = comp_alpha + batch * 1.16 t_dma = dma_alpha + batch * 1 t_max = max(t_comp, t_dma) # print(t_dma * 5, t_comp * 5, t_max * 5) diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index 405b5e0a..d7bde3b0 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -38,10 +38,8 @@ def get_actual_cycles(row): elif row['impl'] == 'tree': n_batches = int(row['size'] // row['batch']) return experiments.tree_cycles(row['results'], 4, row['n_rows'], n_batches) - elif row['impl'] == 'hw_simple': - return experiments.hw_simple_cycles(row['results']) - elif row['impl'] == 'hw_generic': - return experiments.hw_generic_cycles(row['results']) + elif row['impl'] == 'hw': + return experiments.hw_cycles(row['results']) def get_expected_cycles(row): @@ -49,10 +47,8 @@ def get_expected_cycles(row): cycles = model.optimal_seq_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) elif row['impl'] == 'tree': cycles = model.optimal_tree_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) - elif row['impl'] == 'hw_generic': - cycles = model.hw_generic_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) - elif row['impl'] == 'hw_simple': - cycles = model.hw_simple_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) + elif row['impl'] == 'hw': + cycles = model.hw_runtime(4, row['n_rows'], row['size'] // BEAT_BYTES) return int(cycles) @@ -72,7 +68,7 @@ def select_best_configs(df): def seq_runtime_curve(xmin, xmax, c, r): - x = np.arange(xmin, xmax, 64) + x = np.arange(xmin, xmax + 64, 64) y = [model.optimal_seq_runtime(c, r, e // BEAT_BYTES) for e in x] return x, y @@ -84,7 +80,7 @@ def monotone_seq_runtime_curve(xmin, xmax, c, r): def tree_runtime_curve(xmin, xmax, c, r): - x = np.arange(xmin, xmax, 64) + x = np.arange(xmin, xmax + 64, 64) y = [model.optimal_tree_runtime(c, r, e // BEAT_BYTES) for e in x] return x, y @@ -95,15 +91,9 @@ def monotone_tree_runtime_curve(xmin, xmax, c, r): return find_monotone_lower_fit(x, y) -def hw_generic_runtime_curve(xmin, xmax, c, r): - x = np.arange(xmin, xmax, 64) - y = [model.hw_generic_runtime(c, r, e // BEAT_BYTES) for e in x] - return x, y - - -def hw_simple_runtime_curve(xmin, xmax, c, r): - x = np.arange(xmin, xmax, 64) - y = [model.hw_simple_runtime(c, r, e // BEAT_BYTES) for e in x] +def hw_runtime_curve(xmin, xmax, c, r): + x = np.arange(xmin, xmax + 64, 64) + y = [model.hw_runtime(c, r, e // BEAT_BYTES) for e in x] return x, y @@ -134,7 +124,7 @@ def plot1(y_label=None, hide_x_axis=False, show=True): df = df.drop(columns=['results']) # Choose consistent colors and markers for the plots - colors = {"seq": "C0", "tree": "C1", "hw_simple": "C2", "hw_generic": "C3"} + colors = {"seq": "C0", "tree": "C1", "hw": "C2"} markers = {"expected": "o", "actual": "x"} # Create plot @@ -151,13 +141,9 @@ def plot1(y_label=None, hide_x_axis=False, show=True): marker=markers['actual'], color=colors['tree'] ) ax.scatter( - sizes, df[df['impl'] == 'hw_simple']['cycles'], label='Actual (hw_simple)', - marker=markers['actual'], color=colors['hw_simple'] + sizes, df[df['impl'] == 'hw']['cycles'], label='Actual (hw)', + marker=markers['actual'], color=colors['hw'] ) - # ax.scatter( - # sizes, df[df['impl'] == 'hw_generic']['cycles'], label='Actual (hw_generic)', - # marker=markers['actual'], color=colors['hw_generic'] - # ) # # Plot expected runtimes # ax.scatter( @@ -165,12 +151,8 @@ def plot1(y_label=None, hide_x_axis=False, show=True): # marker=markers['expected'], color=colors['tree'] # ) # ax.scatter( - # sizes, df[df['impl'] == 'hw_simple']['exp_cycles'], label='Expected (hw_simple)', - # marker=markers['expected'], color=colors['hw_simple'] - # ) - # ax.scatter( - # sizes, df[df['impl'] == 'hw_generic']['exp_cycles'], label='Expected (hw_generic)', - # marker=markers['expected'], color=colors['hw_generic'] + # sizes, df[df['impl'] == 'hw']['exp_cycles'], label='Expected (hw)', + # marker=markers['expected'], color=colors['hw'] # ) # Plot model line for seq runtime @@ -181,13 +163,9 @@ def plot1(y_label=None, hide_x_axis=False, show=True): x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r) ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) - # # Plot model line for hw generic runtime - # x, y = hw_generic_runtime_curve(sizes.min(), sizes.max(), c, r) - # ax.plot(x, y, label='Model (hw_generic)', linestyle='--', color=colors['hw_generic']) - # Plot model line for hw simple runtime - x, y = hw_simple_runtime_curve(sizes.min(), sizes.max(), c, r) - ax.plot(x, y, label='Model (hw_simple)', linestyle='--', color=colors['hw_simple']) + x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, r) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) # Plot formatting if hide_x_axis: @@ -233,7 +211,7 @@ def plot2(y_label=None, show=True): df = df.drop(columns=['results']) # Choose consistent colors for the plots - colors = {"sw": "C0", "hw_generic": "C1", "hw_simple": "C2"} + colors = {"sw": "C0", "hw": "C1"} # Create plot _, ax = plt.subplots() @@ -248,13 +226,9 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): marker='x', color=colors['sw'] ) ax.scatter( - sizes, res[res['impl'] == 'hw_simple']['cycles'], label='Actual (hw_simple)' if show_label else None, - marker='x', color=colors['hw_simple'] + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)' if show_label else None, + marker='x', color=colors['hw'] ) - # ax.scatter( - # sizes, res[res['impl'] == 'hw_generic']['cycles'], label='Actual (hw_generic)' if show_label else None, - # marker='x', color=colors['hw_generic'] - # ) # Plot model line for best software implementation x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, n_rows) @@ -270,14 +244,9 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): ha='left', va='center', clip_on=False) # Plot model line for hardware simple runtime - x, y = hw_simple_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( - x, y, label='Model (hw_simple)' if show_label else None, linestyle='--', color=colors['hw_simple']) - - # # Plot model line for hardware generic runtime - # x, y = hw_generic_runtime_curve(sizes.min(), sizes.max(), c, n_rows) - # ax.plot( - # x, y, label='Model (hw_generic)' if show_label else None, linestyle='--', color=colors['hw_generic']) + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) for i, n_rows in enumerate(df['n_rows'].unique()): plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) diff --git a/experiments/summa_gemm/model.py b/experiments/summa_gemm/model.py index 2326a265..1c039853 100755 --- a/experiments/summa_gemm/model.py +++ b/experiments/summa_gemm/model.py @@ -65,7 +65,7 @@ def t_reduction(r, c, bytes, impl='sw'): if impl == 'sw': return reduction.model.optimal_sw_runtime(c, r, n) elif impl == 'hw': - return reduction.model.hw_simple_runtime(c, r, n) + return reduction.model.hw_runtime(c, r, n) else: raise ValueError(f"Unknown reduction implementation: {impl}") From a976c11f88c9d322d8d147cfa90138267cb66296 Mon Sep 17 00:00:00 2001 From: Chen Wu Date: Thu, 30 Oct 2025 16:36:20 +0100 Subject: [PATCH 49/77] modify 2d reduction plot, also add the hyperbank reduction(tree not supported yet) --- experiments/reduction/model.py | 2 + experiments/reduction/plot.py | 14 +- .../tests/reduction_benchmark_hyperbank.c | 791 ++++++++++++++++++ 3 files changed, 800 insertions(+), 7 deletions(-) create mode 100644 sw/snitch/tests/reduction_benchmark_hyperbank.c diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py index 29cc016c..a9dec3a6 100755 --- a/experiments/reduction/model.py +++ b/experiments/reduction/model.py @@ -49,6 +49,8 @@ def seq_runtime(c, r, n, k, delta=DELTA): t_max = max(t_comp, t_dma) # print(t_dma * 5, t_comp * 5, t_max * 5) n_iters = 1 + 2 * (c - 2) + k + if r > 1: + n_iters += 2 * (r - 2) + k # First approximation model assumes compute and dma take roughly the same time return n_iters * (t_max + delta) - delta diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index d7bde3b0..43861dac 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -97,12 +97,12 @@ def hw_runtime_curve(xmin, xmax, c, r): return x, y -# def sw_runtime_curve(xmin, xmax, n_rows): -# x = np.arange(xmin, xmax, 64) -# x, y_seq = seq_runtime_curve(xmin, xmax, n_rows) -# _, y_tree = tree_runtime_curve(xmin, xmax, n_rows) -# x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) -# return x, y_min +def sw_runtime_curve(xmin, xmax, c, n_rows): + x = np.arange(xmin, xmax, 64) + x, y_seq = seq_runtime_curve(xmin, xmax, c, n_rows) + _, y_tree = tree_runtime_curve(xmin, xmax, c, n_rows) + x, y_min = find_monotone_lower_fit(x, [min(a, b) for a, b in zip(y_seq, y_tree)]) + return x, y_min def plot1(y_label=None, hide_x_axis=False, show=True): @@ -231,7 +231,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): ) # Plot model line for best software implementation - x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, n_rows) + x, y = sw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( x, y, label='Model (sw)' if show_label else None, linestyle='--', color=colors['sw']) diff --git a/sw/snitch/tests/reduction_benchmark_hyperbank.c b/sw/snitch/tests/reduction_benchmark_hyperbank.c new file mode 100644 index 00000000..f36cb334 --- /dev/null +++ b/sw/snitch/tests/reduction_benchmark_hyperbank.c @@ -0,0 +1,791 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Luca Colagrande +// Chen Wu + +#include "snrt.h" +#include "blas.h" + +// Transfer size in bytes +#ifndef SIZE +#define SIZE 8192 +#endif + +// Batch size in bytes +#ifndef BATCH +#define BATCH 1024 +#endif + +typedef enum { + SEQ, + TREE, + HW_GENERIC, + HW_SIMPLE, +} impl_t; + +#ifndef IMPL +#define IMPL SEQ +#endif + +#define N_BATCHES (SIZE / BATCH) +#define N_ELEMS (SIZE / sizeof(double)) + +#ifndef LOG2_N_ROWS +#define LOG2_N_ROWS 2 +#endif + +#define N_ROWS (1 << LOG2_N_ROWS) + +// Replace `1` literals with this thread-local variable to prevent it from +// being allocated by the compiler in L2. +__thread double one = 1; + +static inline void global_hw_barrier(volatile uint32_t *barrier_ptr, + uint32_t user) { + + // Perform inter-cluster barrier. Disable reduction before the fence + // so it overlaps with the latency of the ongoing reduction operation. + if (snrt_is_dm_core()) { + snrt_set_awuser_low(user); + *barrier_ptr = 1; + snrt_set_awuser_low(0); + snrt_fence(); + } + snrt_cluster_hw_barrier(); +} + +static inline void swap_buffers(uintptr_t *a, uintptr_t *b) { + uintptr_t temp = *a; + *a = *b; + *b = temp; +} + +static inline void dma_reduction_hw_generic(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + if (snrt_is_dm_core() && comm->is_participant) { + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), 0); + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + snrt_dma_start_1d_reduction(remote_dst, src, SIZE, comm, op); + snrt_dma_wait_all(); + } +} + +static inline void dma_reduction_hw_simple(uintptr_t src, uintptr_t dst, + snrt_comm_t comm) { + + // Create a communicator per row + snrt_comm_t row_comm[N_ROWS]; + for (int i = 0; i < N_ROWS; i++) + pb_create_mesh_comm(&row_comm[i], 1, pb_cluster_num_in_row(), i, 0, + comm); + + // Create a communicator for the first column + snrt_comm_t col_comm; + pb_create_mesh_comm(&col_comm, N_ROWS, 1, 0, 0, comm); + + if (snrt_is_dm_core() && comm->is_participant) { + uint32_t remote_cluster; + uintptr_t remote_dst; + snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; + + // Reduction across rows (destination: first cluster in row) + snrt_mcycle(); + remote_cluster = pb_calculate_cluster_idx(pb_cluster_row_idx(), 0); + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)dst, snrt_cluster_idx(), remote_cluster); + snrt_dma_start_1d_reduction( + remote_dst, src, SIZE, row_comm[pb_cluster_row_idx()], op); + snrt_dma_wait_all(); + snrt_mcycle(); + + // Barrier to ensure there is only one outstanding reduction in every + // router + snrt_inter_cluster_barrier(comm); + + // Reduction across first column (destination: cluster 0) + snrt_mcycle(); + if (N_ROWS > 1 && col_comm->is_participant) { + // Switch source and destination buffer + remote_dst = (uintptr_t)snrt_remote_l1_ptr( + (void *)src, snrt_cluster_idx(), 0); + snrt_dma_start_1d_reduction(remote_dst, dst, SIZE, col_comm, op); + snrt_dma_wait_all(); + } + snrt_mcycle(); + } +} + +static inline void dma_reduction_seq_normal(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + + if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { + DUMP(a); + DUMP(b); + DUMP(c); + DUMP(d); + } + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a; + } else { + dma_src = c; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a; + comp_src2 = dst[0]; + comp_result = c; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + comp_src1 = c; + comp_src2 = dst[0]; + comp_result = a; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + comp_src1 += BATCH; + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + comp_result += BATCH; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + dma_src += BATCH; + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1, + uintptr_t b, + uintptr_t c0, uintptr_t c1, + uintptr_t d, + snrt_comm_t comm) { + if (!comm->is_participant) return; + + uint32_t col_idx = pb_cluster_col_idx(); + uint32_t row_idx = pb_cluster_row_idx(); + uintptr_t is_northernmost = pb_cluster_in_row(N_ROWS - 1); + + // Group to simplify rotating buffers + uintptr_t dst[2] = {b, d}; + uintptr_t a_ptr[2] = {a0, a1}; + uintptr_t c_ptr[2] = {c0, c1}; + + if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { + DUMP(a0); DUMP(a1); + DUMP(b); + DUMP(c0); DUMP(c1); + DUMP(d); + } + + // Compute addresses of source and destination data for DMA cores. + // All clusters in col > 0 participate in the row reduction. For this + // reduction the source is in `a` and the result in `c`. + // Only the easternmost clusters send their input data (in `a`) to their + // west neighbours, all other clusters send the data they reduced in the + // previous step (in `c`). + // Clusters in col == 0 participate in the column reduction. For this + // reduction the source is in `c` and the result in `a`. + // Only the northernmost cluster sends the source data (in `c`) to its + // south neighbour, all other clusters send the data they reduced in the + // previous step (in `a`). + uintptr_t dma_src, dma_dst; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + dma_src = a_ptr[0]; + } else { + dma_src = c_ptr[0]; + } + // Clusters in col > 0, participating in row reduction, send to west + // neighbours. Clusters in col == 0, participating in column reduction, + // send to south neighbours. + uint32_t dst_cluster = col_idx > 0 ? pb_cluster_west_neighbour() : + pb_cluster_south_neighbour(); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), dst_cluster); + + // Compute addresses of source and result data for compute cores + uintptr_t comp_src1, comp_src2, comp_result; + comp_src1 = a_ptr[0]; + comp_src2 = dst[0]; + comp_result = c_ptr[0]; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Iterations to cover all reductions in a row + uint32_t n_iters = 1 + 2 * (pb_cluster_num_in_row() - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t col_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + col_idx_inv = pb_cluster_num_in_row() - col_idx - 1; + base_iter = 2 * col_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= !pb_cluster_in_col(0); + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (col_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= !pb_cluster_is_easternmost(); + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + + // comp_src1 += BATCH; + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + comp_src1 = a_ptr[0]; + + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + + // comp_result += BATCH; + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + comp_result = c_ptr[0]; + + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + // DUMP(dma_src);DUMP(*((double *)dma_src)); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + // dma_src += BATCH; + if (pb_cluster_is_easternmost() || (col_idx == 0 && !is_northernmost)) { + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + dma_src = a_ptr[0]; + } else { + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + dma_src = c_ptr[0]; + } + + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_west_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + + // Final column reduction is performed only if there is more than one row + if (N_ROWS > 1) { + + dst[0] = b; + dst[1] = d; + a_ptr[0] = a0; + a_ptr[1] = a1; + c_ptr[0] = c0; + c_ptr[1] = c1; + + comp_src1 = c_ptr[0]; + comp_src2 = dst[0]; + comp_result = a_ptr[0]; + + // Iterations to cover all reductions in the first column + n_iters = 1 + 2 * (N_ROWS - 2) + N_BATCHES; + for (uint32_t i = 0; i < n_iters; i++) { + + uint32_t row_idx_inv, base_iter; + char dma_active, compute_active; + + // Determine which clusters need to send data at every iteration + row_idx_inv = N_ROWS - row_idx - 1; + base_iter = 2 * row_idx_inv; + dma_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + dma_active &= pb_cluster_in_col(0); + dma_active &= snrt_cluster_idx() != 0; + + // Determine which clusters need to reduce data at every iteration + base_iter = 1 + 2 * (row_idx_inv - 1); + compute_active = (i >= base_iter) && (i < (base_iter + N_BATCHES)); + compute_active &= pb_cluster_in_col(0); + compute_active &= !is_northernmost; + + if (compute_active && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)comp_src1, + (double *)comp_src2, (double *)comp_result); + // comp_src1 += BATCH; + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + comp_src1 = c_ptr[0]; + + swap_buffers(&dst[0], &dst[1]); + comp_src2 = dst[0]; + + // comp_result += BATCH; + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + comp_result = a_ptr[0]; + snrt_mcycle(); + } + + if (dma_active && snrt_is_dm_core()) { + // Start DMA + snrt_mcycle(); + snrt_dma_start_1d(dma_dst, dma_src, BATCH); + + // Update pointers for next iteration while transfer completes, + // preventing instructions from being reordered after the DMA wait + // dma_src += BATCH; + if (!is_northernmost) { + a_ptr[0] += BATCH; + swap_buffers(&a_ptr[0], &a_ptr[1]); + dma_src = a_ptr[0]; + } else { + c_ptr[0] += BATCH; + swap_buffers(&c_ptr[0], &c_ptr[1]); + dma_src = c_ptr[0]; + } + + swap_buffers(&dst[0], &dst[1]); + dma_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst[0], + snrt_cluster_idx(), pb_cluster_south_neighbour()); + asm volatile ("" : "+r"(dma_src), "+r"(dma_dst) ::); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } + + global_hw_barrier(barrier_ptr, user); + } + } +} + +static inline void dma_reduction_tree_transfer_left_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx(), pb_cluster_col_idx() - dist); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_transfer_south_phase(char is_sender, char dist, + uintptr_t src, uintptr_t dst) { + + // Every sending cluster sends the data to a cluster on the left + if (is_sender && snrt_is_dm_core()) { + + // Calculate destination + uintptr_t dst_cluster = pb_calculate_cluster_idx( + pb_cluster_row_idx() - dist, pb_cluster_col_idx()); + uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr((void *)dst, + snrt_cluster_idx(), dst_cluster); + + snrt_mcycle(); + + // Start DMA + snrt_dma_start_1d(remote_dst, src, BATCH); + + // Wait for DMA to complete + snrt_dma_wait_all(); + snrt_mcycle(); + } +} + +static inline void dma_reduction_tree_compute_phase(char is_receiver, + uintptr_t src, uintptr_t dst, uintptr_t result) { + + // Every receiving cluster reduces the data from the sender + if (is_receiver && snrt_is_compute_core()) { + snrt_mcycle(); + axpy_opt(N_ELEMS / N_BATCHES, one, (double *)src, (double *)dst, + (double *)result); + snrt_mcycle(); + } + snrt_cluster_hw_barrier(); +} + +// We need four buffers. Upon function invocation, the data must be in a. +static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, + uintptr_t d, snrt_comm_t comm) { + + // Only clusters in the communicator continue from here + if (!comm->is_participant) return; + + // Prepare for inter-cluster barrier in advance, preventing instruction + // reordering using the volatile block. + snrt_collective_op_t op; + op.f.collective_op = SNRT_REDUCTION_BARRIER; + op.f.mask = snrt_get_collective_mask(comm); + volatile uint32_t *barrier_ptr = comm->barrier_ptr; + uint32_t user = (uint32_t)op.w; + asm volatile ("" : "+r"(user) ::); + + // Initially a is the source buffer and c is the result buffer + uintptr_t src = a; + uintptr_t dst[2] = {b, d}; + uintptr_t result = c; + + // Iterations to cover all reductions in a row + uint32_t n_levels_in_row = pb_log2_cluster_num_in_row(); + for (uint32_t i = 0; i < n_levels_in_row; i++) { + + // Determine which clusters are senders and receivers at every level + // of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_col_idx() % (2*dist)) == dist; + char is_receiver = (pb_cluster_col_idx() % (2*dist)) == 0; + + // Transfer phase for first batch + dma_reduction_tree_transfer_left_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_left_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } + + // Iterations to cover all reductions in the first column + uint32_t n_levels_in_col = LOG2_N_ROWS; + for (uint32_t i = 0; i < n_levels_in_col; i++) { + + // Determine which clusters are senders at every level of the tree + char dist = 1 << i; // Distance between clusters in a pair + char is_sender = (pb_cluster_row_idx() % (2*dist)) == dist && pb_cluster_in_col(0); + char is_receiver = (pb_cluster_row_idx() % (2*dist)) == 0 && pb_cluster_in_col(0); + + // Transfer phase for first batch + dma_reduction_tree_transfer_south_phase(is_sender, dist, src, dst[0]); + global_hw_barrier(barrier_ptr, user); + + // If more than one batch is present, there will be N_BATCHES - 1 + // iterations in which we perform both transfer and compute phases + uint32_t j = 0; + for (; j < N_BATCHES - 1; j++) { + // Alternate destination buffers for every batch + uintptr_t dma_dst = dst[(j+1)%2]; + uintptr_t comp_dst = dst[j%2]; + + // Calculate pointers for current batch + uintptr_t batch_dma_src = src + (j+1) * BATCH; + uintptr_t batch_comp_src = src + j * BATCH; + uintptr_t batch_result = result + j * BATCH; + + dma_reduction_tree_transfer_south_phase(is_sender, dist, batch_dma_src, + dma_dst); + dma_reduction_tree_compute_phase(is_receiver, batch_comp_src, + comp_dst, batch_result); + global_hw_barrier(barrier_ptr, user); + } + + // Compute phase for last batch + dma_reduction_tree_compute_phase(is_receiver, src + j * BATCH, + dst[j%2], result + j * BATCH); + + // Swap source and result buffers for next iteration + swap_buffers(&src, &result); + } +} + +// L1 buffer of every cluster invoking this function should be +// at the same offset in the TCDM +static inline void dma_reduction(uintptr_t a0, uintptr_t a1, + uintptr_t b, + uintptr_t c0, uintptr_t c1, + uintptr_t d, + snrt_comm_t comm) { + if (IMPL == SEQ){ + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH) { + dma_reduction_seq_hyperbank(a0, a1, b, c0, c1, d, comm); + } else { + dma_reduction_seq_normal(a0, b, c0, d, comm); + } + } + // else if (IMPL == TREE) + // dma_reduction_tree(a, b, c, d, comm); + else if (IMPL == HW_GENERIC) + dma_reduction_hw_generic(a0, c0, comm); + else if (IMPL == HW_SIMPLE) + dma_reduction_hw_simple(a0, c0, comm); +} + +// Global variables for verification script +double output[N_ELEMS]; +extern const uint32_t n_clusters = N_ROWS * pb_cluster_num_in_row(); +extern const uint32_t length = N_ELEMS; + +int main (void){ + // split a and c into two hyperbanks, even*BATCH in hyperbank 0 and odd*BATCH in hyperbank 1 + // b in hyperbank 0, d in hyperbank 1 + // Allocate 4KiB-aligned buffers in every cluster + uintptr_t a_buffer[2], c_buffer[2], b_buffer, d_buffer; + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)) { + // In SEQ implementation, when SIZE != BATCH we need to allocate + // twice the space for a and c to alternate between buffers + a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096); + a_buffer[1] = a_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE; + c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE / 2, 4096); + c_buffer[1] = c_buffer[0] + SNRT_TCDM_HYPERBANK_SIZE; + b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + d_buffer = b_buffer + SNRT_TCDM_HYPERBANK_SIZE; + } + else { + a_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + c_buffer[0] = (uintptr_t)snrt_l1_alloc_cluster_local(SIZE, 4096); + b_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + d_buffer = (uintptr_t)snrt_l1_alloc_cluster_local(BATCH, 4096); + } + + // Create communicator for first N rows (all other clusters are inactive) + snrt_comm_t comm; + pb_create_mesh_comm(&comm, N_ROWS, pb_cluster_num_in_row()); + + // Only clusters in the first N rows continue from here + if (!comm->is_participant) return 0; + + // Two iterations to to preheat the cache + for (volatile int i = 0; i < 2; i++){ + + // Initialize source buffer + if (snrt_is_dm_core()) { + for (uint32_t i = 0; i < N_ELEMS; i++) { + uint32_t row_major_cluster_idx = pb_cluster_col_idx() + + pb_cluster_row_idx() * pb_cluster_num_in_row(); + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){ + uint32_t batch_id = i / (BATCH / sizeof(double)); + uint32_t buffer_idx = batch_id % 2; + uint32_t buffer_offset = (batch_id / 2) * (BATCH / sizeof(double)) + (i % (BATCH / sizeof(double))); + *((double *)(a_buffer[buffer_idx]) + buffer_offset) = row_major_cluster_idx + i; + + } + else { + ((double *)a_buffer[0])[i] = row_major_cluster_idx + i; + } + // ((double *)a_buffer)[i] = row_major_cluster_idx + i; + } + } + snrt_global_barrier(comm); + + // Perform reduction + snrt_mcycle(); + dma_reduction(a_buffer[0], a_buffer[1], b_buffer, c_buffer[0], c_buffer[1], d_buffer, comm); + snrt_mcycle(); + snrt_global_barrier(comm); + } + + // Writeback to L3 + uintptr_t result_buffer = c_buffer[0]; + uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS; + if ((IMPL == HW_SIMPLE && N_ROWS > 1) || + (IMPL == SEQ && N_ROWS > 1) || + (IMPL == TREE && (total_tree_levels % 2) == 0)) + result_buffer = a_buffer[0]; + if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { + if (SNRT_TCDM_HYPERBANK_NUM!=1 && SIZE != BATCH && (IMPL == SEQ || IMPL == TREE)){ + snrt_dma_1d_to_2d((void *)output, + (void *)result_buffer, + SIZE / 2, + BATCH, + BATCH * SNRT_TCDM_HYPERBANK_NUM); + snrt_dma_1d_to_2d((void *)((uintptr_t)output + BATCH), + (void *)(result_buffer + SNRT_TCDM_HYPERBANK_SIZE), + SIZE / 2, + BATCH, + BATCH * SNRT_TCDM_HYPERBANK_NUM); + } + else { + snrt_dma_start_1d((uintptr_t)output, result_buffer, SIZE); + } + snrt_dma_wait_all(); + } +} From 9d48f2bd17d88f4eae7d5bd115faa492449c51a6 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 11 Nov 2025 11:46:40 +0100 Subject: [PATCH 50/77] Final amendments for submitted paper --- experiments/dma_multicast_v2/plot.py | 30 ++++++++++++--------- experiments/reduction/model.py | 14 +++++++--- experiments/reduction/plot.py | 39 +++++++++++++++++++--------- 3 files changed, 55 insertions(+), 28 deletions(-) diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/dma_multicast_v2/plot.py index aba317fa..5687b385 100755 --- a/experiments/dma_multicast_v2/plot.py +++ b/experiments/dma_multicast_v2/plot.py @@ -180,15 +180,15 @@ def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label): res = df[df['n_rows'] == n_rows] sizes = res['size'].unique() ax.scatter( - sizes, res[res['impl'] == 'seq']['cycles'], label='Actual (seq)', + sizes, res[res['impl'] == 'seq']['cycles'], label='Actual', marker=markers['actual'], color=colors['seq'] ) ax.scatter( - sizes, res[res['impl'] == 'tree']['cycles'], label='Actual (tree)', + sizes, res[res['impl'] == 'tree']['cycles'], label='Actual', marker=markers['actual'], color=colors['tree'] ) ax.scatter( - sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)', + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual', marker=markers['actual'], color=colors['hw'] ) @@ -208,15 +208,15 @@ def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label): # Plot model line for sequential runtime. Find monotone non-decreasing lower fit. x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), n_rows) - ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) # Plot model line for tree runtime x, y = tree_runtime_curve(sizes.min(), sizes.max(), n_rows) - ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) # Plot model line for hardware runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) - ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) if hide_x_axis: ax.set_xlabel('') @@ -236,7 +236,7 @@ def plot_runtime_vs_speedup(ax, n_rows, y_label=y_label): ax.set_ylabel(y_label) ax.set_axisbelow(True) ax.grid(True, linestyle='-', color='gainsboro') - ax.legend() + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) plot_runtime_vs_speedup(ax, n_rows=1) # plot_runtime_vs_speedup(ax[0][0], n_rows=1) @@ -356,18 +356,18 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): res = df[df['n_rows'] == n_rows] best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() ax.scatter( - sizes, best_sw_cycles, label='Actual (sw)' if show_label else None, + sizes, best_sw_cycles, label='Actual' if show_label else None, marker='x', color=colors['sw'] ) ax.scatter( - sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)' if show_label else None, + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None, marker='x', color=colors['hw'] ) # Plot model line for best software implementation x, y = sw_runtime_curve(sizes.min(), sizes.max(), n_rows) ax.plot( - x, y, label='Model (sw)' if show_label else None, + x, y, label='Model (sw)' if show_label else None, linestyle='--', color=colors['sw']) # Annotate sw model lines with number of rows, in correspondence with actual runtime @@ -380,7 +380,13 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): # Plot model line for hardware runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), n_rows) ax.plot( - x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + if n_rows == 1: + ax.annotate( + 'r=1,2,4', + xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']), + xytext=(0, 5), textcoords='offset points', # nudge right + ha='center', va='center', clip_on=False) for i, n_rows in enumerate(df['n_rows'].unique()): plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) @@ -395,7 +401,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): ax.set_ylabel(y_label) ax.set_axisbelow(True) ax.grid(True, linestyle='-', color='gainsboro') - ax.legend() + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) plt.tight_layout() if show: diff --git a/experiments/reduction/model.py b/experiments/reduction/model.py index a9dec3a6..fda790d1 100755 --- a/experiments/reduction/model.py +++ b/experiments/reduction/model.py @@ -35,9 +35,11 @@ def nearest_divisors(divisor, dividend): def hw_runtime(c, r, n): hw_alpha, hw_beta = fit.fit_hw() + t_hw = hw_alpha + n * hw_beta if r > 1: - hw_beta = 2 - return hw_alpha + n * hw_beta + return 2 * t_hw + else: + return t_hw def seq_runtime(c, r, n, k, delta=DELTA): @@ -50,14 +52,18 @@ def seq_runtime(c, r, n, k, delta=DELTA): # print(t_dma * 5, t_comp * 5, t_max * 5) n_iters = 1 + 2 * (c - 2) + k if r > 1: - n_iters += 2 * (r - 2) + k + n_iters += 1 + 2 * (r - 2) + k # First approximation model assumes compute and dma take roughly the same time + delta = 4 * r + 28 return n_iters * (t_max + delta) - delta def optimal_seq_k(c, r, n, delta=DELTA): comp_alpha, _ = fit.fit_seq_compute() - real_k = sqrt(n * (2 * c - 3) / (delta + comp_alpha)) + if r > 1: + real_k = sqrt(n * (2 * (c + r) - 7) / (2 * (delta + comp_alpha))) + else: + real_k = sqrt(n * (2 * c - 3) / (delta + comp_alpha)) lower_k, upper_k = nearest_divisors(real_k, n) assert (lower_k is None) or (lower_k > 0) assert (upper_k is None) or (upper_k <= n) diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index 43861dac..7ce125ce 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -56,6 +56,9 @@ def get_expected_batch_size(row): if row['impl'] == 'tree': optimal_k = model.optimal_tree_k(4, row['n_rows'], row['size'] // BEAT_BYTES) return int(row['size'] // optimal_k) + elif row['impl'] == 'seq': + optimal_k = model.optimal_seq_k(4, row['n_rows'], row['size'] // BEAT_BYTES) + return int(row['size'] // optimal_k) elif row['impl'] == 'hw': return None @@ -133,15 +136,15 @@ def plot1(y_label=None, hide_x_axis=False, show=True): # Plot actual runtimes sizes = df['size'].unique() ax.scatter( - sizes, df[df['impl'] == 'seq']['cycles'], label='Actual (seq)', + sizes, df[df['impl'] == 'seq']['cycles'], label='Actual', marker=markers['actual'], color=colors['seq'] ) ax.scatter( - sizes, df[df['impl'] == 'tree']['cycles'], label='Actual (tree)', + sizes, df[df['impl'] == 'tree']['cycles'], label='Actual', marker=markers['actual'], color=colors['tree'] ) ax.scatter( - sizes, df[df['impl'] == 'hw']['cycles'], label='Actual (hw)', + sizes, df[df['impl'] == 'hw']['cycles'], label='Actual', marker=markers['actual'], color=colors['hw'] ) @@ -157,15 +160,15 @@ def plot1(y_label=None, hide_x_axis=False, show=True): # Plot model line for seq runtime x, y = monotone_seq_runtime_curve(sizes.min(), sizes.max(), c, r) - ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) + ax.plot(x, y, label='Model (seq)', linestyle='--', color=colors['seq']) # Plot model line for tree runtime x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r) - ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) + ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) # Plot model line for hw simple runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, r) - ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) + ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) # Plot formatting if hide_x_axis: @@ -186,7 +189,7 @@ def plot1(y_label=None, hide_x_axis=False, show=True): ax.set_xlim(0, sizes.max() * 1.1) ax.set_axisbelow(True) ax.grid(True, color='gainsboro') - ax.legend() + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) plt.tight_layout() if show: plt.show() @@ -220,20 +223,22 @@ def plot2(y_label=None, show=True): # Create function to plot runtimes for a given number of rows def plot_runtime_vs_speedup(ax, n_rows, show_label=False): res = df[df['n_rows'] == n_rows] + print(res) best_sw_cycles = res[res['impl'].isin(['seq', 'tree'])].groupby('size')['cycles'].min() + print(best_sw_cycles) ax.scatter( - sizes, best_sw_cycles, label='Actual (sw)' if show_label else None, + sizes, best_sw_cycles, label='Actual' if show_label else None, marker='x', color=colors['sw'] ) ax.scatter( - sizes, res[res['impl'] == 'hw']['cycles'], label='Actual (hw)' if show_label else None, + sizes, res[res['impl'] == 'hw']['cycles'], label='Actual' if show_label else None, marker='x', color=colors['hw'] ) # Plot model line for best software implementation x, y = sw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( - x, y, label='Model (sw)' if show_label else None, + x, y, label='Model (sw)' if show_label else None, linestyle='--', color=colors['sw']) # Annotate sw model lines with number of rows, in correspondence with actual runtime @@ -246,7 +251,17 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): # Plot model line for hardware simple runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( - x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) + if n_rows == 1: + label = f'r={n_rows}' + else: + label = 'r=2,4' + if n_rows != 4: + ax.annotate( + label, + xy=(x[-1], res[(res['impl'] == 'hw') & (res['size'] == sizes.max())]['cycles']), + xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right + ha='left', va='center', clip_on=False) for i, n_rows in enumerate(df['n_rows'].unique()): plot_runtime_vs_speedup(ax, n_rows=n_rows, show_label=(i == 0)) @@ -259,7 +274,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): ax.set_ylabel(y_label) ax.set_axisbelow(True) ax.grid(True, color='gainsboro') - ax.legend() + ax.legend(ncol=2, handlelength=1.2, columnspacing=0.5, handletextpad=0.3) plt.tight_layout() if show: From 550c9ae71399b9cf03ec921dad5de35e53ca705c Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 11 Nov 2025 11:50:47 +0100 Subject: [PATCH 51/77] Fix bug in tree multicast model --- experiments/dma_multicast_v2/model.py | 15 ++++++++++++--- experiments/summa_gemm/model.py | 4 ++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/experiments/dma_multicast_v2/model.py b/experiments/dma_multicast_v2/model.py index 852b526d..b67ddbaf 100755 --- a/experiments/dma_multicast_v2/model.py +++ b/experiments/dma_multicast_v2/model.py @@ -5,7 +5,7 @@ # # Luca Colagrande -from math import isqrt, sqrt, log2 +from math import isqrt, sqrt, log2, ceil # N: num clusters # L: num beats in transfer @@ -14,7 +14,6 @@ BEAT_BYTES = 64 SEQ_ALPHA = 30 -# TODO(colluca): in reality delta depends on n_rows, and the arrival times of all clusters DELTA = 36 HW_ALPHA = 30 TREE_M2C_ALPHA = 30 @@ -86,7 +85,17 @@ def hw_runtime(N1, N2, L): def tree_runtime(N1, N2, L, delta=DELTA): m2c_transfer_cycles = TREE_M2C_ALPHA + L - c2c_transfer_cycles = (TREE_C2C_ALPHA + L) * log2(N1 * N2) + + # C2C alpha depends on the distance between the clusters. + # On our system we have two cycles latency per hop. + c2c_transfer_cycles = 0 + for i in range(ceil(log2(N1))): + dist = N1 / (2 ** (i + 1)) + c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L + for i in range(ceil(log2(N2))): + dist = N2 / (2 ** (i + 1)) + c2c_transfer_cycles += TREE_C2C_ALPHA + 2 * dist + L + delta_cycles = delta * (log2(N1 * N2) - 1) return m2c_transfer_cycles + c2c_transfer_cycles + delta_cycles diff --git a/experiments/summa_gemm/model.py b/experiments/summa_gemm/model.py index 1c039853..8d08bfc4 100755 --- a/experiments/summa_gemm/model.py +++ b/experiments/summa_gemm/model.py @@ -28,6 +28,10 @@ def t_mcast(dim, bytes, impl='sw'): n = beats(bytes) if impl == 'sw': return multicast.model.optimal_sw_runtime(dim, 1, n) + elif impl == 'seq': + return multicast.model.optimal_seq_runtime(dim, 1, n) + elif impl == 'tree': + return multicast.model.tree_runtime(dim, 1, n) elif impl == 'hw': return multicast.model.hw_runtime(dim, 1, n) else: From 1c24239f9cd42c4c17f14ace347cf2e05b43cd68 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Fri, 12 Dec 2025 09:58:33 +0100 Subject: [PATCH 52/77] Clean reduction benchmark --- sw/snitch/tests/reduction_benchmark.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/sw/snitch/tests/reduction_benchmark.c b/sw/snitch/tests/reduction_benchmark.c index 164876d3..35f0d45c 100644 --- a/sw/snitch/tests/reduction_benchmark.c +++ b/sw/snitch/tests/reduction_benchmark.c @@ -21,8 +21,7 @@ typedef enum { SEQ, TREE, - HW_GENERIC, - HW_SIMPLE, + HW } impl_t; #ifndef IMPL @@ -62,18 +61,7 @@ static inline void swap_buffers(uintptr_t *a, uintptr_t *b) { *b = temp; } -static inline void dma_reduction_hw_generic(uintptr_t src, uintptr_t dst, - snrt_comm_t comm) { - if (snrt_is_dm_core() && comm->is_participant) { - uintptr_t remote_dst = (uintptr_t)snrt_remote_l1_ptr( - (void *)dst, snrt_cluster_idx(), 0); - snrt_collective_opcode_t op = SNRT_REDUCTION_FADD; - snrt_dma_start_1d_reduction(remote_dst, src, SIZE, comm, op); - snrt_dma_wait_all(); - } -} - -static inline void dma_reduction_hw_simple(uintptr_t src, uintptr_t dst, +static inline void dma_reduction_hw(uintptr_t src, uintptr_t dst, snrt_comm_t comm) { // Create a communicator per row @@ -463,10 +451,8 @@ static inline void dma_reduction(uintptr_t a, uintptr_t b, uintptr_t c, dma_reduction_seq(a, b, c, d, comm); else if (IMPL == TREE) dma_reduction_tree(a, b, c, d, comm); - else if (IMPL == HW_GENERIC) - dma_reduction_hw_generic(a, c, comm); - else if (IMPL == HW_SIMPLE) - dma_reduction_hw_simple(a, c, comm); + else if (IMPL == HW) + dma_reduction_hw(a, c, comm); } // Global variables for verification script @@ -512,7 +498,7 @@ int main (void){ // Writeback to L3 uintptr_t result_buffer = c_buffer; uint32_t total_tree_levels = pb_log2_cluster_num_in_row() + LOG2_N_ROWS; - if ((IMPL == HW_SIMPLE && N_ROWS > 1) || + if ((IMPL == HW && N_ROWS > 1) || (IMPL == SEQ && N_ROWS > 1) || (IMPL == TREE && (total_tree_levels % 2) == 0)) result_buffer = a_buffer; From 22156135bf49d024055cedbcc5f8760fbe072b46 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 11:05:51 +0100 Subject: [PATCH 53/77] Bump dependencies --- Bender.lock | 8 ++++---- Bender.yml | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Bender.lock b/Bender.lock index bc81674e..710b2346 100644 --- a/Bender.lock +++ b/Bender.lock @@ -25,7 +25,7 @@ packages: - obi_peripherals - register_interface axi: - revision: 8e04779f341eb2c89412aae92223a292beef487e + revision: 06410c36819924e32db2afa428d244dbdbcd5d4e version: null source: Git: https://github.com/colluca/axi.git @@ -146,7 +146,7 @@ packages: dependencies: - common_cells common_cells: - revision: bab4cea30f1cdfe303a4585db04c1a1f6882b718 + revision: ca9d577f2fbc45ec557ab4e0905bbfc154441540 version: null source: Git: https://github.com/pulp-platform/common_cells.git @@ -391,10 +391,10 @@ packages: - common_cells - register_interface snitch_cluster: - revision: null + revision: dda2bcb707f0ed237f61a7f6d39608877abe7bdb version: null source: - Path: working_dir/snitch_cluster + Git: https://github.com/pulp-platform/snitch_cluster.git dependencies: - apb - axi diff --git a/Bender.yml b/Bender.yml index 44cc8583..63f00f47 100644 --- a/Bender.yml +++ b/Bender.yml @@ -9,11 +9,11 @@ package: dependencies: register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: "0.4.5" } - axi: { git: "https://github.com/pulp-platform/axi.git", version: "0.39.6" } + axi: { git: "https://github.com/colluca/axi.git", rev: "multicast" } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } - snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "5d6c957c70e3824e2330d8308eb904724cd41cbe" } - floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "feature/reduction" } + snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "narrow_reduction" } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "feature/reduction" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } hci: { git: "https://github.com/pulp-platform/hci.git", rev: "06fcba671e060f2e1b03b7ebe2d3e719f1557099" } From 9b3718cb4d6b5e6a7cadd98dcfcad3fee3fbfeed Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 11:36:56 +0100 Subject: [PATCH 54/77] Update software to new Snitch collective API --- sw/snitch/apps/summa_gemm/src/summa_gemm.c | 4 +-- sw/snitch/tests/barrier_benchmark.c | 8 +++--- sw/snitch/tests/dma_multicast_v2.c | 9 +++---- sw/snitch/tests/reduction_benchmark.c | 15 +++-------- .../tests/reduction_benchmark_hyperbank.c | 27 +++++-------------- 5 files changed, 20 insertions(+), 43 deletions(-) diff --git a/sw/snitch/apps/summa_gemm/src/summa_gemm.c b/sw/snitch/apps/summa_gemm/src/summa_gemm.c index 13840cc1..bdecff61 100644 --- a/sw/snitch/apps/summa_gemm/src/summa_gemm.c +++ b/sw/snitch/apps/summa_gemm/src/summa_gemm.c @@ -34,8 +34,8 @@ static inline void snrt_dma_load_2d_tile_mcast_sw( // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; diff --git a/sw/snitch/tests/barrier_benchmark.c b/sw/snitch/tests/barrier_benchmark.c index e5c6e579..c14e57f3 100644 --- a/sw/snitch/tests/barrier_benchmark.c +++ b/sw/snitch/tests/barrier_benchmark.c @@ -26,8 +26,8 @@ typedef enum { static inline void sw_barrier(snrt_comm_t comm) { // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_COLLECTIVE_MULTICAST; + snrt_collective_t op; + op.f.opcode = SNRT_COLLECTIVE_MULTICAST; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; volatile uint32_t *mcip_set = (uint32_t *)&(snrt_cluster()->peripheral_reg.cl_clint_set.w); @@ -69,8 +69,8 @@ static inline void sw_barrier(snrt_comm_t comm) { static inline void hw_barrier(snrt_comm_t comm) { // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; diff --git a/sw/snitch/tests/dma_multicast_v2.c b/sw/snitch/tests/dma_multicast_v2.c index 4d11275b..7084f63c 100644 --- a/sw/snitch/tests/dma_multicast_v2.c +++ b/sw/snitch/tests/dma_multicast_v2.c @@ -62,8 +62,8 @@ static inline void dma_multicast_sequential(uintptr_t l1_buffer, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; @@ -150,8 +150,8 @@ static inline void dma_multicast_tree(uintptr_t l1_buffer, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; @@ -311,7 +311,6 @@ int main() { uint32_t submask = 0; do { uint32_t i = fixed | submask; - DUMP(i); submask = (submask - 1) & mask; } while (submask != 0); } diff --git a/sw/snitch/tests/reduction_benchmark.c b/sw/snitch/tests/reduction_benchmark.c index 35f0d45c..45d75a1c 100644 --- a/sw/snitch/tests/reduction_benchmark.c +++ b/sw/snitch/tests/reduction_benchmark.c @@ -118,13 +118,6 @@ static inline void dma_reduction_seq(uintptr_t a, uintptr_t b, uintptr_t c, // Group to simplify rotating buffers uintptr_t dst[2] = {b, d}; - if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { - DUMP(a); - DUMP(b); - DUMP(c); - DUMP(d); - } - // Compute addresses of source and destination data for DMA cores. // All clusters in col > 0 participate in the row reduction. For this // reduction the source is in `a` and the result in `c`. @@ -158,8 +151,8 @@ static inline void dma_reduction_seq(uintptr_t a, uintptr_t b, uintptr_t c, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; @@ -347,8 +340,8 @@ static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; diff --git a/sw/snitch/tests/reduction_benchmark_hyperbank.c b/sw/snitch/tests/reduction_benchmark_hyperbank.c index f36cb334..747532b2 100644 --- a/sw/snitch/tests/reduction_benchmark_hyperbank.c +++ b/sw/snitch/tests/reduction_benchmark_hyperbank.c @@ -129,13 +129,6 @@ static inline void dma_reduction_seq_normal(uintptr_t a, uintptr_t b, uintptr_t // Group to simplify rotating buffers uintptr_t dst[2] = {b, d}; - if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { - DUMP(a); - DUMP(b); - DUMP(c); - DUMP(d); - } - // Compute addresses of source and destination data for DMA cores. // All clusters in col > 0 participate in the row reduction. For this // reduction the source is in `a` and the result in `c`. @@ -169,8 +162,8 @@ static inline void dma_reduction_seq_normal(uintptr_t a, uintptr_t b, uintptr_t // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; @@ -308,13 +301,6 @@ static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1, uintptr_t a_ptr[2] = {a0, a1}; uintptr_t c_ptr[2] = {c0, c1}; - if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) { - DUMP(a0); DUMP(a1); - DUMP(b); - DUMP(c0); DUMP(c1); - DUMP(d); - } - // Compute addresses of source and destination data for DMA cores. // All clusters in col > 0 participate in the row reduction. For this // reduction the source is in `a` and the result in `c`. @@ -348,8 +334,8 @@ static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; @@ -399,7 +385,6 @@ static inline void dma_reduction_seq_hyperbank(uintptr_t a0, uintptr_t a1, // Start DMA snrt_mcycle(); snrt_dma_start_1d(dma_dst, dma_src, BATCH); - // DUMP(dma_src);DUMP(*((double *)dma_src)); // Update pointers for next iteration while transfer completes, // preventing instructions from being reordered after the DMA wait @@ -581,8 +566,8 @@ static inline void dma_reduction_tree(uintptr_t a, uintptr_t b, uintptr_t c, // Prepare for inter-cluster barrier in advance, preventing instruction // reordering using the volatile block. - snrt_collective_op_t op; - op.f.collective_op = SNRT_REDUCTION_BARRIER; + snrt_collective_t op; + op.f.opcode = SNRT_REDUCTION_BARRIER; op.f.mask = snrt_get_collective_mask(comm); volatile uint32_t *barrier_ptr = comm->barrier_ptr; uint32_t user = (uint32_t)op.w; From 2ed76dadf7d3e7a45091f085f66b5a318cd28279 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 11:37:19 +0100 Subject: [PATCH 55/77] Bump Snitch LLVM toolchain --- iis-env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iis-env.sh b/iis-env.sh index 6a47bc37..f0aa91b5 100644 --- a/iis-env.sh +++ b/iis-env.sh @@ -9,7 +9,7 @@ export VLIB="questa-2023.4 vlib" export BASE_PYTHON=/usr/local/anaconda3/bin/python3.11 export CHS_SW_GCC_BINROOT=/usr/pack/riscv-1.0-kgf/riscv64-gcc-12.2.0/bin export VERIBLE_FMT="oseda -2025.03 verible-verilog-format" -export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.3.0/bin +export SN_LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.5.0/bin # Create the python venv if [ ! -d ".venv" ]; then From e5c73be7418e0567f26be71d5917b12248e0606e Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:46:54 +0100 Subject: [PATCH 56/77] Remove legacy `dma_multicast` experiments --- experiments/dma_multicast/.gitignore | 4 - experiments/dma_multicast/README.md | 14 -- experiments/dma_multicast/experiments.py | 67 ----- experiments/dma_multicast/quick_plot.py | 288 --------------------- experiments/dma_multicast_v2/README.md | 2 +- sw/snitch/tests/dma_multicast.c | 304 ----------------------- 6 files changed, 1 insertion(+), 678 deletions(-) delete mode 100644 experiments/dma_multicast/.gitignore delete mode 100644 experiments/dma_multicast/README.md delete mode 100755 experiments/dma_multicast/experiments.py delete mode 100755 experiments/dma_multicast/quick_plot.py delete mode 100644 sw/snitch/tests/dma_multicast.c diff --git a/experiments/dma_multicast/.gitignore b/experiments/dma_multicast/.gitignore deleted file mode 100644 index 45b44053..00000000 --- a/experiments/dma_multicast/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -build/ -runs/ -plots/ -results.csv \ No newline at end of file diff --git a/experiments/dma_multicast/README.md b/experiments/dma_multicast/README.md deleted file mode 100644 index edcb834d..00000000 --- a/experiments/dma_multicast/README.md +++ /dev/null @@ -1,14 +0,0 @@ -From the present directory: -```shell -./experiments.py --actions sw run -j -``` - -Run `dma_multicast_v2.elf`: -```shell -questa-2023.4 vsim -work /usr/scratch2/vulcano/colluca/workspace/REPOS/PICOBELLO/dca-snitch/target/sim/vsim/work -suppress 3009 -suppress 8386 -suppress 13314 -quiet -64 +CHS_BINARY=sw/cheshire/tests/simple_offload.spm.elf +SN_BINARY=sw/snitch/tests/build/dma_multicast_v2.elf +PRELMODE=3 -voptargs=+acc tb_picobello_top -do "log -r /*; do wave.do; run -a;" -``` - -Verify `dma_multicast_v2.elf`: -```shell -./experiments/mcast/verify.py placeholder $PWD/sw/snitch/tests/build/dma_multicast_v2.elf --no-ipc --memdump $PWD/l2mem.bin --memaddr 0x70000000 -``` \ No newline at end of file diff --git a/experiments/dma_multicast/experiments.py b/experiments/dma_multicast/experiments.py deleted file mode 100755 index 02135c39..00000000 --- a/experiments/dma_multicast/experiments.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Luca Colagrande - -import picobello as pb -import snitch.util.experiments.experiment_utils as eu -from snitch.util.experiments.SimResults import SimRegion - - -class ExperimentManager(pb.ExperimentManager): - - def derive_axes(self, experiment): - return eu.derive_axes_from_keys(experiment, ['length', 'mode', 'n_clusters']) - - def derive_cdefines(self, experiment): - cdefs = { - 'N_CLUSTERS_TO_USE': experiment['n_clusters'], - 'MODE': experiment['mode'], - 'LENGTH': experiment['length'], - } - return cdefs - - -def gen_experiments(): - experiments = [] - # for length in [256]:#, 512, 1024, 2048, 4096, 8192]: - # for mode in ['SW_UNOPT']:#, 'SW_OPT', 'SW_OPT2', 'HW_MCAST']: - # for n_clusters in [4]:#, 8, 16]: - for length in [256, 512, 1024, 2048, 4096, 8192]: - for mode in ['SW_UNOPT', 'SW_OPT', 'SW_OPT2', 'HW_MCAST']: - for n_clusters in [4, 8, 16]: - experiments.append({ - 'app': 'dma_multicast', - 'cmd': pb.sim_cmd, - 'length': length, - 'mode': mode, - 'n_clusters': n_clusters, - }) - return experiments - - -def main(): - - manager = ExperimentManager(gen_experiments()) - manager.run() - - df = manager.get_results() - print(df) - - if not manager.perf_results_available: - return - - # Get runtime of region 3 (second invocation of `dma_broadcast_to_clusters`) - def get_cycles(row): - return row['results'].get_metric(SimRegion('hart_9', 3), 'cycles') - - df['cycles'] = df.apply(get_cycles, axis=1) - df.drop(labels=['results'], inplace=True, axis=1) - df.to_csv('results.csv', index=False) - print(df) - - -if __name__ == '__main__': - main() diff --git a/experiments/dma_multicast/quick_plot.py b/experiments/dma_multicast/quick_plot.py deleted file mode 100755 index 073df29f..00000000 --- a/experiments/dma_multicast/quick_plot.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Lorenzo Leone - -import os -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from matplotlib import cm -from matplotlib.colors import ListedColormap - -PULP_COLORS = { - "PULP Blue": "#3067AB", - "PULP Petrol": "#84B5E9", - "PULP Green": "#168638", - "PULP Bronze": "#E59955", - "PULP Red": "#9B3B32", - "PULP Purple": "#851B66", - "PULP Grey": "#6F6F6F" -} - - -def plot_multicast_speedup(clusters, transfer_sizes, speedups, - colors=None, - title="Multicast Speedup", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=1, - legend_frame=True, - save_name=None): - """ - Plots a bar chart of multicast speedup. - - Parameters: - - clusters: list of cluster counts (x-axis categories) - - transfer_sizes: list of transfer sizes (legend categories) - - speedups: 2D list of speedups where each row corresponds to clusters[i] - and each column corresponds to transfer_sizes[j] - - colors: list of colors for each transfer size - - title, xlabel, ylabel: strings for plot customization - - legend_loc: legend location (e.g., 'upper left', 'upper right') - - legend_ncol: number of columns in legend - - legend_frame: whether to show frame around legend - """ - - # Number of clusters and transfer sizes - n_clusters = len(clusters) - n_sizes = len(transfer_sizes) - - # Default colors if none are provided - if colors is None: - colors = plt.cm.viridis(np.linspace(0, 1, n_sizes)) - - # X positions for cluster groups - x = np.arange(n_clusters) - - # Width of each bar - bar_width = 0.8 / n_sizes - - fig, ax = plt.subplots(figsize=(10, 6)) - - # Plot each transfer size group - for i, size in enumerate(transfer_sizes): - offsets = x + i * bar_width - (bar_width * (n_sizes - 1) / 2) - ax.bar(offsets, - [speedups[j][i] for j in range(n_clusters)], - width=bar_width, - color=colors[i], - label=f"{size}") - - # Labels and title - ax.set_xlabel(xlabel, fontsize=14) - ax.set_ylabel(ylabel, fontsize=14) - ax.set_title(title, fontsize=16) - ax.set_xticks(x) - ax.set_xticklabels(clusters) - ax.tick_params(axis='both', which='major', labelsize=14) - - # Legend customization - ax.legend(title="Transfer Size", loc=legend_loc, ncol=legend_ncol, frameon=legend_frame, fontsize=14, title_fontsize=16) - - # Grid for readability - ax.grid(axis="y", linestyle="--", alpha=0.7) - - fig.tight_layout() - - # Save plot if a name is provided - if save_name: - plots_dir = os.path.join(os.getcwd(), "plots") - os.makedirs(plots_dir, exist_ok=True) - file_path = os.path.join(plots_dir, f"{save_name}.svg") - plt.savefig(file_path, format="svg", bbox_inches="tight") - print(f"Plot saved to: {file_path}") - - return fig, ax - - -def _add_gradient_legend_inset( - ax_main, colors1, colors2, transfer_sizes, sw1_label, sw2_label, - inset_width=0.3, bar_height=0.04, gap=0.01, x0=0.02, y_top=0.92 -): - """Place two gradient bars in the top-left; bottom=SW1 (with labels), top=SW2 (no labels).""" - - # Top bar = SW2 (no labels) - ax_sw2 = ax_main.inset_axes([x0, y_top - bar_height, inset_width, bar_height], - transform=ax_main.transAxes) - # Bottom bar = SW1 (with labels) - ax_sw1 = ax_main.inset_axes([x0, y_top - 2*bar_height - gap, inset_width, bar_height], - transform=ax_main.transAxes) - - def draw_bar(ax_bar, colors, title, show_labels=False, lgn_title=False): - data = np.arange(len(colors)).reshape(1, len(colors)) - cmap = ListedColormap(colors) - ax_bar.imshow(data, aspect='auto', cmap=cmap, extent=[0, len(colors), 0, 1]) - - if show_labels: - centers = np.arange(len(colors)) + 0.5 - ax_bar.set_xticks(centers) - ax_bar.set_xticklabels(transfer_sizes, fontsize=8, rotation=45, ha="right") - else: - ax_bar.set_xticks([]) - - ax_bar.set_yticks([]) - ax_bar.set_xlim(0, len(colors)) - - for spine in ax_bar.spines.values(): - spine.set_visible(False) - - # Title to the right - ax_bar.text(len(colors) + 0.2, 0.5, title, - fontsize=9, va='center', ha='left', transform=ax_bar.transData) - - # DMA Transfer Size label centered over the 6 colors - if lgn_title: - ax_bar.text(3, 1.4, "DMA Transfer Size", # x=3 is center of 0..6 range - fontsize=9, ha='center', va='bottom', fontweight="bold", - transform=ax_bar.transData) - - # Draw bars - draw_bar(ax_sw2, colors2, sw2_label, show_labels=False, lgn_title=True) - draw_bar(ax_sw1, colors1, sw1_label, show_labels=True, lgn_title=False) - - -def plot_two_software_speedup( - clusters, - transfer_sizes, - sw1_speedups, - sw2_speedups, - sw1_label="Software 1", - sw2_label="Software 2", - cmap1="viridis", - cmap2="magma", - title="Speedup by Transfer Size and Software", - xlabel="Number of Clusters", - ylabel="Speedup", - save_name=None -): - n_clusters = len(clusters) - x = np.arange(n_clusters) - bars_per_group = 12 - bar_width = 0.8 / bars_per_group - - colors1 = cm.get_cmap(cmap1)(np.linspace(0.15, 0.85, 6)) - colors2 = cm.get_cmap(cmap2)(np.linspace(0.15, 0.85, 6)) - - fig, ax = plt.subplots(figsize=(12, 6)) - - for i in range(6): - offsets_sw1 = x + i * bar_width - (bar_width * (bars_per_group - 1) / 2) - offsets_sw2 = x + (6 + i) * bar_width - (bar_width * (bars_per_group - 1) / 2) - - ax.bar(offsets_sw1, [sw1_speedups[j][i] for j in range(n_clusters)], - width=bar_width, color=colors1[i]) - ax.bar(offsets_sw2, [sw2_speedups[j][i] for j in range(n_clusters)], - width=bar_width, color=colors2[i]) - - ax.set_title(title, fontsize=16) - ax.set_xlabel(xlabel, fontsize=14) - ax.set_ylabel(ylabel, fontsize=14) - ax.set_xticks(x) - ax.set_xticklabels(clusters) - ax.grid(axis="y", linestyle="--", alpha=0.7) - - # Add gradient legend in top-left corner - _add_gradient_legend_inset(ax, colors1, colors2, transfer_sizes, sw1_label, sw2_label) - - fig.tight_layout() - - # Save plot if a name is provided - if save_name: - plots_dir = os.path.join(os.getcwd(), "plots") - os.makedirs(plots_dir, exist_ok=True) - file_path = os.path.join(plots_dir, f"{save_name}.svg") - plt.savefig(file_path, format="svg", bbox_inches="tight") - print(f"Plot saved to: {file_path}") - - return fig, ax - - -# ===== Example Usage ===== -def main(): - df = pd.read_csv('results.csv') - - clusters = df['n_clusters'].unique().tolist() - transfer_sizes = [f'{size} B' for size in df['length'].unique().tolist()] - - cycles = { - mode: grp.drop(columns='mode') - .set_index(['n_clusters', 'length']) - .sort_index(level=['n_clusters', 'length']) - for mode, grp in df.groupby("mode", sort=False) - } - - speedups_mcast_unopt = cycles['SW_UNOPT'] / cycles['HW_MCAST'] - speedups_sw1_unopt = cycles['SW_UNOPT'] / cycles['SW_OPT'] - speedups_sw2_unopt = cycles['SW_UNOPT'] / cycles['SW_OPT2'] - speedups_sw2_sw1 = cycles['SW_OPT'] / cycles['SW_OPT2'] - speedups_mcast_sw2 = cycles['SW_OPT2'] / cycles['HW_MCAST'] - - speedups_mcast_unopt = speedups_mcast_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() - speedups_sw1_unopt = speedups_sw1_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() - speedups_sw2_unopt = speedups_sw2_unopt.values.reshape((len(clusters), len(transfer_sizes))).tolist() - speedups_sw2_sw1 = speedups_sw2_sw1.values.reshape((len(clusters), len(transfer_sizes))).tolist() - speedups_mcast_sw2 = speedups_mcast_sw2.values.reshape((len(clusters), len(transfer_sizes))).tolist() - - # Select PULP colors for the 6 transfer sizes - custom_colors = [ - PULP_COLORS["PULP Blue"], - PULP_COLORS["PULP Petrol"], - PULP_COLORS["PULP Green"], - PULP_COLORS["PULP Bronze"], - PULP_COLORS["PULP Red"], - PULP_COLORS["PULP Purple"] - ] - - plot_multicast_speedup( - clusters, transfer_sizes, speedups_mcast_unopt, - title="Multicast Speedup against unoptimized software version", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="mcast_vs_unopt" - ) - - plot_multicast_speedup( - clusters, transfer_sizes, speedups_sw2_sw1, - title="Speedup of SW OPT2 implementation against SW OPT1", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="sw2_vs_sw1" - ) - - plot_multicast_speedup( - clusters, transfer_sizes, speedups_mcast_sw2, - title="Multicast Speedup against best software version", - xlabel="Number of Clusters", - ylabel="Speedup", - legend_loc="upper left", - legend_ncol=2, - save_name="mcast_vs_optimized" - ) - - plot_two_software_speedup( - clusters, - transfer_sizes, - speedups_sw1_unopt, speedups_sw2_unopt, - sw1_label="Column first", - sw2_label="Binary tree", - cmap1="viridis", - cmap2="magma", - title="Speedup of different Software schemes", - xlabel="Number of Clusters", - ylabel="Speedup", - save_name="sw_comparison" - ) - - plt.show() - - -if __name__ == '__main__': - main() diff --git a/experiments/dma_multicast_v2/README.md b/experiments/dma_multicast_v2/README.md index d5a32fda..7e6d26e5 100644 --- a/experiments/dma_multicast_v2/README.md +++ b/experiments/dma_multicast_v2/README.md @@ -2,7 +2,7 @@ All commands are assumed to be run from this folder. To run the experiments: ```shell -./sw_experiments.py --actions sw run visual-trace -j +./experiments.py --actions sw run visual-trace -j ``` To manually verify a simulation: diff --git a/sw/snitch/tests/dma_multicast.c b/sw/snitch/tests/dma_multicast.c deleted file mode 100644 index 6261d876..00000000 --- a/sw/snitch/tests/dma_multicast.c +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright 2023 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// Luca Colagrande -// Lorenzo Leone -// -// This code tests the multicast feature of the wide interconnect. -// It uses the DMA to broadcast chunks of data to different clusters -// in the Picobello system. -// -// Extension: 30-07-25 -// The code can now be used to run microbenchamrk and check the multicast -// feature performance. The test support different software implemenation -// to multicast some DMA trasnefers over multiple clusters in the Picobello -// system. By driving some parameters externally, it's possible to get -// the performance results. -// -// Comment: -// The use of WFI and global_barrier together can create some issues due to -// non unique CLINT signal mapping. A possibility would be not to send the -// clusters into WFI, however this will create complicances in teh wake up routine. -// For instance, each cluster would nee to poll the DMA transfer detsination -// until teh expected value is written, and only then it can move to the next -// algorithm step. However, since the code is run twice to pre-heat the cache, -// in the second iteration each cluster will already have the expected results -// and will start the transfer immediately. - -#include -#include "pb_addrmap.h" -#include "snrt.h" - -#define INITIALIZER 0xAAAAAAAA - -typedef enum { - SW_UNOPT, - SW_OPT, - SW_OPT2, - HW_MCAST -} mode_t; - -// Run experiments with and without multicast support -#ifndef MODE -#define MODE SW_UNOPT -#endif - -// Transfer LENGTH in uint32_t elements -#ifndef LENGTH -#define LENGTH 1024 -#endif - -#define LENGTH_TO_CHECK 32 - -#ifndef N_CLUSTERS_TO_USE -#define N_CLUSTERS_TO_USE 8 -#endif - -#define LOG2_COL 2 -#define LOG2_ROW 2 - -#define BCAST_MASK_ACTIVE ((N_CLUSTERS_TO_USE - 1) << 18) - -#define cluster_is_in_column(col) \ - __cluster_is_in_column_api((col) - 1) - -static inline int __cluster_is_in_column_api(int col){ - return ((snrt_cluster_idx() >> LOG2_ROW) == col); -} - -//------------------- -//| Mcast functions | -//------------------- -// -// HW_MCAST: Broadcast from cluster_idx to all mcast destinations (only power of 2 dst supported) -static inline void cluster_broadcast_hw(uint32_t cluster_idx, void* dst, void* src, size_t size) { - if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { - snrt_dma_start_1d_mcast(snrt_remote_l1_ptr(dst, 0, 1), src, size, BCAST_MASK_ACTIVE); - snrt_dma_wait_all(); - } -} - - -// SW_UNOPT: Unoptimized software version. Cluster_idx initiates multiple DMA transfers to each destination -static inline void cluster_broadcast_sw_unopt(uint32_t cluster_idx, void* dst, void* src, size_t size) { - if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { - for (int cl = 0; cl < N_CLUSTERS_TO_USE; cl++) { - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, cl), src, size); - snrt_dma_wait_all(); - } - } -} - - -// SW_OPT: Cluster 0 broadcast over the first column, then each cluster in the first column -// broadcast over its own row -static inline void cluster_broadcast_sw_opt(uint32_t cluster_idx, void* dst, void* src, size_t size) { - if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { - for (int row_cl = 0; row_cl < 4; row_cl ++){ - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, row_cl), src, size); - snrt_dma_wait_all(); - } - // Wait completion of all the transfer to then wakeup the clusters in col 0 - // In this benchmark setup, it's okay to wake up all clusters in a col together, - // because they'll issue multicast requests on their row without creating - // crossing paths between multicast transactions and therefore without creating - // any network congestion that might affect the final results. - - // Wake up clusters in col 0 - if (N_CLUSTERS_TO_USE > 4) { - for (int row_cl = 1; row_cl < 4; row_cl ++){ - snrt_cluster(row_cl)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - } - // Send data from cluster 0 to row 0 - for (int col_cl = 1; col_cl < N_CLUSTERS_TO_USE/4; col_cl ++){ - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, col_cl * 4), - src, size); - snrt_dma_wait_all(); - } - } else if (snrt_is_dm_core() && cluster_is_in_column(1) && N_CLUSTERS_TO_USE > 4) { - // The clusters in the first column will wait to be waked up by cluster 0 - // and only then will isue the multicast transaction to the row - snrt_wfi(); - snrt_int_clr_mcip(); - // Send data to each row - for (int col_cl = 1; col_cl < N_CLUSTERS_TO_USE/4; col_cl ++){ - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + col_cl * 4), - dst, size); - snrt_dma_wait_all(); - } - } -} - -// SW_OPT2: to maximize the paralle work, cluster 0 multicast to cluster 2, -// then simultaneously cluster 0 and 2 multicast to 1 and 3 respectively. -// The same strided approach is used column-wise. -// This approach is afster for NUM_CLUSTERS = 16 only. -// or this reason the following code is not generic enough to handle NUM_CLUSTERS < 16. -static inline void cluster_broadcast_sw_opt2(uint32_t cluster_idx, void* dst, void* src, size_t size) { - - if (snrt_is_dm_core() && (snrt_cluster_idx() == cluster_idx)) { - // DMA transfer to cluster 0 - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 0), src, size); - snrt_dma_wait_all(); - - // DMA transfer to cluster 2 anf then wake up and move one - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 2), src, size); - snrt_dma_wait_all(); - snrt_cluster(2)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - - // DMA transfer to cluster 1 and then wake up and move to the row - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 1), src, size); - snrt_dma_wait_all(); - // WWake up cluster 1 only if you need a second iteration for the next column - if (N_CLUSTERS_TO_USE > 4){ - snrt_cluster(1)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - - if (N_CLUSTERS_TO_USE > 8){ - // DMA transfer to cluster 8 and then wake up and move to 4 - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 8), src, size); - snrt_dma_wait_all(); - snrt_cluster(8)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - - if (N_CLUSTERS_TO_USE > 4) { - // DMA transfer to cluster 4. No need to wake it up now since - // it is not involved in any transfer - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, 0, 4), src, size); - snrt_dma_wait_all(); - } - - } else if (snrt_is_dm_core() && cluster_is_in_column(1)) { - // The clusters in the first column will wait to be waked up by cluster 0 - // and only then will isue the multicast transaction to the row - if (N_CLUSTERS_TO_USE > 4 || snrt_cluster_idx() == 2) { - snrt_wfi(); - snrt_int_clr_mcip(); - } - - // Only cluster 2 is repsonsible to transfer also to cluster 3 - if (snrt_cluster_idx() == 2) { - // DMA transfer to cluster 8 and then wake up and move to 4 - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 1), dst, size); - snrt_dma_wait_all(); - snrt_cluster(snrt_cluster_idx() + 1)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - - if (N_CLUSTERS_TO_USE > 8){ - // All clusters in the first column need to send the data with - // a stride of 8 first and 4 after. - // DMA transfer to cluster 8 and then wake up and move to 4 - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 8), dst, size); - snrt_dma_wait_all(); - snrt_cluster(snrt_cluster_idx() + 8)->peripheral_reg.cl_clint_set.f.cl_clint_set = 0x1ff; - } - - if (N_CLUSTERS_TO_USE > 4) { - // Then the clusters in the first column can send the transfer with stride 4, - // i.e. to the cluster in he same row but next column. - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 4), dst, size); - snrt_dma_wait_all(); - } - - - } else if (snrt_is_dm_core() && cluster_is_in_column(3) && N_CLUSTERS_TO_USE > 8) { - // The clusters in the column 3 will wait to be waked up by cluster - // and only then will isue the multicast transaction to the row - snrt_wfi(); - snrt_int_clr_mcip(); - - snrt_dma_start_1d(snrt_remote_l1_ptr(dst, snrt_cluster_idx(), snrt_cluster_idx() + 4), dst, size); - snrt_dma_wait_all(); - } -} - - -static inline void dma_broadcast_to_clusters(void* dst, void* src, size_t size) { - switch (MODE) { - case HW_MCAST: - default: - cluster_broadcast_hw(0, dst, src, size); - break; - case SW_UNOPT: - cluster_broadcast_sw_unopt(0, dst, src, size); - break; - case SW_OPT: - cluster_broadcast_sw_opt(0, dst, src, size); - break; - case SW_OPT2: - cluster_broadcast_sw_opt2(0, dst, src, size); - break; - } -} - - -static inline int cluster_participates_in_bcast(int i) { - return (i < N_CLUSTERS_TO_USE); -} - -static inline void broadcast_wrapper(void* dst, void* src, size_t size) { - snrt_global_barrier(); - snrt_mcycle(); - dma_broadcast_to_clusters(dst, src, size); - snrt_mcycle(); -} - -// -// -int main() { - snrt_interrupt_enable(IRQ_M_CLUSTER); - snrt_int_clr_mcip(); - - // Allocate destination buffer - uint32_t *buffer_dst = (uint32_t *)snrt_l1_next_v2(); - - // ALlocate source buffer. To avoid multicast conflictr in the Multicast - // capable AXI Xbar inside each cluster, the source will be a location in - // the memory tile (L3). - // To have better contorl of the DMA transfers triggerred for a given LENGTH, - // align the src pointer to 4kiB, since whenever the src address cross the - // 4kiB section, the DMA will issue multiple transfers. - uintptr_t raw_buffer_src = (uintptr_t) snrt_l3_next_v2(); - uintptr_t aligned_addr = snrt_align_up(raw_buffer_src, 4096); // Align to 4 KiB - uint32_t *buffer_src = (uint32_t *)aligned_addr; - - - // Every cluster initializes its destination buffer. - if (snrt_is_dm_core()) { - snrt_dma_start_1d(buffer_dst, snrt_cluster()->zeromem.mem, LENGTH * sizeof(uint32_t)); - snrt_dma_wait_all(); - } - - // Synchronize all clusters. - snrt_global_barrier(); - - // First cluster initializes the source buffer and multicast- - // copies it to the destination buffer in every cluster's TCDM. - if (snrt_is_dm_core() && (snrt_cluster_idx() == 0)) { - for (uint32_t i = 0; i < LENGTH; i++) { - buffer_src[i] = INITIALIZER; - } - } - - // Initiate DMA transfer (twice to preheat the cache) - for (volatile int i = 0; i < 2; i++) { - broadcast_wrapper(buffer_dst, buffer_src, LENGTH * sizeof(uint32_t) ); - } - - // All other clusters wait on a global barrier to signal the transfer - // completion. - snrt_global_barrier(); - - // Every cluster except cluster 0 checks that the data in the destination - // buffer is correct. To speed this up we only check the first 32 elements. - if (snrt_is_dm_core() && (snrt_cluster_idx() < N_CLUSTERS_TO_USE) && (snrt_cluster_idx() != 0)) { - uint32_t n_errs = LENGTH_TO_CHECK; - for (uint32_t i = 0; i < LENGTH_TO_CHECK; i++) { - if (buffer_dst[i] == INITIALIZER) n_errs--; - } - return n_errs; - } else - return 0; - } From 8b1c7879da5ff758fb13b906d8a4f1e1a84b08cf Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:48:56 +0100 Subject: [PATCH 57/77] Cleanup reduction experiments removing hw generic --- .../cfg/floo_picobello_noc_pkg_generic.sv | 343 ------------------ .../cfg/floo_picobello_noc_pkg_simple.sv | 343 ------------------ experiments/reduction/experiments.py | 12 +- experiments/reduction/fit.py | 2 +- experiments/reduction/plot.py | 4 +- .../roi/{hw_simple.json.tpl => hw.json.tpl} | 0 experiments/reduction/roi/hw_generic.json.tpl | 21 -- 7 files changed, 4 insertions(+), 721 deletions(-) delete mode 100644 experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv delete mode 100644 experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv rename experiments/reduction/roi/{hw_simple.json.tpl => hw.json.tpl} (100%) delete mode 100644 experiments/reduction/roi/hw_generic.json.tpl diff --git a/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv b/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv deleted file mode 100644 index 191d1f55..00000000 --- a/experiments/reduction/cfg/floo_picobello_noc_pkg_generic.sv +++ /dev/null @@ -1,343 +0,0 @@ -// Copyright 2025 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// AUTOMATICALLY GENERATED! DO NOT EDIT! - -`include "axi/typedef.svh" -`include "floo_noc/typedef.svh" - -package floo_picobello_noc_pkg; - - import floo_pkg::*; - - ///////////////////// - // Address Map // - ///////////////////// - - typedef enum logic [4:0] { - ClusterX0Y0 = 0, - ClusterX0Y1 = 1, - ClusterX0Y2 = 2, - ClusterX0Y3 = 3, - ClusterX1Y0 = 4, - ClusterX1Y1 = 5, - ClusterX1Y2 = 6, - ClusterX1Y3 = 7, - ClusterX2Y0 = 8, - ClusterX2Y1 = 9, - ClusterX2Y2 = 10, - ClusterX2Y3 = 11, - ClusterX3Y0 = 12, - ClusterX3Y1 = 13, - ClusterX3Y2 = 14, - ClusterX3Y3 = 15, - Cheshire = 16, - FhgSpu = 17, - TopSpmNarrow = 18, - TopSpmWide = 19, - L2Spm0 = 20, - L2Spm1 = 21, - L2Spm2 = 22, - L2Spm3 = 23, - L2Spm4 = 24, - L2Spm5 = 25, - L2Spm6 = 26, - L2Spm7 = 27, - NumEndpoints = 28 - } ep_id_e; - - - - typedef enum logic [4:0] { - ClusterX0Y0SamIdx = 0, - ClusterX0Y1SamIdx = 1, - ClusterX0Y2SamIdx = 2, - ClusterX0Y3SamIdx = 3, - ClusterX1Y0SamIdx = 4, - ClusterX1Y1SamIdx = 5, - ClusterX1Y2SamIdx = 6, - ClusterX1Y3SamIdx = 7, - ClusterX2Y0SamIdx = 8, - ClusterX2Y1SamIdx = 9, - ClusterX2Y2SamIdx = 10, - ClusterX2Y3SamIdx = 11, - ClusterX3Y0SamIdx = 12, - ClusterX3Y1SamIdx = 13, - ClusterX3Y2SamIdx = 14, - ClusterX3Y3SamIdx = 15, - CheshireExternalSamIdx = 16, - CheshireInternalSamIdx = 17, - FhgSpuSamIdx = 18, - TopSpmNarrowSamIdx = 19, - TopSpmWideSamIdx = 20, - L2Spm0SamIdx = 21, - L2Spm1SamIdx = 22, - L2Spm2SamIdx = 23, - L2Spm3SamIdx = 24, - L2Spm4SamIdx = 25, - L2Spm5SamIdx = 26, - L2Spm6SamIdx = 27, - L2Spm7SamIdx = 28 - } sam_idx_e; - - - - typedef logic [0:0] rob_idx_t; - typedef logic [0:0] port_id_t; - typedef logic [3:0] x_bits_t; - typedef logic [1:0] y_bits_t; - typedef struct packed { - x_bits_t x; - y_bits_t y; - port_id_t port_id; - } id_t; - - typedef logic route_t; - - - localparam int unsigned SamNumRules = 29; - - typedef struct packed { - id_t idx; - logic [47:0] start_addr; - logic [47:0] end_addr; - } sam_rule_t; - - localparam sam_rule_t [SamNumRules-1:0] Sam = '{ - '{ - idx: '{x: 8, y: 3, port_id: 0}, - start_addr: 48'h000070700000, - end_addr: 48'h000070800000 - }, // L2Spm7 - '{ - idx: '{x: 8, y: 2, port_id: 0}, - start_addr: 48'h000070600000, - end_addr: 48'h000070700000 - }, // L2Spm6 - '{ - idx: '{x: 8, y: 1, port_id: 0}, - start_addr: 48'h000070500000, - end_addr: 48'h000070600000 - }, // L2Spm5 - '{ - idx: '{x: 8, y: 0, port_id: 0}, - start_addr: 48'h000070400000, - end_addr: 48'h000070500000 - }, // L2Spm4 - '{ - idx: '{x: 0, y: 3, port_id: 0}, - start_addr: 48'h000070300000, - end_addr: 48'h000070400000 - }, // L2Spm3 - '{ - idx: '{x: 0, y: 2, port_id: 0}, - start_addr: 48'h000070200000, - end_addr: 48'h000070300000 - }, // L2Spm2 - '{ - idx: '{x: 0, y: 1, port_id: 0}, - start_addr: 48'h000070100000, - end_addr: 48'h000070200000 - }, // L2Spm1 - '{ - idx: '{x: 0, y: 0, port_id: 0}, - start_addr: 48'h000070000000, - end_addr: 48'h000070100000 - }, // L2Spm0 - '{ - idx: '{x: 9, y: 1, port_id: 0}, - start_addr: 48'h000060040000, - end_addr: 48'h000060080000 - }, // TopSpmWide - '{ - idx: '{x: 9, y: 2, port_id: 0}, - start_addr: 48'h000060000000, - end_addr: 48'h000060040000 - }, // TopSpmNarrow - '{ - idx: '{x: 9, y: 0, port_id: 0}, - start_addr: 48'h000040000000, - end_addr: 48'h000040040000 - }, // FhgSpu - '{ - idx: '{x: 9, y: 3, port_id: 0}, - start_addr: 48'h000000000000, - end_addr: 48'h000020000000 - }, // CheshireInternal - '{ - idx: '{x: 9, y: 3, port_id: 0}, - start_addr: 48'h000080000000, - end_addr: 48'h020000000000 - }, // CheshireExternal - '{ - idx: '{x: 7, y: 3, port_id: 0}, - start_addr: 48'h0000203c0000, - end_addr: 48'h000020400000 - }, // ClusterX3Y3 - '{ - idx: '{x: 7, y: 2, port_id: 0}, - start_addr: 48'h000020380000, - end_addr: 48'h0000203c0000 - }, // ClusterX3Y2 - '{ - idx: '{x: 7, y: 1, port_id: 0}, - start_addr: 48'h000020340000, - end_addr: 48'h000020380000 - }, // ClusterX3Y1 - '{ - idx: '{x: 7, y: 0, port_id: 0}, - start_addr: 48'h000020300000, - end_addr: 48'h000020340000 - }, // ClusterX3Y0 - '{ - idx: '{x: 6, y: 3, port_id: 0}, - start_addr: 48'h0000202c0000, - end_addr: 48'h000020300000 - }, // ClusterX2Y3 - '{ - idx: '{x: 6, y: 2, port_id: 0}, - start_addr: 48'h000020280000, - end_addr: 48'h0000202c0000 - }, // ClusterX2Y2 - '{ - idx: '{x: 6, y: 1, port_id: 0}, - start_addr: 48'h000020240000, - end_addr: 48'h000020280000 - }, // ClusterX2Y1 - '{ - idx: '{x: 6, y: 0, port_id: 0}, - start_addr: 48'h000020200000, - end_addr: 48'h000020240000 - }, // ClusterX2Y0 - '{ - idx: '{x: 5, y: 3, port_id: 0}, - start_addr: 48'h0000201c0000, - end_addr: 48'h000020200000 - }, // ClusterX1Y3 - '{ - idx: '{x: 5, y: 2, port_id: 0}, - start_addr: 48'h000020180000, - end_addr: 48'h0000201c0000 - }, // ClusterX1Y2 - '{ - idx: '{x: 5, y: 1, port_id: 0}, - start_addr: 48'h000020140000, - end_addr: 48'h000020180000 - }, // ClusterX1Y1 - '{ - idx: '{x: 5, y: 0, port_id: 0}, - start_addr: 48'h000020100000, - end_addr: 48'h000020140000 - }, // ClusterX1Y0 - '{ - idx: '{x: 4, y: 3, port_id: 0}, - start_addr: 48'h0000200c0000, - end_addr: 48'h000020100000 - }, // ClusterX0Y3 - '{ - idx: '{x: 4, y: 2, port_id: 0}, - start_addr: 48'h000020080000, - end_addr: 48'h0000200c0000 - }, // ClusterX0Y2 - '{ - idx: '{x: 4, y: 1, port_id: 0}, - start_addr: 48'h000020040000, - end_addr: 48'h000020080000 - }, // ClusterX0Y1 - '{ - idx: '{x: 4, y: 0, port_id: 0}, - start_addr: 48'h000020000000, - end_addr: 48'h000020040000 - } // ClusterX0Y0 - - }; - - - - localparam route_cfg_t RouteCfg = '{ - RouteAlgo: XYRouting, - UseIdTable: 1'b1, - XYAddrOffsetX: 41, - XYAddrOffsetY: 45, - IdAddrOffset: 0, - NumSamRules: 29, - NumRoutes: 0, - CollectiveCfg: '{ - OpCfg: '{ - EnNarrowMulticast: 1'b1, - EnWideMulticast: 1'b1, - EnLSBAnd: 1'b1, - EnF_Add: 1'b1, - EnF_Mul: 1'b1, - EnF_Min: 1'b1, - EnF_Max: 1'b1, - default: '0 - }, - RedCfg: ReductionDefaultCfg - } - }; - - - typedef logic [47:0] axi_narrow_in_addr_t; - typedef logic [63:0] axi_narrow_in_data_t; - typedef logic [7:0] axi_narrow_in_strb_t; - typedef logic [4:0] axi_narrow_in_id_t; - typedef logic [4:0] axi_narrow_in_user_t; - `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, - axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, - axi_narrow_in_user_t) - - - typedef logic [47:0] axi_narrow_out_addr_t; - typedef logic [63:0] axi_narrow_out_data_t; - typedef logic [7:0] axi_narrow_out_strb_t; - typedef logic [1:0] axi_narrow_out_id_t; - typedef logic [4:0] axi_narrow_out_user_t; - `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, - axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, - axi_narrow_out_strb_t, axi_narrow_out_user_t) - - - typedef logic [47:0] axi_wide_in_addr_t; - typedef logic [511:0] axi_wide_in_data_t; - typedef logic [63:0] axi_wide_in_strb_t; - typedef logic [2:0] axi_wide_in_id_t; - typedef logic [0:0] axi_wide_in_user_t; - `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, - axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) - - - typedef logic [47:0] axi_wide_out_addr_t; - typedef logic [511:0] axi_wide_out_data_t; - typedef logic [63:0] axi_wide_out_strb_t; - typedef logic [0:0] axi_wide_out_id_t; - typedef logic [0:0] axi_wide_out_user_t; - `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, - axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, - axi_wide_out_user_t) - - - - `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) - localparam axi_cfg_t AxiCfgN = '{ - AddrWidth: 48, - DataWidth: 64, - UserWidth: 5, - InIdWidth: 5, - OutIdWidth: 2 - }; - localparam axi_cfg_t AxiCfgW = '{ - AddrWidth: 48, - DataWidth: 512, - UserWidth: 1, - InIdWidth: 3, - OutIdWidth: 1 - }; - `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, - hdr_t) - - `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) - - -endpackage diff --git a/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv b/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv deleted file mode 100644 index 191d1f55..00000000 --- a/experiments/reduction/cfg/floo_picobello_noc_pkg_simple.sv +++ /dev/null @@ -1,343 +0,0 @@ -// Copyright 2025 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// AUTOMATICALLY GENERATED! DO NOT EDIT! - -`include "axi/typedef.svh" -`include "floo_noc/typedef.svh" - -package floo_picobello_noc_pkg; - - import floo_pkg::*; - - ///////////////////// - // Address Map // - ///////////////////// - - typedef enum logic [4:0] { - ClusterX0Y0 = 0, - ClusterX0Y1 = 1, - ClusterX0Y2 = 2, - ClusterX0Y3 = 3, - ClusterX1Y0 = 4, - ClusterX1Y1 = 5, - ClusterX1Y2 = 6, - ClusterX1Y3 = 7, - ClusterX2Y0 = 8, - ClusterX2Y1 = 9, - ClusterX2Y2 = 10, - ClusterX2Y3 = 11, - ClusterX3Y0 = 12, - ClusterX3Y1 = 13, - ClusterX3Y2 = 14, - ClusterX3Y3 = 15, - Cheshire = 16, - FhgSpu = 17, - TopSpmNarrow = 18, - TopSpmWide = 19, - L2Spm0 = 20, - L2Spm1 = 21, - L2Spm2 = 22, - L2Spm3 = 23, - L2Spm4 = 24, - L2Spm5 = 25, - L2Spm6 = 26, - L2Spm7 = 27, - NumEndpoints = 28 - } ep_id_e; - - - - typedef enum logic [4:0] { - ClusterX0Y0SamIdx = 0, - ClusterX0Y1SamIdx = 1, - ClusterX0Y2SamIdx = 2, - ClusterX0Y3SamIdx = 3, - ClusterX1Y0SamIdx = 4, - ClusterX1Y1SamIdx = 5, - ClusterX1Y2SamIdx = 6, - ClusterX1Y3SamIdx = 7, - ClusterX2Y0SamIdx = 8, - ClusterX2Y1SamIdx = 9, - ClusterX2Y2SamIdx = 10, - ClusterX2Y3SamIdx = 11, - ClusterX3Y0SamIdx = 12, - ClusterX3Y1SamIdx = 13, - ClusterX3Y2SamIdx = 14, - ClusterX3Y3SamIdx = 15, - CheshireExternalSamIdx = 16, - CheshireInternalSamIdx = 17, - FhgSpuSamIdx = 18, - TopSpmNarrowSamIdx = 19, - TopSpmWideSamIdx = 20, - L2Spm0SamIdx = 21, - L2Spm1SamIdx = 22, - L2Spm2SamIdx = 23, - L2Spm3SamIdx = 24, - L2Spm4SamIdx = 25, - L2Spm5SamIdx = 26, - L2Spm6SamIdx = 27, - L2Spm7SamIdx = 28 - } sam_idx_e; - - - - typedef logic [0:0] rob_idx_t; - typedef logic [0:0] port_id_t; - typedef logic [3:0] x_bits_t; - typedef logic [1:0] y_bits_t; - typedef struct packed { - x_bits_t x; - y_bits_t y; - port_id_t port_id; - } id_t; - - typedef logic route_t; - - - localparam int unsigned SamNumRules = 29; - - typedef struct packed { - id_t idx; - logic [47:0] start_addr; - logic [47:0] end_addr; - } sam_rule_t; - - localparam sam_rule_t [SamNumRules-1:0] Sam = '{ - '{ - idx: '{x: 8, y: 3, port_id: 0}, - start_addr: 48'h000070700000, - end_addr: 48'h000070800000 - }, // L2Spm7 - '{ - idx: '{x: 8, y: 2, port_id: 0}, - start_addr: 48'h000070600000, - end_addr: 48'h000070700000 - }, // L2Spm6 - '{ - idx: '{x: 8, y: 1, port_id: 0}, - start_addr: 48'h000070500000, - end_addr: 48'h000070600000 - }, // L2Spm5 - '{ - idx: '{x: 8, y: 0, port_id: 0}, - start_addr: 48'h000070400000, - end_addr: 48'h000070500000 - }, // L2Spm4 - '{ - idx: '{x: 0, y: 3, port_id: 0}, - start_addr: 48'h000070300000, - end_addr: 48'h000070400000 - }, // L2Spm3 - '{ - idx: '{x: 0, y: 2, port_id: 0}, - start_addr: 48'h000070200000, - end_addr: 48'h000070300000 - }, // L2Spm2 - '{ - idx: '{x: 0, y: 1, port_id: 0}, - start_addr: 48'h000070100000, - end_addr: 48'h000070200000 - }, // L2Spm1 - '{ - idx: '{x: 0, y: 0, port_id: 0}, - start_addr: 48'h000070000000, - end_addr: 48'h000070100000 - }, // L2Spm0 - '{ - idx: '{x: 9, y: 1, port_id: 0}, - start_addr: 48'h000060040000, - end_addr: 48'h000060080000 - }, // TopSpmWide - '{ - idx: '{x: 9, y: 2, port_id: 0}, - start_addr: 48'h000060000000, - end_addr: 48'h000060040000 - }, // TopSpmNarrow - '{ - idx: '{x: 9, y: 0, port_id: 0}, - start_addr: 48'h000040000000, - end_addr: 48'h000040040000 - }, // FhgSpu - '{ - idx: '{x: 9, y: 3, port_id: 0}, - start_addr: 48'h000000000000, - end_addr: 48'h000020000000 - }, // CheshireInternal - '{ - idx: '{x: 9, y: 3, port_id: 0}, - start_addr: 48'h000080000000, - end_addr: 48'h020000000000 - }, // CheshireExternal - '{ - idx: '{x: 7, y: 3, port_id: 0}, - start_addr: 48'h0000203c0000, - end_addr: 48'h000020400000 - }, // ClusterX3Y3 - '{ - idx: '{x: 7, y: 2, port_id: 0}, - start_addr: 48'h000020380000, - end_addr: 48'h0000203c0000 - }, // ClusterX3Y2 - '{ - idx: '{x: 7, y: 1, port_id: 0}, - start_addr: 48'h000020340000, - end_addr: 48'h000020380000 - }, // ClusterX3Y1 - '{ - idx: '{x: 7, y: 0, port_id: 0}, - start_addr: 48'h000020300000, - end_addr: 48'h000020340000 - }, // ClusterX3Y0 - '{ - idx: '{x: 6, y: 3, port_id: 0}, - start_addr: 48'h0000202c0000, - end_addr: 48'h000020300000 - }, // ClusterX2Y3 - '{ - idx: '{x: 6, y: 2, port_id: 0}, - start_addr: 48'h000020280000, - end_addr: 48'h0000202c0000 - }, // ClusterX2Y2 - '{ - idx: '{x: 6, y: 1, port_id: 0}, - start_addr: 48'h000020240000, - end_addr: 48'h000020280000 - }, // ClusterX2Y1 - '{ - idx: '{x: 6, y: 0, port_id: 0}, - start_addr: 48'h000020200000, - end_addr: 48'h000020240000 - }, // ClusterX2Y0 - '{ - idx: '{x: 5, y: 3, port_id: 0}, - start_addr: 48'h0000201c0000, - end_addr: 48'h000020200000 - }, // ClusterX1Y3 - '{ - idx: '{x: 5, y: 2, port_id: 0}, - start_addr: 48'h000020180000, - end_addr: 48'h0000201c0000 - }, // ClusterX1Y2 - '{ - idx: '{x: 5, y: 1, port_id: 0}, - start_addr: 48'h000020140000, - end_addr: 48'h000020180000 - }, // ClusterX1Y1 - '{ - idx: '{x: 5, y: 0, port_id: 0}, - start_addr: 48'h000020100000, - end_addr: 48'h000020140000 - }, // ClusterX1Y0 - '{ - idx: '{x: 4, y: 3, port_id: 0}, - start_addr: 48'h0000200c0000, - end_addr: 48'h000020100000 - }, // ClusterX0Y3 - '{ - idx: '{x: 4, y: 2, port_id: 0}, - start_addr: 48'h000020080000, - end_addr: 48'h0000200c0000 - }, // ClusterX0Y2 - '{ - idx: '{x: 4, y: 1, port_id: 0}, - start_addr: 48'h000020040000, - end_addr: 48'h000020080000 - }, // ClusterX0Y1 - '{ - idx: '{x: 4, y: 0, port_id: 0}, - start_addr: 48'h000020000000, - end_addr: 48'h000020040000 - } // ClusterX0Y0 - - }; - - - - localparam route_cfg_t RouteCfg = '{ - RouteAlgo: XYRouting, - UseIdTable: 1'b1, - XYAddrOffsetX: 41, - XYAddrOffsetY: 45, - IdAddrOffset: 0, - NumSamRules: 29, - NumRoutes: 0, - CollectiveCfg: '{ - OpCfg: '{ - EnNarrowMulticast: 1'b1, - EnWideMulticast: 1'b1, - EnLSBAnd: 1'b1, - EnF_Add: 1'b1, - EnF_Mul: 1'b1, - EnF_Min: 1'b1, - EnF_Max: 1'b1, - default: '0 - }, - RedCfg: ReductionDefaultCfg - } - }; - - - typedef logic [47:0] axi_narrow_in_addr_t; - typedef logic [63:0] axi_narrow_in_data_t; - typedef logic [7:0] axi_narrow_in_strb_t; - typedef logic [4:0] axi_narrow_in_id_t; - typedef logic [4:0] axi_narrow_in_user_t; - `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, - axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, - axi_narrow_in_user_t) - - - typedef logic [47:0] axi_narrow_out_addr_t; - typedef logic [63:0] axi_narrow_out_data_t; - typedef logic [7:0] axi_narrow_out_strb_t; - typedef logic [1:0] axi_narrow_out_id_t; - typedef logic [4:0] axi_narrow_out_user_t; - `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, - axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, - axi_narrow_out_strb_t, axi_narrow_out_user_t) - - - typedef logic [47:0] axi_wide_in_addr_t; - typedef logic [511:0] axi_wide_in_data_t; - typedef logic [63:0] axi_wide_in_strb_t; - typedef logic [2:0] axi_wide_in_id_t; - typedef logic [0:0] axi_wide_in_user_t; - `AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, - axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) - - - typedef logic [47:0] axi_wide_out_addr_t; - typedef logic [511:0] axi_wide_out_data_t; - typedef logic [63:0] axi_wide_out_strb_t; - typedef logic [0:0] axi_wide_out_id_t; - typedef logic [0:0] axi_wide_out_user_t; - `AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, - axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, - axi_wide_out_user_t) - - - - `FLOO_TYPEDEF_HDR_T(hdr_t, id_t, id_t, nw_ch_e, rob_idx_t, id_t, collect_op_e) - localparam axi_cfg_t AxiCfgN = '{ - AddrWidth: 48, - DataWidth: 64, - UserWidth: 5, - InIdWidth: 5, - OutIdWidth: 2 - }; - localparam axi_cfg_t AxiCfgW = '{ - AddrWidth: 48, - DataWidth: 512, - UserWidth: 1, - InIdWidth: 3, - OutIdWidth: 1 - }; - `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, - hdr_t) - - `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) - - -endpackage diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py index 1461d0f2..e9a141a8 100755 --- a/experiments/reduction/experiments.py +++ b/experiments/reduction/experiments.py @@ -40,13 +40,6 @@ def derive_cdefines(self, experiment): cdefs['BATCH'] = experiment['batch'] return cdefs - # def derive_hw_cfg(self, experiment): - # hw = experiment['hw'] - # return DIR / 'cfg' / f'floo_picobello_noc_pkg_{hw}.sv' - - def derive_vsim_builddir(self, experiment): - return self.dir / 'hw' / experiment['hw'] - def gen_experiments(): experiments = [] @@ -77,12 +70,9 @@ def gen_experiments(): 'size': size, 'batch': batch_size, 'app': 'reduction_benchmark', - 'hw': 'simple', 'roi': Path.cwd() / f'roi/{impl}.json.tpl', } - work_dir = DIR / 'hw' / experiment['hw'] - experiment['cmd'] = pb.sim_and_verify_cmd(Path.cwd() / 'verify.py', - work_dir) + experiment['cmd'] = pb.sim_and_verify_cmd(Path.cwd() / 'verify.py') experiments.append(experiment) return experiments diff --git a/experiments/reduction/fit.py b/experiments/reduction/fit.py index 55c72691..cb896b5d 100755 --- a/experiments/reduction/fit.py +++ b/experiments/reduction/fit.py @@ -147,7 +147,7 @@ def fit_hw(df=None, quiet=True): x = df['size'].to_numpy() / BEAT_BYTES y = df['cycles'].to_numpy() if not quiet: - print("Fit runtime for hw simple reduction:") + print("Fit runtime for hw reduction:") return fit(x, y, quiet=quiet) diff --git a/experiments/reduction/plot.py b/experiments/reduction/plot.py index 7ce125ce..b473e2cb 100755 --- a/experiments/reduction/plot.py +++ b/experiments/reduction/plot.py @@ -166,7 +166,7 @@ def plot1(y_label=None, hide_x_axis=False, show=True): x, y = monotone_tree_runtime_curve(sizes.min(), sizes.max(), c, r) ax.plot(x, y, label='Model (tree)', linestyle='--', color=colors['tree']) - # Plot model line for hw simple runtime + # Plot model line for hw runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, r) ax.plot(x, y, label='Model (hw)', linestyle='--', color=colors['hw']) @@ -248,7 +248,7 @@ def plot_runtime_vs_speedup(ax, n_rows, show_label=False): xytext=(sizes.max() * 0.0001, 0), textcoords='offset points', # nudge right ha='left', va='center', clip_on=False) - # Plot model line for hardware simple runtime + # Plot model line for hardware runtime x, y = hw_runtime_curve(sizes.min(), sizes.max(), c, n_rows) ax.plot( x, y, label='Model (hw)' if show_label else None, linestyle='--', color=colors['hw']) diff --git a/experiments/reduction/roi/hw_simple.json.tpl b/experiments/reduction/roi/hw.json.tpl similarity index 100% rename from experiments/reduction/roi/hw_simple.json.tpl rename to experiments/reduction/roi/hw.json.tpl diff --git a/experiments/reduction/roi/hw_generic.json.tpl b/experiments/reduction/roi/hw_generic.json.tpl deleted file mode 100644 index 00e1c78a..00000000 --- a/experiments/reduction/roi/hw_generic.json.tpl +++ /dev/null @@ -1,21 +0,0 @@ -<% - def pb_cluster_idx(c, r): - return c * 4 + r - - n_rows = experiment['n_rows'] -%> -[ -% for r in range(n_rows): - % for c in range(4): - { - "thread": "${f'hart_{1 + pb_cluster_idx(c, r) * 9 + 8}'}", - "roi": [ - // First iteration - {"idx": 1, "label": "reduction"}, - // Second iteration - {"idx": 3, "label": "reduction"}, - ] - }, - % endfor -% endfor -] From e3a6f54b50397dad7b0c6b819c97a18304c4e5c5 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:49:21 +0100 Subject: [PATCH 58/77] Fix lint error in picobello.py --- experiments/picobello.py | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/picobello.py b/experiments/picobello.py index 27d2f55e..7327b7f6 100644 --- a/experiments/picobello.py +++ b/experiments/picobello.py @@ -12,6 +12,7 @@ root = Path(__file__).resolve().parents[1] default_work_dir = root / 'target/sim/vsim' + def sim_cmd(work_dir=default_work_dir): return [ 'bash', '-c', From a30d3a90b9963e741f96f0511dc48827ad459c37 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:49:46 +0100 Subject: [PATCH 59/77] Cleanup summa_gemm data configuration --- experiments/summa_gemm/cfg.json.tpl | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/experiments/summa_gemm/cfg.json.tpl b/experiments/summa_gemm/cfg.json.tpl index 4086a31a..e2ebd182 100644 --- a/experiments/summa_gemm/cfg.json.tpl +++ b/experiments/summa_gemm/cfg.json.tpl @@ -2,27 +2,6 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -//{ -// setup_ssr: 1, -// parallelize_m: 1, -// parallelize_k: 0, -// m_tiles: 4, // number of tiles in M dimension -// n_tiles: ${4 * experiment['n_tiles']}, // number of tiles in N dimension -// k_tiles: 1, // number of tiles in K dimension -// load_a: 1, -// load_b: 1, -// load_c: 1, -// double_buffer: 1, -// partition_banks: 0, -// transa: false, -// transb: false, // must be true for SIMD -// m: 32, -// n: ${32 * experiment['n_tiles']}, -// k: 8, -// alpha: 1, -// beta: 0, -// gemm_fp: "gemm_fp64_opt" -//} { setup_ssr: 1, parallelize_m: 1, From 8f0a264fdaa7eee3df991649d1adb96c184f7d12 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:50:11 +0100 Subject: [PATCH 60/77] Remove prints from `dma_multicast_v2.c` --- sw/snitch/tests/dma_multicast_v2.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sw/snitch/tests/dma_multicast_v2.c b/sw/snitch/tests/dma_multicast_v2.c index 7084f63c..cc09c50f 100644 --- a/sw/snitch/tests/dma_multicast_v2.c +++ b/sw/snitch/tests/dma_multicast_v2.c @@ -304,17 +304,6 @@ int main() { // Only DMA cores of clusters in the first N rows continue from here if (!snrt_is_dm_core() || !comm->is_participant) return 0; - // Print participating clusters - if (snrt_is_dm_core() && snrt_cluster_idx() == 0) { - uint32_t mask = comm->mask; - uint32_t fixed = comm->base & ~mask; - uint32_t submask = 0; - do { - uint32_t i = fixed | submask; - submask = (submask - 1) & mask; - } while (submask != 0); - } - // Synchronize all clusters. snrt_inter_cluster_barrier(comm); From 6936a3b6ad33b2e999a78e50ee2666af524ca074 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:50:34 +0100 Subject: [PATCH 61/77] Add description in tests --- sw/snitch/tests/overlapping_barriers.c | 7 ++++++- sw/snitch/tests/parallel_row_col_barriers.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sw/snitch/tests/overlapping_barriers.c b/sw/snitch/tests/overlapping_barriers.c index f78b3153..d74c9bce 100644 --- a/sw/snitch/tests/overlapping_barriers.c +++ b/sw/snitch/tests/overlapping_barriers.c @@ -3,6 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 // // Luca Colagrande +// +// Test that multiple outstanding barriers with intersecting (or overlapping) +// participant sets do not deadlock the system. The "participant set" is +// defined as the set of clusters that participate in the barrier. #include "snrt.h" @@ -16,7 +20,8 @@ int main (void) { // other clusters have arrived on the global barrier. This ensures that // the global barrier, arriving first, takes ownership of the router, // preventing the row-0 clusters from ever reaching the global barrier - // and consequently deadlocking the system. + // and consequently deadlocking the system, if multiple outstanding + // reductions are not properly supported. if (pb_cluster_row_idx() == 0) { for (int j = 0; j < 100; j++) snrt_nop(); snrt_global_barrier(comm); diff --git a/sw/snitch/tests/parallel_row_col_barriers.c b/sw/snitch/tests/parallel_row_col_barriers.c index 136f2c5c..acb1bd41 100644 --- a/sw/snitch/tests/parallel_row_col_barriers.c +++ b/sw/snitch/tests/parallel_row_col_barriers.c @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 // // Luca Colagrande +// +// Stress test row, column and global barriers happening in rapid succession. #include "snrt.h" From 5d2e308b978f4f5d065f479ed7a4ab88f86438e5 Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Wed, 17 Dec 2025 17:55:34 +0100 Subject: [PATCH 62/77] Rename dma_multicast_v2 experiments and benchmark --- experiments/{dma_multicast_v2 => multicast}/README.md | 2 +- experiments/{dma_multicast_v2 => multicast}/__init__.py | 0 experiments/{dma_multicast_v2 => multicast}/experiments.py | 2 +- experiments/{dma_multicast_v2 => multicast}/fit.py | 2 +- experiments/{dma_multicast_v2 => multicast}/model.py | 0 experiments/{dma_multicast_v2 => multicast}/plot.py | 4 ++-- experiments/{dma_multicast_v2 => multicast}/roi/hw.json.tpl | 0 experiments/{dma_multicast_v2 => multicast}/roi/seq.json.tpl | 0 experiments/{dma_multicast_v2 => multicast}/roi/tree.json.tpl | 0 experiments/{dma_multicast_v2 => multicast}/verify.py | 2 +- sw/snitch/tests/{dma_multicast_v2.c => multicast_benchmark.c} | 0 11 files changed, 6 insertions(+), 6 deletions(-) rename experiments/{dma_multicast_v2 => multicast}/README.md (86%) rename experiments/{dma_multicast_v2 => multicast}/__init__.py (100%) rename experiments/{dma_multicast_v2 => multicast}/experiments.py (98%) rename experiments/{dma_multicast_v2 => multicast}/fit.py (98%) rename experiments/{dma_multicast_v2 => multicast}/model.py (100%) rename experiments/{dma_multicast_v2 => multicast}/plot.py (99%) rename experiments/{dma_multicast_v2 => multicast}/roi/hw.json.tpl (100%) rename experiments/{dma_multicast_v2 => multicast}/roi/seq.json.tpl (100%) rename experiments/{dma_multicast_v2 => multicast}/roi/tree.json.tpl (100%) rename experiments/{dma_multicast_v2 => multicast}/verify.py (94%) rename sw/snitch/tests/{dma_multicast_v2.c => multicast_benchmark.c} (100%) diff --git a/experiments/dma_multicast_v2/README.md b/experiments/multicast/README.md similarity index 86% rename from experiments/dma_multicast_v2/README.md rename to experiments/multicast/README.md index 7e6d26e5..5e615395 100644 --- a/experiments/dma_multicast_v2/README.md +++ b/experiments/multicast/README.md @@ -7,7 +7,7 @@ To run the experiments: To manually verify a simulation: ```shell -./verify.py placeholder /dma_multicast_v2.elf --no-ipc --memdump /l2mem.bin --memaddr 0x70000000 +./verify.py placeholder /multicast_benchmark.elf --no-ipc --memdump /l2mem.bin --memaddr 0x70000000 ``` To manually generate the traces for a simulation: diff --git a/experiments/dma_multicast_v2/__init__.py b/experiments/multicast/__init__.py similarity index 100% rename from experiments/dma_multicast_v2/__init__.py rename to experiments/multicast/__init__.py diff --git a/experiments/dma_multicast_v2/experiments.py b/experiments/multicast/experiments.py similarity index 98% rename from experiments/dma_multicast_v2/experiments.py rename to experiments/multicast/experiments.py index b3968554..9fcbd9a4 100755 --- a/experiments/dma_multicast_v2/experiments.py +++ b/experiments/multicast/experiments.py @@ -66,7 +66,7 @@ def gen_experiments(): 'n_rows': n_rows, 'size': size, 'batch': batch_size, - 'app': 'dma_multicast_v2', + 'app': 'multicast_benchmark', 'cmd': pb.sim_and_verify_cmd(Path.cwd() / 'verify.py'), 'roi': Path.cwd() / f'roi/{impl}.json.tpl', }) diff --git a/experiments/dma_multicast_v2/fit.py b/experiments/multicast/fit.py similarity index 98% rename from experiments/dma_multicast_v2/fit.py rename to experiments/multicast/fit.py index 88687659..6a1a64dc 100755 --- a/experiments/dma_multicast_v2/fit.py +++ b/experiments/multicast/fit.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd from scipy import stats -from dma_multicast_v2 import experiments +from multicast import experiments pd.options.mode.copy_on_write = True BEAT_BYTES = 64 diff --git a/experiments/dma_multicast_v2/model.py b/experiments/multicast/model.py similarity index 100% rename from experiments/dma_multicast_v2/model.py rename to experiments/multicast/model.py diff --git a/experiments/dma_multicast_v2/plot.py b/experiments/multicast/plot.py similarity index 99% rename from experiments/dma_multicast_v2/plot.py rename to experiments/multicast/plot.py index 5687b385..d591a6b2 100755 --- a/experiments/dma_multicast_v2/plot.py +++ b/experiments/multicast/plot.py @@ -11,8 +11,8 @@ import numpy as np import pandas as pd import seaborn as sns -from dma_multicast_v2 import model -from dma_multicast_v2 import experiments +from multicast import model +from multicast import experiments pd.options.mode.copy_on_write = True diff --git a/experiments/dma_multicast_v2/roi/hw.json.tpl b/experiments/multicast/roi/hw.json.tpl similarity index 100% rename from experiments/dma_multicast_v2/roi/hw.json.tpl rename to experiments/multicast/roi/hw.json.tpl diff --git a/experiments/dma_multicast_v2/roi/seq.json.tpl b/experiments/multicast/roi/seq.json.tpl similarity index 100% rename from experiments/dma_multicast_v2/roi/seq.json.tpl rename to experiments/multicast/roi/seq.json.tpl diff --git a/experiments/dma_multicast_v2/roi/tree.json.tpl b/experiments/multicast/roi/tree.json.tpl similarity index 100% rename from experiments/dma_multicast_v2/roi/tree.json.tpl rename to experiments/multicast/roi/tree.json.tpl diff --git a/experiments/dma_multicast_v2/verify.py b/experiments/multicast/verify.py similarity index 94% rename from experiments/dma_multicast_v2/verify.py rename to experiments/multicast/verify.py index 1aa58510..bc5a7b0a 100755 --- a/experiments/dma_multicast_v2/verify.py +++ b/experiments/multicast/verify.py @@ -5,7 +5,7 @@ # # Luca Colagrande # -# Verification script for `dma_multicast_v2.c` +# Verification script for `multicast_benchmark.c` import sys import snitch.util.sim.verif_utils as vu diff --git a/sw/snitch/tests/dma_multicast_v2.c b/sw/snitch/tests/multicast_benchmark.c similarity index 100% rename from sw/snitch/tests/dma_multicast_v2.c rename to sw/snitch/tests/multicast_benchmark.c From cd2764c2dff8f08ff5643cd43cf3b062c729c50c Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Sat, 22 Nov 2025 10:37:52 +0100 Subject: [PATCH 63/77] hw: Adapt SoC to multiple physical channels --- Bender.lock | 2 +- Bender.yml | 2 +- Makefile | 2 +- floo_picobello_noc_pkg_REDUCTION.sv | 8 +++--- hw/cheshire_tile.sv | 34 +++++++++++++----------- hw/cluster_tile.sv | 15 ++++------- hw/dummy_tile.sv | 18 +++++++------ hw/fhg_spu_tile.sv | 21 ++++++++------- hw/mem_tile.sv | 41 ++++++++++++++++------------- hw/picobello_pkg.sv | 12 ++++----- hw/spm_tile.sv | 21 ++++++++------- 11 files changed, 93 insertions(+), 83 deletions(-) diff --git a/Bender.lock b/Bender.lock index 710b2346..d9fc9ea1 100644 --- a/Bender.lock +++ b/Bender.lock @@ -391,7 +391,7 @@ packages: - common_cells - register_interface snitch_cluster: - revision: dda2bcb707f0ed237f61a7f6d39608877abe7bdb + revision: ecf876f5d8726c6424970c99c5e91f83aaedeacd version: null source: Git: https://github.com/pulp-platform/snitch_cluster.git diff --git a/Bender.yml b/Bender.yml index 63f00f47..33298873 100644 --- a/Bender.yml +++ b/Bender.yml @@ -12,7 +12,7 @@ dependencies: axi: { git: "https://github.com/colluca/axi.git", rev: "multicast" } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } - snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "narrow_reduction" } + snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "main" } floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "feature/reduction" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } diff --git a/Makefile b/Makefile index 02d15f28..f2ea2606 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ SN_ROOT = $(shell $(BENDER) path snitch_cluster) FLOO_ROOT = $(shell $(BENDER) path floo_noc) # Executables -BENDER ?= bender -d $(PB_ROOT) +BENDER ?= bender --suppress W22 -d $(PB_ROOT) FLOO_GEN ?= floogen VERIBLE_FMT ?= verible-verilog-format VERIBLE_FMT_ARGS ?= --flagfile .verilog_format --inplace --verbose diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index 191d1f55..21badec9 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -283,7 +283,7 @@ package floo_picobello_noc_pkg; typedef logic [63:0] axi_narrow_in_data_t; typedef logic [7:0] axi_narrow_in_strb_t; typedef logic [4:0] axi_narrow_in_id_t; - typedef logic [4:0] axi_narrow_in_user_t; + typedef logic [2:0] axi_narrow_in_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_in, axi_narrow_in_req_t, axi_narrow_in_rsp_t, axi_narrow_in_addr_t, axi_narrow_in_id_t, axi_narrow_in_data_t, axi_narrow_in_strb_t, axi_narrow_in_user_t) @@ -293,7 +293,7 @@ package floo_picobello_noc_pkg; typedef logic [63:0] axi_narrow_out_data_t; typedef logic [7:0] axi_narrow_out_strb_t; typedef logic [1:0] axi_narrow_out_id_t; - typedef logic [4:0] axi_narrow_out_user_t; + typedef logic [2:0] axi_narrow_out_user_t; `AXI_TYPEDEF_ALL_CT(axi_narrow_out, axi_narrow_out_req_t, axi_narrow_out_rsp_t, axi_narrow_out_addr_t, axi_narrow_out_id_t, axi_narrow_out_data_t, axi_narrow_out_strb_t, axi_narrow_out_user_t) @@ -323,7 +323,7 @@ package floo_picobello_noc_pkg; localparam axi_cfg_t AxiCfgN = '{ AddrWidth: 48, DataWidth: 64, - UserWidth: 5, + UserWidth: 3, InIdWidth: 5, OutIdWidth: 2 }; @@ -337,7 +337,7 @@ package floo_picobello_noc_pkg; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, hdr_t) - `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2) + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2, 2) endpackage diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index 66430503..89970006 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -107,7 +107,7 @@ module cheshire_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -121,8 +121,9 @@ module cheshire_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), - .EnDecoupledRW (1'b1) + // .floo_wide_out_t (floo_wide_double_t), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2) ) i_router ( .clk_i, .rst_ni, @@ -168,12 +169,14 @@ module cheshire_tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[South] = floo_rsp_south_i; //TODO(colluca): Merge with floo_wide - assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; - assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; - assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; - assign floo_wide_south_o.valid = router_floo_wide_out[South].valid; - assign floo_wide_south_o.ready = router_floo_wide_out[South].ready; - assign floo_wide_south_o.wide = router_floo_wide_out[South].wide[0]; + // assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + // assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + // assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + // assign floo_wide_south_o.valid = router_floo_wide_out[South].valid; + // assign floo_wide_south_o.ready = router_floo_wide_out[South].ready; + // assign floo_wide_south_o.wide = router_floo_wide_out[South].wide[0]; + assign floo_wide_west_o = router_floo_wide_out[West]; + assign floo_wide_south_o = router_floo_wide_out[South]; assign router_floo_wide_in[West] = floo_wide_west_i; assign router_floo_wide_in[North] = '0; // No North port in this tile assign router_floo_wide_in[East] = '0; // No East port in this tile @@ -201,6 +204,7 @@ module cheshire_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), + .NumWidePhysChannels (2), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), @@ -218,7 +222,7 @@ module cheshire_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_in_t (floo_wide_double_t), + // .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( @@ -506,11 +510,11 @@ module cheshire_tile // Add Assertion that no multicast / reduction can enter this tile! for (genvar r = 0; r < 4; r++) begin : gen_virt - `ASSERT(NoCollectivOperation_NReq_In, (!router_floo_req_in[r].valid | (router_floo_req_in[r].req.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NRsp_In, (!router_floo_rsp_in[r].valid | (router_floo_rsp_in[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp.generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NReq_In, (!router_floo_req_in[r].valid | (router_floo_req_in[r].req[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_In, (!router_floo_rsp_in[r].valid | (router_floo_rsp_in[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!router_floo_wide_in[r].valid | (router_floo_wide_in[r].wide[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NReq_Out, (!router_floo_req_out[r].valid | (router_floo_req_out[r].req[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NRsp_Out, (!router_floo_rsp_out[r].valid | (router_floo_rsp_out[r].rsp[0].generic.hdr.collective_op == Unicast))) `ASSERT(NoCollectivOperation_NWide_Out, (!router_floo_wide_out[r].valid | (router_floo_wide_out[r].wide[0].generic.hdr.collective_op == Unicast))) end diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 54cb9c0b..0d12a1be 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -340,14 +340,15 @@ module cluster_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), - .EnDecoupledRW (1'b1), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2), .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), @@ -357,7 +358,6 @@ module cluster_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), .RdWideOperation_t (floo_pkg::collect_op_e), .RdNarrowOperation_t (floo_pkg::collect_op_e), .RdWideData_t (RdDataWide_t), @@ -402,13 +402,8 @@ module cluster_tile assign router_floo_req_in[West:North] = floo_req_i; assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; - // Only the local port uses both physical channels. Other outputs use only the lower. - for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o - assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; - assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; - assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; - end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -422,6 +417,7 @@ module cluster_tile .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), + .NumWidePhysChannels (2), .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), @@ -441,7 +437,6 @@ module cluster_tile .floo_req_t (floo_picobello_noc_pkg::floo_req_t), .floo_rsp_t (floo_picobello_noc_pkg::floo_rsp_t), .floo_wide_t (floo_picobello_noc_pkg::floo_wide_t), - .floo_wide_in_t (floo_wide_double_t), .sram_cfg_t (snitch_cluster_pkg::sram_cfg_t), .user_narrow_struct_t (picobello_pkg::collective_narrow_user_t), .user_wide_struct_t (picobello_pkg::collective_wide_user_t) diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index 3d3c8a20..2e4eaee6 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -30,7 +30,7 @@ module dummy_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -44,8 +44,9 @@ module dummy_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), - .EnDecoupledRW (1'b1) + // .floo_wide_out_t (floo_wide_double_t), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2) ) i_router ( .clk_i, .rst_ni, @@ -83,12 +84,13 @@ module dummy_tile assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; // Only the local port uses both physical channels. Other outputs use only the lower. - for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o - assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; - assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; - assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; - end + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; // Tie the router’s Eject input ports to 0 assign router_floo_req_in[Eject] = '0; diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 03c8db90..60eb4221 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -48,7 +48,7 @@ module fhg_spu_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -62,8 +62,9 @@ module fhg_spu_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), - .EnDecoupledRW (1'b1) + // .floo_wide_out_t (floo_wide_double_t), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2) ) i_router ( .clk_i, .rst_ni, @@ -108,12 +109,14 @@ module fhg_spu_tile assign router_floo_rsp_in[South] = '0; // No South port in this tile assign router_floo_rsp_in[East] = '0; // No East port in this tile assign router_floo_rsp_in[North] = floo_rsp_north_i; - assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; - assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; - assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; - assign floo_wide_north_o.valid = router_floo_wide_out[North].valid; - assign floo_wide_north_o.ready = router_floo_wide_out[North].ready; - assign floo_wide_north_o.wide = router_floo_wide_out[North].wide[0]; + // assign floo_wide_west_o.valid = router_floo_wide_out[West].valid; + // assign floo_wide_west_o.ready = router_floo_wide_out[West].ready; + // assign floo_wide_west_o.wide = router_floo_wide_out[West].wide[0]; + // assign floo_wide_north_o.valid = router_floo_wide_out[North].valid; + // assign floo_wide_north_o.ready = router_floo_wide_out[North].ready; + // assign floo_wide_north_o.wide = router_floo_wide_out[North].wide[0]; + assign floo_wide_west_o = router_floo_wide_out[West]; + assign floo_wide_north_o = router_floo_wide_out[North]; assign router_floo_wide_in[West] = floo_wide_west_i; assign router_floo_wide_in[South] = '0; // No South port in this tile assign router_floo_wide_in[East] = '0; // No East port in this tile diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index c6080daf..b70ae58d 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -47,7 +47,7 @@ module mem_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( .AxiCfgN (AxiCfgN), @@ -61,8 +61,9 @@ module mem_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), - .EnDecoupledRW (1'b1) + // .floo_wide_out_t (floo_wide_double_t), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2) ) i_router ( .clk_i, .rst_ni, @@ -100,12 +101,13 @@ module mem_tile assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; // Only the local port uses both physical channels. Other outputs use only the lower. - for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o - assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; - assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; - assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; - end + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -124,6 +126,7 @@ module mem_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), + .NumWidePhysChannels (2), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), @@ -141,7 +144,7 @@ module mem_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_in_t (floo_wide_double_t), + // .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( @@ -505,20 +508,20 @@ module mem_tile `endif // Add Assertion that no multicast / reduction can enter this tile! for (genvar r = 0; r < 4; r++) begin : gen_route_assertions - `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req.generic.hdr.collective_op == Unicast)), + `ASSERT(NoCollectivOperation_NReq_In, (!floo_req_i[r].valid | (floo_req_i[r].req[0].generic.hdr.collective_op == Unicast)), clk_i, !rst_ni, - $sformatf("Unsupported collective attempted with destination: %h", floo_req_i[r].req.narrow_aw.payload.addr)) - `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide.generic.hdr.collective_op == Unicast)), + $sformatf("Unsupported collective attempted with destination: %h", floo_req_i[r].req[0].narrow_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NRsp_In, (!floo_rsp_i[r].valid | (floo_rsp_i[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_In, (!floo_wide_i[r].valid | (floo_wide_i[r].wide[0].generic.hdr.collective_op == Unicast)), clk_i, !rst_ni, - $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide.wide_aw.payload.addr)) - `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req.generic.hdr.collective_op == Unicast)), + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NReq_Out, (!floo_req_o[r].valid | (floo_req_o[r].req[0].generic.hdr.collective_op == Unicast)), clk_i, !rst_ni, - $sformatf("Unsupported collective attempted with destination: %h", floo_req_o[r].req.narrow_aw.payload.addr)) - `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp.generic.hdr.collective_op == Unicast))) - `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide.generic.hdr.collective_op == Unicast)), + $sformatf("Unsupported collective attempted with destination: %h", floo_req_o[r].req[0].narrow_aw.payload.addr)) + `ASSERT(NoCollectivOperation_NRsp_Out, (!floo_rsp_o[r].valid | (floo_rsp_o[r].rsp[0].generic.hdr.collective_op == Unicast))) + `ASSERT(NoCollectivOperation_NWide_Out, (!floo_wide_o[r].valid | (floo_wide_o[r].wide[0].generic.hdr.collective_op == Unicast)), clk_i, !rst_ni, - $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide.wide_aw.payload.addr)) + $sformatf("Unsupported collective attempted with destination: %h", floo_wide_i[r].wide[0].wide_aw.payload.addr)) end endmodule diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index cbb208cc..7d7f6cf0 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -409,12 +409,12 @@ package picobello_pkg; floo_pkg::collect_op_e collective_op; } collective_wide_user_t; - // TODO (colluca): Merge with floo_wide - typedef struct packed { - logic [1:0] valid; - logic [1:0] ready; - floo_wide_chan_t [1:0] wide; - } floo_wide_double_t; + // // TODO (colluca): Merge with floo_wide + // typedef struct packed { + // logic [1:0] valid; + // logic [1:0] ready; + // floo_wide_chan_t [1:0] wide; + // } floo_wide_double_t; // Configurations for the Reductions // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index 99c9c8f5..14e9fcce 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -82,7 +82,7 @@ module spm_tile floo_req_t [Eject:North] router_floo_req_out, router_floo_req_in; floo_rsp_t [Eject:North] router_floo_rsp_out, router_floo_rsp_in; floo_wide_t [Eject:North] router_floo_wide_in; - floo_wide_double_t [Eject:North] router_floo_wide_out; + floo_wide_t [Eject:North] router_floo_wide_out; floo_nw_router #( @@ -97,8 +97,9 @@ module spm_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_out_t (floo_wide_double_t), - .EnDecoupledRW (1'b1) + // .floo_wide_out_t (floo_wide_double_t), + .NumWideVirtChannels (2), + .NumWidePhysChannels (2) ) i_router ( .clk_i, .rst_ni, @@ -136,12 +137,13 @@ module spm_tile assign floo_rsp_o = router_floo_rsp_out[West:North]; assign router_floo_rsp_in[West:North] = floo_rsp_i; // Only the local port uses both physical channels. Other outputs use only the lower. - for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o - assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; - assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; - assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; - end + // for (genvar i = North; i <= West; i++) begin : gen_floo_wide_o + // assign floo_wide_o[i].valid = router_floo_wide_out[i].valid; + // assign floo_wide_o[i].ready = router_floo_wide_out[i].ready; + // assign floo_wide_o[i].wide = router_floo_wide_out[i].wide[0]; + // end assign router_floo_wide_in[West:North] = floo_wide_i; + assign floo_wide_o[West:North] =router_floo_wide_out[West:North]; ///////////// // Chimney // @@ -163,6 +165,7 @@ module spm_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), + .NumWidePhysChannels (2), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), @@ -180,7 +183,7 @@ module spm_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .floo_wide_in_t (floo_wide_double_t), + // .floo_wide_in_t (floo_wide_double_t), .user_narrow_struct_t (collective_narrow_user_t), .user_wide_struct_t (collective_wide_user_t) ) i_chimney ( From b5bc432e7ff3828606e5f858bc6a71482bc019c9 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Mon, 24 Nov 2025 20:54:11 +0100 Subject: [PATCH 64/77] hw: Align with credit based VC --- hw/cheshire_tile.sv | 8 +++++--- hw/cluster_tile.sv | 8 +++++--- hw/dummy_tile.sv | 5 +++-- hw/fhg_spu_tile.sv | 5 +++-- hw/mem_tile.sv | 8 +++++--- hw/picobello_pkg.sv | 4 ++++ hw/spm_tile.sv | 8 +++++--- 7 files changed, 30 insertions(+), 16 deletions(-) diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index 89970006..c0cd85f1 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -122,8 +122,9 @@ module cheshire_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2) + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport) ) i_router ( .clk_i, .rst_ni, @@ -204,7 +205,8 @@ module cheshire_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .NumWidePhysChannels (2), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 0d12a1be..f2a90ca9 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -347,8 +347,9 @@ module cluster_tile .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2), + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport), .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), @@ -417,7 +418,8 @@ module cluster_tile .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .NumWidePhysChannels (2), + .EnWideCreditSupport (EnWideCreditSupport), + .NumWidePhysChannels (NumWidePhysChannels), .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index 2e4eaee6..fce44812 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -45,8 +45,9 @@ module dummy_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2) + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 60eb4221..67f779f6 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -63,8 +63,9 @@ module fhg_spu_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2) + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index b70ae58d..732573e1 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -62,8 +62,9 @@ module mem_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2) + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport) ) i_router ( .clk_i, .rst_ni, @@ -126,7 +127,8 @@ module mem_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .NumWidePhysChannels (2), + .EnWideCreditSupport (EnWideCreditSupport), + .NumWidePhysChannels (NumWidePhysChannels), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 7d7f6cf0..87442f52 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -416,6 +416,10 @@ package picobello_pkg; // floo_wide_chan_t [1:0] wide; // } floo_wide_double_t; + localparam int unsigned NumWideVirtChannels = 2; + localparam int unsigned NumWidePhysChannels = 1; + localparam bit EnWideCreditSupport = 0; + // Configurations for the Reductions // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything // ATTENTION: diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index 14e9fcce..dbcbfaf4 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -98,8 +98,9 @@ module spm_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (2), - .NumWidePhysChannels (2) + .NumWideVirtChannels (NumWideVirtChannels), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport) ) i_router ( .clk_i, .rst_ni, @@ -165,7 +166,8 @@ module spm_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .NumWidePhysChannels (2), + .NumWidePhysChannels (NumWidePhysChannels), + .EnWideCreditSupport (EnWideCreditSupport), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), From f38c1901132991d61512be030bfae5599cd74cae Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Tue, 25 Nov 2025 14:00:28 +0100 Subject: [PATCH 65/77] hw: Add UnstableValid VC implementation --- hw/cheshire_tile.sv | 4 ++-- hw/cluster_tile.sv | 4 ++-- hw/dummy_tile.sv | 2 +- hw/fhg_spu_tile.sv | 2 +- hw/mem_tile.sv | 4 ++-- hw/picobello_pkg.sv | 2 +- hw/spm_tile.sv | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index c0cd85f1..bbfb9297 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -124,7 +124,7 @@ module cheshire_tile // .floo_wide_out_t (floo_wide_double_t), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport) + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -206,7 +206,7 @@ module cheshire_tile .AtopSupport (1'b1), .EnDecoupledRW (1'b1), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport), + .VcImplementation (VcImplementation), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index f2a90ca9..6e0cfae5 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -349,7 +349,7 @@ module cluster_tile .RouteAlgo (RouteCfg.RouteAlgo), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport), + .VcImplementation (VcImplementation), .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), @@ -418,7 +418,7 @@ module cluster_tile .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .EnWideCreditSupport (EnWideCreditSupport), + .VcImplementation (VcImplementation), .NumWidePhysChannels (NumWidePhysChannels), .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index fce44812..e4641218 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -47,7 +47,7 @@ module dummy_tile // .floo_wide_out_t (floo_wide_double_t), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport) + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 67f779f6..45d1a88b 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -65,7 +65,7 @@ module fhg_spu_tile // .floo_wide_out_t (floo_wide_double_t), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport) + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 732573e1..f1946ada 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -64,7 +64,7 @@ module mem_tile // .floo_wide_out_t (floo_wide_double_t), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport) + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -127,7 +127,7 @@ module mem_tile .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), .EnDecoupledRW (1'b1), - .EnWideCreditSupport (EnWideCreditSupport), + .VcImplementation (VcImplementation), .NumWidePhysChannels (NumWidePhysChannels), .MaxAtomicTxns (1), .Sam (Sam), diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 87442f52..9a226900 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -418,7 +418,7 @@ package picobello_pkg; localparam int unsigned NumWideVirtChannels = 2; localparam int unsigned NumWidePhysChannels = 1; - localparam bit EnWideCreditSupport = 0; + localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcUnstableValid; // Configurations for the Reductions // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index dbcbfaf4..1833d558 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -100,7 +100,7 @@ module spm_tile // .floo_wide_out_t (floo_wide_double_t), .NumWideVirtChannels (NumWideVirtChannels), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport) + .VcImplementation (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -167,7 +167,7 @@ module spm_tile .AtopSupport (1'b1), .EnDecoupledRW (1'b1), .NumWidePhysChannels (NumWidePhysChannels), - .EnWideCreditSupport (EnWideCreditSupport), + .VcImplementation (VcImplementation), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), From dcc68f8c39743d6b53399087a9b4f076ad85607f Mon Sep 17 00:00:00 2001 From: Luca Colagrande Date: Mon, 24 Nov 2025 20:35:19 +0100 Subject: [PATCH 66/77] vsim.mk: Use `BENDER` variable --- target/sim/vsim/vsim.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/sim/vsim/vsim.mk b/target/sim/vsim/vsim.mk index ec5f216f..b57e28a5 100644 --- a/target/sim/vsim/vsim.mk +++ b/target/sim/vsim/vsim.mk @@ -45,7 +45,7 @@ vsim-compile: $(VSIM_DIR)/compile.tcl $(PB_HW_ALL) $(VSIM) -c $(VSIM_FLAGS) -do "source $<; quit" $(VSIM_DIR)/compile.tcl: $(BENDER_YML) $(BENDER_LOCK) - bender script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@ + $(BENDER) script vsim --compilation-mode common $(COMMON_TARGS) $(SIM_TARGS) --vlog-arg="$(VLOG_ARGS)"> $@ echo 'vlog -work $(VSIM_WORK) "$(realpath $(CHS_ROOT))/target/sim/src/elfloader.cpp" -ccflags "-std=c++11"' >> $@ vsim-run: From 48605dfa3734ae7b123abb621bbbc5fd3c8d4f76 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Wed, 17 Dec 2025 13:34:46 +0100 Subject: [PATCH 67/77] hw: Align picobello with FlooNoC VC --- Bender.yml | 2 +- floo_picobello_noc_pkg_REDUCTION.sv | 2 +- hw/cheshire_tile.sv | 10 ++++------ hw/cluster_tile.sv | 10 ++++------ hw/dummy_tile.sv | 3 +-- hw/fhg_spu_tile.sv | 5 ++--- hw/mem_tile.sv | 10 ++++------ hw/picobello_pkg.sv | 17 +++++++++-------- hw/spm_tile.sv | 10 ++++------ 9 files changed, 30 insertions(+), 39 deletions(-) diff --git a/Bender.yml b/Bender.yml index 33298873..88dc00a7 100644 --- a/Bender.yml +++ b/Bender.yml @@ -13,7 +13,7 @@ dependencies: common_cells: { git: "https://github.com/pulp-platform/common_cells.git", rev: "snitch" } cheshire: { git: "https://github.com/pulp-platform/cheshire.git", rev: "picobello" } snitch_cluster: { git: "https://github.com/pulp-platform/snitch_cluster.git", rev: "main" } - floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "feature/reduction" } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: "reduction-vc-rebase" } obi: { git: "https://github.com/pulp-platform/obi.git", rev: "acfcd0f80c7539aa8da7821a66d9acf2074a5b4e" } redmule: { git: "https://github.com/pulp-platform/redmule.git", rev: "picobello" } hci: { git: "https://github.com/pulp-platform/hci.git", rev: "06fcba671e060f2e1b03b7ebe2d3e719f1557099" } diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index 21badec9..baa39271 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -337,7 +337,7 @@ package floo_picobello_noc_pkg; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, hdr_t) - `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 1, 2, 2) + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 1) endpackage diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index bbfb9297..10935d95 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -122,9 +122,8 @@ module cheshire_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation) + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -204,9 +203,8 @@ module cheshire_tile .ChimneyCfgW (ChimneyCfgW), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), - .EnDecoupledRW (1'b1), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (AxiCfgN.OutIdWidth - 1), .Sam (Sam), .id_t (id_t), diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 6e0cfae5..00b313d4 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -347,9 +347,8 @@ module cluster_tile .AxiCfgN (AxiCfgN), .AxiCfgW (AxiCfgW), .RouteAlgo (RouteCfg.RouteAlgo), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .NoLoopback (1'b0), .NumRoutes (5), .InFifoDepth (2), @@ -417,9 +416,8 @@ module cluster_tile .ChimneyCfgW (floo_pkg::ChimneyDefaultCfg), .RouteCfg (floo_picobello_noc_pkg::RouteCfg), .AtopSupport (1'b1), - .EnDecoupledRW (1'b1), - .VcImplementation (VcImplementation), - .NumWidePhysChannels (NumWidePhysChannels), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (3), .Sam (picobello_pkg::SamMcast), .id_t (floo_picobello_noc_pkg::id_t), diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index e4641218..cfe19817 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -45,8 +45,7 @@ module dummy_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), + .WideRwDecouple (WideRwDecouple), .VcImplementation (VcImplementation) ) i_router ( .clk_i, diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 45d1a88b..5bd200e6 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -63,9 +63,8 @@ module fhg_spu_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation) + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index f1946ada..09852e93 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -62,9 +62,8 @@ module mem_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation) + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -126,9 +125,8 @@ module mem_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, 1'b1, 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), - .EnDecoupledRW (1'b1), - .VcImplementation (VcImplementation), - .NumWidePhysChannels (NumWidePhysChannels), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 9a226900..bacbd54a 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -270,7 +270,7 @@ package picobello_pkg; typedef struct packed { logic [5:0] offset; logic [2:0] len; - logic [2:0] grp_base_id; + logic [2:0] base_id; } mask_sel_t; typedef struct packed { @@ -325,16 +325,16 @@ package picobello_pkg; sam_multicast[rule].idx.mask_x = '{ offset: offset_id_x, len: len_id_x, - grp_base_id: empty_cols + base_id: empty_cols }; sam_multicast[rule].idx.mask_y = '{ offset: offset_id_y, len: len_id_y, - grp_base_id: empty_rows + base_id: empty_rows }; end else begin - sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, grp_base_id: 0}; - sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, grp_base_id: 0}; + sam_multicast[rule].idx.mask_x = '{offset: '0, len: '0, base_id: 0}; + sam_multicast[rule].idx.mask_y = '{offset: '0, len: '0, base_id: 0}; end end @@ -364,9 +364,9 @@ package picobello_pkg; $write(" { idx: { id: {x: %0d, y: %0d, port: %0d},", SamMcast[i].idx.id.x, SamMcast[i].idx.id.y, SamMcast[i].idx.id.port_id); $write(" mask_x: {offset: %0d, len: %0d, base_id: %0d},", SamMcast[i].idx.mask_x.offset, - SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.grp_base_id); + SamMcast[i].idx.mask_x.len, SamMcast[i].idx.mask_x.base_id); $write(" mask_y: {offset: %0d, len: %0d, base_id: %0d} },", SamMcast[i].idx.mask_y.offset, - SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.grp_base_id); + SamMcast[i].idx.mask_y.len, SamMcast[i].idx.mask_y.base_id); $write("start: 0x%0h, end: 0x%0h }\n", SamMcast[i].start_addr, SamMcast[i].end_addr); end $display("]"); @@ -418,7 +418,8 @@ package picobello_pkg; localparam int unsigned NumWideVirtChannels = 2; localparam int unsigned NumWidePhysChannels = 1; - localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcUnstableValid; + localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcPreemptValid; + localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Vc; // Configurations for the Reductions // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index 1833d558..f4cf337d 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -98,9 +98,8 @@ module spm_tile .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), // .floo_wide_out_t (floo_wide_double_t), - .NumWideVirtChannels (NumWideVirtChannels), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation) + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation) ) i_router ( .clk_i, .rst_ni, @@ -165,9 +164,8 @@ module spm_tile .ChimneyCfgW (set_ports(ChimneyDefaultCfg, bit'(!IsNarrow), 1'b0)), .RouteCfg (RouteCfgNoMcast), .AtopSupport (1'b1), - .EnDecoupledRW (1'b1), - .NumWidePhysChannels (NumWidePhysChannels), - .VcImplementation (VcImplementation), + .WideRwDecouple (WideRwDecouple), + .VcImpl (VcImplementation), .MaxAtomicTxns (1), .Sam (Sam), .id_t (id_t), From 625d71464c13e79793a37e94f8fe134fa38cd7d5 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 26 Feb 2026 15:33:53 +0100 Subject: [PATCH 68/77] Fix setuptools --- Bender.lock | 4 ++-- Makefile | 9 ++++++--- hw/cluster_tile.sv | 46 +++++++++++++++++++++++++++++----------------- iis-env.sh | 2 +- requirements.txt | 2 +- 5 files changed, 39 insertions(+), 24 deletions(-) diff --git a/Bender.lock b/Bender.lock index d9fc9ea1..c8bad46e 100644 --- a/Bender.lock +++ b/Bender.lock @@ -129,8 +129,8 @@ packages: - common_cells - register_interface cluster_icache: - revision: 64e21ae455bbdde850c4df13bef86ea55ac42537 - version: 0.2.0 + revision: 1642961b9561513f283674efc2c90f24855e8d39 + version: 0.3.1 source: Git: https://github.com/pulp-platform/cluster_icache.git dependencies: diff --git a/Makefile b/Makefile index f2ea2606..6278f56e 100644 --- a/Makefile +++ b/Makefile @@ -181,7 +181,9 @@ PB_HW_ALL += update-sn-cfg picobello-hw-all all: $(PB_HW_ALL) sn-hw-all $(MAKE) $(PB_HW_ALL) -picobello-hw-clean clean: sn-hw-clean floo-clean +picobello-hw-clean: sn-hw-clean floo-clean + +clean: picobello-hw-clean rm -rf $(BENDER_ROOT) ############ @@ -222,13 +224,14 @@ python-venv: .venv .venv: $(BASE_PYTHON) -m venv $@ . $@/bin/activate && \ - python -m pip install --upgrade pip setuptools && \ - python -m pip install --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ + python -m pip install --upgrade pip "setuptools<81" && \ + python -m pip install --no-build-isolation --cache-dir $(PIP_CACHE_DIR) -r requirements.txt && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) $(shell $(BENDER) path floo_noc) --no-deps && \ python -m pip install --cache-dir $(PIP_CACHE_DIR) -e "$(shell $(BENDER) path snitch_cluster)[kernels]" python-venv-clean: rm -rf .venv + rm -rf $(PIP_CACHE_DIR) verible-fmt: $(VERIBLE_FMT) $(VERIBLE_FMT_ARGS) $(shell $(BENDER) script flist $(SIM_TARGS) --no-deps) diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 00b313d4..96ab3127 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -210,23 +210,35 @@ module cluster_tile .hart_base_id_i, .cluster_base_addr_i, .cluster_base_offset_i, - .mxip_i (mxip), - .clk_d2_bypass_i ('0), - .sram_cfgs_i ('0), - .narrow_in_req_i (cluster_narrow_in_req), - .narrow_in_resp_o (cluster_narrow_in_rsp), - .narrow_out_req_o (cluster_narrow_out_req), - .narrow_out_resp_i(cluster_narrow_out_rsp), - .wide_out_req_o (cluster_wide_out_req), - .wide_out_resp_i (cluster_wide_out_rsp), - .wide_in_req_i (cluster_wide_in_req), - .wide_in_resp_o (cluster_wide_in_rsp), - .narrow_ext_req_o (cluster_narrow_ext_req), - .narrow_ext_resp_i(cluster_narrow_ext_rsp), - .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), - .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned), - .dca_req_i (offload_dca_req_cut), - .dca_rsp_o (offload_dca_rsp_cut) + .mxip_i (mxip), + .clk_d2_bypass_i ('0), + .sram_cfgs_i ('0), + .narrow_in_req_i (cluster_narrow_in_req), + .narrow_in_resp_o (cluster_narrow_in_rsp), + .narrow_out_req_o (cluster_narrow_out_req), + .narrow_out_resp_i (cluster_narrow_out_rsp), + .wide_out_req_o (cluster_wide_out_req), + .wide_out_resp_i (cluster_wide_out_rsp), + .wide_in_req_i (cluster_wide_in_req), + .wide_in_resp_o (cluster_wide_in_rsp), + .narrow_ext_req_o (cluster_narrow_ext_req), + .narrow_ext_resp_i (cluster_narrow_ext_rsp), + .tcdm_ext_req_i (cluster_tcdm_ext_req_aligned), + .tcdm_ext_resp_o (cluster_tcdm_ext_rsp_aligned), + .dca_req_i (offload_dca_req_cut), + .dca_rsp_o (offload_dca_rsp_cut), + .x_issue_req_o (), + .x_issue_resp_i ('0), + .x_issue_valid_o (), + .x_issue_ready_i ('0), + .x_register_o (), + .x_register_valid_o (), + .x_register_ready_i ('0), + .x_commit_o (), + .x_commit_valid_o (), + .x_result_i ('0), + .x_result_valid_i ('0), + .x_result_ready_o () ); if (UseHWPE) begin : gen_hwpe diff --git a/iis-env.sh b/iis-env.sh index f0aa91b5..2a65cc08 100644 --- a/iis-env.sh +++ b/iis-env.sh @@ -21,4 +21,4 @@ if [ -z "$VIRTUAL_ENV" ] || [ "$VIRTUAL_ENV" != "$(realpath .venv)" ]; then source .venv/bin/activate fi -export PYTHONPATH=$PWD/experiments/:$PYTHONPATH \ No newline at end of file +export PYTHONPATH=$PWD/experiments/:$PYTHONPATH diff --git a/requirements.txt b/requirements.txt index 4effd42e..1b65da0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,5 +25,5 @@ hjson # for reggen # For peakrdl peakrdl -peakrdl-rawheader @ git+https://github.com/colluca/PeakRDL-rawheader.git@9a7bd727986ee2ffa5105c61aaeb5c991309d29b +peakrdl-rawheader==0.1.3 peakrdl-markdown From 064d976af6da32ecadc7be8a6c67be04f09a7177 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 26 Feb 2026 15:59:01 +0100 Subject: [PATCH 69/77] Add experiments in CI --- .gitlab-ci.yml | 26 ++++++++++++++++++++++++++ .gitlab/common.yml | 1 - Bender.lock | 4 ++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a98fdb8..d588015f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,6 +57,9 @@ chs-sw: vsim-compile: stage: build + extends: + - .cache-deps + - .init-env needs: - generate-rtl variables: @@ -101,6 +104,29 @@ sim-vsim: paths: - transcript +##################### +# Run Experiments # +##################### + +sim-exps: + stage: run + extends: + - .cache-deps + - .init-env + needs: + - chs-sw + - sn-sw + - vsim-compile + parallel: + matrix: + - EXPERIMENT: multicast + - EXPERIMENT: reduction + - EXPERIMENT: barrier + script: + - cd experiments/$EXPERIMENT && ./experiments.py --actions sw run -j + artifacts: + expire_in: 7 days + ################### # Physical Design # ################### diff --git a/.gitlab/common.yml b/.gitlab/common.yml index ac4634c7..708c5cc5 100644 --- a/.gitlab/common.yml +++ b/.gitlab/common.yml @@ -68,7 +68,6 @@ variables: script: - make sn-tests DEBUG=ON - make sn-apps DEBUG=ON - - make pb-sn-tests artifacts: paths: - sw/snitch/tests/build/*.elf diff --git a/Bender.lock b/Bender.lock index c8bad46e..81acd2f7 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,10 +207,10 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: null + revision: 1c7f10686271c1d30b421b8f73a095304e8b61ec version: null source: - Path: working_dir/floo_noc + Git: https://github.com/pulp-platform/FlooNoC.git dependencies: - axi - axi_riscv_atomics From 40dcf3c9712faec283980693f2653bf2abd9e28d Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Fri, 27 Feb 2026 11:41:04 +0100 Subject: [PATCH 70/77] ci: Reduce experiments size for CI usage --- .gitlab-ci.yml | 2 +- experiments/barrier/experiments.py | 17 +++++++++++++---- experiments/multicast/experiments.py | 27 ++++++++++++++++++++------- experiments/reduction/experiments.py | 27 ++++++++++++++++++++------- 4 files changed, 54 insertions(+), 19 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d588015f..fe12d745 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -123,7 +123,7 @@ sim-exps: - EXPERIMENT: reduction - EXPERIMENT: barrier script: - - cd experiments/$EXPERIMENT && ./experiments.py --actions sw run -j + - cd experiments/$EXPERIMENT && ./experiments.py --ci --actions sw run -j artifacts: expire_in: 7 days diff --git a/experiments/barrier/experiments.py b/experiments/barrier/experiments.py index fb551175..8775c6bf 100755 --- a/experiments/barrier/experiments.py +++ b/experiments/barrier/experiments.py @@ -29,12 +29,17 @@ def derive_cdefines(self, experiment): return cdefs -def gen_experiments(): +def gen_experiments(ci=False): experiments = [] # for impl in ['hw']: # for n_clusters in [8]: - for impl in ['sw', 'hw']: - for n_clusters in [2, 4, 8, 16]: + impls = ['sw', 'hw'] + n_clusters_list = [2, 4, 8, 16] + if ci: + impls = ['hw'] + n_clusters_list = [16] + for impl in impls: + for n_clusters in n_clusters_list: experiments.append({ 'app': 'barrier_benchmark', 'cmd': pb.sim_cmd(), @@ -63,7 +68,11 @@ def results(manager=None): def main(): - manager = ExperimentManager(gen_experiments(), dir=DIR) + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) manager.run() df = results(manager) print(df) diff --git a/experiments/multicast/experiments.py b/experiments/multicast/experiments.py index 9fcbd9a4..9611cfd5 100755 --- a/experiments/multicast/experiments.py +++ b/experiments/multicast/experiments.py @@ -40,15 +40,24 @@ def derive_cdefines(self, experiment): return cdefs -def gen_experiments(): +def gen_experiments(ci=False): experiments = [] - for impl in ['seq', 'tree', 'hw']: + impls = ['seq', 'tree', 'hw'] + n_rows_list = [1, 2, 4] + sizes = [1024, 2048, 4096, 8192, 16384, 32768] + n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] + if ci: + impls = ['hw'] + n_rows_list = [4] + sizes = [32768] + n_batches_list = [1024, 2048] + for impl in impls: # for n_rows in [1]: - for n_rows in [1, 2, 4]: + for n_rows in n_rows_list: # for size in [1024]: - for size in [1024, 2048, 4096, 8192, 16384, 32768]: + for size in sizes: # for n_batches in [1]: - for n_batches in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]: + for n_batches in n_batches_list: # Only sequential implementation supports batching, for all other # implementations we only accept n_batches = 1 @@ -57,7 +66,7 @@ def gen_experiments(): if impl == 'seq': valid_n_batches = batch_beats > 8 and batch_beats < 1024 else: - valid_n_batches = n_batches == 1 + valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list # If the current setting for n_batches is valid, add the experiment if valid_n_batches: @@ -128,7 +137,11 @@ def results(manager=None): def main(): - manager = ExperimentManager(gen_experiments(), dir=DIR) + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) manager.run() df = results(manager) print(df) diff --git a/experiments/reduction/experiments.py b/experiments/reduction/experiments.py index e9a141a8..4e3662b5 100755 --- a/experiments/reduction/experiments.py +++ b/experiments/reduction/experiments.py @@ -41,17 +41,26 @@ def derive_cdefines(self, experiment): return cdefs -def gen_experiments(): +def gen_experiments(ci=False): experiments = [] + impls = ['seq', 'tree', 'hw'] + n_rows_list = [1, 2, 4] + sizes = [1024, 2048, 4096, 8192, 16384, 32768] + n_batches_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] + if ci: + impls = ['hw'] + n_rows_list = [4] + sizes = [32768] + n_batches_list = [1024, 2048] # for impl in ['seq']: # for impl in ['tree', 'hw']: - for impl in ['seq', 'tree', 'hw']: + for impl in impls: # for n_rows in [1]: - for n_rows in [1, 2, 4]: + for n_rows in n_rows_list: # for size in [4096]: - for size in [1024, 2048, 4096, 8192, 16384, 32768]: + for size in sizes: # for n_batches in [4]: - for n_batches in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]: + for n_batches in n_batches_list: # Only sequential and tree implementations supports batching, for all other # implementations we only accept n_batches = 1 @@ -60,7 +69,7 @@ def gen_experiments(): if impl in ['seq', 'tree']: valid_n_batches = batch_beats > 8 and batch_beats < 256 else: - valid_n_batches = n_batches == 1 + valid_n_batches = n_batches == 1 if not ci else n_batches in n_batches_list # If the current setting for n_batches is valid, add the experiment if valid_n_batches: @@ -154,7 +163,11 @@ def results(manager=None): def main(): - manager = ExperimentManager(gen_experiments(), dir=DIR) + parser = ExperimentManager.parser() + parser.add_argument('--ci', action='store_true', + help='Reduce experiment space for CI runs') + args = parser.parse_args() + manager = ExperimentManager(gen_experiments(ci=args.ci), dir=DIR, args=args, parse_args=False) manager.run() df = results(manager) print(df) From 508db96ce156bf47fb6eee86d16b02079297c3fb Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Fri, 27 Feb 2026 14:27:46 +0100 Subject: [PATCH 71/77] Improve make flow --- Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 6278f56e..2712124f 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,9 @@ L2_TILES = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.l2_spm.num 2>/de BENDER_YML = $(PB_ROOT)/Bender.yml BENDER_LOCK = $(PB_ROOT)/Bender.lock +$(PB_GEN_DIR): + mkdir -p $@ + ################ # Bender flags # ################ @@ -62,7 +65,7 @@ $(PB_GEN_DIR)/picobello.rdl: $(FLOO_CFG) $(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --rdl --rdl-as-mem --rdl-memwidth=32 # Those are dummy RDL files, for generation without access to the PD repository. -$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: +$(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: | $(PB_GEN_DIR) @touch $@ $(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello.rdl $(PB_RDL_ALL) @@ -172,16 +175,14 @@ clean-pd: PB_HW_ALL += $(CHS_HW_ALL) PB_HW_ALL += $(CHS_SIM_ALL) -PB_HW_ALL += $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv PB_HW_ALL += $(PB_RDL_HW_ALL) -PB_HW_ALL += update-sn-cfg -.PHONY: picobello-hw-all picobello-clean clean +.PHONY: picobello-hw-all picobello-hw-clean clean -picobello-hw-all all: $(PB_HW_ALL) sn-hw-all - $(MAKE) $(PB_HW_ALL) +picobello-hw-all all: $(PB_HW_ALL) update-sn-cfg sn-hw-all floo-hw-all picobello-hw-clean: sn-hw-clean floo-clean + rm -rf $(PB_HW_ALL) clean: picobello-hw-clean rm -rf $(BENDER_ROOT) From 73cb50048b13e3f1edc8417902fd90648d5c487d Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Mon, 2 Mar 2026 14:23:47 +0100 Subject: [PATCH 72/77] Bump FlooNoC --- .gitlab/sw-tests.yml | 2 +- Bender.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/sw-tests.yml b/.gitlab/sw-tests.yml index 42cc5c1d..d1cc08eb 100644 --- a/.gitlab/sw-tests.yml +++ b/.gitlab/sw-tests.yml @@ -23,7 +23,7 @@ - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/non_null_exitcode.elf, NZ_EXIT_CODE: 896 } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicluster_atomics.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/mcast_barrier.elf } - - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/dma_multicast.elf } + - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multicast_benchmark.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/multi_mcast.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/row_col_mcast.elf } - { CHS_BINARY: $CHS_BUILD_DIR/simple_offload.spm.elf, SN_BINARY: $SN_BUILD_DIR/access_spm.elf } diff --git a/Bender.lock b/Bender.lock index 81acd2f7..f7967a9b 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: 1c7f10686271c1d30b421b8f73a095304e8b61ec + revision: 8c73a97386dadc6b8186b5162db440752fd5c715 version: null source: Git: https://github.com/pulp-platform/FlooNoC.git From 5403ba484cfb8cccdf31058aaae16cd5008362e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Mon, 2 Mar 2026 17:02:42 +0100 Subject: [PATCH 73/77] Align new floogen CLI --- Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 2712124f..cd4839c8 100644 --- a/Makefile +++ b/Makefile @@ -27,8 +27,8 @@ VERIBLE_FMT_ARGS ?= --flagfile .verilog_format --inplace --verbose PEAKRDL ?= peakrdl # Tiles configuration -SN_CLUSTERS = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.cluster.num 2>/dev/null) -L2_TILES = $(shell $(FLOO_GEN) -c $(FLOO_CFG) --query endpoints.l2_spm.num 2>/dev/null) +SN_CLUSTERS = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.cluster.num 2>/dev/null) +L2_TILES = $(shell $(FLOO_GEN) query -c $(FLOO_CFG) endpoints.l2_spm.num 2>/dev/null) # Bender prerequisites BENDER_YML = $(PB_ROOT)/Bender.yml @@ -48,7 +48,7 @@ SIM_TARGS += -t simulation -t test -t idma_test # systemRDL # ############# -PB_RDL_ALL += $(PB_GEN_DIR)/picobello.rdl +PB_RDL_ALL += $(PB_GEN_DIR)/picobello_addrmap.rdl PB_RDL_ALL += $(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl PB_RDL_ALL += $(PB_GEN_DIR)/snitch_cluster.rdl PB_RDL_ALL += $(wildcard $(PB_ROOT)/cfg/rdl/*.rdl) @@ -61,14 +61,14 @@ $(PB_GEN_DIR)/pb_soc_regs.sv: $(PB_GEN_DIR)/pb_soc_regs_pkg.sv $(PB_GEN_DIR)/pb_soc_regs_pkg.sv: $(PB_ROOT)/cfg/rdl/pb_soc_regs.rdl $(PEAKRDL) regblock $< -o $(PB_GEN_DIR) --cpuif apb4-flat --default-reset arst_n -P Num_Clusters=$(SN_CLUSTERS) -P Num_Mem_Tiles=$(L2_TILES) -$(PB_GEN_DIR)/picobello.rdl: $(FLOO_CFG) - $(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --rdl --rdl-as-mem --rdl-memwidth=32 +$(PB_GEN_DIR)/picobello_addrmap.rdl: $(FLOO_CFG) + $(FLOO_GEN) rdl -c $(FLOO_CFG) -o $(PB_GEN_DIR) --as-mem --memwidth=32 # Those are dummy RDL files, for generation without access to the PD repository. $(PB_GEN_DIR)/fll.rdl $(PB_GEN_DIR)/pb_chip_regs.rdl: | $(PB_GEN_DIR) @touch $@ -$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello.rdl $(PB_RDL_ALL) +$(PB_GEN_DIR)/pb_addrmap.h: $(PB_GEN_DIR)/picobello_addrmap.rdl $(PB_RDL_ALL) $(PEAKRDL) c-header $< $(PEAKRDL_INCLUDES) $(PEAKRDL_DEFINES) -o $@ -i -b ltoh $(PB_GEN_DIR)/pb_addrmap.svh: $(PB_RDL_ALL) @@ -133,12 +133,12 @@ endif floo-hw-all: $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv: $(FLOO_CFG) | $(PB_GEN_DIR) - $(FLOO_GEN) -c $(FLOO_CFG) -o $(PB_GEN_DIR) --only-pkg $(FLOO_GEN_FLAGS) + $(FLOO_GEN) pkg -c $(FLOO_CFG) -o $(PB_GEN_DIR) $(FLOO_GEN_FLAGS) floo-clean: rm -f $(PB_GEN_DIR)/floo_picobello_noc_pkg.sv - rm -f $(PB_GEN_DIR)/picobello.rdl + rm -f $(PB_GEN_DIR)/picobello_addrmap.rdl ################### # Physical Design # From 657ea524ac8f47dec7b598991cece1c0e9ae465a Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Wed, 4 Mar 2026 15:58:50 +0100 Subject: [PATCH 74/77] Remove unused cfg par --- Bender.lock | 2 +- hw/picobello_pkg.sv | 20 -------------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/Bender.lock b/Bender.lock index f7967a9b..a92ca3dc 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: 8c73a97386dadc6b8186b5162db440752fd5c715 + revision: 930bbf2f61e88b699c70963ab39a4679d79ec1a3 version: null source: Git: https://github.com/pulp-platform/FlooNoC.git diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index bacbd54a..746ca11e 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -422,41 +422,21 @@ package picobello_pkg; localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Vc; // Configurations for the Reductions - // Stupid asolution which allows me to overwrite the Reduction confiuration without endagering everything - // ATTENTION: - // @GENERIC Implementation: RdPartialBufferSize Needs to be bigger than the "RdPipelineDepth" otherwise we can build a deadlock - // @STALLING Implementation: min(ceil(log2(NumRoutes)), RdPipelineDepth+1) - // TODO: Add Assertion to check this - // raroth - overwrite benchmark autotest - start localparam reduction_cfg_t WideReductionCfg = '{ - RdControllConf: ControllerGeneric, - RdFifoFallThrough: 1'b1, - RdFifoDepth: 0, RdPipelineDepth: 5, - RdPartialBufferSize: 6, - RdTagBits: 5, RdSupportAxi: 1'b1, - RdEnableBypass: 1'b1, RdSupportLoopback: 1'b1, CutOffloadIntf: 1'b1 }; - // raroth - overwrite benchmark autotest - end localparam reduction_cfg_t NarrowReductionCfg = '{ - RdControllConf: ControllerGeneric, - RdFifoFallThrough: 1'b1, - RdFifoDepth: 0, RdPipelineDepth: 1, - RdPartialBufferSize: 3, - RdTagBits: 5, RdSupportAxi: 1'b1, - RdEnableBypass: 1'b1, RdSupportLoopback: 1'b1, CutOffloadIntf: 1'b0 }; localparam reduction_cfg_t ResponseReductionCfg = '{ - RdEnableBypass: 1'b1, RdSupportLoopback: 1'b1, default: '0 }; From 616b0018df99c0dab514af55a43c39a433a1e6a5 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Wed, 4 Mar 2026 17:34:38 +0100 Subject: [PATCH 75/77] Bump floonoc without coll loopback option --- Bender.lock | 2 +- hw/picobello_pkg.sv | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/Bender.lock b/Bender.lock index a92ca3dc..a7b0bdcb 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: 930bbf2f61e88b699c70963ab39a4679d79ec1a3 + revision: eafe97883c547bbdeead959b21270b4da5c5e390 version: null source: Git: https://github.com/pulp-platform/FlooNoC.git diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 746ca11e..d5c1b544 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -347,8 +347,6 @@ package picobello_pkg; floo_pkg::route_cfg_t ret = floo_picobello_noc_pkg::RouteCfg; // Disable multicast for non-cluster tiles ret.CollectiveCfg = CollectiveDefaultCfg; - // DIsbale loop back - ret.CollectiveCfg.RedCfg.RdSupportLoopback = 1'b0; return ret; endfunction @@ -424,20 +422,15 @@ package picobello_pkg; // Configurations for the Reductions localparam reduction_cfg_t WideReductionCfg = '{ RdPipelineDepth: 5, - RdSupportAxi: 1'b1, - RdSupportLoopback: 1'b1, CutOffloadIntf: 1'b1 }; localparam reduction_cfg_t NarrowReductionCfg = '{ RdPipelineDepth: 1, - RdSupportAxi: 1'b1, - RdSupportLoopback: 1'b1, CutOffloadIntf: 1'b0 }; localparam reduction_cfg_t ResponseReductionCfg = '{ - RdSupportLoopback: 1'b1, default: '0 }; From 574ee6545e6793a9d000a590e5424ec12f9d30c0 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Thu, 5 Mar 2026 18:49:11 +0100 Subject: [PATCH 76/77] hw: Align with new offload interface --- Bender.lock | 2 +- floo_picobello_noc_pkg_REDUCTION.sv | 6 +++ hw/cheshire_tile.sv | 20 ++------ hw/cluster_tile.sv | 80 ++++++++++------------------- hw/dummy_tile.sv | 20 ++------ hw/fhg_spu_tile.sv | 20 ++------ hw/mem_tile.sv | 20 ++------ hw/picobello_pkg.sv | 4 -- hw/spm_tile.sv | 20 ++------ 9 files changed, 55 insertions(+), 137 deletions(-) diff --git a/Bender.lock b/Bender.lock index a7b0bdcb..49bb7b8c 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: eafe97883c547bbdeead959b21270b4da5c5e390 + revision: d19db129e9fff8ab783317e7218547b1ea2912de version: null source: Git: https://github.com/pulp-platform/FlooNoC.git diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index baa39271..986177be 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -6,6 +6,7 @@ `include "axi/typedef.svh" `include "floo_noc/typedef.svh" +`include "reduction/typedef.svh" package floo_picobello_noc_pkg; @@ -337,7 +338,12 @@ package floo_picobello_noc_pkg; `FLOO_TYPEDEF_NW_CHAN_ALL(axi, req, rsp, wide, axi_narrow_in, axi_wide_in, AxiCfgN, AxiCfgW, hdr_t) + //TODO (lleone): all this must be generated `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 1) + typedef logic [AxiCfgW.DataWidth-1:0] data_wide_width; + typedef logic [AxiCfgN.DataWidth-1:0] data_narrow_width; + `RED_TYPEDEF_REQ_RSP_LINK(wide, data_wide_width, wide_req, wide_rsp) + `RED_TYPEDEF_REQ_RSP_LINK(narrow, data_narrow_width, narrow_req, narrow_rsp) endpackage diff --git a/hw/cheshire_tile.sv b/hw/cheshire_tile.sv index 10935d95..e2af0086 100644 --- a/hw/cheshire_tile.sv +++ b/hw/cheshire_tile.sv @@ -137,23 +137,11 @@ module cheshire_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_west_o = router_floo_req_out[West]; diff --git a/hw/cluster_tile.sv b/hw/cluster_tile.sv index 96ab3127..af101f30 100644 --- a/hw/cluster_tile.sv +++ b/hw/cluster_tile.sv @@ -39,12 +39,6 @@ module cluster_tile input floo_wide_t [ West:North] floo_wide_i ); - /// TODO(lleone): Offload typedef to be generated by Floogen or removed form - /// the hardware after more optimization - typedef logic[AxiCfgW.DataWidth-1:0] RdDataWide_t; - typedef logic[AxiCfgN.DataWidth-1:0] RdDataNarrow_t; - - // Tile-specific reset and clock signals logic tile_clk; logic tile_rst_n; @@ -99,13 +93,8 @@ module cluster_tile snitch_cluster_pkg::dca_rsp_t offload_dca_rsp, offload_dca_rsp_cut; // Signals to connect the NW router to the wide parser - RdDataWide_t [1:0] offload_wide_req_operand; - collect_op_e offload_wide_req_operation; - logic offload_wide_req_valid; - logic offload_wide_req_ready; - RdDataWide_t offload_wide_resp_data; - logic offload_wide_resp_valid; - logic offload_wide_resp_ready; + red_wide_req_t offload_wide_req; + red_wide_rsp_t offload_wide_rsp; // Parse the Wide request from the reouter to the one from the snitch cluster! // TODO(raroth): possible to remove this decode from the picobello repo and move it inside the @@ -113,8 +102,8 @@ module cluster_tile // file. Maybe do the same for the FPU if(is_en_wide_reduction(RouteCfg.CollectiveCfg.OpCfg)) begin : gen_wide_offload_reduction // Connect the DCA Request - assign offload_dca_req.q_valid = offload_wide_req_valid; - assign offload_wide_req_ready = offload_dca_rsp.q_ready; + assign offload_dca_req.q_valid = offload_wide_req.valid; + assign offload_wide_rsp.ready = offload_dca_rsp.q_ready; // Parse the FPU Request always_comb begin @@ -131,31 +120,31 @@ module cluster_tile offload_dca_req.q.op = fpnew_pkg::ADD; // Define the operation we want to execute on the FPU - unique casez (offload_wide_req_operation) + unique casez (offload_wide_req.req.op) (floo_pkg::F_Add) : begin offload_dca_req.q.op = fpnew_pkg::ADD; offload_dca_req.q.operands[0] = '0; - offload_dca_req.q.operands[1] = offload_wide_req_operand[0]; - offload_dca_req.q.operands[2] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[2] = offload_wide_req.req.operand2; end (floo_pkg::F_Mul) : begin offload_dca_req.q.op = fpnew_pkg::MUL; - offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; - offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; offload_dca_req.q.operands[2] = '0; end (floo_pkg::F_Max) : begin offload_dca_req.q.op = fpnew_pkg::MINMAX; offload_dca_req.q.rnd_mode = fpnew_pkg::RNE; - offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; - offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; offload_dca_req.q.operands[2] = '0; end (floo_pkg::F_Min) : begin offload_dca_req.q.op = fpnew_pkg::MINMAX; offload_dca_req.q.rnd_mode = fpnew_pkg::RTZ; - offload_dca_req.q.operands[0] = offload_wide_req_operand[0]; - offload_dca_req.q.operands[1] = offload_wide_req_operand[1]; + offload_dca_req.q.operands[0] = offload_wide_req.req.operand1; + offload_dca_req.q.operands[1] = offload_wide_req.req.operand2; offload_dca_req.q.operands[2] = '0; end default : begin @@ -184,17 +173,17 @@ module cluster_tile .mst_rsp_i (offload_dca_rsp_cut) ); // Connect the Response - assign offload_wide_resp_valid = offload_dca_rsp.p_valid; - assign offload_dca_req.p_ready = offload_wide_resp_ready; - assign offload_wide_resp_data = offload_dca_rsp.p.result; + assign offload_wide_rsp.valid = offload_dca_rsp.p_valid; + assign offload_dca_req.p_ready = offload_wide_req.ready; + assign offload_wide_rsp.rsp.result = offload_dca_rsp.p.result; // No Wide Reduction supported end else begin : gen_no_wide_reduction assign offload_dca_req_cut = '0; assign offload_dca_rsp = '0; - assign offload_wide_req_ready = '0; - assign offload_wide_resp_data = '0; - assign offload_wide_resp_valid = '0; + assign offload_wide_rsp.ready = '0; + assign offload_wide_rsp.rsp.result = '0; + assign offload_wide_rsp.valid = '0; end // TODO(lleone): Add teh narrow ALU reduction unit and connections here @@ -370,14 +359,13 @@ module cluster_tile .floo_req_t (floo_req_t), .floo_rsp_t (floo_rsp_t), .floo_wide_t (floo_wide_t), - .RdWideOperation_t (floo_pkg::collect_op_e), - .RdNarrowOperation_t (floo_pkg::collect_op_e), - .RdWideData_t (RdDataWide_t), - .RdNarrowData_t (RdDataNarrow_t), + .red_wide_req_t (red_wide_req_t), + .red_wide_rsp_t (red_wide_rsp_t), + .red_narrow_req_t (red_narrow_req_t), + .red_narrow_rsp_t (red_narrow_rsp_t), .CollectiveOpCfg (RouteCfg.CollectiveCfg.OpCfg), .RdWideCfg (WideReductionCfg), - .RdNarrowCfg (NarrowReductionCfg), - .RdRespCfg (ResponseReductionCfg) + .RdNarrowCfg (NarrowReductionCfg) ) i_router ( .clk_i, .rst_ni, @@ -391,23 +379,11 @@ module cluster_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (offload_wide_req_operation), - .offload_wide_req_operand1_o (offload_wide_req_operand[0]), - .offload_wide_req_operand2_o (offload_wide_req_operand[1]), - .offload_wide_req_valid_o (offload_wide_req_valid), - .offload_wide_req_ready_i (offload_wide_req_ready), - .offload_wide_resp_result_i (offload_wide_resp_data), - .offload_wide_resp_valid_i (offload_wide_resp_valid), - .offload_wide_resp_ready_o (offload_wide_resp_ready), + .offload_wide_req_o ( offload_wide_req ), + .offload_wide_rsp_i ( offload_wide_rsp ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; diff --git a/hw/dummy_tile.sv b/hw/dummy_tile.sv index cfe19817..2dafd6a9 100644 --- a/hw/dummy_tile.sv +++ b/hw/dummy_tile.sv @@ -60,23 +60,11 @@ module dummy_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; diff --git a/hw/fhg_spu_tile.sv b/hw/fhg_spu_tile.sv index 5bd200e6..5db4fa91 100644 --- a/hw/fhg_spu_tile.sv +++ b/hw/fhg_spu_tile.sv @@ -78,23 +78,11 @@ module fhg_spu_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_west_o = router_floo_req_out[West]; diff --git a/hw/mem_tile.sv b/hw/mem_tile.sv index 09852e93..f615891f 100644 --- a/hw/mem_tile.sv +++ b/hw/mem_tile.sv @@ -77,23 +77,11 @@ module mem_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index d5c1b544..9d3bf86b 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -430,10 +430,6 @@ localparam reduction_cfg_t NarrowReductionCfg = '{ CutOffloadIntf: 1'b0 }; - localparam reduction_cfg_t ResponseReductionCfg = '{ - default: '0 - }; - //////////////// // Cheshire // //////////////// diff --git a/hw/spm_tile.sv b/hw/spm_tile.sv index f4cf337d..a0980e8a 100644 --- a/hw/spm_tile.sv +++ b/hw/spm_tile.sv @@ -113,23 +113,11 @@ module spm_tile .floo_wide_i (router_floo_wide_in), .floo_wide_o (router_floo_wide_out), // Wide Reduction offload port - .offload_wide_req_op_o (), - .offload_wide_req_operand1_o (), - .offload_wide_req_operand2_o (), - .offload_wide_req_valid_o (), - .offload_wide_req_ready_i ('0), - .offload_wide_resp_result_i ('0), - .offload_wide_resp_valid_i ('0), - .offload_wide_resp_ready_o (), + .offload_wide_req_o ( ), + .offload_wide_rsp_i ( '0 ), // Narrow Reduction offload port - .offload_narrow_req_op_o (), - .offload_narrow_req_operand1_o (), - .offload_narrow_req_operand2_o (), - .offload_narrow_req_valid_o (), - .offload_narrow_req_ready_i ('0), - .offload_narrow_resp_result_i ('0), - .offload_narrow_resp_valid_i ('0), - .offload_narrow_resp_ready_o () + .offload_narrow_req_o ( ), + .offload_narrow_rsp_i ('0 ) ); assign floo_req_o = router_floo_req_out[West:North]; From 2b4f86b9fa6b9d2bc9ec92fffcaa70a60ad1a454 Mon Sep 17 00:00:00 2001 From: Lorenzo Leone Date: Fri, 6 Mar 2026 13:41:56 +0100 Subject: [PATCH 77/77] hw: Enable multi phys for performance boost --- Bender.lock | 2 +- floo_picobello_noc_pkg_REDUCTION.sv | 2 +- hw/picobello_pkg.sv | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Bender.lock b/Bender.lock index 49bb7b8c..10fa0b86 100644 --- a/Bender.lock +++ b/Bender.lock @@ -207,7 +207,7 @@ packages: Path: .deps/fhg_spu_cluster dependencies: [] floo_noc: - revision: d19db129e9fff8ab783317e7218547b1ea2912de + revision: 8d19d926fbc9174984a67d76f349d82ea117c443 version: null source: Git: https://github.com/pulp-platform/FlooNoC.git diff --git a/floo_picobello_noc_pkg_REDUCTION.sv b/floo_picobello_noc_pkg_REDUCTION.sv index 986177be..cf82fdde 100644 --- a/floo_picobello_noc_pkg_REDUCTION.sv +++ b/floo_picobello_noc_pkg_REDUCTION.sv @@ -339,7 +339,7 @@ package floo_picobello_noc_pkg; hdr_t) //TODO (lleone): all this must be generated - `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 1) + `FLOO_TYPEDEF_NW_VIRT_CHAN_LINK_ALL(req, rsp, wide, req, rsp, wide, 2, 2) typedef logic [AxiCfgW.DataWidth-1:0] data_wide_width; typedef logic [AxiCfgN.DataWidth-1:0] data_narrow_width; `RED_TYPEDEF_REQ_RSP_LINK(wide, data_wide_width, wide_req, wide_rsp) diff --git a/hw/picobello_pkg.sv b/hw/picobello_pkg.sv index 9d3bf86b..909f2ee2 100644 --- a/hw/picobello_pkg.sv +++ b/hw/picobello_pkg.sv @@ -417,7 +417,7 @@ package picobello_pkg; localparam int unsigned NumWideVirtChannels = 2; localparam int unsigned NumWidePhysChannels = 1; localparam floo_pkg::vc_impl_e VcImplementation = floo_pkg::VcPreemptValid; - localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Vc; + localparam floo_pkg::wide_rw_decouple_e WideRwDecouple = floo_pkg::Phys; // Configurations for the Reductions localparam reduction_cfg_t WideReductionCfg = '{