From ad26d179da1cef7f2927ab9c48c295bb7cac2028 Mon Sep 17 00:00:00 2001
From: Luca Balboni <lucabalbo2001@gmail.com>
Date: Wed, 11 Mar 2026 18:29:35 +0100
Subject: [PATCH] Bumped Spatz Core Complex

---
 .gitignore                                    |   27 +-
 Bender.local                                  |    2 +
 Bender.yml                                    |   47 +-
 Makefile                                      |  102 +-
 bender_sim.mk                                 |    7 +-
 bender_synth.mk                               |   14 +-
 hw/mesh/magia_pkg.sv                          |    4 +-
 hw/mesh/noc/floo_axi_mesh_16x16_noc.sv        |    8 +-
 hw/mesh/noc/floo_axi_mesh_2x2_noc.sv          |    8 +-
 hw/mesh/noc/floo_axi_mesh_32x32_noc.sv        |    8 +-
 hw/mesh/noc/floo_axi_mesh_4x4_noc.sv          |    8 +-
 hw/mesh/noc/floo_axi_mesh_8x8_noc.sv          |    8 +-
 .../floonoc_axi_mesh_16x16_config.yml         |    4 +-
 .../floonoc_axi_mesh_2x2_config.yml           |    4 +-
 .../floonoc_axi_mesh_32x32_config.yml         |    4 +-
 .../floonoc_axi_mesh_4x4_config.yml           |    4 +-
 .../floonoc_axi_mesh_8x8_config.yml           |    4 +-
 hw/tile/converters/reqrsp2obi.sv              |   90 +
 hw/tile/converters/reqrsp64_to_obi32.sv       |  104 +
 hw/tile/converters/tcdm2hci.sv                |   66 +
 hw/tile/converters/tcdm2hci_atomic.sv         |  166 ++
 hw/tile/converters/tcdm2obi.sv                |   84 +
 hw/tile/converters/tcdm64_to_dual_hci32.sv    |  103 +
 .../converters/tcdm64_to_dual_hci32_atomic.sv |  136 ++
 hw/tile/converters/tcdm64_to_dual_tcdm32.sv   |  188 ++
 hw/tile/eu_direct_cut.sv                      |  105 +
 hw/tile/magia_event_unit.sv                   |   46 +-
 hw/tile/magia_tile.sv                         |  421 +++-
 hw/tile/magia_tile_pkg.sv                     |  329 ++-
 hw/tile/obi_slave_ctrl_spatz.sv               |  179 ++
 hw/tile/spatz_bootrom.sv                      |   39 +
 hw/tile/spatz_cc_wrapper.sv                   |  489 +++++
 scripts/bin2header.py                         |   67 +
 scripts/generate_spatz_bootrom.py             |  100 +
 setup_env.sh                                  |    4 +-
 spatz/README.md                               |  360 ++++
 spatz/bootrom/Makefile                        |   96 +
 spatz/bootrom/spatz_init.S                    |   36 +
 spatz/bootrom/spatz_init.ld                   |   62 +
 spatz/sw/Makefile                             |  193 ++
 spatz/sw/kernel/spatz_crt0.S                  |  169 ++
 spatz/sw/kernel/spatz_program.ld              |   86 +
 spatz/sw/spatz_utils/spatz_workers.h          | 1760 +++++++++++++++++
 spatz/sw/tasks/atomic_remote_task.c           |   66 +
 spatz/sw/tasks/dotp_task.c                    |   57 +
 spatz/sw/tasks/hello_task.c                   |   47 +
 spatz/sw/tasks/hello_world_simple_task.c      |   26 +
 spatz/sw/tasks/idma_simple_task.c             |   96 +
 .../sw/tasks/instructions_sweep_simple_task.c |  426 ++++
 spatz/sw/tasks/interconnect32_simple_task.c   |  131 ++
 spatz/sw/tasks/interconnect64_simple_task.c   |  174 ++
 spatz/sw/tasks/matmul16_task.c                |   60 +
 spatz/sw/tasks/matvec16_task.c                |  207 ++
 spatz/sw/tasks/matvectrans16_task.c           |  206 ++
 spatz/sw/tasks/vecsum16_task.c                |   78 +
 sw/kernel/link.ld                             |   15 +-
 sw/tests/spatz_tests/atomic_mesh_spatz_test.c |   95 +
 sw/tests/spatz_tests/dotp_mesh_spatz_test.c   |  137 ++
 sw/tests/spatz_tests/hello_mesh_spatz_test.c  |   61 +
 .../spatz_tests/matmul_compare_spatz_test.c   |  195 ++
 sw/tests/spatz_tests/simple_spatz_test.c      |   66 +
 sw/utils/event_unit_utils.h                   |   35 +
 sw/utils/magia_spatz_utils.h                  |   88 +
 sw/utils/magia_tile_utils.h                   |   77 +-
 sw/utils/redmule_mm_utils.h                   |   10 +-
 sw/utils/tinyprintf.h                         |    3 +-
 target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv  |    8 +-
 67 files changed, 7798 insertions(+), 307 deletions(-)
 create mode 100644 hw/tile/converters/reqrsp2obi.sv
 create mode 100644 hw/tile/converters/reqrsp64_to_obi32.sv
 create mode 100644 hw/tile/converters/tcdm2hci.sv
 create mode 100644 hw/tile/converters/tcdm2hci_atomic.sv
 create mode 100644 hw/tile/converters/tcdm2obi.sv
 create mode 100644 hw/tile/converters/tcdm64_to_dual_hci32.sv
 create mode 100644 hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
 create mode 100644 hw/tile/converters/tcdm64_to_dual_tcdm32.sv
 create mode 100644 hw/tile/eu_direct_cut.sv
 create mode 100644 hw/tile/obi_slave_ctrl_spatz.sv
 create mode 100644 hw/tile/spatz_bootrom.sv
 create mode 100644 hw/tile/spatz_cc_wrapper.sv
 create mode 100644 scripts/bin2header.py
 create mode 100644 scripts/generate_spatz_bootrom.py
 create mode 100644 spatz/README.md
 create mode 100644 spatz/bootrom/Makefile
 create mode 100644 spatz/bootrom/spatz_init.S
 create mode 100644 spatz/bootrom/spatz_init.ld
 create mode 100644 spatz/sw/Makefile
 create mode 100644 spatz/sw/kernel/spatz_crt0.S
 create mode 100644 spatz/sw/kernel/spatz_program.ld
 create mode 100644 spatz/sw/spatz_utils/spatz_workers.h
 create mode 100644 spatz/sw/tasks/atomic_remote_task.c
 create mode 100644 spatz/sw/tasks/dotp_task.c
 create mode 100644 spatz/sw/tasks/hello_task.c
 create mode 100644 spatz/sw/tasks/hello_world_simple_task.c
 create mode 100644 spatz/sw/tasks/idma_simple_task.c
 create mode 100644 spatz/sw/tasks/instructions_sweep_simple_task.c
 create mode 100644 spatz/sw/tasks/interconnect32_simple_task.c
 create mode 100644 spatz/sw/tasks/interconnect64_simple_task.c
 create mode 100644 spatz/sw/tasks/matmul16_task.c
 create mode 100644 spatz/sw/tasks/matvec16_task.c
 create mode 100644 spatz/sw/tasks/matvectrans16_task.c
 create mode 100644 spatz/sw/tasks/vecsum16_task.c
 create mode 100644 sw/tests/spatz_tests/atomic_mesh_spatz_test.c
 create mode 100644 sw/tests/spatz_tests/dotp_mesh_spatz_test.c
 create mode 100644 sw/tests/spatz_tests/hello_mesh_spatz_test.c
 create mode 100644 sw/tests/spatz_tests/matmul_compare_spatz_test.c
 create mode 100644 sw/tests/spatz_tests/simple_spatz_test.c
 create mode 100644 sw/utils/magia_spatz_utils.h

diff --git a/.gitignore b/.gitignore
index 7c8a8fa..d275390 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,29 @@ build-hw.log
 profile-ips.log
 magia_venv/
 modelsim.ini
-# sw/tests/*/
+
+sw/tests/*/build/
+sw/tests/*/logs/
+sw/tests/*/work/
+sw/tests/*/work
+sw/tests/*/transcript
+sw/tests/*/*.log
+
+sw/tests/*/*/build/
+sw/tests/*/*/logs/
+sw/tests/*/*/work/
+sw/tests/*/*/work
+sw/tests/*/*/transcript
+sw/tests/*/*/*.log
+
+spatz/bootrom/*.elf
+spatz/bootrom/*.bin
+spatz/bootrom/*.dump
+spatz/bootrom/*.hex
+
+spatz/sw/bin/*.elf
+spatz/sw/bin/*.bin
+spatz/sw/bin/*.dump
+spatz/sw/bin/*.o
+
+spatz/sw/headers_bin/*.h
\ No newline at end of file
diff --git a/Bender.local b/Bender.local
index a7bab39..dc33deb 100644
--- a/Bender.local
+++ b/Bender.local
@@ -7,3 +7,5 @@ overrides:
   register_interface : { git: "https://github.com/pulp-platform/register_interface.git", rev: e25b36670ff7aab3402f40efcc2b11ee0f31cf19 }
   idma               : { git: "https://github.com/pulp-platform/iDMA.git"              , rev: c12caf59bb482fe44b27361f6924ad346b2d22fe }
   tech_cells_generic : { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13                               }
+  cluster_icache     : { git: "https://github.com/pulp-platform/cluster_icache.git"    , rev: f88227d287251812b54c77fbfa608aa6dcd92b31 }
+  axi_riscv_atomics  : { git: "https://github.com/pulp-platform/axi_riscv_atomics.git" , version: 0.8.2                                }
\ No newline at end of file
diff --git a/Bender.yml b/Bender.yml
index b96784d..a7f1435 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -26,12 +26,13 @@ package:
 dependencies:
   redmule           : { git: "https://github.com/pulp-platform/redmule.git"           , rev: 944d4a4d45fe05147cfbf7f872af677578f3b15c } # branch: fc/ooo-mux
   cv32e40x          : { git: "https://github.com/pulp-platform/cv32e40x.git"          , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } # branch: vi/redmule_scaleup
-  cv32e40p          : { git: "https://github.com/pulp-platform/cv32e40p.git"          , rev: 37a82d337ba60129c333d104c29e816d0698b53b } 
+  cv32e40p          : { git: "https://github.com/pulp-platform/cv32e40p.git"          , rev: f5241403d5d65dbe1fffacd7035dd7ae1359c8ef } # branch: lb/magia_core
+  spatz             : { git: "https://github.com/pulp-platform/spatz.git"             , rev: 2a524191c8a6472d83fe69ce3a24f90bee5b4ef0 } # branch: lb/magia-spatz_cc
   idma              : { git: "https://github.com/pulp-platform/iDMA.git"              , rev: a6b190c7991331432afa9a2899d032bc1b176830 } # branch: vi/redmule_scaleup
   hwpe-stream       : { git: "https://github.com/pulp-platform/hwpe-stream.git"       , version: 1.6                                  }
   hwpe-ctrl         : { git: "https://github.com/pulp-platform/hwpe-ctrl.git"         , version: 3.0.0                                }
   hci               : { git: "https://github.com/pulp-platform/hci.git"               , version: 2.3.0                                }
-  cluster_icache    : { git: "https://github.com/pulp-platform/cluster_icache.git"    , rev: 917ecbf908bdaa22c5713bbcff277d142506bb16 } # branch: michaero/astral
+  cluster_icache    : { git: "https://github.com/pulp-platform/cluster_icache.git"    , rev: f88227d287251812b54c77fbfa608aa6dcd92b31 } # branch: lb/magia-spatz
   fpnew             : { git: "https://github.com/pulp-platform/cvfpu.git"             , rev: "pulp-v0.1.3"                            }
   fpu_ss            : { git: "https://github.com/pulp-platform/fpu_ss.git"            , rev: 8e2eff774d9d38a1e17a46bd56a0936dac9522f0 } # branch: vi/bender_manifest
   obi               : { git: "https://github.com/pulp-platform/obi.git"               , rev: 528dc65303d5ffb02fbc254324c6b53eac0dd6e5 } # branch: lb/fix_atop_resolver                       
@@ -69,6 +70,14 @@ sources:
       - hw/tile/converters/hci2obi.sv
       - hw/tile/converters/xif_if2struct.sv
       - hw/tile/converters/obi2hwpe_ctrl.sv
+      - hw/tile/converters/tcdm2hci.sv
+      - hw/tile/converters/tcdm2obi.sv
+      - hw/tile/converters/tcdm2hci_atomic.sv
+      - hw/tile/converters/reqrsp2obi.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32.sv
+      - hw/tile/converters/tcdm64_to_dual_tcdm32.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
+      - hw/tile/converters/reqrsp64_to_obi32.sv
       - hw/tile/xbar_periph_bus_if.sv
       - hw/tile/cluster_event_map.sv
       - hw/tile/magia_event_unit.sv
@@ -82,7 +91,11 @@ sources:
       - hw/tile/idma_ctrl_mm.sv
       - hw/tile/fractal_sync_xif_inst_decoder.sv
       - hw/tile/obi_slave_fsync.sv
+      - hw/tile/obi_slave_ctrl_spatz.sv
+      - hw/tile/spatz_bootrom.sv
+      - hw/tile/spatz_cc_wrapper.sv
       - hw/tile/core_data_demux_eu_direct.sv
+      - hw/tile/eu_direct_cut.sv
       - hw/tile/magia_redmule_wrap.sv
       - hw/tile/magia_tile.sv
       # MAGIA DV
@@ -119,6 +132,14 @@ sources:
       - hw/tile/converters/hci2obi.sv
       - hw/tile/converters/xif_if2struct.sv
       - hw/tile/converters/obi2hwpe_ctrl.sv
+      - hw/tile/converters/tcdm2hci.sv
+      - hw/tile/converters/tcdm2obi.sv
+      - hw/tile/converters/tcdm2hci_atomic.sv
+      - hw/tile/converters/reqrsp2obi.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32.sv
+      - hw/tile/converters/tcdm64_to_dual_tcdm32.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
+      - hw/tile/converters/reqrsp64_to_obi32.sv
       - hw/tile/xbar_periph_bus_if.sv
       - hw/tile/cluster_event_map.sv
       - hw/tile/magia_event_unit.sv
@@ -132,7 +153,11 @@ sources:
       - hw/tile/idma_ctrl_mm.sv
       - hw/tile/fractal_sync_xif_inst_decoder.sv
       - hw/tile/obi_slave_fsync.sv
+      - hw/tile/obi_slave_ctrl_spatz.sv
+      - hw/tile/spatz_bootrom.sv
+      - hw/tile/spatz_cc_wrapper.sv
       - hw/tile/core_data_demux_eu_direct.sv
+      - hw/tile/eu_direct_cut.sv
       - hw/tile/magia_redmule_wrap.sv
       - hw/tile/magia_tile.sv
       # MAGIA
@@ -153,6 +178,7 @@ sources:
       - hw/mesh/noc/floo_axi_mesh_8x8_noc.sv
       - hw/mesh/noc/floo_axi_mesh_16x16_noc.sv
       - hw/mesh/noc/floo_axi_mesh_32x32_noc.sv
+      - target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv
       # MAGIA Packages
       - hw/mesh/magia_pkg.sv
       - hw/tile/magia_tile_pkg.sv
@@ -168,6 +194,14 @@ sources:
       - hw/tile/converters/hci2obi.sv
       - hw/tile/converters/xif_if2struct.sv
       - hw/tile/converters/obi2hwpe_ctrl.sv
+      - hw/tile/converters/tcdm2hci.sv
+      - hw/tile/converters/tcdm2obi.sv
+      - hw/tile/converters/tcdm2hci_atomic.sv
+      - hw/tile/converters/reqrsp2obi.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32.sv
+      - hw/tile/converters/tcdm64_to_dual_tcdm32.sv
+      - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
+      - hw/tile/converters/reqrsp64_to_obi32.sv
       - hw/tile/xbar_periph_bus_if.sv
       - hw/tile/cluster_event_map.sv
       - hw/tile/magia_event_unit.sv
@@ -181,15 +215,14 @@ sources:
       - hw/tile/idma_ctrl_mm.sv
       - hw/tile/fractal_sync_xif_inst_decoder.sv
       - hw/tile/obi_slave_fsync.sv
+      - hw/tile/obi_slave_ctrl_spatz.sv
+      - hw/tile/spatz_bootrom.sv
+      - hw/tile/spatz_cc_wrapper.sv
       - hw/tile/core_data_demux_eu_direct.sv
+      - hw/tile/eu_direct_cut.sv
       - hw/tile/magia_redmule_wrap.sv
       - hw/tile/magia_tile.sv
       # MAGIA
-      - hw/mesh/noc/floo_axi_mesh_2x2_noc.sv
-      - hw/mesh/noc/floo_axi_mesh_4x4_noc.sv
-      - hw/mesh/noc/floo_axi_mesh_8x8_noc.sv
-      - hw/mesh/noc/floo_axi_mesh_16x16_noc.sv
-      - hw/mesh/noc/floo_axi_mesh_32x32_noc.sv
       - hw/mesh/magia.sv
       # Tech
       - pd/sourcecode/tc_sram.sv
diff --git a/Makefile b/Makefile
index dbeb6b6..c4c277a 100644
--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,7 @@
 
 # Paths to folders
 ROOT_DIR       := $(abspath $(dir $(lastword $(MAKEFILE_LIST))))
-core           ?= CV32E40X
+core           ?= CV32E40P
 
 MAGIA_DIR  ?= $(shell pwd)
 
@@ -43,7 +43,7 @@ XLEN           ?= 32
 ifeq ($(core), CV32E40X)
   XTEN         = imafc
 else
-  XTEN         = imfcxpulpv2
+  XTEN         = imcxgap9
 endif
 ABI            ?= ilp
 XABI           ?= f
@@ -55,7 +55,9 @@ XABI           ?= f
 #endif
 
 TEST_DIR  := sw/tests
-TEST_SRCS  = $(TEST_DIR)/$(test).c
+# Auto-detect test location in any subdirectory
+TEST_SUBDIR = $(filter-out .,$(shell find $(TEST_DIR) -name "$(test).c" -printf "%P\n" 2>/dev/null | head -1 | xargs dirname 2>/dev/null))
+TEST_SRCS  = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test).c
 
 compile_script       ?= scripts/compile.tcl
 compile_script_synth ?= scripts/synth_compile.tcl
@@ -104,6 +106,7 @@ endif
 INC += -Isw
 INC += -Isw/inc
 INC += -Isw/utils
+INC += -Ispatz/sw/headers_bin
 
 BOOTSCRIPT := sw/kernel/crt0.S
 LINKSCRIPT := sw/kernel/link.ld
@@ -114,15 +117,23 @@ OBJDUMP=$(ISA)$(XLEN)-unknown-elf-objdump
 CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -O2 -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP
 LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections
 
+# Spatz embedded binary support (via header)
+SPATZ_SW_DIR   := spatz/sw
+
+# Auto-detect which Spatz tasks are used by looking for *_TASK symbols in CV32 code
+# Example: HELLO_WORLD_TASK → hello_world_task
+SPATZ_TASKS := $(shell grep -oP '\b(?!SPATZ_)[A-Z][A-Z0-9_]*_TASK\b' $(TEST_SRCS) 2>/dev/null | tr '[:upper:]' '[:lower:]' | awk '!seen[$$0]++')
+
 # Setup build object dirs
-CRT=$(TEST_DIR)/$(test)/build/crt0.o
-OBJ=$(TEST_DIR)/$(test)/build/verif.o
-BIN=$(TEST_DIR)/$(test)/build/verif
-DUMP=$(TEST_DIR)/$(test)/build/verif.dump
-ODUMP=$(TEST_DIR)/$(test)/build/verif.objdump
-ITB=$(TEST_DIR)/$(test)/build/verif.itb
-STIM_INSTR=$(TEST_DIR)/$(test)/build/stim_instr.txt
-STIM_DATA=$(TEST_DIR)/$(test)/build/stim_data.txt
+TEST_BUILD_DIR = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test)
+CRT=$(TEST_BUILD_DIR)/build/crt0.o
+OBJ=$(TEST_BUILD_DIR)/build/verif.o
+BIN=$(TEST_BUILD_DIR)/build/verif
+DUMP=$(TEST_BUILD_DIR)/build/verif.dump
+ODUMP=$(TEST_BUILD_DIR)/build/verif.objdump
+ITB=$(TEST_BUILD_DIR)/build/verif.itb
+STIM_INSTR=$(TEST_BUILD_DIR)/build/stim_instr.txt
+STIM_DATA=$(TEST_BUILD_DIR)/build/stim_data.txt
 VSIM_INI=modelsim.ini
 VSIM_LIBS=work
 
@@ -131,25 +142,40 @@ $(STIM_INSTR) $(STIM_DATA): $(BIN)
 	objcopy --srec-len 1 --output-target=srec $(BIN) $(BIN).s19 &&	\
 	scripts/parse_s19.pl $(BIN).s19 > $(BIN).txt &&					\
 	$(BASE_PYTHON) scripts/s19tomem.py $(BIN).txt $(STIM_INSTR) $(STIM_DATA)
-	cd $(TEST_DIR)/$(test) &&										\
+	cd $(TEST_BUILD_DIR) &&										\
 	ln -sfn $(ROOT_DIR)/$(INI_PATH) $(VSIM_INI) &&						\
 	ln -sfn $(ROOT_DIR)/$(WORK_PATH) $(VSIM_LIBS)
 
+# Build Spatz binary with auto-detected tasks (only if tasks are used)
+# Generate test-specific header: test_name_task_bin.h
+.PHONY: spatz-header
+spatz-header:
+	@if [ -n "$(SPATZ_TASKS)" ]; then \
+		echo "[SPATZ] Auto-detected tasks: $(SPATZ_TASKS)"; \
+		$(MAKE) -C $(SPATZ_SW_DIR) task="$(SPATZ_TASKS)" TEST_NAME=$(test) SPATZ_RVD=$(SPATZ_RVD) SPATZ_VLEN=$(SPATZ_VLEN) SPATZ_N_IPU=$(SPATZ_N_IPU) SPATZ_N_FPU=$(SPATZ_N_FPU) SPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) SPATZ_XDMA=$(SPATZ_XDMA) SPATZ_RVF=$(SPATZ_RVF) SPATZ_RVV=$(SPATZ_RVV) all; \
+	else \
+		echo "[SPATZ] No Spatz tasks detected - skipping Spatz compilation"; \
+	fi
+
 $(BIN): $(CRT) $(OBJ)
+	@if [ -n "$(SPATZ_TASKS)" ]; then \
+		echo "[CV32-LINK] Linking with embedded Spatz binary (tasks: $(SPATZ_TASKS))"; \
+	else \
+		echo "[CV32-LINK] Linking without Spatz binary"; \
+	fi
 	$(LD) $(LD_OPTS) -o $(BIN) $(CRT) $(OBJ) -T$(LINKSCRIPT)
 
 $(CRT):
-	cd $(TEST_DIR) &&							\
-	mkdir -p $(test) &&							\
-	cd $(test) &&								\
-	mkdir -p build								
+	mkdir -p $(TEST_BUILD_DIR)/build
 	$(CC) $(CC_OPTS) -c $(BOOTSCRIPT) -o $(CRT)
 
+# Compile CV32 test (depends on spatz-header only if tasks detected)
+ifneq ($(SPATZ_TASKS),)
+$(OBJ): spatz-header
+endif
+
 $(OBJ):
-	cd $(TEST_DIR) &&											\
-	mkdir -p $(test) &&											\
-	cd $(test) &&												\
-	mkdir -p build												
+	mkdir -p $(TEST_BUILD_DIR)/build
 	$(CC) $(CC_OPTS) -c $(TEST_SRCS) $(FLAGS) $(INC) -o $(OBJ)
 
 SHELL := /bin/bash
@@ -176,7 +202,7 @@ all: $(STIM_INSTR) $(STIM_DATA) dis objdump itb
 # Run the simulation
 run: $(CRT)
 ifeq ($(gui), 0)
-	cd $(TEST_DIR)/$(test);                                                                		 \
+	cd $(TEST_BUILD_DIR);                                                                		 \
 	$(QUESTA) vsim -c vopt_tb $(questa_run_fast_flag) -l transcript -do "run -a"                 \
 	+INST_HEX=$(inst_hex_name)                                                                   \
 	+DATA_HEX=$(data_hex_name)                                                                   \
@@ -188,7 +214,7 @@ ifeq ($(gui), 0)
 	)                                                                                            \
 	+itb_file=$(itb_file)
 else
-	cd $(TEST_DIR)/$(test);                                                                		 \
+	cd $(TEST_BUILD_DIR);                                                                		 \
 	$(QUESTA) vsim vopt_tb $(questa_run_flag) -l transcript                                      \
 	-do "add log -r sim:/$(tb)/*"                                                                \
 	-do "source $(WAVES)"                                                                        \
@@ -234,6 +260,7 @@ bender_targs += -t cv32e40p_include_tracer
 # Targets needed to avoid error even though the module is not used
 bender_targs += -t snitch_cluster
 bender_targs += -t idma_test
+bender_targs += -t spatz
 
 #ifeq ($(REDMULE_COMPLEX),1)
 #	tb := redmule_complex_tb
@@ -258,6 +285,31 @@ else
   bender_targs += -t redmule_hwpe
 endif
 
+# Define for Spatz target
+bender_defs  += -D TARGET_SPATZ
+SPATZ_RVD      ?= 0   # 0: 32-bit TCDM (ELEN=32), 1: 64-bit TCDM (ELEN=64)
+SPATZ_VLEN     ?= 256 # Vector length in bits (128, 256, 512, ...)
+SPATZ_NRVREG   ?= 32  # Number of vector registers (RISC-V standard=32)
+SPATZ_NR_VRF_BANKS ?= 4  # Number of VRF banks (banking parallelism: 2, 4, 8)
+SPATZ_N_IPU    ?= 1   # Number of Integer Processing Units (1-8)
+SPATZ_N_FPU    ?= 4   # Number of Floating Point Units (1-8)
+SPATZ_NR_PARALLEL_INSTR ?= 4  # Number of parallel vector instructions (scoreboard depth)
+SPATZ_XDIVSQRT ?= 0   # 0: FP div/sqrt disabled, 1: enabled (increases area)
+SPATZ_XDMA     ?= 0   # 0: DMA disabled, 1: enabled
+SPATZ_RVF      ?= 1   # 0: single-precision FP disabled, 1: enabled
+SPATZ_RVV      ?= 1   # 0: vector extension disabled, 1: enabled
+bender_defs    += -D SPATZ_RVD=$(SPATZ_RVD)
+bender_defs    += -D SPATZ_VLEN=$(SPATZ_VLEN)
+bender_defs    += -D SPATZ_NRVREG=$(SPATZ_NRVREG)
+bender_defs    += -D SPATZ_NR_VRF_BANKS=$(SPATZ_NR_VRF_BANKS)
+bender_defs    += -D SPATZ_N_IPU=$(SPATZ_N_IPU)
+bender_defs    += -D SPATZ_N_FPU=$(SPATZ_N_FPU)
+bender_defs    += -D SPATZ_NR_PARALLEL_INSTR=$(SPATZ_NR_PARALLEL_INSTR)
+bender_defs    += -D SPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT)
+bender_defs    += -D SPATZ_XDMA=$(SPATZ_XDMA)
+bender_defs    += -D SPATZ_RVF=$(SPATZ_RVF)
+bender_defs    += -D SPATZ_RVV=$(SPATZ_RVV)
+
 update-ips:
 	$(BENDER) update
 	$(BENDER) script vsim          \
@@ -308,7 +360,11 @@ clean-sdk:
 	rm -rf $(SW)/pulp-sdk
 
 clean:
-	rm -rf $(TEST_DIR)/$(test)
+	rm -rf $(TEST_BUILD_DIR)
+	@if [ -d "$(SPATZ_SW_DIR)" ]; then \
+		echo "[CLEAN] Cleaning Spatz..."; \
+		$(MAKE) -C $(SPATZ_SW_DIR) clean; \
+	fi
 
 dis:
 	$(OBJDUMP) -d -S $(BIN) > $(DUMP)
diff --git a/bender_sim.mk b/bender_sim.mk
index 86e58f5..0d2ab4f 100644
--- a/bender_sim.mk
+++ b/bender_sim.mk
@@ -16,12 +16,7 @@
 sim_targs += -t rtl
 sim_targs += -t test
 sim_targs += -t idma_test
-
-#ifeq ($(REDMULE_COMPLEX),1)
-#	sim_targs += -t redmule_test_complex
-#else
-#	sim_targs += -t redmule_test_hwpe
-#endif
+sim_targs += -t redmule_hwpe
 
 ifeq ($(mesh_dv), 0)
 	sim_targs += -t standalone_tile
diff --git a/bender_synth.mk b/bender_synth.mk
index 4bd5430..46db632 100644
--- a/bender_synth.mk
+++ b/bender_synth.mk
@@ -14,15 +14,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 synth_targs += -t rtl
-synth_targs += -t redmule_complex
+synth_targs += -t redmule_hwpe
 synth_targs += -t asic
 synth_targs += -t synopsys
+synth_targs += -t spatz
 
-#ifeq ($(REDMULE_COMPLEX),1)
-#	synth_defs += -D REDMULE_COMPLEX_SYNTH
-#else
-#	synth_defs += -D REDMULE_HWPE_SYNTH
-#endif
+ifeq ($(mesh_dv), 0)
+synth_defs += -D TARGET_STANDALONE_TILE
+endif
 
-synth_defs += -D REDMULE_COMPLEX_SYNTH
+synth_defs += -D REDMULE_HWPE_SYNTH
 synth_defs += -D SYNTHESIS
+synth_defs += -D TARGET_SPATZ
diff --git a/hw/mesh/magia_pkg.sv b/hw/mesh/magia_pkg.sv
index 88852a9..5f35a15 100644
--- a/hw/mesh/magia_pkg.sv
+++ b/hw/mesh/magia_pkg.sv
@@ -44,11 +44,11 @@ package magia_pkg;
   localparam int unsigned USR_W            = 1;                               // Default User Width
 
   // Parameters used by the NoC
-  parameter int unsigned AXI_NOC_ID_W      = 4;                                // AXI NoC ID Width: 2 bits on slave port + 2 bits to select from i$, iDMA and core data
+  parameter int unsigned AXI_NOC_ID_W      = 6;                                // AXI NoC ID Width: matches slave side id_width (6 bits)
   parameter int unsigned AXI_NOC_U_W       = USR_W;
 
   // Parameters used by the L2
-  parameter int unsigned L2_ID_W           = 2;                                // The ID Width reflects the ID Width of the Tile AXI XBAR
+  parameter int unsigned L2_ID_W           = 3;                                // The ID Width reflects the slave ID Width of the Tile AXI XBAR (for 5 ports: log2(5)=3)
   parameter int unsigned L2_U_W            = 1;
 
   // Parameter used for the Fractal Sync network
diff --git a/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv b/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv
index 22c3f43..2fd557e 100644
--- a/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv
+++ b/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv
@@ -603,7 +603,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -611,7 +611,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -621,8 +621,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)
diff --git a/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv b/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv
index a1a73f3..9a62e0c 100644
--- a/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv
+++ b/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv
@@ -71,7 +71,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -79,7 +79,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -89,8 +89,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)
diff --git a/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv b/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv
index 05e14d1..be0bca2 100644
--- a/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv
+++ b/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv
@@ -2171,7 +2171,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -2179,7 +2179,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -2189,8 +2189,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)
diff --git a/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv b/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv
index 38b4156..688d4e4 100644
--- a/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv
+++ b/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv
@@ -99,7 +99,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -107,7 +107,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -117,8 +117,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)
diff --git a/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv b/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv
index 90483b5..5f7b3bf 100644
--- a/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv
+++ b/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv
@@ -203,7 +203,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -211,7 +211,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -221,8 +221,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)
diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml
index b85ac94..44e68a4 100644
--- a/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml
+++ b/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml
@@ -11,13 +11,13 @@ protocols:
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 2
+    id_width: 3
     user_width: 1
   - name: "data_slv"
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 4
+    id_width: 6
     user_width: 1
 
 endpoints:
diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml
index 69c4628..ea375b2 100644
--- a/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml
+++ b/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml
@@ -11,13 +11,13 @@ protocols:
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 2
+    id_width: 3
     user_width: 1
   - name: "data_slv"
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 4
+    id_width: 6
     user_width: 1
 
 endpoints:
diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml
index 44efa46..65bd166 100644
--- a/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml
+++ b/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml
@@ -11,13 +11,13 @@ protocols:
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 2
+    id_width: 3
     user_width: 1
   - name: "data_slv"
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 4
+    id_width: 6
     user_width: 1
 
 endpoints:
diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml
index b13e59c..92aaef6 100644
--- a/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml
+++ b/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml
@@ -11,13 +11,13 @@ protocols:
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 2
+    id_width: 3
     user_width: 1
   - name: "data_slv"
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 4
+    id_width: 6
     user_width: 1
 
 endpoints:
diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml
index 945921d..7246ee0 100644
--- a/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml
+++ b/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml
@@ -11,13 +11,13 @@ protocols:
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 2
+    id_width: 3
     user_width: 1
   - name: "data_slv"
     protocol: "AXI4"
     data_width: 32
     addr_width: 32
-    id_width: 4
+    id_width: 6
     user_width: 1
 
 endpoints:
diff --git a/hw/tile/converters/reqrsp2obi.sv b/hw/tile/converters/reqrsp2obi.sv
new file mode 100644
index 0000000..d604a08
--- /dev/null
+++ b/hw/tile/converters/reqrsp2obi.sv
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Convert reqrsp to OBI protocol - Direct conversion without AXI intermediate step
+ */
+
+module reqrsp2obi
+  import reqrsp_pkg::*;
+#(
+  parameter obi_pkg::obi_cfg_t ObiCfg = obi_pkg::ObiDefaultConfig,
+  parameter type obi_req_t = logic,
+  parameter type obi_rsp_t = logic,
+  parameter type reqrsp_req_t = logic,
+  parameter type reqrsp_rsp_t = logic
+) (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+
+  input  reqrsp_req_t reqrsp_req_i,
+  output reqrsp_rsp_t reqrsp_rsp_o,
+
+  output obi_req_t    obi_req_o,
+  input  obi_rsp_t    obi_rsp_i
+);
+
+  // Convert reqrsp AMO to OBI ATOP encoding
+  function automatic logic [5:0] amo_to_atop(reqrsp_pkg::amo_op_e amo);
+    case (amo)
+      reqrsp_pkg::AMONone:  return 6'h00;
+      reqrsp_pkg::AMOSwap:  return 6'h21;
+      reqrsp_pkg::AMOAdd:   return 6'h20;
+      reqrsp_pkg::AMOAnd:   return 6'h2C;
+      reqrsp_pkg::AMOOr:    return 6'h28;
+      reqrsp_pkg::AMOXor:   return 6'h24;
+      reqrsp_pkg::AMOMax:   return 6'h34;
+      reqrsp_pkg::AMOMaxu:  return 6'h3C;
+      reqrsp_pkg::AMOMin:   return 6'h30;
+      reqrsp_pkg::AMOMinu:  return 6'h38;
+      reqrsp_pkg::AMOLR:    return 6'h22;
+      reqrsp_pkg::AMOSC:    return 6'h23;
+      default:              return 6'h00;
+    endcase
+  endfunction
+
+  /*******************************************************************/
+  /*               Direct reqrsp → OBI conversion                    */
+  /*******************************************************************/
+
+  logic amo_detected;
+  assign amo_detected = (reqrsp_req_i.q.amo != reqrsp_pkg::AMONone);
+  assign obi_req_o.req                  = reqrsp_req_i.q_valid;
+  assign obi_req_o.a.addr               = reqrsp_req_i.q.addr;
+  assign obi_req_o.a.we                 = reqrsp_req_i.q.write || amo_detected;
+  assign obi_req_o.a.wdata              = reqrsp_req_i.q.data;
+  assign obi_req_o.a.be                 = reqrsp_req_i.q.strb;
+  assign obi_req_o.a.aid                = '0;
+  
+  // Optional request fields
+  assign obi_req_o.a.a_optional.auser   = '0;
+  assign obi_req_o.a.a_optional.wuser   = '0;
+  assign obi_req_o.a.a_optional.atop    = amo_to_atop(reqrsp_req_i.q.amo);
+  assign obi_req_o.a.a_optional.memtype = 2'b00;
+  assign obi_req_o.a.a_optional.mid     = '0;
+  assign obi_req_o.a.a_optional.prot    = 3'b000;
+  assign obi_req_o.a.a_optional.dbg     = 1'b0;
+  assign obi_req_o.a.a_optional.achk    = '0;
+
+  // OBI → reqrsp Response
+  assign reqrsp_rsp_o.q_ready = obi_rsp_i.gnt;
+  assign reqrsp_rsp_o.p_valid = obi_rsp_i.rvalid;
+  assign reqrsp_rsp_o.p.data  = obi_rsp_i.r.rdata;
+  assign reqrsp_rsp_o.p.error = obi_rsp_i.r.err;
+
+endmodule
+
diff --git a/hw/tile/converters/reqrsp64_to_obi32.sv b/hw/tile/converters/reqrsp64_to_obi32.sv
new file mode 100644
index 0000000..8ba2cb7
--- /dev/null
+++ b/hw/tile/converters/reqrsp64_to_obi32.sv
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * REQRSP 64-bit to OBI 32-bit Protocol Converter
+ * 
+ */
+
+module reqrsp64_to_obi32
+  import obi_pkg::*;
+#(
+  parameter type reqrsp_req_t = logic,  // 64-bit reqrsp request type
+  parameter type reqrsp_rsp_t = logic,  // 64-bit reqrsp response type
+  parameter type obi_req_t    = logic,  // 32-bit OBI request type
+  parameter type obi_rsp_t    = logic,  // 32-bit OBI response type
+  parameter obi_pkg::obi_cfg_t ObiCfg = obi_pkg::ObiDefaultConfig
+)(
+  input  logic         clk_i,
+  input  logic         rst_ni,
+  
+  // REQRSP side (64-bit from Snitch RV64)
+  input  reqrsp_req_t  reqrsp_req_i,
+  output reqrsp_rsp_t  reqrsp_rsp_o,
+  
+  // OBI side (32-bit to peripherals/L2)
+  output obi_req_t     obi_req_o,
+  input  obi_rsp_t     obi_rsp_i
+);
+
+  /*******************************************************************/
+  /*            REQRSP64 → OBI32 Request Conversion                  */
+  /*******************************************************************/
+  // Determine if accessing upper or lower word based on strb
+  logic upper_word_req;
+  assign upper_word_req = |reqrsp_req_i.q.strb[7:4];
+  
+  // Select based on which half is being accessed
+  logic [31:0] wdata_32bit;
+  assign wdata_32bit = upper_word_req ? reqrsp_req_i.q.data[63:32] : reqrsp_req_i.q.data[31:0];
+  
+  logic [3:0] be_32bit;
+  assign be_32bit = upper_word_req ? reqrsp_req_i.q.strb[7:4] : reqrsp_req_i.q.strb[3:0];
+  
+  // OBI Request signals
+  assign obi_req_o.req                  = reqrsp_req_i.q_valid;
+  assign obi_req_o.a.addr               = reqrsp_req_i.q.addr;
+  assign obi_req_o.a.we                 = reqrsp_req_i.q.write;
+  assign obi_req_o.a.wdata              = wdata_32bit;
+  assign obi_req_o.a.be                 = be_32bit;
+  assign obi_req_o.a.aid                = '0;
+  assign obi_req_o.a.a_optional.auser   = '0;
+  assign obi_req_o.a.a_optional.wuser   = '0;
+  assign obi_req_o.a.a_optional.atop    = 6'h00;  // No atomics on this path
+  assign obi_req_o.a.a_optional.memtype = 2'b00;
+  assign obi_req_o.a.a_optional.mid     = '0;
+  assign obi_req_o.a.a_optional.prot    = 3'b000;
+  assign obi_req_o.a.a_optional.dbg     = 1'b0;
+  assign obi_req_o.a.a_optional.achk    = '0;
+  
+  /*******************************************************************/
+  /*            OBI32 → REQRSP64 Response Conversion                 */
+  /*******************************************************************/
+  
+  // Response handshaking
+  assign reqrsp_rsp_o.q_ready = obi_rsp_i.gnt;
+  assign reqrsp_rsp_o.p_valid = obi_rsp_i.rvalid;
+  
+  // Track which word half was requested (for correct response data placement)
+  logic [7:0] req_strb_q;
+  
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      req_strb_q <= 8'h0;
+    end else if (reqrsp_req_i.q_valid && obi_rsp_i.gnt) begin
+      req_strb_q <= reqrsp_req_i.q.strb;
+    end
+  end
+  
+  // Data placement based on which half was requested:
+  logic upper_word_access;
+  assign upper_word_access = |req_strb_q[7:4];
+  
+  assign reqrsp_rsp_o.p.data = upper_word_access ? 
+                                {obi_rsp_i.r.rdata, 32'h0000_0000} :  // Upper word → [63:32]
+                                {32'h0000_0000, obi_rsp_i.r.rdata};   // Lower word → [31:0]
+  
+  // Error propagation
+  assign reqrsp_rsp_o.p.error = obi_rsp_i.r.err;
+
+endmodule : reqrsp64_to_obi32
diff --git a/hw/tile/converters/tcdm2hci.sv b/hw/tile/converters/tcdm2hci.sv
new file mode 100644
index 0000000..364e56a
--- /dev/null
+++ b/hw/tile/converters/tcdm2hci.sv
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * TCDM to HCI Protocol Converter (Direct - No Atomics)
+ */
+
+module tcdm2hci #(
+  parameter type tcdm_req_t = logic,
+  parameter type tcdm_rsp_t = logic,
+  parameter type hci_req_t  = logic,
+  parameter type hci_rsp_t  = logic
+)(
+  input  logic       clk_i,
+  input  logic       rst_ni,
+  
+  // TCDM side (Spatz vector ports)
+  input  tcdm_req_t  tcdm_req_i,
+  output tcdm_rsp_t  tcdm_rsp_o,
+  
+  // HCI side (L1 SPM interconnect)
+  output hci_req_t   hci_req_o,
+  input  hci_rsp_t   hci_rsp_i
+);
+
+  /*******************************************************************/
+  /*                     TCDM → HCI Request Mapping                  */
+  /*******************************************************************/
+  
+  // Map request channel
+  assign hci_req_o.req   = tcdm_req_i.q_valid;          
+  assign hci_req_o.add   = tcdm_req_i.q.addr;           
+  assign hci_req_o.wen   = ~tcdm_req_i.q.write;         
+  assign hci_req_o.data  = tcdm_req_i.q.data;           
+  assign hci_req_o.be    = tcdm_req_i.q.strb;         
+  assign hci_req_o.user  = tcdm_req_i.q.user;
+  assign hci_req_o.id    = '0;
+  assign hci_req_o.ecc   = '0;
+  assign hci_req_o.ereq  = '0;
+  assign hci_req_o.r_ready  = 1'b1;
+  assign hci_req_o.r_eready = '0;           
+  
+  /*******************************************************************/
+  /*                     HCI → TCDM Response Mapping                 */
+  /*******************************************************************/
+  
+  // Map response channel
+  assign tcdm_rsp_o.q_ready = hci_rsp_i.gnt;            
+  assign tcdm_rsp_o.p_valid = hci_rsp_i.r_valid;       
+  assign tcdm_rsp_o.p.data  = hci_rsp_i.r_data;        
+
+endmodule : tcdm2hci
diff --git a/hw/tile/converters/tcdm2hci_atomic.sv b/hw/tile/converters/tcdm2hci_atomic.sv
new file mode 100644
index 0000000..ce3aac2
--- /dev/null
+++ b/hw/tile/converters/tcdm2hci_atomic.sv
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * TCDM to HCI Converter with Atomic Operation Support
+ * 
+ * Conversion chain: TCDM (with AMO) → OBI (with atop) → OBI (resolved) → HCI
+ * 
+ */
+
+module tcdm2hci_atomic
+  import obi_pkg::*;
+  import magia_tile_pkg::*;
+#(
+  parameter type tcdm_req_t = logic,
+  parameter type tcdm_rsp_t = logic,
+  parameter type obi_req_t  = logic,
+  parameter type obi_rsp_t  = logic,
+  parameter type hci_req_t  = logic,
+  parameter type hci_rsp_t  = logic,
+  
+  // OBI channel types
+  parameter type obi_a_chan_t = logic,
+  parameter type obi_r_chan_t = logic,
+  
+  // OBI optional types for atomic resolver
+  parameter type obi_a_optional_t = logic,
+  parameter type obi_r_optional_t = logic,
+  
+  // OBI configuration for atomic resolver
+  parameter obi_pkg::obi_cfg_t SbrPortObiCfg = obi_pkg::ObiDefaultConfig,
+  parameter obi_pkg::obi_cfg_t MgrPortObiCfg = obi_pkg::ObiDefaultConfig,
+  
+  // Pipeline cut control
+  parameter bit BypassCut = 1'b0
+)(
+  input  logic      clk_i,
+  input  logic      rst_ni,
+  
+  // TCDM slave port (with atomic operations)
+  input  tcdm_req_t tcdm_req_i,
+  output tcdm_rsp_t tcdm_rsp_o,
+  
+  // HCI master port (atomic-free)
+  output hci_req_t  hci_req_o,
+  input  hci_rsp_t  hci_rsp_i
+);
+
+  // Internal OBI signals between converters
+  obi_req_t obi_req;
+  obi_rsp_t obi_rsp;
+  
+  // OBI signals after atomic resolution
+  obi_req_t obi_resolved_req;
+  obi_rsp_t obi_resolved_rsp;
+  
+  // OBI signals after post-atomic cut (pipeline stage)
+  obi_req_t obi_cut_req;
+  obi_rsp_t obi_cut_rsp;
+  
+  /*******************************************************************/
+  /*  Stage 1: TCDM → OBI (with atomic operations preserved)        */
+  /*******************************************************************/
+  
+  tcdm2obi #(
+    .tcdm_req_t ( tcdm_req_t ),
+    .tcdm_rsp_t ( tcdm_rsp_t ),
+    .obi_req_t  ( obi_req_t  ),
+    .obi_rsp_t  ( obi_rsp_t  )
+  ) i_tcdm2obi (
+    .clk_i      ( clk_i      ),
+    .rst_ni     ( rst_ni     ),
+    .tcdm_req_i ( tcdm_req_i ),
+    .tcdm_rsp_o ( tcdm_rsp_o ),
+    .obi_req_o  ( obi_req    ),
+    .obi_rsp_i  ( obi_rsp    )
+  );
+  
+  /*******************************************************************/
+  /*  Stage 2: OBI Atomic Resolution (atop → RMW sequences)         */
+  /*******************************************************************/
+  
+  obi_atop_resolver #(
+    .SbrPortObiCfg             ( SbrPortObiCfg              ),
+    .MgrPortObiCfg             ( MgrPortObiCfg              ),
+    .sbr_port_obi_req_t        ( obi_req_t                  ),
+    .sbr_port_obi_rsp_t        ( obi_rsp_t                  ),
+    .mgr_port_obi_req_t        ( obi_req_t                  ),
+    .mgr_port_obi_rsp_t        ( obi_rsp_t                  ),
+    .mgr_port_obi_a_optional_t ( obi_a_optional_t           ),
+    .mgr_port_obi_r_optional_t ( obi_r_optional_t           ),
+    .LrScEnable                ( 1'b1                       ),
+    .RegisterAmo               ( magia_tile_pkg::RegisterAmo)
+  ) i_obi_atop_resolver (
+    .clk_i          ( clk_i            ),
+    .rst_ni         ( rst_ni           ),
+    .testmode_i     ( 1'b0             ),
+    .sbr_port_req_i ( obi_req          ),
+    .sbr_port_rsp_o ( obi_rsp          ),
+    .mgr_port_req_o ( obi_resolved_req ),
+    .mgr_port_rsp_i ( obi_resolved_rsp )
+  );
+  
+  /*******************************************************************/
+  /*  Stage 3: OBI Cut (optional - controlled by BypassCut param)   */
+  /*******************************************************************/
+  
+  generate
+    if (BypassCut) begin : gen_bypass_cut
+      assign obi_cut_req = obi_resolved_req;
+      assign obi_resolved_rsp = obi_cut_rsp;
+    end else begin : gen_enable_cut
+      obi_cut #(
+        .ObiCfg       ( MgrPortObiCfg ),
+        .obi_a_chan_t ( obi_a_chan_t  ),
+        .obi_r_chan_t ( obi_r_chan_t  ),
+        .obi_req_t    ( obi_req_t     ),
+        .obi_rsp_t    ( obi_rsp_t     )
+      ) i_obi_cut_post_atomic (
+        .clk_i           ( clk_i            ),
+        .rst_ni          ( rst_ni           ),
+        .sbr_port_req_i  ( obi_resolved_req ),
+        .sbr_port_rsp_o  ( obi_resolved_rsp ),
+        .mgr_port_req_o  ( obi_cut_req      ),
+        .mgr_port_rsp_i  ( obi_cut_rsp      )
+      );
+    end
+  endgenerate
+  
+  /*******************************************************************/
+  /*  Stage 4: OBI → HCI conversion (request and response)          */
+  /******************************************************************/
+  
+  // Convert OBI request to HCI request
+  obi2hci_req #(
+    .obi_req_t ( obi_req_t ),
+    .hci_req_t ( hci_req_t )
+  ) i_obi2hci_req (
+    .obi_req_i ( obi_cut_req ),
+    .hci_req_o ( hci_req_o   )
+  );
+  
+  // Convert HCI response to OBI response
+  hci2obi_rsp #(
+    .hci_rsp_t ( hci_rsp_t ),
+    .obi_rsp_t ( obi_rsp_t )
+  ) i_hci2obi_rsp (
+    .hci_rsp_i ( hci_rsp_i   ),
+    .obi_rsp_o ( obi_cut_rsp )
+  );
+
+endmodule : tcdm2hci_atomic
diff --git a/hw/tile/converters/tcdm2obi.sv b/hw/tile/converters/tcdm2obi.sv
new file mode 100644
index 0000000..43fc5ce
--- /dev/null
+++ b/hw/tile/converters/tcdm2obi.sv
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51 
+ * (the "License"); you may not use this file except in compliance 
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * TCDM to OBI Protocol Converter - Combinatorial conversion
+ *
+ */
+
+module tcdm2obi
+  import reqrsp_pkg::*;
+#(
+  parameter type tcdm_req_t = logic,
+  parameter type tcdm_rsp_t = logic,
+  parameter type obi_req_t  = logic,
+  parameter type obi_rsp_t  = logic
+)(
+  input  logic       clk_i,
+  input  logic       rst_ni,
+  
+  input  tcdm_req_t  tcdm_req_i,
+  output tcdm_rsp_t  tcdm_rsp_o,
+  
+  output obi_req_t   obi_req_o,
+  input  obi_rsp_t   obi_rsp_i
+);
+
+  // Convert TCDM AMO to OBI ATOP encoding
+  function automatic logic [5:0] amo_to_atop(reqrsp_pkg::amo_op_e amo);
+    case (amo)
+      reqrsp_pkg::AMONone:  return 6'h00;  // ATOPNONE
+      reqrsp_pkg::AMOSwap:  return 6'h21;  // AMOSWAP
+      reqrsp_pkg::AMOAdd:   return 6'h20;  // AMOADD
+      reqrsp_pkg::AMOAnd:   return 6'h2C;  // AMOAND
+      reqrsp_pkg::AMOOr:    return 6'h28;  // AMOOR
+      reqrsp_pkg::AMOXor:   return 6'h24;  // AMOXOR
+      reqrsp_pkg::AMOMax:   return 6'h34;  // AMOMAX (signed)
+      reqrsp_pkg::AMOMaxu:  return 6'h3C;  // AMOMAXU (unsigned)
+      reqrsp_pkg::AMOMin:   return 6'h30;  // AMOMIN (signed)
+      reqrsp_pkg::AMOMinu:  return 6'h38;  // AMOMINU (unsigned)
+      reqrsp_pkg::AMOLR:    return 6'h22;  // ATOPLR (Load-Reserved)
+      reqrsp_pkg::AMOSC:    return 6'h23;  // ATOPSC (Store-Conditional)
+      default:              return 6'h00;  // Safe default: ATOPNONE
+    endcase
+  endfunction
+
+  logic amo_detected;
+  assign amo_detected = (tcdm_req_i.q.amo != reqrsp_pkg::AMONone);
+
+  // TCDM → OBI Request (combinatorial)
+  assign obi_req_o.req                  = tcdm_req_i.q_valid;
+  assign obi_req_o.a.addr               = tcdm_req_i.q.addr;
+  assign obi_req_o.a.we                 = tcdm_req_i.q.write || amo_detected;
+  assign obi_req_o.a.wdata              = tcdm_req_i.q.data;
+  assign obi_req_o.a.be                 = tcdm_req_i.q.strb;
+  assign obi_req_o.a.aid                = '0;
+  assign obi_req_o.a.a_optional.auser   = '0;
+  assign obi_req_o.a.a_optional.wuser   = '0;
+  assign obi_req_o.a.a_optional.atop    = amo_to_atop(tcdm_req_i.q.amo);
+  assign obi_req_o.a.a_optional.memtype = 2'b00;
+  assign obi_req_o.a.a_optional.mid     = '0;
+  assign obi_req_o.a.a_optional.prot    = 3'b000;
+  assign obi_req_o.a.a_optional.dbg     = 1'b0;
+  assign obi_req_o.a.a_optional.achk    = '0;
+  
+  // OBI → TCDM Response
+  assign tcdm_rsp_o.q_ready = obi_rsp_i.gnt;
+  assign tcdm_rsp_o.p_valid = obi_rsp_i.rvalid;
+  assign tcdm_rsp_o.p.data  = obi_rsp_i.r.rdata;
+
+endmodule : tcdm2obi
diff --git a/hw/tile/converters/tcdm64_to_dual_hci32.sv b/hw/tile/converters/tcdm64_to_dual_hci32.sv
new file mode 100644
index 0000000..9a355e4
--- /dev/null
+++ b/hw/tile/converters/tcdm64_to_dual_hci32.sv
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * TCDM 64-bit to Dual HCI 32-bit Protocol Converter
+ *
+ */
+
+module tcdm64_to_dual_hci32 #(
+  parameter type tcdm64_req_t = logic,  // 64-bit TCDM request type
+  parameter type tcdm64_rsp_t = logic,  // 64-bit TCDM response type
+  parameter type tcdm32_req_t = logic,  // 32-bit TCDM request type
+  parameter type tcdm32_rsp_t = logic,  // 32-bit TCDM response type
+  parameter type hci_req_t    = logic,  // 32-bit HCI request type
+  parameter type hci_rsp_t    = logic   // 32-bit HCI response type
+)(
+  input  logic         clk_i,
+  input  logic         rst_ni,
+  
+  // TCDM side (Spatz 64-bit vector port)
+  input  tcdm64_req_t  tcdm_req_i,
+  output tcdm64_rsp_t  tcdm_rsp_o,
+  
+  // HCI side (Two 32-bit L1 SPM interconnect ports)
+  output hci_req_t   hci_req_lo_o,  // Lower 32-bit word (bits [31:0])
+  input  hci_rsp_t   hci_rsp_lo_i,
+  output hci_req_t   hci_req_hi_o,  // Upper 32-bit word (bits [63:32])
+  input  hci_rsp_t   hci_rsp_hi_i
+);
+
+  // Internal TCDM32 signals
+  tcdm32_req_t tcdm32_req_lo, tcdm32_req_hi;
+  tcdm32_rsp_t tcdm32_rsp_lo, tcdm32_rsp_hi;
+
+  /*******************************************************************/
+  /*    Step 1: TCDM64 → 2× TCDM32 split (tcdm64_to_dual_tcdm32)    */
+  /*******************************************************************/
+  
+  tcdm64_to_dual_tcdm32 #(
+    .tcdm64_req_t ( tcdm64_req_t  ),
+    .tcdm64_rsp_t ( tcdm64_rsp_t  ),
+    .tcdm32_req_t ( tcdm32_req_t  ),
+    .tcdm32_rsp_t ( tcdm32_rsp_t  )
+  ) i_tcdm64_to_dual_tcdm32 (
+    .clk_i          ( clk_i          ),
+    .rst_ni         ( rst_ni         ),
+    .tcdm_req_i     ( tcdm_req_i     ),
+    .tcdm_rsp_o     ( tcdm_rsp_o     ),
+    .tcdm_req_lo_o  ( tcdm32_req_lo  ),
+    .tcdm_rsp_lo_i  ( tcdm32_rsp_lo  ),
+    .tcdm_req_hi_o  ( tcdm32_req_hi  ),
+    .tcdm_rsp_hi_i  ( tcdm32_rsp_hi  )
+  );
+
+  /*******************************************************************/
+  /*           Step 2: TCDM32 → HCI32 (2× tcdm2hci)                 */
+  /*******************************************************************/
+  
+  // Lower bank: TCDM32 → HCI32
+  tcdm2hci #(
+    .tcdm_req_t ( tcdm32_req_t ),
+    .tcdm_rsp_t ( tcdm32_rsp_t ),
+    .hci_req_t  ( hci_req_t    ),
+    .hci_rsp_t  ( hci_rsp_t    )
+  ) i_tcdm2hci_lo (
+    .clk_i      ( clk_i          ),
+    .rst_ni     ( rst_ni         ),
+    .tcdm_req_i ( tcdm32_req_lo  ),
+    .tcdm_rsp_o ( tcdm32_rsp_lo  ),
+    .hci_req_o  ( hci_req_lo_o   ),
+    .hci_rsp_i  ( hci_rsp_lo_i   )
+  );
+
+  // Upper bank: TCDM32 → HCI32
+  tcdm2hci #(
+    .tcdm_req_t ( tcdm32_req_t ),
+    .tcdm_rsp_t ( tcdm32_rsp_t ),
+    .hci_req_t  ( hci_req_t    ),
+    .hci_rsp_t  ( hci_rsp_t    )
+  ) i_tcdm2hci_hi (
+    .clk_i      ( clk_i          ),
+    .rst_ni     ( rst_ni         ),
+    .tcdm_req_i ( tcdm32_req_hi  ),
+    .tcdm_rsp_o ( tcdm32_rsp_hi  ),
+    .hci_req_o  ( hci_req_hi_o   ),
+    .hci_rsp_i  ( hci_rsp_hi_i   )
+  );
+
+endmodule : tcdm64_to_dual_hci32
diff --git a/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv b/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
new file mode 100644
index 0000000..452b509
--- /dev/null
+++ b/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * TCDM 64-bit to Dual HCI 32-bit Protocol Converter with Atomic Support
+ *
+ */
+
+module tcdm64_to_dual_hci32_atomic
+  import obi_pkg::*;
+#(
+  parameter type tcdm64_req_t = logic,  // 64-bit TCDM request type (with AMO)
+  parameter type tcdm64_rsp_t = logic,  // 64-bit TCDM response type
+  parameter type tcdm32_req_t = logic,  // 32-bit TCDM request type (with AMO)
+  parameter type tcdm32_rsp_t = logic,  // 32-bit TCDM response type
+  parameter type obi32_req_t  = logic,  // 32-bit OBI request type
+  parameter type obi32_rsp_t  = logic,  // 32-bit OBI response type
+  parameter type obi32_a_chan_t = logic, // 32-bit OBI address channel type
+  parameter type obi32_r_chan_t = logic, // 32-bit OBI response channel type
+  parameter type hci_req_t    = logic,  // 32-bit HCI request type
+  parameter type hci_rsp_t    = logic,  // 32-bit HCI response type
+  
+  // OBI optional types for atomic resolvers
+  parameter type obi32_a_optional_t = logic,
+  parameter type obi32_r_optional_t = logic,
+  
+  // OBI configuration for atomic resolvers
+  parameter obi_pkg::obi_cfg_t SbrPortObiCfg = obi_pkg::ObiDefaultConfig,
+  parameter obi_pkg::obi_cfg_t MgrPortObiCfg = obi_pkg::ObiDefaultConfig,
+  parameter bit BypassCut = 1'b0
+)(
+  input  logic       clk_i,
+  input  logic       rst_ni,
+  
+  // TCDM side (Snitch RV64 port with AMO)
+  input  tcdm64_req_t  tcdm_req_i,
+  output tcdm64_rsp_t  tcdm_rsp_o,
+  
+  // HCI side (Two 32-bit L1 SPM interconnect ports)
+  output hci_req_t   hci_req_lo_o,  // Lower 32-bit word (bits [31:0])
+  input  hci_rsp_t   hci_rsp_lo_i,
+  output hci_req_t   hci_req_hi_o,  // Upper 32-bit word (bits [63:32])
+  input  hci_rsp_t   hci_rsp_hi_i
+);
+
+  // Internal TCDM32 signals after tcdm64_to_dual_tcdm32
+  tcdm32_req_t tcdm32_req_lo, tcdm32_req_hi;
+  tcdm32_rsp_t tcdm32_rsp_lo, tcdm32_rsp_hi;
+  
+  /*******************************************************************/
+  /*    Step 1: TCDM64 → 2× TCDM32 split (tcdm64_to_dual_tcdm32)    */
+  /*******************************************************************/
+  
+  tcdm64_to_dual_tcdm32 #(
+    .tcdm64_req_t ( tcdm64_req_t  ),
+    .tcdm64_rsp_t ( tcdm64_rsp_t  ),
+    .tcdm32_req_t ( tcdm32_req_t  ),
+    .tcdm32_rsp_t ( tcdm32_rsp_t  )
+  ) i_tcdm64_to_dual_tcdm32 (
+    .clk_i          ( clk_i          ),
+    .rst_ni         ( rst_ni         ),
+    .tcdm_req_i     ( tcdm_req_i     ),
+    .tcdm_rsp_o     ( tcdm_rsp_o     ),
+    .tcdm_req_lo_o  ( tcdm32_req_lo  ),
+    .tcdm_rsp_lo_i  ( tcdm32_rsp_lo  ),
+    .tcdm_req_hi_o  ( tcdm32_req_hi  ),
+    .tcdm_rsp_hi_i  ( tcdm32_rsp_hi  )
+  );
+  
+  /*******************************************************************/
+  /*      Step 2: TCDM32 → HCI32 with atomics (2× tcdm2hci_atomic)  */
+  /*******************************************************************/
+  
+  // Lower bank: TCDM32 → HCI32 with atomic support
+  tcdm2hci_atomic #(
+    .tcdm_req_t       ( tcdm32_req_t        ),
+    .tcdm_rsp_t       ( tcdm32_rsp_t        ),
+    .obi_req_t        ( obi32_req_t         ),
+    .obi_rsp_t        ( obi32_rsp_t         ),
+    .obi_a_chan_t     ( obi32_a_chan_t      ),
+    .obi_r_chan_t     ( obi32_r_chan_t      ),
+    .hci_req_t        ( hci_req_t           ),
+    .hci_rsp_t        ( hci_rsp_t           ),
+    .obi_a_optional_t ( obi32_a_optional_t  ),
+    .obi_r_optional_t ( obi32_r_optional_t  ),
+    .SbrPortObiCfg    ( SbrPortObiCfg       ),
+    .MgrPortObiCfg    ( MgrPortObiCfg       ),
+    .BypassCut        ( BypassCut           )
+  ) i_tcdm2hci_atomic_lo (
+    .clk_i       ( clk_i           ),
+    .rst_ni      ( rst_ni          ),
+    .tcdm_req_i  ( tcdm32_req_lo   ),
+    .tcdm_rsp_o  ( tcdm32_rsp_lo   ),
+    .hci_req_o   ( hci_req_lo_o    ),
+    .hci_rsp_i   ( hci_rsp_lo_i    )
+  );
+  
+  // Upper bank: TCDM32 → HCI32 with atomic support
+  tcdm2hci_atomic #(
+    .tcdm_req_t       ( tcdm32_req_t        ),
+    .tcdm_rsp_t       ( tcdm32_rsp_t        ),
+    .obi_req_t        ( obi32_req_t         ),
+    .obi_rsp_t        ( obi32_rsp_t         ),
+    .obi_a_chan_t     ( obi32_a_chan_t      ),
+    .obi_r_chan_t     ( obi32_r_chan_t      ),
+    .hci_req_t        ( hci_req_t           ),
+    .hci_rsp_t        ( hci_rsp_t           ),
+    .obi_a_optional_t ( obi32_a_optional_t  ),
+    .obi_r_optional_t ( obi32_r_optional_t  ),
+    .SbrPortObiCfg    ( SbrPortObiCfg       ),
+    .MgrPortObiCfg    ( MgrPortObiCfg       ),
+    .BypassCut        ( BypassCut           )
+  ) i_tcdm2hci_atomic_hi (
+    .clk_i       ( clk_i           ),
+    .rst_ni      ( rst_ni          ),
+    .tcdm_req_i  ( tcdm32_req_hi   ),
+    .tcdm_rsp_o  ( tcdm32_rsp_hi   ),
+    .hci_req_o   ( hci_req_hi_o    ),
+    .hci_rsp_i   ( hci_rsp_hi_i    )
+  );
+
+endmodule : tcdm64_to_dual_hci32_atomic
diff --git a/hw/tile/converters/tcdm64_to_dual_tcdm32.sv b/hw/tile/converters/tcdm64_to_dual_tcdm32.sv
new file mode 100644
index 0000000..f51e422
--- /dev/null
+++ b/hw/tile/converters/tcdm64_to_dual_tcdm32.sv
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51
+ * (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * TCDM 64-bit to Dual TCDM 32-bit Protocol Converter
+ *
+ */
+
+module tcdm64_to_dual_tcdm32 #(
+  parameter type tcdm64_req_t = logic,  // 64-bit TCDM request type
+  parameter type tcdm64_rsp_t = logic,  // 64-bit TCDM response type
+  parameter type tcdm32_req_t = logic,  // 32-bit TCDM request type
+  parameter type tcdm32_rsp_t = logic   // 32-bit TCDM response type
+)(
+  input  logic         clk_i,
+  input  logic         rst_ni,
+  
+  // TCDM 64-bit side (Snitch data port)
+  input  tcdm64_req_t  tcdm_req_i,
+  output tcdm64_rsp_t  tcdm_rsp_o,
+  
+  // TCDM 32-bit side (Two parallel TCDM ports)
+  output tcdm32_req_t  tcdm_req_lo_o,  // Lower 32-bit word (bits [31:0])
+  input  tcdm32_rsp_t  tcdm_rsp_lo_i,
+  output tcdm32_req_t  tcdm_req_hi_o,  // Upper 32-bit word (bits [63:32])
+  input  tcdm32_rsp_t  tcdm_rsp_hi_i
+);
+
+  /*******************************************************************/
+  /*   TCDM64 → Dual TCDM32: Minimal State (2 Registers Only)       */
+  /*******************************************************************/
+  
+  // Request analysis signals
+  logic strb_is_zero;
+  logic is_load;
+  logic access_lo, access_hi;
+  
+  // Decode request type
+  assign strb_is_zero = (tcdm_req_i.q.strb == 8'h00);
+  assign is_load = !tcdm_req_i.q.write;
+  
+  // Determine which banks to access:
+  // - Normal operation: access banks based on strobe bits
+  // - Load with strb=0: access BOTH banks (full 64-bit read)
+  // - Store with strb=0: access NO banks (no-op)
+  assign access_lo = |tcdm_req_i.q.strb[3:0] || (strb_is_zero && is_load);
+  assign access_hi = |tcdm_req_i.q.strb[7:4] || (strb_is_zero && is_load);
+  
+  logic access_lo_q, access_hi_q;
+  
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      access_lo_q <= 1'b0;
+      access_hi_q <= 1'b0;
+    end else if (tcdm_req_i.q_valid && tcdm_rsp_o.q_ready) begin
+      // Latch access pattern for all requests
+      // For loads with strb=0, this will be both banks (full read)
+      access_lo_q <= access_lo;
+      access_hi_q <= access_hi;
+    end
+  end
+  
+  // Response arrives when BOTH accessed banks have responded
+  logic rsp_arriving;
+  always_comb begin
+    rsp_arriving = 1'b0;
+    
+    if (access_lo_q && access_hi_q) begin
+      rsp_arriving = tcdm_rsp_lo_i.p_valid && tcdm_rsp_hi_i.p_valid;
+    end else if (access_lo_q) begin
+      rsp_arriving = tcdm_rsp_lo_i.p_valid;
+    end else if (access_hi_q) begin
+      rsp_arriving = tcdm_rsp_hi_i.p_valid;
+    end
+  end
+  
+  // Address generation for dual banks
+  logic [31:0] addr_base;
+  logic [31:0] addr_lo, addr_hi;
+  
+  assign addr_base = {tcdm_req_i.q.addr[31:3], 3'b000};  // Align to 8-byte boundary
+  assign addr_lo = addr_base;                             // Lower 32-bit word [31:0]
+  assign addr_hi = addr_base + 32'd4;                     // Upper 32-bit word [63:32]
+  
+  /*******************************************************************/
+  /*                  TCDM32 Request Outputs                         */
+  /*******************************************************************/
+  
+  // Lower bank (LO) request generation
+  always_comb begin
+    tcdm_req_lo_o.q_valid = 1'b0;
+    tcdm_req_lo_o.q.addr  = '0;
+    tcdm_req_lo_o.q.write = 1'b0;
+    tcdm_req_lo_o.q.amo   = reqrsp_pkg::AMONone;
+    tcdm_req_lo_o.q.data  = '0;
+    tcdm_req_lo_o.q.strb  = '0;
+    tcdm_req_lo_o.q.user  = '0;
+    
+    if (access_lo) begin
+      tcdm_req_lo_o.q_valid = tcdm_req_i.q_valid;
+      tcdm_req_lo_o.q.addr  = addr_lo;
+      tcdm_req_lo_o.q.write = tcdm_req_i.q.write;
+      tcdm_req_lo_o.q.amo   = tcdm_req_i.q.amo;
+      tcdm_req_lo_o.q.data  = tcdm_req_i.q.data[31:0];
+      tcdm_req_lo_o.q.user  = tcdm_req_i.q.user;
+      
+      // Strobe handling: load with strb=0 reads full 32-bit word
+      if (strb_is_zero && is_load)
+        tcdm_req_lo_o.q.strb = 4'hF;
+      else
+        tcdm_req_lo_o.q.strb = tcdm_req_i.q.strb[3:0];
+    end
+  end
+  
+  // Upper bank (HI) request generation
+  always_comb begin
+    tcdm_req_hi_o.q_valid = 1'b0;
+    tcdm_req_hi_o.q.addr  = '0;
+    tcdm_req_hi_o.q.write = 1'b0;
+    tcdm_req_hi_o.q.amo   = reqrsp_pkg::AMONone;
+    tcdm_req_hi_o.q.data  = '0;
+    tcdm_req_hi_o.q.strb  = '0;
+    tcdm_req_hi_o.q.user  = '0;
+    
+    if (access_hi) begin
+      tcdm_req_hi_o.q_valid = tcdm_req_i.q_valid;
+      tcdm_req_hi_o.q.addr  = addr_hi;
+      tcdm_req_hi_o.q.write = tcdm_req_i.q.write;
+      tcdm_req_hi_o.q.amo   = tcdm_req_i.q.amo;
+      tcdm_req_hi_o.q.data  = tcdm_req_i.q.data[63:32];
+      tcdm_req_hi_o.q.user  = tcdm_req_i.q.user;
+      
+      // Strobe handling: load with strb=0 reads full 32-bit word
+      if (strb_is_zero && is_load)
+        tcdm_req_hi_o.q.strb = 4'hF;
+      else
+        tcdm_req_hi_o.q.strb = tcdm_req_i.q.strb[7:4];
+    end
+  end
+  
+  /*******************************************************************/
+  /*              TCDM64 Response & Grant                            */
+  /*******************************************************************/
+  
+  // Ready signal: accept request when target banks are ready
+  always_comb begin
+    tcdm_rsp_o.q_ready = 1'b0;
+    
+    if (access_lo && access_hi) begin
+      // Both banks accessed: need both ready
+      tcdm_rsp_o.q_ready = tcdm_rsp_lo_i.q_ready && tcdm_rsp_hi_i.q_ready;
+    end else if (access_lo) begin
+      // Only LO bank accessed
+      tcdm_rsp_o.q_ready = tcdm_rsp_lo_i.q_ready;
+    end else if (access_hi) begin
+      // Only HI bank accessed
+      tcdm_rsp_o.q_ready = tcdm_rsp_hi_i.q_ready;
+    end else if (strb_is_zero && !is_load) begin
+      // Store with strb=0: no memory access needed, accept immediately
+      tcdm_rsp_o.q_ready = 1'b1;
+    end
+  end
+  
+  // Response valid: asserted when all accessed banks have responded
+  assign tcdm_rsp_o.p_valid = rsp_arriving;
+  
+  // Response data: recombine 32-bit responses into 64-bit word
+  // Uses registered access pattern to handle asynchronous bank responses
+  assign tcdm_rsp_o.p.data = {
+    access_hi_q ? tcdm_rsp_hi_i.p.data : 32'h0,
+    access_lo_q ? tcdm_rsp_lo_i.p.data : 32'h0
+  };
+
+endmodule : tcdm64_to_dual_tcdm32
diff --git a/hw/tile/eu_direct_cut.sv b/hw/tile/eu_direct_cut.sv
new file mode 100644
index 0000000..b962e8b
--- /dev/null
+++ b/hw/tile/eu_direct_cut.sv
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51 
+ * (the "License"); you may not use this file except in compliance 
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * EU Direct Link Cut
+ */
+
+module eu_direct_cut 
+  import magia_tile_pkg::*;
+#(
+  parameter type eu_direct_req_t = logic,
+  parameter type eu_direct_rsp_t = logic,
+  parameter bit  Bypass          = 1'b0,
+  parameter bit  BypassReq       = Bypass,
+  parameter bit  BypassRsp       = Bypass
+)(
+  input  logic           clk_i,
+  input  logic           rst_ni,
+
+  input  eu_direct_req_t sbr_req_i,
+  output eu_direct_rsp_t sbr_rsp_o,
+
+  output eu_direct_req_t mgr_req_o,
+  input  eu_direct_rsp_t mgr_rsp_i
+);
+
+  // Request payload (exclude req signal for valid/ready)
+  typedef struct packed {
+    logic[magia_pkg::ADDR_W-1:0] addr;
+    logic                        wen;
+    logic[magia_pkg::DATA_W-1:0] wdata;
+    logic[3:0]                   be;
+  } eu_req_payload_t;
+  
+  eu_req_payload_t sbr_req_payload, mgr_req_payload;
+  
+  assign sbr_req_payload.addr  = sbr_req_i.addr;
+  assign sbr_req_payload.wen   = sbr_req_i.wen;
+  assign sbr_req_payload.wdata = sbr_req_i.wdata;
+  assign sbr_req_payload.be    = sbr_req_i.be;
+  
+  assign mgr_req_o.addr  = mgr_req_payload.addr;
+  assign mgr_req_o.wen   = mgr_req_payload.wen;
+  assign mgr_req_o.wdata = mgr_req_payload.wdata;
+  assign mgr_req_o.be    = mgr_req_payload.be;
+
+  // Request spill register
+  spill_register #(
+    .T      ( eu_req_payload_t ),
+    .Bypass ( BypassReq        )
+  ) i_req_spill (
+    .clk_i   ( clk_i              ),
+    .rst_ni  ( rst_ni             ),
+    .valid_i ( sbr_req_i.req      ),
+    .ready_o ( sbr_rsp_o.gnt      ),
+    .data_i  ( sbr_req_payload    ),
+    .valid_o ( mgr_req_o.req      ),
+    .ready_i ( mgr_rsp_i.gnt      ),
+    .data_o  ( mgr_req_payload    )
+  );
+
+  // Response payload (exclude rvalid for valid/ready)
+  typedef struct packed {
+    logic[magia_pkg::DATA_W-1:0] rdata;
+    logic                        err;
+  } eu_rsp_payload_t;
+  
+  eu_rsp_payload_t sbr_rsp_payload, mgr_rsp_payload;
+  
+  assign mgr_rsp_payload.rdata = mgr_rsp_i.rdata;
+  assign mgr_rsp_payload.err   = mgr_rsp_i.err;
+  
+  assign sbr_rsp_o.rdata = sbr_rsp_payload.rdata;
+  assign sbr_rsp_o.err   = sbr_rsp_payload.err;
+
+  // Response spill register
+  spill_register #(
+    .T      ( eu_rsp_payload_t ),
+    .Bypass ( BypassRsp        )
+  ) i_rsp_spill (
+    .clk_i   ( clk_i              ),
+    .rst_ni  ( rst_ni             ),
+    .valid_i ( mgr_rsp_i.rvalid   ),
+    .ready_o (                    ),
+    .data_i  ( mgr_rsp_payload    ),
+    .valid_o ( sbr_rsp_o.rvalid   ),
+    .ready_i ( 1'b1               ),
+    .data_o  ( sbr_rsp_payload    )
+  );
+
+endmodule
diff --git a/hw/tile/magia_event_unit.sv b/hw/tile/magia_event_unit.sv
index 32050a2..76e6b37 100644
--- a/hw/tile/magia_event_unit.sv
+++ b/hw/tile/magia_event_unit.sv
@@ -25,10 +25,10 @@ import magia_tile_pkg::*;
   // MAGIA Event Unit Parameters - Optimized for single-core system
   parameter int unsigned NB_CORES = 1,              // Single core system
   parameter int unsigned NB_SW_EVT = 1,             // Minimal SW events for basic functionality
-  parameter int unsigned NB_BARR  = 0,              // Barrier units disabled (no sync needed)
-  parameter int unsigned NB_HW_MUT = 0,             // Hardware mutexes disabled (no contention)
+  parameter int unsigned NB_BARR  = 2,              // Minimum 2 barriers (workaround: $clog2(1)=0 causes issues)
+  parameter int unsigned NB_HW_MUT = 2,             // Minimum 2 mutexes (workaround: $clog2(1)=0 causes issues)
   parameter int unsigned MUTEX_MSG_W = 32,          // Mutex message width (unused but kept for compatibility)
-  parameter int unsigned DISP_FIFO_DEPTH = 0,       // Task dispatcher disabled (no distribution)
+  parameter int unsigned DISP_FIFO_DEPTH = 1,       // Minimal dispatcher FIFO (workaround for synthesis)
   parameter int unsigned EVNT_WIDTH = 8,            // SOC event width (external events)
   parameter int unsigned SOC_FIFO_DEPTH = 8         // SOC event FIFO depth (external events)
 )
@@ -67,12 +67,16 @@ import magia_tile_pkg::*;
   output logic                      eu_direct_gnt_o,
   output logic                      eu_direct_rvalid_o,
   output logic [31:0]               eu_direct_rdata_o,
-  output logic                      eu_direct_err_o
+  output logic                      eu_direct_err_o,
+
+   // OBI slave connection
+  input  core_obi_data_req_t        obi_req_i,
+  output core_obi_data_rsp_t        obi_rsp_o
 );
 
   // Create internal interface instances
   XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) eu_direct_link[NB_CORES-1:0]();
-  XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) speriph_slave();  // Tied off
+  XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) speriph_slave();
 
   // Internal signals
   logic soc_periph_evt_ready_internal;
@@ -93,15 +97,29 @@ import magia_tile_pkg::*;
   assign eu_direct_rdata_o  = eu_direct_link[0].r_rdata;
   assign eu_direct_err_o    = eu_direct_link[0].r_opc;  // r_opc: 0=OK, 1=ERROR
 
-  // Tie off speriph_slave (not used anymore)
-  assign speriph_slave.req   = 1'b0;
-  assign speriph_slave.add   = '0;
-  assign speriph_slave.wen   = 1'b1;
-  assign speriph_slave.wdata = '0;
-  assign speriph_slave.be    = '0;
-  assign speriph_slave.id    = '0;
-
+  // Address range check and offset calculation
+  localparam logic [magia_pkg::ADDR_W-1:0] EU_BASE_ADDR = magia_tile_pkg::EVENT_UNIT_ADDR_START;
+  logic addr_in_range;
+  logic [magia_pkg::ADDR_W-1:0] addr_offset;
+  
+  assign addr_in_range = (obi_req_i.a.addr >= magia_tile_pkg::EVENT_UNIT_ADDR_START) && 
+                         (obi_req_i.a.addr <= magia_tile_pkg::EVENT_UNIT_ADDR_END);
+  assign addr_offset   = obi_req_i.a.addr - EU_BASE_ADDR;
+  
+  // OBI to XBAR_PERIPH_BUS conversion - pass RELATIVE address (offset from base)
+  assign speriph_slave.req   = obi_req_i.req && addr_in_range;
+  assign speriph_slave.add   = addr_offset;           
+  assign speriph_slave.wen   = ~obi_req_i.a.we;       
+  assign speriph_slave.wdata = obi_req_i.a.wdata;
+  assign speriph_slave.be    = obi_req_i.a.be;
+  assign speriph_slave.id    = '0;                   
 
+  // Direct response mapping - no mux needed
+  assign obi_rsp_o.gnt         = speriph_slave.gnt;
+  assign obi_rsp_o.rvalid      = speriph_slave.r_valid;
+  assign obi_rsp_o.r.rdata     = speriph_slave.r_rdata;
+  assign obi_rsp_o.r.err       = speriph_slave.r_opc;  // r_opc: 0=OK, 1=ERROR
+  
 
   // Event Unit Flex instantiation
   event_unit_top #(
@@ -133,7 +151,7 @@ import magia_tile_pkg::*;
     .soc_periph_evt_valid_i   ( 1'b0                          ),
     .soc_periph_evt_ready_o   ( soc_periph_evt_ready_internal ),
     .soc_periph_evt_data_i    ( '0                            ),
-    .speriph_slave            ( speriph_slave.Slave           ),
+    .speriph_slave            ( speriph_slave                 ),
     .eu_direct_link           ( eu_direct_link                )
   );
  
diff --git a/hw/tile/magia_tile.sv b/hw/tile/magia_tile.sv
index a9fcbf2..e1a2bee 100644
--- a/hw/tile/magia_tile.sv
+++ b/hw/tile/magia_tile.sv
@@ -135,6 +135,8 @@ module magia_tile
   logic[magia_pkg::ADDR_W-1:0] tile_fsync_ctrl_end_addr;
   logic[magia_pkg::ADDR_W-1:0] tile_event_unit_start_addr;
   logic[magia_pkg::ADDR_W-1:0] tile_event_unit_end_addr;
+  logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_start_addr;
+  logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_end_addr;
   
   magia_tile_pkg::redmule_data_req_t redmule_data_req;
   magia_tile_pkg::redmule_data_rsp_t redmule_data_rsp;
@@ -148,20 +150,20 @@ module magia_tile
   magia_tile_pkg::core_obi_data_req_t core_obi_data_req;
   magia_tile_pkg::core_obi_data_rsp_t core_obi_data_rsp;
 
-  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl
-  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl
+  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req;
+  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp;
 
-  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_req; // Index 0 -> L2, Index 1 -> L1SPM
-  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_rsp; // Index 0 -> L2, Index 1 -> L1SPM
+  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_req;
+  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_rsp;
 
   magia_tile_pkg::core_obi_data_req_t core_l1_data_amo_req;
   magia_tile_pkg::core_obi_data_rsp_t core_l1_data_amo_rsp;
 
-  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request
-  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request
+  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request
+  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request
 
-  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request
-  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request
+  magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request
+  magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request
 
   magia_tile_pkg::core_obi_data_req_t ext_obi_data_req;
   magia_tile_pkg::core_obi_data_rsp_t ext_obi_data_rsp;
@@ -332,6 +334,33 @@ module magia_tile
   magia_tile_pkg::eu_direct_req_t eu_direct_req;
   magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp;
 
+  // EU direct with pipeline cut
+  magia_tile_pkg::eu_direct_req_t eu_direct_req_cut;
+  magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp_cut;
+
+  // Spatz CC signals
+  snitch_pkg::interrupts_t spatz_irq;
+  magia_tile_pkg::core_hci_data_req_t [magia_tile_pkg::SPATZ_HCI_PORTS-1:0] spatz_hci_req;
+  magia_tile_pkg::core_hci_data_rsp_t [magia_tile_pkg::SPATZ_HCI_PORTS-1:0] spatz_hci_rsp;
+  magia_tile_pkg::core_obi_data_req_t spatz_obi_req;
+  magia_tile_pkg::core_obi_data_rsp_t spatz_obi_rsp;
+  logic        spatz_inst_req;         
+  logic [31:0] spatz_inst_addr;        
+  logic        spatz_inst_cacheable;    
+  logic        spatz_flush_i_valid;     
+  logic [31:0] spatz_inst_data;         
+  logic        spatz_inst_ready;        
+  logic        spatz_inst_error;        
+  logic        spatz_flush_i_ready;  
+  logic spatz_enable_prefetching;
+  snitch_pkg::core_events_t spatz_core_events;   
+  magia_tile_pkg::core_axi_instr_req_t  spatz_icache_axi_req;
+  magia_tile_pkg::core_axi_instr_rsp_t  spatz_icache_axi_rsp;
+  logic spatz_start;
+  logic spatz_done;
+  logic spatz_clk_en;     
+  logic spatz_clk;  
+
 /*******************************************************/
 /**          Internal Signal Definitions End          **/
 /*******************************************************/
@@ -346,15 +375,19 @@ module magia_tile
   assign tile_fsync_ctrl_end_addr     = magia_tile_pkg::FSYNC_CTRL_ADDR_END;
   assign tile_event_unit_start_addr   = magia_tile_pkg::EVENT_UNIT_ADDR_START;
   assign tile_event_unit_end_addr     = magia_tile_pkg::EVENT_UNIT_ADDR_END;
-  assign tile_l1_start_addr           = magia_tile_pkg::L1_ADDR_START       + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
-  assign tile_l1_end_addr             = magia_tile_pkg::L1_ADDR_END         + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
+  assign tile_spatz_ctrl_start_addr   = magia_tile_pkg::SPATZ_CTRL_ADDR_START;
+  assign tile_spatz_ctrl_end_addr     = magia_tile_pkg::SPATZ_CTRL_ADDR_END;
   assign tile_reserved_start_addr     = magia_tile_pkg::RESERVED_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
   assign tile_reserved_end_addr       = magia_tile_pkg::RESERVED_ADDR_END   + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
+  assign tile_l1_start_addr           = magia_tile_pkg::L1_ADDR_START       + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
+  assign tile_l1_end_addr             = magia_tile_pkg::L1_ADDR_END         + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET;
 
-  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L2_IDX]           = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START,    end_addr: magia_tile_pkg::L2_ADDR_END    };
-  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L1SPM_IDX]        = '{idx: 32'd1, start_addr: tile_l1_start_addr,               end_addr: tile_l1_end_addr               };
-  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX]     = '{idx: 32'd1, start_addr: tile_reserved_start_addr,         end_addr: tile_reserved_end_addr         };
-  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX]        = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L2_IDX]           = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START,    end_addr: magia_tile_pkg::L2_ADDR_END     };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L1SPM_IDX]        = '{idx: 32'd1, start_addr: tile_l1_start_addr,               end_addr: tile_l1_end_addr                };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX]     = '{idx: 32'd1, start_addr: tile_reserved_start_addr,         end_addr: tile_reserved_end_addr          };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX]        = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END  };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX]            = '{idx: 32'd5, start_addr: tile_event_unit_start_addr,     end_addr: tile_event_unit_end_addr };
+  assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX]            = '{idx: 32'd6, start_addr: tile_spatz_ctrl_start_addr,  end_addr: tile_spatz_ctrl_end_addr    };
 `ifndef CV32E40X
   assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] = '{idx: 32'd2, start_addr: tile_redmule_ctrl_start_addr,     end_addr: tile_redmule_ctrl_end_addr     };
   assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_IDMA_IDX]         = '{idx: 32'd3, start_addr: tile_idma_ctrl_start_addr,        end_addr: tile_idma_ctrl_end_addr        };
@@ -364,6 +397,7 @@ module magia_tile
   assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L2_IDX]       = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END };
   assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L1SPM_IDX]    = '{idx: 32'd1, start_addr: tile_l1_start_addr,            end_addr: tile_l1_end_addr            };
   assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr,      end_addr: tile_reserved_end_addr      };
+  assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_BOOTROM_IDX]  = '{idx: 32'd2, start_addr: magia_tile_pkg::SPATZ_BOOT_ADDR, end_addr: magia_tile_pkg::SPATZ_BOOT_ADDR + magia_tile_pkg::SPATZ_BOOTROM_SIZE};
   
   assign obi_xbar_en_default_idx = '1; // Routing to the AXI Xbar all requests with an address outside the range of the internal L1 and the external L2
   assign obi_xbar_default_idx    = '0;
@@ -374,11 +408,15 @@ module magia_tile
   assign core_l2_data_rsp                                         = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_CORE_DATA_IDX];
   assign axi_xbar_data_in_req[magia_tile_pkg::AXI_CORE_INSTR_IDX] = core_l2_instr_req;
   assign core_l2_instr_rsp                                        = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_CORE_INSTR_IDX];
-
+  assign axi_xbar_data_in_req[magia_tile_pkg::AXI_SPATZ_INSTR_IDX] = spatz_icache_axi_req;
+  assign spatz_icache_axi_rsp                                      = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_SPATZ_INSTR_IDX];
+  
   assign obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX] = core_obi_data_req;
   assign core_obi_data_rsp                              = obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX];
   assign obi_xbar_slv_req[magia_tile_pkg::OBI_EXT_IDX]  = ext_obi_data_req;
   assign ext_obi_data_rsp                               = obi_xbar_slv_rsp[magia_tile_pkg::OBI_EXT_IDX];
+  assign obi_xbar_slv_req[magia_tile_pkg::OBI_SPATZ_IDX] = spatz_obi_req;
+  assign spatz_obi_rsp                                   = obi_xbar_slv_rsp[magia_tile_pkg::OBI_SPATZ_IDX];
 
   assign axi_data_user     = '0;
   assign obi_rsp_data_user = '0;
@@ -661,6 +699,22 @@ module magia_tile
     .eu_direct_rsp_i    ( eu_direct_rsp           )
   );
 
+    // EU direct pipeline cut
+  eu_direct_cut #(
+    .eu_direct_req_t ( magia_tile_pkg::eu_direct_req_t ),
+    .eu_direct_rsp_t ( magia_tile_pkg::eu_direct_rsp_t ),
+    .Bypass          ( 1'b0                            ),
+    .BypassReq       ( 1'b0                            ),
+    .BypassRsp       ( 1'b0                            )
+  ) i_eu_direct_cut (
+    .clk_i       ( sys_clk            ),
+    .rst_ni      ( rst_ni             ),
+    .sbr_req_i   ( eu_direct_req      ),
+    .sbr_rsp_o   ( eu_direct_rsp      ),
+    .mgr_req_o   ( eu_direct_req_cut  ),
+    .mgr_rsp_i   ( eu_direct_rsp_cut  )
+  );
+
 /*******************************************************/
 /**                Core Data Demux End                **/
 /*******************************************************/
@@ -689,6 +743,13 @@ module magia_tile
     .clk_o     ( core_clk    )
   );
 
+  // Spatz clock gating controlled by obi_slave_ctrl_spatz
+  tc_clk_gating spatz_clock_gating (
+    .clk_i     ( sys_clk       ),
+    .en_i      ( spatz_clk_en  ),
+    .test_en_i ( test_mode_i   ),
+    .clk_o     ( spatz_clk     )
+  );
 /*******************************************************/
 /**                  Clock Gating End                 **/
 /*******************************************************/
@@ -789,6 +850,11 @@ module magia_tile
 /*******************************************************/
 
   `HCI_ASSIGN_TO_INTF(hci_core_if[0],                                   core_l1_data_req,   core_l1_data_rsp)   // Only 1 core supported
+   generate
+    for (genvar i = 0; i < magia_tile_pkg::SPATZ_HCI_PORTS; i++) begin : gen_spatz_hci_assign
+      `HCI_ASSIGN_TO_INTF(hci_core_if[i+1], spatz_hci_req[i], spatz_hci_rsp[i])                                 // Spatz CC HCI ports
+    end
+  endgenerate
   `HCI_ASSIGN_TO_INTF(hci_redmule_if[0],                                redmule_data_req,   redmule_data_rsp)   // Only 1 RedMulE supported
   `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_CH_READ_IDX],  idma_hci_read_req,  idma_hci_read_rsp)  // iDMA HCI read channel
   `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_CH_WRITE_IDX], idma_hci_write_req, idma_hci_write_rsp) // iDMA HCI write channel
@@ -968,7 +1034,7 @@ module magia_tile
     .wu_wfe_i            
   );
 `else
-  // flex-v core with integrated FPU and tracer
+  // RI5CY core with integrated FPU and tracer
   riscv_core #(
     .N_EXT_PERF_COUNTERS ( magia_tile_pkg::N_EXT_PERF_COUNTERS ),
     .INSTR_RDATA_WIDTH   ( magia_tile_pkg::INSTR_RDATA_WIDTH   ),
@@ -1081,32 +1147,8 @@ module magia_tile
 /*******************************************************/
 /**      Core Data Demuxing (OBI XBAR) Beginning      **/
 /*******************************************************/
-
-  obi_atop_resolver #(
-    .SbrPortObiCfg             ( magia_tile_pkg::obi_amo_cfg                ),
-    .MgrPortObiCfg             ( obi_pkg::ObiDefaultConfig                  ),
-    .sbr_port_obi_req_t        ( magia_tile_pkg::core_obi_data_req_t        ),
-    .sbr_port_obi_rsp_t        ( magia_tile_pkg::core_obi_data_rsp_t        ),
-    .mgr_port_obi_req_t        (                                            ),
-    .mgr_port_obi_rsp_t        (                                            ),
-    .mgr_port_obi_a_optional_t ( magia_tile_pkg::core_data_obi_a_optional_t ),
-    .mgr_port_obi_r_optional_t ( magia_tile_pkg::core_data_obi_r_optional_t ),
-    .LrScEnable                (                                            ),
-    .RegisterAmo               ( magia_tile_pkg::RegisterAmo                )
-  ) i_obi_atomics (
-    .clk_i          ( sys_clk                                               ),
-    .rst_ni         ( rst_ni                                                ),
-    .testmode_i     ( test_mode_i                                           ),
-    .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ),
-    .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ),
-    .mgr_port_req_o ( core_l1_data_amo_req                                  ),
-    .mgr_port_rsp_i ( core_l1_data_amo_rsp                                  )
-  );
-
-  // Cut only external paths comming from the AXI XBAR
-  assign obi_xbar_slv_cut_req[0] = obi_xbar_slv_req[0];
-  assign obi_xbar_slv_rsp[0]     = obi_xbar_slv_cut_rsp[0];
-  for (genvar i = 1; i < magia_tile_pkg::N_MGR; i++) begin: gen_obi_xbar_sbr_cut
+  // OBI cut instances for EXT and SPATZ master ports
+  for (genvar i = magia_tile_pkg::OBI_EXT_IDX; i <= magia_tile_pkg::OBI_SPATZ_IDX; i++) begin : gen_obi_cut
     obi_cut #(
       .ObiCfg       ( magia_tile_pkg::obi_amo_cfg            ),
       .obi_a_chan_t ( magia_tile_pkg::core_data_obi_a_chan_t ),
@@ -1114,15 +1156,18 @@ module magia_tile
       .obi_req_t    ( magia_tile_pkg::core_obi_data_req_t    ),
       .obi_rsp_t    ( magia_tile_pkg::core_obi_data_rsp_t    )
     ) i_obi_cut_sbr (
-      .clk_i          ( sys_clk                 ),
-      .rst_ni         ( rst_ni                  ),
-      .sbr_port_req_i ( obi_xbar_slv_req[i]     ),
-      .sbr_port_rsp_o ( obi_xbar_slv_rsp[i]     ),
-      .mgr_port_req_o ( obi_xbar_slv_cut_req[i] ),
-      .mgr_port_rsp_i ( obi_xbar_slv_cut_rsp[i] )
+      .clk_i          ( sys_clk                        ),
+      .rst_ni         ( rst_ni                         ),
+      .sbr_port_req_i ( obi_xbar_slv_req[i]            ),
+      .sbr_port_rsp_o ( obi_xbar_slv_rsp[i]            ),
+      .mgr_port_req_o ( obi_xbar_slv_cut_req[i]        ),
+      .mgr_port_rsp_i ( obi_xbar_slv_cut_rsp[i]        )
     );
   end
   
+  assign obi_xbar_slv_cut_req[magia_tile_pkg::OBI_CORE_IDX]  = obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX];
+  assign obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX]      = obi_xbar_slv_cut_rsp[magia_tile_pkg::OBI_CORE_IDX];
+
   obi_xbar #(
     .SbrPortObiCfg      ( magia_tile_pkg::obi_amo_cfg            ),
     .MgrPortObiCfg      (                                        ),
@@ -1169,6 +1214,27 @@ module magia_tile
     );
    end
 
+  obi_atop_resolver #(
+    .SbrPortObiCfg             ( magia_tile_pkg::obi_amo_cfg                ),
+    .MgrPortObiCfg             ( obi_pkg::ObiDefaultConfig                  ),
+    .sbr_port_obi_req_t        ( magia_tile_pkg::core_obi_data_req_t        ),
+    .sbr_port_obi_rsp_t        ( magia_tile_pkg::core_obi_data_rsp_t        ),
+    .mgr_port_obi_req_t        (                                            ),
+    .mgr_port_obi_rsp_t        (                                            ),
+    .mgr_port_obi_a_optional_t ( magia_tile_pkg::core_data_obi_a_optional_t ),
+    .mgr_port_obi_r_optional_t ( magia_tile_pkg::core_data_obi_r_optional_t ),
+    .LrScEnable                (                                            ),
+    .RegisterAmo               ( magia_tile_pkg::RegisterAmo                )
+  ) i_obi_atomics (
+    .clk_i          ( sys_clk                                               ),
+    .rst_ni         ( rst_ni                                                ),
+    .testmode_i     ( test_mode_i                                           ),
+    .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ),
+    .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ),
+    .mgr_port_req_o ( core_l1_data_amo_req                                  ),
+    .mgr_port_rsp_i ( core_l1_data_amo_rsp                                  )
+  );
+
 /*******************************************************/
 /**         Core Data Demuxing (OBI XBAR) End         **/
 /*******************************************************/
@@ -1646,14 +1712,32 @@ module magia_tile
 /*******************************************************/
 /**              Floating-Point Unit End              **/
 /*******************************************************/
+/**              Spatz Control Slave Beginning        **/
+/*******************************************************/
+
+  obi_slave_ctrl_spatz #(
+    .BaseAddr  ( magia_tile_pkg::SPATZ_CTRL_ADDR_START                                       )
+  ) i_spatz_ctrl (
+    .clk_i     ( sys_clk                                                                     ),
+    .rst_ni    ( rst_ni                                                                      ),
+    .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX]                  ),  
+    .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX]                  ),
+    .clk_en_o  ( spatz_clk_en                                                                ),  
+    .start_o   ( spatz_start                                                                 ),  
+    .done_o    ( spatz_done                                                                  )  
+  );
+
+/*******************************************************/
+/**              Spatz Control Slave End              **/
+/*******************************************************/
 /**                Event Unit Beginning               **/
 /*******************************************************/
 
   // Event array assignments for proper 2D array structure
-  assign acc_events_array[0]   = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, 1'b0};
-  assign dma_events_array[0]   = {idma_obi2axi_done, idma_axi2obi_done};
-  assign timer_events_array[0] = 2'b00;
-  assign other_events_array[0] = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, 24'b0};  // iDMA status events [31:28]|idma_obi2axi_error, idma_axi2obi_error, iDMA error events [27:26]|fsync_error, fsync_done, Fsync events [25:24], Reserved [23:0] - SW events are INTERNAL to Event Unit!
+  assign acc_events_array[0]     = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, spatz_done};
+  assign dma_events_array[0]     = {idma_obi2axi_done, idma_axi2obi_done};
+  assign timer_events_array[0]   = 2'b00;
+  assign other_events_array[0]   = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, spatz_start, 23'b0};  // iDMA status events [31:28]|iDMA errors [27:26]|Fsync [25:24]|Spatz start [23]|Reserved [22:0]
 
 `ifdef CV32E40X
   assign eu_core_irq_ack    = eu_core_irq_req;
@@ -1662,54 +1746,215 @@ module magia_tile
   assign core_busy_o = !core_sleep_o;
 `endif
   
-  magia_event_unit #(
-    .NB_CORES         ( 1  ), // Single core system
-    .NB_SW_EVT        ( 1  ), // Minimum 1 SW event to avoid indexing issues (unused but required)
-    .NB_BARR          ( 0  ), // No barriers needed with single core
-    .NB_HW_MUT        ( 0  ), // No mutexes needed with single core
-    .MUTEX_MSG_W      ( 32 ), // Keep default even if unused
-    .DISP_FIFO_DEPTH  ( 0  ), // No task dispatcher needed
-    .EVNT_WIDTH       ( 8  ), // SOC event width (keep default)
-    .SOC_FIFO_DEPTH   ( 8  )  // SOC FIFO depth (keep default)
+ magia_event_unit #(
+    .NB_CORES         ( 1                                          ),
+    .NB_SW_EVT        ( 1                                          ), 
+    .NB_BARR          ( 2                                          ), 
+    .NB_HW_MUT        ( 1                                          ), 
+    .MUTEX_MSG_W      ( 32                                         ), 
+    .DISP_FIFO_DEPTH  ( 1                                          ), 
+    .EVNT_WIDTH       ( 8                                          ), 
+    .SOC_FIFO_DEPTH   ( 8                                          )  
   ) i_magia_event_unit (
-    .clk_i              ( sys_clk              ),
-    .rst_ni             ( rst_ni               ),
-    .test_mode_i        ( test_mode_i          ),
+    .clk_i            ( sys_clk                                    ),
+    .rst_ni           ( rst_ni                                     ),
+    .test_mode_i      ( test_mode_i                                ),
 
     // Event inputs - single core arrays
-    .acc_events_i       ( acc_events_array     ),  // Accelerator events
-    .dma_events_i       ( dma_events_array     ),  // iDMA completion events  
-    .timer_events_i     ( timer_events_array   ),
-    .other_events_i     ( other_events_array   ),  // Combined events
+    .acc_events_i     ( acc_events_array                           ),                   
+    .dma_events_i     ( dma_events_array                           ),                      
+    .timer_events_i   ( timer_events_array                         ),
+    .other_events_i   ( other_events_array                         ),                   
 
     // Core IRQ interface
-    .core_irq_req_o     ( eu_core_irq_req      ),
-    .core_irq_id_o      ( eu_core_irq_id       ),
-    .core_irq_ack_i     ( eu_core_irq_ack      ),
-    .core_irq_ack_id_i  ( eu_core_irq_ack_id   ),
+    .core_irq_req_o   ( eu_core_irq_req                            ),
+    .core_irq_id_o    ( eu_core_irq_id                             ),
+    .core_irq_ack_i   ( eu_core_irq_ack                            ),
+    .core_irq_ack_id_i( eu_core_irq_ack_id                         ),
 
     // Core control
-    .core_busy_i        ( core_busy_o          ),
-    .core_clock_en_o    ( eu_core_clk_en       ),
+    .core_busy_i      ( core_busy_o                                ),
+    .core_clock_en_o  ( eu_core_clk_en                             ),
 
     // Debug
-    .dbg_req_i          ( debug_req_i          ),
-    .core_dbg_req_o     ( eu_core_dbg_req      ),
-
-    // EU Direct Link Interface - abstract types
-    .eu_direct_req_i    ( eu_direct_req.req    ),
-    .eu_direct_addr_i   ( eu_direct_req.addr   ),
-    .eu_direct_wen_i    ( eu_direct_req.wen    ),
-    .eu_direct_wdata_i  ( eu_direct_req.wdata  ),
-    .eu_direct_be_i     ( eu_direct_req.be     ),
-    .eu_direct_gnt_o    ( eu_direct_rsp.gnt    ),
-    .eu_direct_rvalid_o ( eu_direct_rsp.rvalid ),
-    .eu_direct_rdata_o  ( eu_direct_rsp.rdata  ),
-    .eu_direct_err_o    ( eu_direct_rsp.err    )
+    .dbg_req_i        ( debug_req_i                                ),
+    .core_dbg_req_o   ( eu_core_dbg_req                            ),
+
+    // EU Direct Link Interface (with cut for timing)
+    .eu_direct_req_i      ( eu_direct_req_cut.req                  ),
+    .eu_direct_addr_i     ( eu_direct_req_cut.addr                 ),
+    .eu_direct_wen_i      ( eu_direct_req_cut.wen                  ),
+    .eu_direct_wdata_i    ( eu_direct_req_cut.wdata                ),
+    .eu_direct_be_i       ( eu_direct_req_cut.be                   ),
+    .eu_direct_gnt_o      ( eu_direct_rsp_cut.gnt                  ),
+    .eu_direct_rvalid_o   ( eu_direct_rsp_cut.rvalid               ),
+    .eu_direct_rdata_o    ( eu_direct_rsp_cut.rdata                ),
+    .eu_direct_err_o      ( eu_direct_rsp_cut.err                  ),
+    
+    // OBI Peripheral Slave Interface
+    .obi_req_i        ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] ),
+    .obi_rsp_o        ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] )
   );
 
 /*******************************************************/
-/**                   Event Unit End                  **/
+/**                    Event Unit End                 **/
+/*******************************************************/
+
+/*******************************************************/
+/**                  Spatz CC Beginning               **/
+/*******************************************************/
+  // Spatz interrupt signals
+  assign spatz_irq.msip      = 1'b0;              // Machine software interrupt (unused - no multi-core)
+  assign spatz_irq.mtip      = 1'b0;              // Machine timer interrupt (unused)
+  assign spatz_irq.meip      = spatz_start;       // Machine external interrupt - from obi_slave_ctrl_spatz
+  assign spatz_irq.mcip      = 1'b0;              // Machine cluster-local interrupt (unused - no cluster)
+  assign spatz_irq.debug     = 1'b0;              // Debug request from external debugger
+
+  spatz_cc_wrapper #(
+    .AddrWidth         ( magia_pkg::ADDR_W                       ),
+    .DataWidth         ( magia_tile_pkg::SPATZ_TCDM_DATA_WIDTH   ),
+    .NumSpatzFPUs      ( SPATZ_NUM_FPU                           ),
+    .NumSpatzIPUs      ( SPATZ_NUM_IPU                           ),
+    .BootAddr          ( magia_tile_pkg::SPATZ_BOOT_ADDR         ),
+    .RVF               ( magia_tile_pkg::SPATZ_RVF_PARAM         ),
+    .RVD               ( magia_tile_pkg::SPATZ_RVD_PARAM         ),
+    .RVV               ( magia_tile_pkg::SPATZ_RVV_PARAM         ),
+    .XDivSqrt          ( magia_tile_pkg::SPATZ_XDIVSQRT_PARAM    ),
+    .FPUImplementation ( magia_tile_pkg::SPATZ_FPUImplementation )
+  ) i_spatz_cc_core (
+    .clk_i             ( spatz_clk                               ),  // Use gated clock
+    .rst_ni            ( rst_ni                                  ),
+    .test_mode_i       ( test_mode_i                             ),
+    
+    // Hart ID
+    .hart_id_i         ( mhartid_i                               ),  // Same as CV32
+    .tcdm_addr_base_i  ( tile_l1_start_addr                      ),  // Dynamic L1 base per tile
+    
+    // Interrupts
+    .irq_i             ( spatz_irq                               ),
+    
+    // HCI Master Interface - Connect to HCI interconnect
+    .hci_master_req_o  ( spatz_hci_req                           ),
+    .hci_master_rsp_i  ( spatz_hci_rsp                           ),
+    
+    // OBI Master Interface - Connect to OBI crossbar
+    .obi_master_req_o  ( spatz_obi_req                           ),
+    .obi_master_rsp_i  ( spatz_obi_rsp                           ),
+    
+    // Instruction Cache Interface
+    .inst_req_o        ( spatz_inst_req                          ),
+    .inst_addr_o       ( spatz_inst_addr                         ),
+    .inst_cacheable_o  ( spatz_inst_cacheable                    ),
+    .flush_i_valid_o   ( spatz_flush_i_valid                     ),
+    .inst_data_i       ( spatz_inst_data                         ),
+    .inst_ready_i      ( spatz_inst_ready                        ),
+    .inst_error_i      ( spatz_inst_error                        ),
+    .flush_i_ready_i   ( spatz_flush_i_ready                     ),
+    
+    // Events and status
+    .core_events_o     ( spatz_core_events                       )
+  );
+
+/*******************************************************/
+/**                    Spatz CC End                   **/
+/*******************************************************/
+/**              Spatz ICache Beginning               **/
+/*******************************************************/
+
+  assign spatz_enable_prefetching = 1'b0;  
+
+  snitch_icache #(
+    .NR_FETCH_PORTS     ( 1                                                  ), // Single Spatz CC core
+    .L0_LINE_COUNT      ( 8                                                  ), // L0 cache lines
+    .LINE_WIDTH         ( magia_tile_pkg::SPATZ_ICACHE_LINE_WIDTH            ), // 256 bits
+    .LINE_COUNT         ( magia_tile_pkg::SPATZ_ICACHE_LINE_COUNT            ), // 32 lines
+    .SET_COUNT          ( magia_tile_pkg::SPATZ_ICACHE_WAYS                  ), // 2-way set associative
+    .FETCH_AW           ( magia_pkg::ADDR_W                                  ), // Address width
+    .FETCH_DW           ( 32                                                 ), // 32-bit instructions
+    .FILL_AW            ( magia_pkg::ADDR_W                                  ), // AXI address width
+    .FILL_DW            ( magia_pkg::DATA_W                                  ), // AXI data width
+    .SERIAL_LOOKUP      ( 0                                                  ),
+    .L1_TAG_SCM         ( 0                                                  ),
+    .NUM_AXI_OUTSTANDING( 2                                                  ),
+    .EARLY_LATCH        ( 0                                                  ),
+    .L0_EARLY_TAG_WIDTH ( magia_tile_pkg::SPATZ_L0_EARLY_TAG_W               ),
+    .ISO_CROSSING       ( 1'b0                                               ),
+    .axi_req_t          ( magia_tile_pkg::core_axi_instr_req_t               ),
+    .axi_rsp_t          ( magia_tile_pkg::core_axi_instr_rsp_t               )
+  ) i_spatz_cc_icache (
+    .clk_i                ( spatz_clk                                        ), 
+    .clk_d2_i             ( spatz_clk                                        ),  
+    .rst_ni               ( rst_ni                                           ),
+    .enable_prefetching_i ( spatz_enable_prefetching                         ),
+    .icache_l0_events_o   (                                                  ),
+    .icache_l1_events_o   (                                                  ), 
+    .flush_valid_i        ( spatz_flush_i_valid                              ),
+    .flush_ready_o        ( spatz_flush_i_ready                              ),
+    .inst_addr_i          ( spatz_inst_addr                                  ),
+    .inst_cacheable_i     ( spatz_inst_cacheable                             ),
+    .inst_data_o          ( spatz_inst_data                                  ),
+    .inst_valid_i         ( spatz_inst_req                                   ),  
+    .inst_ready_o         ( spatz_inst_ready                                 ),  
+    .inst_error_o         ( spatz_inst_error                                 ),
+    .sram_cfg_tag_i       ( '0                                               ),
+    .sram_cfg_data_i      ( '0                                               ),
+    .axi_req_o            ( spatz_icache_axi_req                             ),
+    .axi_rsp_i            ( spatz_icache_axi_rsp                             )
+  );
+
+/*******************************************************/
+/**                Spatz ICache End                   **/
+/*******************************************************/
+/**            Spatz Bootrom Beginning                **/
+/*******************************************************/
+  // AXI to regbus converter for bootrom
+  magia_tile_pkg::reg_dma_req_t bootrom_reg_req;
+  magia_tile_pkg::reg_dma_rsp_t bootrom_reg_rsp;
+  logic [magia_pkg::AXI_NOC_ID_W-1:0] bootrom_reg_id;
+  logic bootrom_busy;
+
+  axi_to_reg_v2 #(
+    .AxiAddrWidth ( magia_pkg::ADDR_W             ),
+    .AxiDataWidth ( magia_pkg::DATA_W             ),
+    .AxiIdWidth   ( magia_pkg::AXI_NOC_ID_W       ),
+    .AxiUserWidth ( magia_pkg::AXI_NOC_U_W        ),
+    .RegDataWidth ( magia_pkg::DATA_W             ),
+    .axi_req_t    ( magia_pkg::axi_xbar_mst_req_t ),
+    .axi_rsp_t    ( magia_pkg::axi_xbar_mst_rsp_t ),
+    .reg_req_t    ( magia_tile_pkg::reg_dma_req_t ),
+    .reg_rsp_t    ( magia_tile_pkg::reg_dma_rsp_t )
+  ) i_axi_to_reg_bootrom (
+    .clk_i      ( sys_clk                                                     ),
+    .rst_ni     ( rst_ni                                                      ),
+    .axi_req_i  ( axi_xbar_mst_req[magia_tile_pkg::AXI_XBAR_MST_BOOTROM_IDX]  ),
+    .axi_rsp_o  ( axi_xbar_mst_rsp[magia_tile_pkg::AXI_XBAR_MST_BOOTROM_IDX]  ),
+    .reg_req_o  ( bootrom_reg_req                                             ),
+    .reg_rsp_i  ( bootrom_reg_rsp                                             ),
+    .reg_id_o   ( bootrom_reg_id                                              ),
+    .busy_o     ( bootrom_busy                                                )
+  );
+
+  // Spatz bootrom module (generated from spatz_init.S)
+  spatz_bootrom i_spatz_bootrom (
+    .clk_i   ( sys_clk                 ),
+    .req_i   ( bootrom_reg_req.valid   ),
+    .addr_i  ( bootrom_reg_req.addr    ),
+    .rdata_o ( bootrom_reg_rsp.rdata   )
+  );
+  
+  // Regbus response handling instead of macro
+  always_ff @(posedge sys_clk or negedge rst_ni) begin
+    if (!rst_ni) begin
+      bootrom_reg_rsp.ready <= 1'b0;
+    end else begin
+      bootrom_reg_rsp.ready <= bootrom_reg_req.valid;
+    end
+  end
+  
+  assign bootrom_reg_rsp.error = 1'b0;
+
+/*******************************************************/
+/**              Spatz Bootrom End                    **/
 /*******************************************************/
 
 endmodule: magia_tile
\ No newline at end of file
diff --git a/hw/tile/magia_tile_pkg.sv b/hw/tile/magia_tile_pkg.sv
index 4f06773..e7d135b 100644
--- a/hw/tile/magia_tile_pkg.sv
+++ b/hw/tile/magia_tile_pkg.sv
@@ -29,6 +29,8 @@ package magia_tile_pkg;
   `include "register_interface/typedef.svh"
   `include "idma/typedef.svh"
   `include "fractal_sync/typedef.svh"
+  `include "reqrsp_interface/typedef.svh"
+  `include "tcdm_interface/typedef.svh"
 
   `include "../include/alias.svh"
 
@@ -50,34 +52,182 @@ package magia_tile_pkg;
 
   // Address map
   localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_START = 32'h0000_0100;
-  localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE       = 32'h0000_0100; 
+  localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE       = 32'h0000_00FF; 
   localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_END   = REDMULE_CTRL_ADDR_START + REDMULE_CTRL_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START    = REDMULE_CTRL_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE          = 32'h0000_0400;
+  localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START    = REDMULE_CTRL_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE          = 32'h0000_03FF;
   localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_END      = IDMA_CTRL_ADDR_START + IDMA_CTRL_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START   = IDMA_CTRL_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE         = 32'h0000_0100; 
+  localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START   = IDMA_CTRL_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE         = 32'h0000_00FF; 
   localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_END     = FSYNC_CTRL_ADDR_START + FSYNC_CTRL_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START   = FSYNC_CTRL_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE         = 32'h0000_1000; 
+  localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START   = FSYNC_CTRL_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE         = 32'h0000_0FFF; 
   localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_END     = EVENT_UNIT_ADDR_START + EVENT_UNIT_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START     = EVENT_UNIT_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE           = 32'h0000_E900;  // Calculated to make RESERVED_ADDR_END = 0x0001_0000
+  localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_START   = EVENT_UNIT_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_SIZE         = 32'h0000_00FF; 
+  localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_END     = SPATZ_CTRL_ADDR_START + SPATZ_CTRL_SIZE;
+  localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START     = SPATZ_CTRL_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE           = 32'h0000_E7FF;  // Calculated to make RESERVED_ADDR_END = 0x0001_0000
   localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_END       = RESERVED_ADDR_START + RESERVED_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START        = RESERVED_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] STACK_SIZE              = 32'h0001_0000;
+  localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START        = RESERVED_ADDR_END +1;
+  localparam logic [magia_pkg::ADDR_W-1:0] STACK_SIZE              = 32'h0000_FFFF;
   localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_END          = STACK_ADDR_START + STACK_SIZE;
-  localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_START           = STACK_ADDR_END;
-  localparam logic [magia_pkg::ADDR_W-1:0] L1_SIZE                 = 32'h000E_0000;
+  localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_START           = STACK_ADDR_END + 1;
+  localparam logic [magia_pkg::ADDR_W-1:0] L1_SIZE                 = 32'h000D_FFFF;
   localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_END             = L1_ADDR_START + L1_SIZE;
   localparam logic [magia_pkg::ADDR_W-1:0] L1_TILE_OFFSET          = 32'h0010_0000;
   localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_START           = 32'hC000_0000;
-  localparam logic [magia_pkg::ADDR_W-1:0] L2_SIZE                 = 32'h4000_0000;
+  localparam logic [magia_pkg::ADDR_W-1:0] L2_SIZE                 = 32'h3FFF_FFFF;
   localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_END             = L2_ADDR_START + L2_SIZE;
 
+  // Instruction region for Spatz code (cacheable region)
+  localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_ADDR_START      = 32'hCC00_0000;
+  localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_SIZE            = 32'h0000_8000;  // 32KB
+
+  localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_PMA_MASK        = 32'hFFFF_8000;
+
+  // Snitch PMA Configuration - defines cacheable regions for instruction fetches
+  function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_snitch_cached_regions();
+    automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions;
+    cached_regions = '{default: '0};
+    cached_regions[0] = '{base: INSTRRAM_ADDR_START, mask: INSTRRAM_PMA_MASK};
+    return cached_regions;
+  endfunction
+
+  localparam snitch_pma_pkg::snitch_pma_t SPATZ_SNITCH_PMA_CFG = '{
+    NrCachedRegionRules: 1,
+    CachedRegion: get_snitch_cached_regions(),
+    default: 0
+  };
+
+  //SPATZ PARAMETERS from Makefile defines
+  //SPATZ_RVD (Double-width vector extension support)
+  `ifdef SPATZ_RVD
+    localparam bit SPATZ_RVD_PARAM = `SPATZ_RVD;
+  `else
+    localparam bit SPATZ_RVD_PARAM = 1'b0;
+  `endif
+  
+  //SPATZ_N_IPU and SPATZ_N_FPU
+  `ifdef SPATZ_N_IPU
+    localparam int unsigned SPATZ_NUM_IPU = `SPATZ_N_IPU;
+  `else
+    localparam int unsigned SPATZ_NUM_IPU = 1;
+  `endif
+  
+  `ifdef SPATZ_N_FPU
+    localparam int unsigned SPATZ_NUM_FPU = `SPATZ_N_FPU;
+  `else
+    localparam int unsigned SPATZ_NUM_FPU = 4;
+  `endif
+  
+  //SPATZ_XDIVSQRT (FP division/sqrt enable)
+  `ifdef SPATZ_XDIVSQRT
+    localparam bit SPATZ_XDIVSQRT_PARAM = `SPATZ_XDIVSQRT;
+  `else
+    localparam bit SPATZ_XDIVSQRT_PARAM = 1'b0;
+  `endif
+  
+  //SPATZ_XDMA (DMA inside Spatz_cc)
+  `ifdef SPATZ_XDMA
+    localparam bit SPATZ_XDMA_PARAM = `SPATZ_XDMA;
+  `else
+    localparam bit SPATZ_XDMA_PARAM = 1'b0;
+  `endif
+  
+  //SPATZ_RVF (Single-precision FP support)
+  `ifdef SPATZ_RVF
+    localparam bit SPATZ_RVF_PARAM = `SPATZ_RVF;
+  `else
+    localparam bit SPATZ_RVF_PARAM = 1'b1;
+  `endif
+  
+  //SPATZ_RVV (Vector extension support)
+  `ifdef SPATZ_RVV
+    localparam bit SPATZ_RVV_PARAM = `SPATZ_RVV;
+  `else
+    localparam bit SPATZ_RVV_PARAM = 1'b1;
+  `endif
+  
+  // Spatz CC parameters (must be defined before HCI parameters)
+  localparam int unsigned SPATZ_NUM_FU            = (SPATZ_NUM_FPU > SPATZ_NUM_IPU) ? SPATZ_NUM_FPU : SPATZ_NUM_IPU;  // Max of FPU and IPU
+  localparam int unsigned SPATZ_TCDM_PORTS        = SPATZ_NUM_FU + 1;  // N_FU + 1 TCDM ports (N_FU vector + 1 snitch)
+  localparam int unsigned SPATZ_HCI_PORTS         = SPATZ_RVD_PARAM ? (SPATZ_TCDM_PORTS * 2) : SPATZ_TCDM_PORTS;  // RVD=1: 2xTCDM HCI32, RVD=0: 1xTCDM HCI32
+
+  // Spatz CC outstanding transactions and timing parameters
+  parameter int unsigned SPATZ_NUM_INT_OUTSTANDING_LOADS   = 1;   // Snitch core outstanding loads
+  parameter int unsigned SPATZ_NUM_INT_OUTSTANDING_MEM     = 4;   // Snitch core outstanding memory ops
+  parameter int unsigned SPATZ_NUM_SPATZ_OUTSTANDING_LOADS = 4;   // Spatz vector unit outstanding loads
+  parameter bit          SPATZ_XDIVSQRT                    = SPATZ_XDIVSQRT_PARAM; // From Makefile define (0=disabled, 1=enabled)
+  parameter bit          SPATZ_XDMA                        = SPATZ_XDMA_PARAM;     // From Makefile define (0=disabled, 1=enabled)
+  parameter bit          SPATZ_RVF                         = SPATZ_RVF_PARAM;      // From Makefile define (single-precision FP)
+  parameter bit          SPATZ_RVV                         = SPATZ_RVV_PARAM;      // From Makefile define (vector extension)
+  parameter bit          SPATZ_REGISTER_OFFLOAD_RSP        = 1'b0; // Pipeline register on offload response
+  parameter bit          SPATZ_REGISTER_CORE_REQ           = 1'b1; // Pipeline register on core request
+  parameter bit          SPATZ_REGISTER_CORE_RSP           = 1'b1; // Pipeline register on core response
+  
+  // Spatz FPU implementation configuration
+  localparam fpnew_pkg::fpu_implementation_t SPATZ_FPUImplementation = '{
+    PipeRegs: // FMA Block
+              '{
+                '{  3,                           // FP32 FMA
+                    SPATZ_RVD_PARAM ? 3 : 0,     // FP64 FMA
+                    3,                           // FP16
+                    3,                           // FP8
+                    3,                           // FP16alt
+                    3                            // FP8alt
+                  },
+                '{1, 1, 1, 1, 1, 1},             // DIVSQRT (all formats)
+                '{1, 1, 1, 1, 1, 1},             // NONCOMP (all formats)
+                '{2, 2, 2, 2, 2, 2},             // CONV (all formats)
+                '{5, 5, 5, 5, 5, 5}              // DOTP
+                },
+  UnitTypes: '{'{ fpnew_pkg::MERGED,
+                  SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED},           // FMA 
+                '{fpnew_pkg::DISABLED,
+                  fpnew_pkg::DISABLED,
+                  fpnew_pkg::DISABLED,
+                  fpnew_pkg::DISABLED,
+                  fpnew_pkg::DISABLED,
+                  fpnew_pkg::DISABLED},          // DIVSQRT 
+                '{fpnew_pkg::PARALLEL,
+                  SPATZ_RVD_PARAM ? fpnew_pkg::PARALLEL : fpnew_pkg::DISABLED,
+                  fpnew_pkg::PARALLEL,
+                  fpnew_pkg::PARALLEL,
+                  fpnew_pkg::PARALLEL,
+                  fpnew_pkg::PARALLEL},          // NONCOMP 
+                '{fpnew_pkg::MERGED,
+                  SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED},            // CONV 
+                '{fpnew_pkg::MERGED,
+                  SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED,
+                  fpnew_pkg::MERGED}},           // DOTP 
+    PipeConfig: fpnew_pkg::BEFORE 
+  };
+
+  // Spatz bootrom parameters
+  parameter logic [31:0] SPATZ_BOOT_ADDR          = 32'h1000_0000;  // Spatz bootrom base address
+  parameter logic [31:0] SPATZ_BOOTROM_SIZE       = 32'h0000_00FF;
+  
+  // Spatz TCDM parameters
+  parameter int unsigned SPATZ_TCDM_ADDR_WIDTH = $clog2(magia_pkg::N_MEM_BANKS * magia_pkg::N_WORDS_BANK * magia_pkg::DATA_W / 8);  
+  parameter int unsigned SPATZ_TCDM_DATA_WIDTH = SPATZ_RVD_PARAM ? 64 : 32;                  // Spatz TCDM data width
+  parameter int unsigned SPATZ_TCDM_STRB_WIDTH = SPATZ_RVD_PARAM ? 8 : 4;                    // Spatz TCDM strobe width
+
+
   // Parameters used by the HCI
   parameter int unsigned N_HWPE  = 1;                                                   // Number of HWPEs attached to the port
-  parameter int unsigned N_CORE  = 1;                                                   // Number of Core ports
+  parameter int unsigned N_CORE  = 1 + SPATZ_HCI_PORTS;                                 // Number of Core ports: 1 CV32 + Spatz HCI ports (RVD=1: 11 total, RVD=0: 6 total)
   parameter int unsigned N_DMA   = 2;                                                   // Number of DMA ports (1 for the read channel and 1 for the write channel)
   typedef enum logic{
     HCI_DMA_CH_READ_IDX  = 1'b0,
@@ -125,7 +275,7 @@ package magia_tile_pkg;
   parameter bit          USE_PMP             = 1'b1;                                    // Enable PMP
   parameter bit          PULP_CLUSTER        = 1'b1;                                    // PULP cluster mode
   parameter bit          FPU                 = 1'b1;                                    // Enable FPU (main feature)
-  parameter bit          ZFINX               = 1'b0;                                    // Zfinx extension (integer FP in GPR) - Must be 0 for standard FPU
+  parameter bit          ZFINX               = 1'b1;                                    // Zfinx extension (integer FP in GPR) - Must be 1 for CV32E40P with FP16/FP16alt
   parameter bit          FP_DIVSQRT          = 1'b1;                                    // FP division and square root
   parameter bit          SHARED_FP           = 1'b0;                                    // Shared FP unit
   parameter bit          SHARED_DSP_MULT     = 1'b0;                                    // Shared DSP multiplier
@@ -167,23 +317,24 @@ package magia_tile_pkg;
   parameter int unsigned MID_WIDTH    = 1;                                              // Width of the mid   signal (manager identifier, see OBI documentation)
   parameter int unsigned OBI_ID_WIDTH = 1;                                              // Width of the id - configuration
 `ifdef CV32E40X
-  parameter int unsigned N_SBR        = 2;                                              // Number of slaves (HCI, AXI XBAR)
+  parameter int unsigned N_SBR        = 4;                                              // Number of slaves (HCI, AXI XBAR, Event_Unit, Spatz_Ctrl)
 `else
-  parameter int unsigned N_SBR        = 5;                                              // Number of slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link
+  parameter int unsigned N_SBR        = 7;                                              // Number of OBI slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl)
 `endif  
-  parameter int unsigned N_MGR        = 2;                                              // Number of masters (Core, AXI XBAR)
+  parameter int unsigned N_MGR        = 3;                                              // Number of masters (Core, AXI XBAR, Spatz CC)
   parameter int unsigned N_MAX_TRAN   = 1;                                              // Number of maximum outstanding transactions
 `ifdef CV32E40X
-  parameter int unsigned N_ADDR_RULE  = 4;                                              // Number of address rules (L2, L1, Stack, Reserved)
+  parameter int unsigned N_ADDR_RULE  = 6;                                              // Number of address rules (L2, L1, Stack, Reserved, Event_Unit, Spatz_Ctrl)
 `else
-  parameter int unsigned N_ADDR_RULE  = 7;                                              // Number of address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link
+  parameter int unsigned N_ADDR_RULE  = 9;                                              // Number of OBI address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl)
 `endif  
   localparam int unsigned N_BIT_SBR   = $clog2(N_SBR);                                  // Number of bits required to identify each slave
 
   // Parameters used by AXI
-  parameter int unsigned AXI_DATA_ID_W  = 2;                                            // Width of the AXI Data ID (2 bits: Core, iDMA, I$, ext)
-  parameter int unsigned AXI_INSTR_ID_W = 1;                                            // Width of the AXI Instruction ID (0 bits: direct Core - I$ connection)
-  parameter int unsigned AXI_ID_W       = 2;                                            // Width of the AXI Unified Communication Channel ID
+  parameter int unsigned AXI_DATA_ID_W  = 3;                                            // Width of the AXI Data ID (3 bits for 5 slave ports on crossbar: 2^3=8)
+  parameter int unsigned AXI_INSTR_ID_W = 3;                                            // Width of the AXI Instruction ID (3 bits for 5 slave ports on crossbar)
+  parameter int unsigned AXI_ID_W       = 3;                                            // Width of the AXI Unified Communication Channel ID (3 bits for 5 slave ports)
+  localparam int unsigned AXI_MST_ID_W  = 6;                                            // Width of master port ID (slave 3b + prepend 3b for 5 ports)
   parameter int unsigned AXI_DATA_U_W   = magia_pkg::USR_W;                             // Width of the AXI Data User
   parameter int unsigned AXI_INSTR_U_W  = magia_pkg::USR_W;                             // Width of the AXI Instruction User
   parameter int unsigned AXI_U_W        = magia_pkg::USR_W;                             // Width of the AXI Unified Communication Channel User
@@ -307,8 +458,8 @@ package magia_tile_pkg;
   parameter bit          FSYNC_STALL               = 1;                                 // Fractal Sync Stall during synchronization
 
   // Parameters of the AXI XBAR
-  parameter int unsigned AxiXbarNoSlvPorts     = 4;                                     // Number of Slave Ports (iDMA, Core Data, Core I$ and ext)
-  parameter int unsigned AxiXbarNoMstPorts     = 2;                                     // Number of Master Ports (to ext and to internal L1 from ext)
+  parameter int unsigned AxiXbarNoSlvPorts     = 5;                                     // Number of Slave Ports (ext, iDMA, Core Data, CV32 I$, Spatz I$)
+  parameter int unsigned AxiXbarNoMstPorts     = 3;                                     // Number of Master Ports (to ext, to internal L1, to Spatz bootrom)
   localparam int unsigned AxiXbarSlvAxiIDWidth = AXI_DATA_ID_W;                         // Number of bits to indentify each Slave Port
   parameter int unsigned AxiXbarMaxWTrans      = 16;                                    // Maximum number of outstanding transactions per write
   parameter int unsigned AxiXbarMaxMstTrans    = AxiXbarMaxWTrans;                      // Maximum number of outstanding transactions per master
@@ -332,6 +483,12 @@ package magia_tile_pkg;
   parameter int unsigned FETCH_DW       = magia_pkg::DATA_W;                            // i$ Fetch interface data width. Power of two; >= 8.
   parameter int unsigned FILL_AW        = magia_pkg::ADDR_W;                            // i$ Fill interface address width. Same as FILL_AW; >= 1.
   parameter int unsigned FILL_DW        = magia_pkg::DATA_W;                            // i$ Fill interface data width. Power of two; >= 8.
+
+  // Spatz ICache parameters (dedicated icache for Spatz CC)
+  parameter int unsigned SPATZ_ICACHE_LINE_WIDTH = 256;                                 // Spatz i$ cache line width (should be investigated which is the best value)
+  parameter int unsigned SPATZ_ICACHE_LINE_COUNT = 32;                                  // Spatz i$ number of cache lines
+  parameter int unsigned SPATZ_ICACHE_WAYS       = 2;                                   // Spatz i$ number of ways (2-way set associative)
+  localparam int unsigned SPATZ_L0_EARLY_TAG_W   = snitch_pkg::PAGE_SHIFT - $clog2(SPATZ_ICACHE_LINE_WIDTH/8); // L0 early tag width
   
   // Parameters used by the FPU
   parameter bit                             FPU_ZFINX          = 0;                     // FPU use Zfinx extension instead of the F ISA extention
@@ -364,9 +521,10 @@ package magia_tile_pkg;
     logic[magia_pkg::ADDR_W-1:0] end_addr;
   } obi_xbar_rule_t;
 
-  typedef enum logic{
-    OBI_EXT_IDX  = 1,
-    OBI_CORE_IDX = 0
+  typedef enum logic[1:0]{
+    OBI_SPATZ_IDX = 2,
+    OBI_EXT_IDX   = 1,
+    OBI_CORE_IDX  = 0
   } obi_xbar_idx_e;
 
   typedef struct packed {
@@ -450,16 +608,20 @@ package magia_tile_pkg;
   } core_cache_instr_rsp_t;
 
 `ifdef CV32E40X
-  typedef enum logic[1:0]{
-    OBI_XBAR_STACK_IDX    = 3,
-    OBI_XBAR_RESERVED_IDX = 2,
-    OBI_XBAR_L1SPM_IDX    = 1,
-    OBI_XBAR_L2_IDX       = 0
+  typedef enum logic[2:0]{
+    OBI_XBAR_STACK_IDX        = 5,
+    OBI_XBAR_SPATZ_CTRL_IDX   = 4,
+    OBI_XBAR_EVENT_UNIT_IDX   = 3,
+    OBI_XBAR_RESERVED_IDX     = 2,
+    OBI_XBAR_L1SPM_IDX        = 1,
+    OBI_XBAR_L2_IDX           = 0
   } obi_mem_array_idx_e;
 `else
-  typedef enum logic[2:0]{
-    OBI_XBAR_STACK_IDX        = 6,
-    OBI_XBAR_RESERVED_IDX     = 5,
+  typedef enum logic[3:0]{
+    OBI_XBAR_STACK_IDX        = 8,
+    OBI_XBAR_RESERVED_IDX     = 7,
+    OBI_XBAR_SPATZ_CTRL_IDX   = 6,
+    OBI_XBAR_EVENT_UNIT_IDX   = 5,
     OBI_XBAR_FSYNC_CTRL_IDX   = 4,
     OBI_XBAR_IDMA_IDX         = 3,
     OBI_XBAR_REDMULE_CTRL_IDX = 2,
@@ -468,20 +630,30 @@ package magia_tile_pkg;
   } obi_mem_array_idx_e;
 `endif
 
-  typedef enum logic[1:0]{
-    AXI_XBAR_STACK_IDX    = 3,
-    AXI_XBAR_RESERVED_IDX = 2,
-    AXI_XBAR_L1SPM_IDX    = 1,
+  typedef enum logic[2:0]{
+    AXI_XBAR_STACK_IDX    = 4,
+    AXI_XBAR_RESERVED_IDX = 3,
+    AXI_XBAR_L1SPM_IDX    = 2,
+    AXI_XBAR_BOOTROM_IDX  = 1, 
     AXI_XBAR_L2_IDX       = 0
   } axi_mem_array_idx_e;
 
-  typedef enum logic[1:0]{
-    AXI_EXT_IDX        = 3,
-    AXI_IDMA_IDX       = 2,
-    AXI_CORE_DATA_IDX  = 1,
-    AXI_CORE_INSTR_IDX = 0
+
+  typedef enum logic[2:0]{
+    AXI_SPATZ_INSTR_IDX =  4,
+    AXI_EXT_IDX          = 3,
+    AXI_IDMA_IDX         = 2,
+    AXI_CORE_DATA_IDX    = 1,
+    AXI_CORE_INSTR_IDX   = 0
   } axi_xbar_idx_e;
 
+  
+  typedef enum logic[1:0]{
+    AXI_XBAR_MST_EXT_IDX     = 0,
+    AXI_XBAR_MST_INT_IDX     = 1,
+    AXI_XBAR_MST_BOOTROM_IDX = 2
+  } axi_xbar_mst_idx_e;
+
   typedef struct packed {
     logic[N_SIGN-1:0][SIGN_W-1:0] sign_list;
   } xif_inst_rule_t;
@@ -495,7 +667,15 @@ package magia_tile_pkg;
   `HCI_TYPEDEF_RSP_T(redmule_data_rsp_t, logic[DWH-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0])
 
   localparam obi_pkg::obi_optional_cfg_t obi_amo_optional_cfg = obi_pkg::obi_all_optional_config(AUSER_WIDTH, WUSER_WIDTH, RUSER_WIDTH, MID_WIDTH, ACHK_WIDTH, RCHK_WIDTH);
-  localparam obi_pkg::obi_cfg_t          obi_amo_cfg          = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_amo_optional_cfg);
+  localparam obi_pkg::obi_optional_cfg_t obi_no_amo_optional_cfg = '{UseAtop: 1'b0, UseMemtype: 1'b0, UseProt: 1'b0, UseDbg: 1'b0, AUserWidth: AUSER_WIDTH, WUserWidth: WUSER_WIDTH, RUserWidth: RUSER_WIDTH, MidWidth: MID_WIDTH, AChkWidth: ACHK_WIDTH, RChkWidth: RCHK_WIDTH};
+  
+  // OBI full configurations - 32-bit (default)
+  localparam obi_pkg::obi_cfg_t obi_amo_cfg = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_amo_optional_cfg);
+  localparam obi_pkg::obi_cfg_t obi_no_amo_cfg = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_no_amo_optional_cfg);
+  
+  // OBI full configurations - 64-bit
+  localparam obi_pkg::obi_cfg_t obi_amo_cfg_64 = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, SPATZ_TCDM_DATA_WIDTH, OBI_ID_WIDTH, obi_amo_optional_cfg);
+  localparam obi_pkg::obi_cfg_t obi_no_amo_cfg_64 = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, SPATZ_TCDM_DATA_WIDTH, OBI_ID_WIDTH, obi_no_amo_optional_cfg);
   localparam bit                         RegisterAmo          = 1;
   
   `OBI_TYPEDEF_ALL_A_OPTIONAL(core_data_obi_a_optional_t, AUSER_WIDTH, WUSER_WIDTH, MID_WIDTH, ACHK_WIDTH)
@@ -518,6 +698,7 @@ package magia_tile_pkg;
   `AXI_TYPEDEF_ALL_CT(core_axi_data, core_axi_data_req_t, core_axi_data_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0])
   `AXI_TYPEDEF_ALL_CT(core_axi_instr, core_axi_instr_req_t, core_axi_instr_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0])
 
+  `REG_BUS_TYPEDEF_ALL(reg_dma, logic[magia_pkg::ADDR_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0])
   `REG_BUS_TYPEDEF_ALL(idma_fe_reg, logic[magia_pkg::ADDR_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0])
 
   `IDMA_TYPEDEF_FULL_REQ_T(idma_be_req_t, logic[iDMA_AxiIdWidth-1:0], idma_addr_t, logic[iDMA_TFLenWidth-1:0])
@@ -557,9 +738,9 @@ package magia_tile_pkg;
       idma_obi_a_chan_t a_chan;
     } obi;
   } idma_write_meta_channel_t;
-
-  `AXI_ALIAS(core_axi_data, axi_xbar_slv, core_axi_data_req_t, axi_xbar_slv_req_t, core_axi_data_rsp_t, axi_xbar_slv_rsp_t)
-  `AXI_ALIAS(core_axi_data, axi_xbar_mst, core_axi_data_req_t, axi_xbar_mst_req_t, core_axi_data_rsp_t, axi_xbar_mst_rsp_t)
+ 
+  `AXI_TYPEDEF_ALL_CT(axi_xbar_slv, axi_xbar_slv_req_t, axi_xbar_slv_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0])
+  `AXI_TYPEDEF_ALL_CT(axi_xbar_mst, axi_xbar_mst_req_t, axi_xbar_mst_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_MST_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0])
 
   `HCI_TYPEDEF_REQ_T(idma_hci_req_t, logic[AWC-1:0], logic[DW_LIC-1:0], logic[SW_LIC-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0])
   `HCI_TYPEDEF_RSP_T(idma_hci_rsp_t, logic[DW_LIC-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0])
@@ -577,7 +758,7 @@ package magia_tile_pkg;
     UniqueIds           : 1'b0,
     AxiAddrWidth        : magia_pkg::ADDR_W,
     AxiDataWidth        : magia_pkg::DATA_W,
-    NoAddrRules         : 3
+    NoAddrRules         : 4
   };
 
   `FSYNC_TYPEDEF_ALL(ht_tile_fsync, logic[FSYNC_AGGR_W-1:0], logic[FSYNC_LVL_W-1:0], logic[FSYNC_ID_W-1:0])
@@ -585,4 +766,50 @@ package magia_tile_pkg;
   `FSYNC_TYPEDEF_ALL(hn_tile_fsync, logic[FSYNC_NBR_AGGR_W-1:0], logic[FSYNC_NBR_LVL_W-1:0], logic[FSYNC_NBR_ID_W-1:0])
   `FSYNC_TYPEDEF_ALL(vn_tile_fsync, logic[FSYNC_NBR_AGGR_W-1:0], logic[FSYNC_NBR_LVL_W-1:0], logic[FSYNC_NBR_ID_W-1:0])
 
+  /*******************************************************************/
+  /*              Spatz Core Complex Wrapper Types                   */
+  /*******************************************************************/
+  
+  // Base types for Spatz TCDM and reqrsp
+  typedef logic [magia_pkg::ADDR_W-1:0]    spatz_addr_t;
+  typedef logic [magia_pkg::DATA_W-1:0]    spatz_data_t;
+  typedef logic [magia_pkg::DATA_W/8-1:0]  spatz_strb_t;
+  typedef logic                            spatz_tcdm_user_t;
+  typedef logic [magia_tile_pkg::SPATZ_TCDM_ADDR_WIDTH-1:0] spatz_tcdm_addr_t;
+  
+  // 64-bit types for TCDM and reqrsp interfaces when RVD=1
+  typedef logic [SPATZ_TCDM_DATA_WIDTH-1:0]     spatz_data64_t;
+  typedef logic [SPATZ_TCDM_STRB_WIDTH-1:0]     spatz_strb64_t;
+  
+  `TCDM_TYPEDEF_ALL(spatz_tcdm, spatz_tcdm_addr_t, spatz_data_t, spatz_strb_t, spatz_tcdm_user_t)
+  `REQRSP_TYPEDEF_ALL(spatz_reqrsp, spatz_addr_t, spatz_data_t, spatz_strb_t)        // 32-bit for Snitch data port (RVD=0, DataWidth=32)
+  `REQRSP_TYPEDEF_ALL(spatz_reqrsp64, spatz_addr_t, spatz_data64_t, spatz_strb64_t)  // 64-bit for Snitch data port (RVD=1, DataWidth=64)
+
+  // TCDM64 types with AMO support (using TCDM_TYPEDEF_ALL macro which includes amo field)
+  typedef logic [magia_pkg::ADDR_W-1:0]        spatz_tcdm64_addr_t;
+  typedef logic [SPATZ_TCDM_DATA_WIDTH-1:0]    spatz_tcdm64_data_t;
+  typedef logic [SPATZ_TCDM_STRB_WIDTH-1:0]    spatz_tcdm64_strb_t;
+  typedef logic                                 spatz_tcdm64_user_t;
+  
+  `TCDM_TYPEDEF_ALL(spatz_tcdm64, spatz_tcdm64_addr_t, spatz_tcdm64_data_t, spatz_tcdm64_strb_t, spatz_tcdm64_user_t)
+  
+  // Alias for backward compatibility with spatz_cc instantiation
+  typedef spatz_tcdm64_req_chan_t spatz_tcdm64_payload_t;
+
+  // TCDM32 types with AMO support (for modular atomic resolver architecture)
+  typedef logic [magia_pkg::ADDR_W-1:0]        spatz_tcdm32_addr_t;
+  typedef logic [31:0]                          spatz_tcdm32_data_t;
+  typedef logic [3:0]                           spatz_tcdm32_strb_t;
+  typedef logic                                 spatz_tcdm32_user_t;
+  
+  `TCDM_TYPEDEF_ALL(spatz_tcdm32, spatz_tcdm32_addr_t, spatz_tcdm32_data_t, spatz_tcdm32_strb_t, spatz_tcdm32_user_t)
+  
+  // OBI 32-bit types for modular atomic architecture  
+  `OBI_TYPEDEF_ALL_A_OPTIONAL(spatz_obi32_a_optional_t, AUSER_WIDTH, WUSER_WIDTH, MID_WIDTH, ACHK_WIDTH)
+  `OBI_TYPEDEF_ALL_R_OPTIONAL(spatz_obi32_r_optional_t, RUSER_WIDTH, RCHK_WIDTH)
+  `OBI_TYPEDEF_A_CHAN_T(spatz_obi32_a_chan_t, magia_pkg::ADDR_W, 32, AID_WIDTH, spatz_obi32_a_optional_t)
+  `OBI_TYPEDEF_R_CHAN_T(spatz_obi32_r_chan_t, 32, RID_WIDTH, spatz_obi32_r_optional_t)
+  `OBI_TYPEDEF_DEFAULT_REQ_T(spatz_obi32_req_t, spatz_obi32_a_chan_t)
+  `OBI_TYPEDEF_RSP_T(spatz_obi32_rsp_t, spatz_obi32_r_chan_t)
+
 endpackage: magia_tile_pkg
\ No newline at end of file
diff --git a/hw/tile/obi_slave_ctrl_spatz.sv b/hw/tile/obi_slave_ctrl_spatz.sv
new file mode 100644
index 0000000..d90cf98
--- /dev/null
+++ b/hw/tile/obi_slave_ctrl_spatz.sv
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51 
+ * (the "License"); you may not use this file except in compliance 
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * OBI Slave Control Registers for Spatz Core Complex
+ *
+ */
+
+module obi_slave_ctrl_spatz
+  import magia_tile_pkg::*;
+#(
+  parameter logic [31:0] BaseAddr  = 32'h00001700  // Base address for control registers
+) (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  
+  // OBI slave interface
+  input  core_obi_data_req_t    obi_req_i,
+  output core_obi_data_rsp_t    obi_rsp_o,
+  
+  // Control outputs
+  output logic              clk_en_o,
+  output logic              start_o,
+  output logic              done_o
+);
+
+  // Register offsets
+  localparam logic [3:0] CLK_EN_OFFSET   = 4'h0;  // +0x00
+  localparam logic [3:0] READY_OFFSET    = 4'h4;  // +0x04
+  localparam logic [3:0] START_OFFSET    = 4'h8;  // +0x08
+  localparam logic [3:0] TASKBIN_OFFSET  = 4'hC;  // +0x0C
+  localparam logic [4:0] DATA_OFFSET     = 5'h10; // +0x10
+  localparam logic [4:0] RETURN_OFFSET   = 5'h14; // +0x14
+  localparam logic [4:0] DONE_OFFSET     = 5'h18; // +0x18
+
+  // Registers
+  logic        clk_en_q;
+  logic        start_q;
+  logic        ready_q;
+  logic [31:0] taskbin_q;
+  logic [31:0] data_q;
+  logic [31:0] return_q;
+  logic        done_q;
+  
+  // Response pipeline
+  logic        rvalid_q, rvalid_d;
+  logic [31:0] rdata_q, rdata_d;
+  
+  // Address decode (offset from base)
+  logic [4:0]  addr_offset;
+  logic        addr_valid;
+  
+  assign addr_offset = obi_req_i.a.addr[4:0];
+  
+  // Check if address is in valid range
+  assign addr_valid = (obi_req_i.a.addr >= BaseAddr) && 
+                      (obi_req_i.a.addr < (BaseAddr + 28));  // 7 registers * 4 bytes
+  
+  // Grant only if address is valid
+  assign obi_rsp_o.gnt = obi_req_i.req && addr_valid;
+  assign obi_rsp_o.r.err = 1'b0;
+  
+  // ============================================
+  // Register write logic (combinational)
+  // ============================================
+  logic        clk_en_d;
+  logic        start_d;
+  logic        ready_d;
+  logic [31:0] taskbin_d;
+  logic [31:0] data_d;
+  logic [31:0] return_d;
+  logic        done_d;
+  
+  always_comb begin
+    // Default: keep current values
+    clk_en_d  = clk_en_q;
+    start_d   = start_q;
+    ready_d   = ready_q;
+    taskbin_d = taskbin_q;
+    data_d    = data_q;
+    return_d  = return_q;
+    done_d    = 1'b0;  // Done is a pulse, auto-clears
+    
+    // Update registers on write only if address is valid
+    if (obi_req_i.req && obi_req_i.a.we && addr_valid) begin
+      case (addr_offset)
+        CLK_EN_OFFSET:  clk_en_d  = obi_req_i.a.wdata[0];
+        READY_OFFSET:   ready_d   = obi_req_i.a.wdata[0];  // Spatz sets to 1 when ready for interrupts
+        START_OFFSET:   start_d   = obi_req_i.a.wdata[0];  // CV32 sets to 1, Spatz clears to 0
+        TASKBIN_OFFSET: taskbin_d = obi_req_i.a.wdata;     // Preserved register
+        DATA_OFFSET:    data_d    = obi_req_i.a.wdata;     // Preserved register
+        RETURN_OFFSET:  return_d  = obi_req_i.a.wdata;     // Preserved register
+        DONE_OFFSET:    done_d    = obi_req_i.a.wdata[0];  // Spatz writes 1, creates pulse
+      endcase
+    end
+  end
+  
+  // ============================================
+  // Register sequential logic
+  // ============================================
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      clk_en_q  <= 1'b0;
+      start_q   <= 1'b0;
+      ready_q   <= 1'b0;
+      taskbin_q <= 32'h0;
+      data_q    <= 32'h0;
+      return_q  <= 32'h0;
+      done_q    <= 1'b0;
+    end else begin
+      clk_en_q  <= clk_en_d;
+      start_q   <= start_d;
+      ready_q   <= ready_d;
+      taskbin_q <= taskbin_d;
+      data_q    <= data_d;
+      return_q  <= return_d;
+      done_q    <= done_d;
+    end
+  end
+  
+  // ============================================
+  // OBI read response logic (combinational)
+  // ============================================
+  always_comb begin
+    rdata_d  = 32'h0;
+    rvalid_d = obi_req_i.req && addr_valid;
+    
+    if (obi_req_i.req && !obi_req_i.a.we && addr_valid) begin
+      case (addr_offset)
+        CLK_EN_OFFSET:  rdata_d = {31'h0, clk_en_q};
+        READY_OFFSET:   rdata_d = {31'h0, ready_q};
+        START_OFFSET:   rdata_d = {31'h0, start_q};
+        TASKBIN_OFFSET: rdata_d = taskbin_q;
+        DATA_OFFSET:    rdata_d = data_q;
+        RETURN_OFFSET:  rdata_d = return_q;
+        DONE_OFFSET:    rdata_d = {31'h0, done_q};
+        default:        rdata_d = 32'hDEADBEEF;
+      endcase
+    end
+  end
+  
+  // ============================================
+  // OBI response sequential logic
+  // ============================================
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      rvalid_q <= 1'b0;
+      rdata_q  <= 32'h0;
+    end else begin
+      rvalid_q <= rvalid_d;
+      rdata_q  <= rdata_d;
+    end
+  end
+  
+  // Output assignments
+  assign obi_rsp_o.rvalid = rvalid_q;
+  assign obi_rsp_o.r.rdata  = rdata_q;
+  assign obi_rsp_o.r.rid = obi_req_i.a.aid;
+  assign obi_rsp_o.r.r_optional = '0;
+  assign clk_en_o = clk_en_q;
+  assign start_o  = start_q;
+  assign done_o   = done_q;
+  
+
+endmodule
diff --git a/hw/tile/spatz_bootrom.sv b/hw/tile/spatz_bootrom.sv
new file mode 100644
index 0000000..7bbac14
--- /dev/null
+++ b/hw/tile/spatz_bootrom.sv
@@ -0,0 +1,39 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Auto-generated bootrom for Spatz core complex
+// Generated from: spatz_init.bin
+// Size: 16 bytes (4 words)
+
+module spatz_bootrom #(
+  parameter int unsigned DataWidth = 32,
+  parameter int unsigned AddrWidth = 32
+) (
+  input  logic                  clk_i,
+  input  logic                  req_i,
+  input  logic [AddrWidth-1:0]  addr_i,
+  output logic [DataWidth-1:0]  rdata_o
+);
+  localparam int RomSize = 4;
+  localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1;
+
+  const logic [RomSize-1:0][DataWidth-1:0] mem = {
+    32'h00030067,
+    32'h0002a303,
+    32'h70c28293,
+    32'h000012b7
+  };
+
+  // Sequential read with registered address
+  logic [AddrBits-1:0] addr_q;
+
+  always_ff @(posedge clk_i) begin
+    if (req_i) begin
+      addr_q <= addr_i[AddrBits-1+2:2];
+    end
+  end
+  
+  // Return data based on registered address
+  assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0;
+endmodule
diff --git a/hw/tile/spatz_cc_wrapper.sv b/hw/tile/spatz_cc_wrapper.sv
new file mode 100644
index 0000000..5171fcd
--- /dev/null
+++ b/hw/tile/spatz_cc_wrapper.sv
@@ -0,0 +1,489 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51 
+ * (the "License"); you may not use this file except in compliance 
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * Spatz Core Complex Wrapper for MAGIA Tile
+ * 
+ */
+
+`include "snitch_vm/typedef.svh"
+
+module spatz_cc_wrapper
+  import magia_tile_pkg::*;
+  import magia_pkg::*;
+  import snitch_pkg::*;
+  import obi_pkg::*;
+  import fpnew_pkg::*;
+#(
+  // Spatz Core Complex Parameters
+  parameter int unsigned AddrWidth                = magia_pkg::ADDR_W,
+  parameter int unsigned DataWidth                = magia_tile_pkg::SPATZ_TCDM_DATA_WIDTH,
+  parameter int unsigned TCDMAddrWidth            = magia_tile_pkg::SPATZ_TCDM_ADDR_WIDTH,
+  parameter int unsigned NumSpatzFPUs             = magia_tile_pkg::SPATZ_NUM_FPU,
+  parameter int unsigned NumSpatzIPUs             = magia_tile_pkg::SPATZ_NUM_IPU,
+  parameter int unsigned NumIntOutstandingLoads   = magia_tile_pkg::SPATZ_NUM_INT_OUTSTANDING_LOADS,
+  parameter int unsigned NumIntOutstandingMem     = magia_tile_pkg::SPATZ_NUM_INT_OUTSTANDING_MEM,
+  parameter int unsigned NumSpatzOutstandingLoads = magia_tile_pkg::SPATZ_NUM_SPATZ_OUTSTANDING_LOADS,
+  parameter bit          Xdma                     = magia_tile_pkg::SPATZ_XDMA,
+  parameter bit          RVF                      = magia_tile_pkg::SPATZ_RVF,
+  parameter bit          RVD                      = magia_tile_pkg::SPATZ_RVD_PARAM,
+  parameter bit          RVV                      = magia_tile_pkg::SPATZ_RVV,
+  parameter bit          XDivSqrt                 = magia_tile_pkg::SPATZ_XDIVSQRT,
+  parameter bit          RegisterOffloadRsp       = magia_tile_pkg::SPATZ_REGISTER_OFFLOAD_RSP,
+  parameter bit          RegisterCoreReq          = magia_tile_pkg::SPATZ_REGISTER_CORE_REQ,
+  parameter bit          RegisterCoreRsp          = magia_tile_pkg::SPATZ_REGISTER_CORE_RSP,
+  parameter logic [31:0] BootAddr                 = magia_tile_pkg::SPATZ_BOOT_ADDR,
+  
+  parameter fpnew_pkg::fpu_implementation_t FPUImplementation = magia_tile_pkg::SPATZ_FPUImplementation,
+  
+  // Derived parameters - calcolo dinamico basato su N_IPU e N_FPU configurabili
+  localparam int unsigned NumSpatzFUs         = (NumSpatzFPUs > NumSpatzIPUs) ? NumSpatzFPUs : NumSpatzIPUs,
+  localparam int unsigned NumMemPortsPerSpatz = NumSpatzFUs,
+  localparam int unsigned TCDMPorts           = RVV ? NumMemPortsPerSpatz + 1 : 1,  // N_FU + 1 TCDM ports
+  localparam int unsigned HCIMasterPorts      = RVD ? (TCDMPorts * 2) : TCDMPorts   // RVD=1: 2 HCI32/TCDM, RVD=0: 1 HCI32/TCDM
+)(
+  input  logic                                    clk_i,
+  input  logic                                    rst_ni,
+  input  logic                                    test_mode_i,
+  
+  // Core configuration
+  input  logic [31:0]                             hart_id_i,
+  input  logic [AddrWidth-1:0]                    tcdm_addr_base_i,  // TCDM base address for this tile
+  
+  // Interrupts
+  input  snitch_pkg::interrupts_t                 irq_i,
+  
+  // HCI Master Interface(s) - Connect to MAGIA HCI Interconnect for L1 SPM access
+  output magia_tile_pkg::core_hci_data_req_t [HCIMasterPorts-1:0] hci_master_req_o,
+  input  magia_tile_pkg::core_hci_data_rsp_t [HCIMasterPorts-1:0] hci_master_rsp_i,
+  
+  // OBI Master Interface - Single port for Snitch core
+  output magia_tile_pkg::core_obi_data_req_t      obi_master_req_o,
+  input  magia_tile_pkg::core_obi_data_rsp_t      obi_master_rsp_i,
+  
+  // Core -> ICache signals (from hive_req)
+  output logic                                    inst_req_o,        // Core wants instruction (inst_valid)
+  output logic [AddrWidth-1:0]                    inst_addr_o,       // Instruction address
+  output logic                                    inst_cacheable_o,  // Cacheable flag
+  output logic                                    flush_i_valid_o,   // Flush request
+  // ICache -> Core signals (to hive_rsp)
+  input  logic [31:0]                             inst_data_i,       // Instruction data
+  input  logic                                    inst_ready_i,      // ICache has data ready
+  input  logic                                    inst_error_i,      // Fetch error
+  input  logic                                    flush_i_ready_i,   // Flush complete
+  
+  // Events and status
+  output snitch_pkg::core_events_t                core_events_o
+);
+
+  /*******************************************************************/
+  /*                     Internal Type Definitions                   */
+  /*******************************************************************/
+  
+  // Local type aliases
+  typedef logic [AddrWidth-1:0] addr_t;
+  typedef logic [DataWidth-1:0] data_t;
+  
+  // Accelerator types
+  typedef struct packed {
+    logic [31:0] addr;
+    logic [5:0]  id;
+    logic [31:0] data_op;
+    data_t       data_arga;
+    data_t       data_argb;
+    addr_t       data_argc;
+  } acc_issue_req_t;
+  
+  typedef struct packed {
+    logic accept;
+    logic writeback;
+    logic loadstore;
+    logic exception;
+    logic isfloat;
+  } acc_issue_rsp_t;
+  
+  typedef struct packed {
+    logic [5:0] id;
+    logic       error;
+    data_t      data;
+  } acc_rsp_t;
+  
+  // Virtual memory types - use macro from snitch_vm/typedef.svh
+  `SNITCH_VM_TYPEDEF(AddrWidth)
+  
+  // Hive types (instruction fetch) - matching spatz_cluster.sv
+  typedef struct packed {
+    logic flush_i_valid;
+    addr_t inst_addr;
+    logic inst_cacheable;
+    logic inst_valid;
+    acc_issue_req_t acc_req;
+    logic acc_qvalid;
+    logic acc_pready;
+    logic [1:0] ptw_valid;   
+    va_t [1:0] ptw_va;        
+    pa_t [1:0] ptw_ppn;        
+  } hive_req_t;
+  
+  typedef struct packed {
+    logic flush_i_ready;
+    logic [31:0] inst_data;
+    logic inst_ready;
+    logic inst_error;
+    logic acc_qready;
+    acc_rsp_t acc_resp;
+    logic acc_pvalid;
+    logic [1:0] ptw_ready;     
+    l0_pte_t [1:0] ptw_pte;     
+    logic [1:0] ptw_is_4mega;    
+  } hive_rsp_t;
+  
+  // DMA types - simplified for MAGIA (DMA disabled)
+  typedef struct packed {
+    logic start;
+    logic complete;
+    logic error;
+  } dma_events_t;
+  
+  /*******************************************************************/
+  /*                     Internal Signal Declarations                */
+  /*******************************************************************/
+  
+  // Spatz_cc internal signals
+  hive_req_t                                  hive_req;
+  hive_rsp_t                                  hive_rsp;
+  
+  // Conditional signal declarations based on RVD parameter
+  generate
+    if (RVD) begin : gen_signals
+      // RVD=1: 64-bit signal types
+      magia_tile_pkg::spatz_reqrsp64_req_t        data_req;
+      magia_tile_pkg::spatz_reqrsp64_rsp_t        data_rsp;
+      magia_tile_pkg::spatz_tcdm64_req_t [TCDMPorts-1:0] tcdm_req;
+      magia_tile_pkg::spatz_tcdm64_rsp_t [TCDMPorts-1:0] tcdm_rsp;
+    end else begin : gen_signals
+      // RVD=0: 32-bit signal types
+      magia_tile_pkg::spatz_reqrsp_req_t          data_req;
+      magia_tile_pkg::spatz_reqrsp_rsp_t          data_rsp;
+      magia_tile_pkg::spatz_tcdm_req_t [TCDMPorts-1:0] tcdm_req;
+      magia_tile_pkg::spatz_tcdm_rsp_t [TCDMPorts-1:0] tcdm_rsp;
+    end
+  endgenerate
+  
+  /*******************************************************************/
+  /*                     Spatz Core Complex Instance                 */
+  /*******************************************************************/
+  
+  generate
+    if (RVD) begin : gen_spatz_cc_rvd
+      // RVD=1: 64-bit configuration
+      spatz_cc #(
+        .AddrWidth                ( AddrWidth                 ),
+        .DataWidth                ( DataWidth                 ),  // 64-bit
+        .TCDMAddrWidth            ( TCDMAddrWidth             ),
+        .dreq_t                   ( magia_tile_pkg::spatz_reqrsp64_req_t     ),
+        .drsp_t                   ( magia_tile_pkg::spatz_reqrsp64_rsp_t     ),
+        .tcdm_req_t               ( magia_tile_pkg::spatz_tcdm64_req_t       ),
+        .tcdm_req_chan_t          ( magia_tile_pkg::spatz_tcdm64_payload_t   ),
+        .tcdm_rsp_t               ( magia_tile_pkg::spatz_tcdm64_rsp_t       ),
+        .tcdm_rsp_chan_t          ( magia_tile_pkg::spatz_tcdm64_rsp_chan_t  ),
+        .axi_req_t                ( magia_tile_pkg::idma_axi_req_t        ),
+        .axi_ar_chan_t            ( magia_tile_pkg::idma_axi_ar_chan_t    ),
+        .axi_aw_chan_t            ( magia_tile_pkg::idma_axi_aw_chan_t    ),
+        .axi_rsp_t                ( magia_tile_pkg::idma_axi_rsp_t        ),
+        .hive_req_t               ( hive_req_t                ),
+        .hive_rsp_t               ( hive_rsp_t                ),
+        .acc_issue_req_t          ( acc_issue_req_t           ),
+        .acc_issue_rsp_t          ( acc_issue_rsp_t           ),
+        .acc_rsp_t                ( acc_rsp_t                 ),
+        .dma_events_t             ( dma_events_t                          ),
+        .dma_perf_t               ( logic                                 ),        
+        .SnitchPMACfg             ( magia_tile_pkg::SPATZ_SNITCH_PMA_CFG  ),        
+        .FPUImplementation        ( FPUImplementation         ),
+        .BootAddr                 ( BootAddr                  ),
+        .RVE                      ( 1'b0                      ),
+        .RVF                      ( RVF                       ),
+        .RVD                      ( 1'b1                      ),
+        .XDivSqrt                 ( XDivSqrt                  ),
+        .XF8                      ( 1'b1                      ),
+        .XF16                     ( 1'b1                      ),
+        .XF16ALT                  ( 1'b1                      ),
+        .XF8ALT                   ( 1'b1                      ),
+        .Xdma                     ( Xdma                      ),
+        .NumIntOutstandingLoads   ( NumIntOutstandingLoads    ),
+        .NumIntOutstandingMem     ( NumIntOutstandingMem      ),
+        .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads  ),
+        .RVV                      ( RVV                       ),
+        .NumSpatzFPUs             ( NumSpatzFPUs              ),
+        .NumSpatzIPUs             ( NumSpatzIPUs              ),
+        .IsoCrossing              ( 1'b0                      ),
+        .RegisterOffloadRsp       ( RegisterOffloadRsp        ),
+        .RegisterCoreReq          ( RegisterCoreReq           ),
+        .RegisterCoreRsp          ( RegisterCoreRsp           )
+      ) i_spatz_cc (
+        .clk_i                    ( clk_i                     ),
+        .clk_d2_i                 ( clk_i                     ),
+        .rst_ni                   ( rst_ni                    ),
+        .testmode_i               ( test_mode_i               ),
+        .hart_id_i                ( hart_id_i                 ),
+        .irq_i                    ( irq_i                     ),
+        .tcdm_addr_base_i         ( tcdm_addr_base_i          ),
+        .hive_req_o               ( hive_req                  ),
+        .hive_rsp_i               ( hive_rsp                  ),
+        .data_req_o               ( gen_signals.data_req      ),
+        .data_rsp_i               ( gen_signals.data_rsp      ),
+        .tcdm_req_o               ( gen_signals.tcdm_req      ),
+        .tcdm_rsp_i               ( gen_signals.tcdm_rsp      ),
+        .axi_dma_req_o            ( /* DMA disabled */        ),
+        .axi_dma_res_i            ( '0                        ),
+        .axi_dma_busy_o           ( /* DMA disabled */        ),
+        .axi_dma_perf_o           ( /* DMA disabled */        ),
+        .axi_dma_events_o         ( /* DMA disabled */        ),
+        .core_events_o            ( core_events_o             )
+      );
+    end else begin : gen_spatz_cc_no_rvd
+      // RVD=0: 32-bit configuration
+      spatz_cc #(
+        .AddrWidth                ( AddrWidth                 ),
+        .DataWidth                ( DataWidth                 ),  // 32-bit
+        .TCDMAddrWidth            ( TCDMAddrWidth             ),
+        .dreq_t                   ( magia_tile_pkg::spatz_reqrsp_req_t     ),
+        .drsp_t                   ( magia_tile_pkg::spatz_reqrsp_rsp_t     ),
+        .tcdm_req_t               ( magia_tile_pkg::spatz_tcdm_req_t       ),
+        .tcdm_req_chan_t          ( magia_tile_pkg::spatz_tcdm_req_chan_t  ),
+        .tcdm_rsp_t               ( magia_tile_pkg::spatz_tcdm_rsp_t       ),
+        .tcdm_rsp_chan_t          ( magia_tile_pkg::spatz_tcdm_rsp_chan_t  ),
+        .axi_req_t                ( magia_tile_pkg::idma_axi_req_t        ),
+        .axi_ar_chan_t            ( magia_tile_pkg::idma_axi_ar_chan_t    ),
+        .axi_aw_chan_t            ( magia_tile_pkg::idma_axi_aw_chan_t    ),
+        .axi_rsp_t                ( magia_tile_pkg::idma_axi_rsp_t        ),
+        .hive_req_t               ( hive_req_t                ),
+        .hive_rsp_t               ( hive_rsp_t                ),
+        .acc_issue_req_t          ( acc_issue_req_t           ),
+        .acc_issue_rsp_t          ( acc_issue_rsp_t           ),
+        .acc_rsp_t                ( acc_rsp_t                 ),
+        .dma_events_t             ( dma_events_t                          ),
+        .dma_perf_t               ( logic                                 ),
+        .SnitchPMACfg             ( magia_tile_pkg::SPATZ_SNITCH_PMA_CFG  ),
+        .FPUImplementation        ( FPUImplementation         ),
+        .BootAddr                 ( BootAddr                  ),
+        .RVE                      ( 1'b0                      ),
+        .RVF                      ( RVF                       ),
+        .RVD                      ( 1'b0                      ),
+        .XDivSqrt                 ( XDivSqrt                  ),
+        .XF8                      ( 1'b1                      ),
+        .XF16                     ( 1'b1                      ),
+        .XF16ALT                  ( 1'b1                      ),
+        .XF8ALT                   ( 1'b1                      ),
+        .Xdma                     ( Xdma                      ),
+        .NumIntOutstandingLoads   ( NumIntOutstandingLoads    ),
+        .NumIntOutstandingMem     ( NumIntOutstandingMem      ),
+        .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads  ),
+        .RVV                      ( RVV                       ),
+        .NumSpatzFPUs             ( NumSpatzFPUs              ),
+        .NumSpatzIPUs             ( NumSpatzIPUs              ),
+        .IsoCrossing              ( 1'b0                      ),
+        .RegisterOffloadRsp       ( RegisterOffloadRsp        ),
+        .RegisterCoreReq          ( RegisterCoreReq           ),
+        .RegisterCoreRsp          ( RegisterCoreRsp           )
+      ) i_spatz_cc (
+        .clk_i                    ( clk_i                     ),
+        .clk_d2_i                 ( clk_i                     ),
+        .rst_ni                   ( rst_ni                    ),
+        .testmode_i               ( test_mode_i               ),
+        .hart_id_i                ( hart_id_i                 ),
+        .irq_i                    ( irq_i                     ),
+        .tcdm_addr_base_i         ( tcdm_addr_base_i          ),
+        .hive_req_o               ( hive_req                  ),
+        .hive_rsp_i               ( hive_rsp                  ),
+        .data_req_o               ( gen_signals.data_req      ),
+        .data_rsp_i               ( gen_signals.data_rsp      ),
+        .tcdm_req_o               ( gen_signals.tcdm_req      ),
+        .tcdm_rsp_i               ( gen_signals.tcdm_rsp      ),
+        .axi_dma_req_o            ( /* DMA disabled */        ),
+        .axi_dma_res_i            ( '0                        ),
+        .axi_dma_busy_o           ( /* DMA disabled */        ),
+        .axi_dma_perf_o           ( /* DMA disabled */        ),
+        .axi_dma_events_o         ( /* DMA disabled */        ),
+        .core_events_o            ( core_events_o             )
+      );
+    end
+  endgenerate
+  
+  /*******************************************************************/
+  /*          Instruction Cache Interface (Hive) Connection         */
+  /*******************************************************************/
+  
+  // Map hive request to external icache ports (Core → ICache)
+  assign inst_req_o        = hive_req.inst_valid;
+  assign inst_addr_o       = hive_req.inst_addr;
+  assign inst_cacheable_o  = hive_req.inst_cacheable;
+  assign flush_i_valid_o   = hive_req.flush_i_valid;
+  
+  // Map external icache response to hive response (ICache → Core)
+  assign hive_rsp = '{
+    inst_data    : inst_data_i,
+    inst_ready   : inst_ready_i,
+    inst_error   : inst_error_i,
+    flush_i_ready: flush_i_ready_i,
+    default      : '0
+  };
+  
+  /*******************************************************************/
+  /*         Protocol Conversion: reqrsp (Snitch) → OBI32           */
+  /*******************************************************************/
+  
+  generate
+    if (RVD) begin : gen_reqrsp64_to_obi32
+      // RVD=1: Snitch data port 64-bit reqrsp → 32-bit OBI (for L2/peripherals)
+      reqrsp64_to_obi32 #(
+        .ObiCfg       ( magia_tile_pkg::obi_amo_cfg              ),
+        .reqrsp_req_t ( magia_tile_pkg::spatz_reqrsp64_req_t    ),
+        .reqrsp_rsp_t ( magia_tile_pkg::spatz_reqrsp64_rsp_t    ),
+        .obi_req_t    ( magia_tile_pkg::core_obi_data_req_t      ),
+        .obi_rsp_t    ( magia_tile_pkg::core_obi_data_rsp_t      )
+      ) i_snitch_reqrsp_to_obi32 (
+        .clk_i        ( clk_i                              ),
+        .rst_ni       ( rst_ni                             ),
+        .reqrsp_req_i ( gen_signals.data_req               ),
+        .reqrsp_rsp_o ( gen_signals.data_rsp               ),
+        .obi_req_o    ( obi_master_req_o                   ),
+        .obi_rsp_i    ( obi_master_rsp_i                   )
+      );
+    end else begin : gen_reqrsp32_to_obi32
+      // RVD=0: Snitch data port 32-bit reqrsp → 32-bit OBI (for L2/peripherals)
+      reqrsp2obi #(
+        .ObiCfg       ( magia_tile_pkg::obi_amo_cfg             ),
+        .reqrsp_req_t ( magia_tile_pkg::spatz_reqrsp_req_t     ),
+        .reqrsp_rsp_t ( magia_tile_pkg::spatz_reqrsp_rsp_t     ),
+        .obi_req_t    ( magia_tile_pkg::core_obi_data_req_t     ),
+        .obi_rsp_t    ( magia_tile_pkg::core_obi_data_rsp_t     )
+      ) i_snitch_reqrsp_to_obi32 (
+        .clk_i        ( clk_i                              ),
+        .rst_ni       ( rst_ni                             ),
+        .reqrsp_req_i ( gen_signals.data_req               ),
+        .reqrsp_rsp_o ( gen_signals.data_rsp               ),
+        .obi_req_o    ( obi_master_req_o                   ),
+        .obi_rsp_i    ( obi_master_rsp_i                   )
+      );
+    end
+  endgenerate
+  
+  /*******************************************************************/
+  /*        Protocol Conversion: TCDM (Spatz) → HCI32               */
+  /*******************************************************************/
+  
+  generate
+    if (RVD) begin : gen_tcdm64_to_hci
+      // RVD=1: Each 64-bit TCDM port splits into 2×32-bit HCI ports
+      
+      // Ports 0-3: Spatz vector memory ports (64-bit TCDM → 2×32-bit HCI)
+      for (genvar i = 0; i < NumMemPortsPerSpatz; i++) begin : gen_tcdm64_to_dual_hci32_spatz
+        tcdm64_to_dual_hci32 #(
+          .tcdm64_req_t  ( magia_tile_pkg::spatz_tcdm64_req_t  ),
+          .tcdm64_rsp_t  ( magia_tile_pkg::spatz_tcdm64_rsp_t  ),
+          .tcdm32_req_t  ( magia_tile_pkg::spatz_tcdm32_req_t  ),
+          .tcdm32_rsp_t  ( magia_tile_pkg::spatz_tcdm32_rsp_t  ),
+          .hci_req_t     ( magia_tile_pkg::core_hci_data_req_t ),
+          .hci_rsp_t     ( magia_tile_pkg::core_hci_data_rsp_t )
+        ) i_tcdm64_to_dual_hci32_spatz (
+          .clk_i         ( clk_i                              ),
+          .rst_ni        ( rst_ni                             ),
+          .tcdm_req_i    ( gen_signals.tcdm_req[i]            ),
+          .tcdm_rsp_o    ( gen_signals.tcdm_rsp[i]            ),
+          .hci_req_lo_o  ( hci_master_req_o[i*2]              ),
+          .hci_rsp_lo_i  ( hci_master_rsp_i[i*2]              ),
+          .hci_req_hi_o  ( hci_master_req_o[i*2+1]            ),
+          .hci_rsp_hi_i  ( hci_master_rsp_i[i*2+1]            )
+        );
+      end
+      
+      // Port 4: Snitch RV64 TCDM (with AMO) → Dual HCI32 (ports 8-9)
+      tcdm64_to_dual_hci32_atomic #(
+        .tcdm64_req_t         ( magia_tile_pkg::spatz_tcdm64_req_t         ),
+        .tcdm64_rsp_t         ( magia_tile_pkg::spatz_tcdm64_rsp_t         ),
+        .tcdm32_req_t         ( magia_tile_pkg::spatz_tcdm32_req_t         ),
+        .tcdm32_rsp_t         ( magia_tile_pkg::spatz_tcdm32_rsp_t         ),
+        .obi32_req_t          ( magia_tile_pkg::spatz_obi32_req_t          ),
+        .obi32_rsp_t          ( magia_tile_pkg::spatz_obi32_rsp_t          ),
+        .obi32_a_chan_t       ( magia_tile_pkg::spatz_obi32_a_chan_t       ),
+        .obi32_r_chan_t       ( magia_tile_pkg::spatz_obi32_r_chan_t       ),
+        .hci_req_t            ( magia_tile_pkg::core_hci_data_req_t        ),
+        .hci_rsp_t            ( magia_tile_pkg::core_hci_data_rsp_t        ),
+        .obi32_a_optional_t   ( magia_tile_pkg::spatz_obi32_a_optional_t  ),
+        .obi32_r_optional_t   ( magia_tile_pkg::spatz_obi32_r_optional_t  ),
+        .SbrPortObiCfg        ( magia_tile_pkg::obi_amo_cfg                ),
+        .MgrPortObiCfg        ( magia_tile_pkg::obi_no_amo_cfg             ),
+        .BypassCut            ( 1'b0                                       )
+      ) i_tcdm64_to_dual_hci32_atomic_snitch (
+        .clk_i         ( clk_i                                      ),
+        .rst_ni        ( rst_ni                                     ),
+        .tcdm_req_i    ( gen_signals.tcdm_req[NumMemPortsPerSpatz]  ),
+        .tcdm_rsp_o    ( gen_signals.tcdm_rsp[NumMemPortsPerSpatz]  ),
+        .hci_req_lo_o  ( hci_master_req_o[NumMemPortsPerSpatz*2]        ),
+        .hci_rsp_lo_i  ( hci_master_rsp_i[NumMemPortsPerSpatz*2]        ),
+        .hci_req_hi_o  ( hci_master_req_o[NumMemPortsPerSpatz*2+1]      ),
+        .hci_rsp_hi_i  ( hci_master_rsp_i[NumMemPortsPerSpatz*2+1]      )
+      );
+      
+    end else begin : gen_tcdm32_to_hci
+      // RVD=0: Each 32-bit TCDM port maps directly to 1×32-bit HCI port
+      
+      // Ports 0-3: Spatz vector memory ports (32-bit TCDM → 32-bit HCI)
+      for (genvar i = 0; i < NumMemPortsPerSpatz; i++) begin : gen_tcdm32_to_hci32_spatz
+        tcdm2hci #(
+          .tcdm_req_t  ( magia_tile_pkg::spatz_tcdm_req_t     ),
+          .tcdm_rsp_t  ( magia_tile_pkg::spatz_tcdm_rsp_t     ),
+          .hci_req_t   ( magia_tile_pkg::core_hci_data_req_t  ),
+          .hci_rsp_t   ( magia_tile_pkg::core_hci_data_rsp_t  )
+        ) i_tcdm32_to_hci32_spatz (
+          .clk_i       ( clk_i                            ),
+          .rst_ni      ( rst_ni                           ),
+          .tcdm_req_i  ( gen_signals.tcdm_req[i]          ),
+          .tcdm_rsp_o  ( gen_signals.tcdm_rsp[i]          ),
+          .hci_req_o   ( hci_master_req_o[i]              ),
+          .hci_rsp_i   ( hci_master_rsp_i[i]              )
+        );
+      end
+      
+      // Port 4: Snitch RV32 TCDM (with AMO) → HCI32
+      tcdm2hci_atomic #(
+        .tcdm_req_t         ( magia_tile_pkg::spatz_tcdm_req_t         ),
+        .tcdm_rsp_t         ( magia_tile_pkg::spatz_tcdm_rsp_t         ),
+        .obi_req_t          ( magia_tile_pkg::core_obi_data_req_t      ),
+        .obi_rsp_t          ( magia_tile_pkg::core_obi_data_rsp_t      ),
+        .obi_a_chan_t       ( magia_tile_pkg::core_data_obi_a_chan_t   ),
+        .obi_r_chan_t       ( magia_tile_pkg::core_data_obi_r_chan_t   ),
+        .hci_req_t          ( magia_tile_pkg::core_hci_data_req_t      ),
+        .hci_rsp_t          ( magia_tile_pkg::core_hci_data_rsp_t      ),
+        .obi_a_optional_t   ( magia_tile_pkg::core_data_obi_a_optional_t ),
+        .obi_r_optional_t   ( magia_tile_pkg::core_data_obi_r_optional_t ),
+        .SbrPortObiCfg      ( magia_tile_pkg::obi_amo_cfg              ),
+        .MgrPortObiCfg      ( magia_tile_pkg::obi_no_amo_cfg           ),
+        .BypassCut          ( 1'b0                                     )
+      ) i_tcdm32_to_hci32_atomic_snitch (
+        .clk_i       ( clk_i                                      ),
+        .rst_ni      ( rst_ni                                     ),
+        .tcdm_req_i  ( gen_signals.tcdm_req[NumMemPortsPerSpatz]  ),
+        .tcdm_rsp_o  ( gen_signals.tcdm_rsp[NumMemPortsPerSpatz]  ),
+        .hci_req_o   ( hci_master_req_o[NumMemPortsPerSpatz]             ),
+        .hci_rsp_i   ( hci_master_rsp_i[NumMemPortsPerSpatz]             )
+      );
+    end
+  endgenerate
+
+endmodule : spatz_cc_wrapper
diff --git a/scripts/bin2header.py b/scripts/bin2header.py
new file mode 100644
index 0000000..4195577
--- /dev/null
+++ b/scripts/bin2header.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""
+Convert binary file to C header with embedded array.
+Generates a header with the binary as uint32_t array in custom linker section.
+"""
+
+import argparse
+
+def bytes_to_words(data):
+    """Convert bytes to uint32_t words (little-endian)."""
+    words = []
+    for i in range(0, len(data), 4):
+        # Read up to 4 bytes, pad with zeros if needed
+        chunk = data[i:i+4]
+        word = sum(byte << (8 * j) for j, byte in enumerate(chunk))
+        words.append(f"0x{word:08X}")
+    return words
+
+def generate_header(binary_path, output_path, array_name, section_name, address):
+    """Generate C header with binary array in custom section."""
+    
+    # Read binary file
+    with open(binary_path, 'rb') as f:
+        data = f.read()
+    
+    # Convert to words
+    words = bytes_to_words(data)
+    guard = f"__{array_name.upper()}_H__"
+    
+    # Write header file
+    with open(output_path, 'w') as f:
+        # Header guard and includes
+        f.write(f"/* Auto-generated from {binary_path} ({len(data)} bytes) */\n")
+        f.write(f"#ifndef {guard}\n")
+        f.write(f"#define {guard}\n\n")
+        f.write(f"#include <stdint.h>\n\n")
+        
+        # Binary array with section attribute
+        f.write(f"/* Binary array in {section_name} section (target: {address}) */\n")
+        f.write(f"const uint32_t {array_name}[] ")
+        f.write(f"__attribute__((section(\"{section_name}\"), aligned(4), used)) = {{\n")
+        
+        # Write words (8 per line for readability)
+        for i in range(0, len(words), 8):
+            line = words[i:i+8]
+            f.write("    " + ", ".join(line))
+            f.write(",\n" if i + 8 < len(words) else "\n")
+        
+        f.write("};\n\n")
+        f.write(f"#endif /* {guard} */\n")
+    
+    print(f"Generated: {output_path} ({len(data)} bytes = {len(words)} words)")
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert binary to C header')
+    parser.add_argument('binary', help='Input binary file')
+    parser.add_argument('output', help='Output header file')
+    parser.add_argument('--name', default='spatz_program', help='Array name')
+    parser.add_argument('--section', default='.spatz_binary', help='Linker section name')
+    parser.add_argument('--address', default='dynamic (_spatz_binary_start)', 
+                       help='Target address (informational)')
+    
+    args = parser.parse_args()
+    generate_header(args.binary, args.output, args.name, args.section, args.address)
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/generate_spatz_bootrom.py b/scripts/generate_spatz_bootrom.py
new file mode 100644
index 0000000..4fd641d
--- /dev/null
+++ b/scripts/generate_spatz_bootrom.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+#
+# Generate SystemVerilog bootrom module from binary file
+
+import argparse
+import sys
+
+def generate_bootrom_sv(bin_file, output_file):
+    """Generate SystemVerilog bootrom module from binary file."""
+    
+    # Read binary file
+    with open(bin_file, 'rb') as f:
+        data = f.read()
+    
+    # Pad to 32-bit word boundary
+    if len(data) % 4 != 0:
+        padding = 4 - (len(data) % 4)
+        data = data + b'\x00' * padding
+    
+    num_bytes = len(data)
+    num_words = num_bytes // 4
+    
+    # Convert bytes to 32-bit words (little-endian, in address order)
+    words = []
+    for i in range(0, num_bytes, 4):
+        word = int.from_bytes(data[i:i+4], byteorder='little')
+        words.append(word)
+    
+    # Generate SystemVerilog module (like spatz_cluster)
+    sv_content = f'''// Copyright 2024 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+//
+// Auto-generated bootrom for Spatz core complex
+// Generated from: {bin_file}
+// Size: {num_bytes} bytes ({num_words} words)
+
+module spatz_bootrom #(
+  parameter int unsigned DataWidth = 32,
+  parameter int unsigned AddrWidth = 32
+) (
+  input  logic                  clk_i,
+  input  logic                  req_i,
+  input  logic [AddrWidth-1:0]  addr_i,
+  output logic [DataWidth-1:0]  rdata_o
+);
+  localparam int RomSize = {num_words};
+  localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1;
+
+  const logic [RomSize-1:0][DataWidth-1:0] mem = {{
+'''
+    
+    # Add ROM content in REVERSE order (high index first)
+    # This is because packed arrays fill from high to low index
+    for i in range(num_words - 1, -1, -1):
+        sv_content += f"    32'h{words[i]:08x}"
+        if i > 0:
+            sv_content += ","
+        sv_content += "\n"
+    
+    sv_content += '''  };
+
+  // Sequential read with registered address
+  logic [AddrBits-1:0] addr_q;
+
+  always_ff @(posedge clk_i) begin
+    if (req_i) begin
+      addr_q <= addr_i[AddrBits-1+2:2];
+    end
+  end
+  
+  // Return data based on registered address
+  assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0;
+endmodule
+'''
+    
+    # Write output file
+    with open(output_file, 'w') as f:
+        f.write(sv_content)
+    
+    print(f"Generated {output_file}: {num_words} words ({num_bytes} bytes)")
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate SystemVerilog bootrom from binary')
+    parser.add_argument('binary', help='Input binary file')
+    parser.add_argument('-o', '--output', required=True, help='Output SystemVerilog file')
+    
+    args = parser.parse_args()
+    
+    try:
+        generate_bootrom_sv(args.binary, args.output)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/setup_env.sh b/setup_env.sh
index ed0bd50..d588cee 100644
--- a/setup_env.sh
+++ b/setup_env.sh
@@ -15,8 +15,8 @@ export PATH=/usr/local/anaconda3-2023.07/condabin:$PATH
 export PATH=/home/visachi/.local/bin:$PATH
 export XLEN=32
 if [[ "$core" == "CV32E40P" ]]; then
-  echo "Exporting ISA extentions: I, M, F, C, XPULP_V2"
-  export XTEN=imfcxpulpv2
+  echo "Exporting ISA extentions: I, M, C, GAP9"
+  export XTEN=imcxgap9
 else
   echo "Exporting ISA extentions: I, M, A, F, C"
   export XTEN=imafc
diff --git a/spatz/README.md b/spatz/README.md
new file mode 100644
index 0000000..d90ba1f
--- /dev/null
+++ b/spatz/README.md
@@ -0,0 +1,360 @@
+# Spatz Vector Accelerator in MAGIA
+[![License](https://img.shields.io/badge/license-Apache--2.0-green)](../LICENSE.APACHE)
+[![SHL-0.51 license](https://img.shields.io/badge/license-SHL--0.51-green)](../LICENSE.SHL)
+
+The Spatz Core Complex (Spatz CC) is a **RISC-V Vector Extension 1.0 (zve32d/zved64d)  core** integrated into the MAGIA tile architecture. It consists of **Snitch** (a compact 32-bit RISC-V scalar core) and **Spatz** (a vector coprocessor with multiple FPU lanes). The CV32 host core controls Spatz CC through memory-mapped registers and interrupt-driven task dispatching, following an accelerator-style programming model.
+
+**Key Features:**
+- RISC-V Vector Extension 1.0 (zve64d/zved32d) support (RVV)
+- Single and double precision floating-point (RVF, RVD)
+- Configurable vector length (VLEN: 128-512 bits)
+- Up to 8 vector FPU lanes
+- Clock-gated, interrupt-driven operation
+- Transparent binary embedding in CV32 firmware
+
+## ⚙️ Hardware Integration
+
+### OBI Slave Control Interface
+
+Spatz CC is controlled through an **OBI slave interface** (`obi_slave_ctrl_spatz`) that exposes memory-mapped control registers. This interface allows the CV32 to:
+- Enable/disable the Spatz clock
+- Set task entry points
+- Trigger task execution via interrupt
+- Read task completion status and exit codes
+
+### Control Registers
+
+The OBI slave exposes the following registers (base address: `SPATZ_CTRL_BASE = 0x00001700`):
+
+| Register Name    | Offset | Address    | Description                                                                                              |
+|------------------|--------|------------|----------------------------------------------------------------------------------------------------------|
+| `SPATZ_CLK_EN`   | 0x00   | 0x00001700 | Clock enable (write 1 to enable, 0 to disable Spatz CC clock)                                           |
+| `SPATZ_READY`    | 0x04   | 0x00001704 | Ready flag: Spatz writes 1 when ready for interrupts, 0 after first task; CV32 polls before triggering  |
+| `SPATZ_START`    | 0x08   | 0x00001708 | Trigger register: write 1 to send interrupt to Spatz CC, Snitch clears it to 0                          |
+| `SPATZ_TASKBIN`  | 0x0C   | 0x0000170C | Task binary address: CV32 writes task entry point address, Snitch reads it                              |
+| `SPATZ_DATA`     | 0x10   | 0x00001710 | Data/parameter pointer: CV32 writes pointer to task parameters, Snitch reads it                          |
+| `SPATZ_RETURN`   | 0x14   | 0x00001714 | Return value: Snitch writes task exit code (0=success, non-zero=error/exception), CV32 reads it         |
+| `SPATZ_DONE`     | 0x18   | 0x00001718 | Done flag: Snitch writes 1 when task completes, generates 1-cycle pulse to Event Unit, auto-clears      |
+
+**Register Behavior:**
+- **CLK_EN, START, READY, TASKBIN, DATA, RETURN**: Preserved registers - values persist until overwritten
+- **DONE**: Pulse register - auto-clears after one cycle, used to trigger Event Unit
+
+**Signal Flow:**
+1. CV32 writes task address to `SPATZ_TASKBIN`
+2. CV32 enables clock via `SPATZ_CLK_EN = 1`
+3. CV32 waits for `SPATZ_READY` to become 1 (Spatz initialization complete, in WFI loop)
+4. CV32 writes parameter pointer to `SPATZ_DATA` (optional, only if task needs parameters)
+5. CV32 writes 1 to `SPATZ_START` → triggers external interrupt to Spatz CC (Snitch core)
+6. Snitch core wakes from WFI, reads task address from `SPATZ_TASKBIN`, clears `SPATZ_START` to 0
+7. CV32 waits for `SPATZ_START` to become 0 (Snitch has saved task address)
+8. Snitch executes task, which reads parameters from address in `SPATZ_DATA`
+9. Task completes and writes exit code to `SPATZ_RETURN`, then writes 1 to `SPATZ_DONE`
+10. `SPATZ_DONE` generates 1-cycle pulse to Event Unit, CV32 wakes from WFE/polling
+11. CV32 reads exit code from `SPATZ_RETURN` (0 = success)
+
+## 🚀 Quick Start
+
+### Typical CV32 Test with Spatz Tasks
+```c
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "my_test_task_bin.h"  // REQUIRED - Auto-generated header with SPATZ_BINARY_START and task symbols
+
+int main(void) {
+    // Initialize Event Unit and Spatz
+    eu_init();
+    eu_enable_events(EU_SPATZ_DONE_MASK);
+
+    //Task with parameters via SPATZ_DATA register (optional)
+    typedef struct { uint32_t addr; uint32_t size; } params_t;
+    params_t params = {.addr = DATA_BASE, .size = 1024};
+
+    spatz_init(SPATZ_BINARY_START);  // Uses address from auto-generated header
+
+    spatz_pass_params((uint32_t)&params);  // Optional: pass params pointer via SPATZ_DATA
+    
+    //Simple task (no parameters via SPATZ_DATA)
+    spatz_run_task(MY_VECTOR_TASK);  // MY_VECTOR_TASK defined in auto-generated header
+    eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK);
+    
+    if (spatz_get_exit_code() != 0) {
+        printf("Task failed: 0x%03lx\n", spatz_get_exit_code());
+        return 1;
+    }
+    
+    spatz_clk_dis();  // Power saving
+    return 0;
+}
+```
+
+**Header Naming Convention:**
+- Test file: `sw/tests/my_test.c`
+- Generated header: `spatz/sw/headers_bin/my_test_task_bin.h`
+- Include in your test: `#include "my_test_task_bin.h"`
+
+### Writing a Spatz Task
+
+Create `spatz/sw/tasks/my_vector_task.c`:
+
+```c
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+
+// Define parameter structure (optional, if task needs parameters)
+typedef struct {
+    uint32_t data_addr;
+    uint32_t size;
+} my_params_t;
+
+int my_vector_task(void) {
+    // Read parameter pointer from SPATZ_DATA register (if task uses parameters)
+    uint32_t params_ptr = mmio32(SPATZ_DATA);
+    volatile my_params_t *params = (volatile my_params_t *)params_ptr;
+    
+    uint32_t data_addr = params->data_addr;
+    uint32_t size = params->size;
+    
+    // Your vector code here using RVV intrinsics or assembly
+    asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(32));
+    // ... vector operations ...
+    
+    return 0;  // Exit code: 0 = success, non-zero = error
+}
+```
+
+**Naming Convention (Required):**
+- File: `*_task.c` (lowercase, ends with `_task`)
+- Function: `int *_task(void)` (matches filename)
+- Symbol: `*_TASK` (uppercase in generated header)
+
+**Reading Parameters:**
+- Parameters are passed via pointer in `SPATZ_DATA` register
+- Task reads `SPATZ_DATA` to get pointer to parameter structure in L1
+- CV32 must ensure parameters are written to L1 before triggering task
+
+---
+
+## 🔧 Configuration Parameters
+### Main Parameters (Makefile)
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `SPATZ_RVD` | 0 | 0: 32-bit TCDM (ELEN=32); 1: 64-bit TCDM (ELEN=64, double precision) |
+| `SPATZ_VLEN` | 256 | Vector register length in bits (128, 256, 512) |
+| `SPATZ_N_FPU` | 4 | Number of FPU lanes (1-8) |
+| `SPATZ_N_IPU` | 1 | Number of integer processing units (1-8) |
+| `SPATZ_RVF` | 1 | Enable single-precision floating-point |
+| `SPATZ_RVV` | 1 | Enable RISC-V Vector extension |
+| `SPATZ_XDIVSQRT` | 0 | Enable FP div/sqrt |
+
+### Advanced Parameters
+
+Defined in `hw/tile/magia_tile_pkg.sv`. See [Spatz documentation](https://github.com/pulp-platform/spatz) for details.
+
+**Memory & Transactions:**
+- `SPATZ_NUM_INT_OUTSTANDING_LOADS`, `SPATZ_NUM_INT_OUTSTANDING_MEM`
+- `SPATZ_NUM_SPATZ_OUTSTANDING_LOADS`
+
+**Pipeline:**
+- `SPATZ_REGISTER_OFFLOAD_RSP`, `SPATZ_REGISTER_CORE_REQ/RSP`
+- `SPATZ_FPU_PIPE_*` (FMA, DIVSQRT, CONV, DOTP stages)
+
+---
+
+## 🕸️ Interconnects
+
+1. **HCI Ports (High-Performance L1/TCDM Access):**
+   - Spatz CC connects to the **HCI interconnect** with multiple ports (5-10 ports depending on `N_FPU` and `RVD`)
+   - Each port corresponds to a TCDM request channel: N_FU ports from Spatz vector unit + 1 port from Snitch scalar core
+   - When `RVD=1`: HCI ports are doubled (2x32-bit ports per TCDM port) to support 64-bit data transfers
+   - **Destination:** L1 SPM and TCDM banks (local tile memory)
+
+2. **OBI Master Port (General Memory Access):**
+   - Spatz CC (Snitch core) connects to the **OBI crossbar as a master** through a req/rsp port
+   - Used for accesses **outside L1/TCDM**: peripherals, L2 memory, cross-tile communication
+   - Handles scalar memory operations from Snitch and accesses to non-TCDM address ranges
+   - Routed through OBI interconnect to reach all tile resources
+
+3. **OBI Slave Interface (Control Registers):**
+   - Spatz CC exposes an **OBI slave interface** (`obi_slave_ctrl_spatz`) for CV32 control
+   - Base address: `0x00001700`
+   - Contains 7 control registers: `SPATZ_CLK_EN`, `SPATZ_READY`, `SPATZ_START`, `SPATZ_TASKBIN`, `SPATZ_DATA`, `SPATZ_RETURN`, `SPATZ_DONE`
+   - CV32 accesses these registers via OBI to command Spatz CC operation
+---
+
+## 🖥️ Programming API
+
+### C Functions (`sw/utils/magia_spatz_utils.h`)
+
+```c
+// Clock control
+void spatz_clk_en(void);                                 // Enable Spatz clock
+void spatz_clk_dis(void);                                // Disable Spatz clock
+
+// Initialization and task control
+void spatz_init(uint32_t addr);                          // Initialize: writes addr to TASKBIN, enables clock, waits for READY
+void spatz_run_task(uint32_t addr);                      // Write task address to TASKBIN and trigger START=1
+void spatz_pass_params(uint32_t params_ptr);             // Write parameter pointer to DATA (waits for START=0)
+void spatz_run_task_with_params(uint32_t addr, uint32_t params_ptr);  // Combined: run task + pass params
+uint32_t spatz_get_exit_code(void);                      // Read exit code from RETURN register
+```
+---
+
+## 🏗️ Boot and Initialization
+The bootROM is **extremely minimal** (only 3 instructions) to minimize hardware impact and ROM area. It performs only:
+```assembly
+// Read dispatcher entry point from TASKBIN register (set by CV32)
+li      t0, TASKBIN_ADDR      // 0x0000170C
+lw      t1, 0(t0)
+jr      t1  // Jump to spatz_crt0.S _start
+```
+
+The CV32 must write the Spatz CC binary's `_start` address to `SPATZ_TASKBIN` **before enabling the Spatz CC clock** for the first time.
+
+#### 2. Runtime Initialization (`spatz/sw/kernel/spatz_crt0.S`)
+
+The **spatz_crt0.S** runtime performs complete initialization and enters the task dispatcher loop. It is designed to be **completely transparent** to the programmer.
+
+**Initialization Steps:**
+1. **Stack Setup:** `sp = 0x0001FFF8` (128KB - 8 bytes, stack at top of Snitch local memory)
+2. **Register Clearing:** All integer registers `x1-x31` cleared (except `sp`)
+3. **Vector Extension Enable:** Set `mstatus.VS = 0x200` (Initial state to enable Spatz vector coprocessor)
+4. **BSS Clearing:** Zero-initialize `.bss` section
+5. **Trap Handler Setup:** Install `_trap_handler` at `mtvec`
+6. **Interrupt Enable:** Enable external interrupts (`MIE.MEIE`) and global interrupts (`mstatus.MIE`)
+7. **Signal READY:** Write 1 to `SPATZ_READY` register to signal CV32 that initialization is complete
+
+**Dispatcher Loop:**
+```assembly
+    wfi                      // First WFI - wait for first interrupt
+    
+    // Deassert READY after first wakeup (only once)
+    li      t0, READY_ADDR
+    sw      zero, 0(t0)
+    
+dispatcher_loop:
+    wfi                      // Wait for subsequent interrupts from CV32
+    j       dispatcher_loop  // Loop back after task completion
+```
+
+**READY Synchronization:**
+- After completing initialization, Spatz writes `READY = 1`
+- CV32's `spatz_init()` polls `READY` until it becomes 1 before proceeding
+- After the first WFI wakeup, Spatz clears `READY = 0` (only once)
+- This prevents race conditions where CV32 might trigger an interrupt before Spatz enters WFI
+
+**Trap Handler (`_trap_handler`):**
+- **Interrupts** (mcause[31]=1): 
+  - **Reads and saves task address** from `TASKBIN` register into a register
+  - Clears `START` register to 0 to deassert interrupt signal (signals CV32 it's safe to write to `DATA`)
+  - Jumps to saved task address via `jalr ra, 0(t1)`
+  - After task returns, writes exit code 0 (success) to `RETURN` register
+  - Sets `DONE = 1` (generates 1-cycle pulse to Event Unit)
+  
+- **Exceptions** (mcause[31]=0):
+  - Writes exception code | 0x100 to `RETURN` register (bit 8 distinguishes exceptions from task errors)
+  - Sets `DONE = 1` (generates 1-cycle pulse to Event Unit)
+  - Halts in infinite WFI loop
+
+**Exit Code Convention:**
+- `0x000`: Task completed successfully
+- `0x1XX`: Exception occurred (XX = mcause[7:0])
+
+### Workflow
+
+**Programmer writes task** → **Makefile auto-detects** → **Builds Spatz binary** → **Converts to header** → **Embeds in CV32 firmware**
+
+**Task Naming Convention:**
+- File: `*_task.c` (e.g., `my_vector_task.c`)
+- Function: `int *_task(void)`
+- Symbol: `*_TASK` (uppercase in header)
+
+### Auto-Detection and Build
+
+Top-level Makefile automatically:
+1. Scans test code for `*_TASK` symbols
+2. Triggers Spatz binary build
+3. Generates header with task addresses
+
+```makefile
+# Automatic in MAGIA-SPATZ/Makefile
+SPATZ_TASKS := $(shell grep -oP '\b[A-Z][A-Z0-9_]*_TASK\b' $(TEST_SRCS))
+$(OBJ): spatz-header  # Header generated before CV32 compilation
+```
+
+### Generated Header Example
+
+The build system automatically generates a header file with the Spatz binary and task entry points:
+
+```c
+// Auto-generated: test_name_task_bin.h
+const uint8_t test_name_task_bin[] __attribute__((section(".spatz_binary"))) = {
+    0x37, 0x01, 0x02, 0x00, ...  // Spatz binary data
+};
+
+#define SPATZ_BINARY_START ((uint32_t)&_spatz_binary_start)
+#define MY_VECTOR_TASK     (SPATZ_BINARY_START + 0x00000120)  // Task offset
+#define ANOTHER_TASK       (SPATZ_BINARY_START + 0x00000240)  // Another task offset
+
+```
+
+**You MUST include this header in your CV32 test:**
+```c
+#include "my_test_task_bin.h"  // Required for SPATZ_BINARY_START and task symbols
+```
+
+
+## ⚡ Complete Build and Run
+
+### Example Test with Spatz
+
+**File: `sw/tests/my_test.c`**
+```c
+#include "magia_tile_utils.h"
+#include "magia_utils.h"
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "my_test_task_bin.h"
+
+int main(void) {
+    eu_init();
+    eu_enable_events(EU_SPATZ_DONE_MASK);
+    spatz_init(SPATZ_BINARY_START);
+    
+    spatz_run_task(MY_VECTOR_TASK);
+    eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK);
+    
+    return spatz_get_exit_code();
+}
+```
+
+### Build Commands
+
+```bash
+# 1. Setup environment
+make python_venv
+source setup_env.sh
+make python_deps
+make bender
+make update-ips > update-ips.log
+make floonoc-patch
+
+# 2. Build hardware (QuestaSim)
+module load questasim/2024.3
+make build-hw mesh_dv=0 fast_sim=1 > build-hw.log
+
+# 3. Build software (auto-detects Spatz tasks)
+module load pulp-gcc7/1.0.16
+make clean all test='my_test' mesh_dv=0
+
+# 4. Run simulation
+make run test='my_test' mesh_dv=0 gui=0
+```
+
+**Note:** Build flow is **identical** whether your test uses Spatz or not — the system automatically detects and handles Spatz tasks!
+
+---
+
+## 🔏 License
+All `software` sources are licensed under the Apache License 2.0 ([`LICENSE.APACHE`](../LICENSE.APACHE)). All `hardware` sources are licensed under the Solderpad Hardware License 0.51 ([`LICENSE.SHL`](../LICENSE.SHL)).
diff --git a/spatz/bootrom/Makefile b/spatz/bootrom/Makefile
new file mode 100644
index 0000000..d2f5648
--- /dev/null
+++ b/spatz/bootrom/Makefile
@@ -0,0 +1,96 @@
+# Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Solderpad Hardware License, Version 0.51 
+# (the "License"); you may not use this file except in compliance 
+# with the License. You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: SHL-0.51
+#
+# Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+# Spatz Initialization ROM Makefile for MAGIA Tile
+
+# Using Spatz LLVM toolchain
+LLVM_PATH := /sw/scratch/luca.balboni10/SPATZ/install/llvm
+
+# Configuration from top-level Makefile
+SPATZ_RVD      ?= 0   # 0: ELEN=32, 1: ELEN=64
+SPATZ_VLEN     ?= 256 # Vector length in bits
+SPATZ_N_IPU    ?= 1   # Number of Integer Processing Units (1-8)
+SPATZ_N_FPU    ?= 4   # Number of Floating Point Units (1-8)
+SPATZ_XDIVSQRT ?= 0   # 0: FP div/sqrt disabled, 1: enabled
+SPATZ_XDMA     ?= 0   # 0: DMA disabled, 1: enabled
+SPATZ_RVF      ?= 1   # 0: single-precision FP disabled, 1: enabled
+SPATZ_RVV      ?= 1   # 0: vector extension disabled, 1: enabled
+
+ISA      ?= riscv
+ARCH     ?= rv
+XLEN     ?= 32
+
+# Build ISA string dynamically based on configuration flags
+XTEN := ima
+ifeq ($(SPATZ_RVD),1)
+    XTEN := $(XTEN)fd
+    XABI ?= 32d
+else ifeq ($(SPATZ_RVF),1)
+    XTEN := $(XTEN)f
+    XABI ?= 32f
+else
+    XABI ?= 32
+endif
+ifeq ($(SPATZ_RVV),1)
+    XTEN := $(XTEN)v
+endif
+XTEN := $(XTEN)_zfh
+ABI  ?= ilp
+
+CC      = $(LLVM_PATH)/bin/clang
+OBJCOPY = $(LLVM_PATH)/bin/llvm-objcopy
+OBJDUMP = $(LLVM_PATH)/bin/llvm-objdump
+
+# Compiler flags (RVD=$(SPATZ_RVD), VLEN=$(SPATZ_VLEN), N_IPU=$(SPATZ_N_IPU), N_FPU=$(SPATZ_N_FPU), XDivSqrt=$(SPATZ_XDIVSQRT), Xdma=$(SPATZ_XDMA), RVF=$(SPATZ_RVF), RVV=$(SPATZ_RVV))
+CFLAGS  = --target=$(ISA)$(XLEN)-unknown-elf -march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XABI)
+CFLAGS += -DSPATZ_RVD=$(SPATZ_RVD) -DSPATZ_VLEN=$(SPATZ_VLEN) -DSPATZ_N_IPU=$(SPATZ_N_IPU) -DSPATZ_N_FPU=$(SPATZ_N_FPU) -DSPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) -DSPATZ_XDMA=$(SPATZ_XDMA) -DSPATZ_RVF=$(SPATZ_RVF) -DSPATZ_RVV=$(SPATZ_RVV)
+CFLAGS += -static -nostartfiles -nostdlib -O2 -g -Wall -Wextra
+
+# Linker flags
+LDFLAGS = -T spatz_init.ld -fuse-ld=lld
+
+# Files
+SRC     = spatz_init.S
+ELF     = spatz_init.elf
+BIN     = spatz_init.bin
+DUMP    = spatz_init.dump
+
+.PHONY: all clean bin sv
+
+# Build binary and dump files
+bin: $(BIN) $(DUMP)
+
+# Generate hardware bootrom SystemVerilog module from binary
+sv: $(BIN)
+	@echo "Generating hardware bootrom: ../../hw/tile/spatz_bootrom.sv"
+	python3 ../../scripts/generate_spatz_bootrom.py $(BIN) -o ../../hw/tile/spatz_bootrom.sv
+
+# Complete build: clean, compile binary, generate hardware module
+all: clean bin sv
+
+$(ELF): $(SRC) spatz_init.ld
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(SRC)
+
+$(BIN): $(ELF)
+	$(OBJCOPY) -O binary $< $@
+
+$(DUMP): $(ELF)
+	$(OBJDUMP) -D $< > $@
+
+clean:
+	rm -f $(ELF) $(BIN) $(DUMP)
+
+.PRECIOUS: $(ELF) $(BIN)
diff --git a/spatz/bootrom/spatz_init.S b/spatz/bootrom/spatz_init.S
new file mode 100644
index 0000000..3559298
--- /dev/null
+++ b/spatz/bootrom/spatz_init.S
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2018-2019 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *  
+ * All initialization (stack, regs, vector, BSS) done in spatz_crt0.S  
+ * Minimal Boot ROM - only read TASKBIN_ADDR and jump
+ */
+
+#define TASKBIN_ADDR 0x0000170C  // OBI slave ctrl exchange register
+
+  .section .text.bootrom
+  .global _start
+  .option norvc
+
+_start:
+  // Read dispatcher loop address from TASKBIN_ADDR (set by CV32)
+  li      t0, TASKBIN_ADDR
+  lw      t1, 0(t0)
+  
+  // Jump to _start in spatz binary (does all init)
+  jr      t1
diff --git a/spatz/bootrom/spatz_init.ld b/spatz/bootrom/spatz_init.ld
new file mode 100644
index 0000000..851b5c0
--- /dev/null
+++ b/spatz/bootrom/spatz_init.ld
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Solderpad Hardware License, Version 0.51 
+ * (the "License"); you may not use this file except in compliance 
+ * with the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: SHL-0.51
+/*
+ * Spatz Boot ROM Linker Script for MAGIA Tile
+ * 
+ * Maps bootrom to 0x10000000 (outside L2 range to avoid AXI address decode overlap).
+ * The bootrom is synthesized as a hardware module (spatz_bootrom.sv) mapped via AXI slave.
+ * 
+ * Boot sequence:
+ *   1. Spatz resets with PC = 0x10000000 (SPATZ_BOOT_ADDR in magia_tile.sv)
+ *   2. Bootrom initializes core (clear regs, setup interrupts)
+ *   3. Bootrom waits for MEIP interrupt via WFI
+ *   4. CV32 writes entry point to FUNC_PTR (0x1704)
+ *   5. CV32 triggers START_TRIGGER (0x1700) → MEIP interrupt
+ *   6. Bootrom reads FUNC_PTR and jumps to user program 
+ */
+
+SEARCH_DIR(.)
+OUTPUT_ARCH(riscv)
+ENTRY(_start)
+
+MEMORY
+{
+    bootrom : ORIGIN = 0x10000000, LENGTH = 0x100  /* 256B at 0x10000000 (outside L2 range to avoid AXI overlap) */
+}
+
+SECTIONS
+{
+    .text.bootrom : {
+        KEEP(*(.text.bootrom))
+        *(.text)
+        *(.text.*)
+    } > bootrom
+
+    .rodata : {
+        *(.rodata)
+        *(.rodata.*)
+    } > bootrom
+
+    /* Discard all other sections */
+    /DISCARD/ : {
+        *(.data)
+        *(.data.*)
+        *(.bss)
+        *(.bss.*)
+        *(.comment)
+        *(.note.*)
+    }
+}
diff --git a/spatz/sw/Makefile b/spatz/sw/Makefile
new file mode 100644
index 0000000..d22595f
--- /dev/null
+++ b/spatz/sw/Makefile
@@ -0,0 +1,193 @@
+# Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+#
+# Licensed under the Solderpad Hardware License, Version 0.51 
+# (the "License"); you may not use this file except in compliance 
+# with the License. You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: SHL-0.51
+#
+# Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+# Spatz Programs Makefile for MAGIA Tile
+
+LLVM_PATH := /sw/scratch/luca.balboni10/SPATZ/install/llvm
+
+# Configuration from top-level Makefile
+SPATZ_RVD      ?= 0   # 0: ELEN=32, 1: ELEN=64
+SPATZ_VLEN     ?= 256 # Vector length in bits
+SPATZ_N_IPU    ?= 1   # Number of Integer Processing Units (1-8)
+SPATZ_N_FPU    ?= 4   # Number of Floating Point Units (1-8)
+SPATZ_XDIVSQRT ?= 0   # 0: FP div/sqrt disabled, 1: enabled
+SPATZ_XDMA     ?= 0   # 0: DMA disabled, 1: enabled
+SPATZ_RVF      ?= 1   # 0: single-precision FP disabled, 1: enabled
+SPATZ_RVV      ?= 1   # 0: vector extension disabled, 1: enabled
+
+# Test name for generating specific header (passed from top Makefile)
+TEST_NAME      ?= base_spatz_test
+
+ISA      ?= riscv
+ARCH     ?= rv
+XLEN     ?= 32
+
+# Build ISA string dynamically based on configuration flags
+XTEN := ima
+ifeq ($(SPATZ_RVD),1)
+    XTEN := $(XTEN)fd
+    XABI ?= 32d
+else ifeq ($(SPATZ_RVF),1)
+    XTEN := $(XTEN)f
+    XABI ?= 32f
+else
+    XABI ?= 32
+endif
+ifeq ($(SPATZ_RVV),1)
+    XTEN := $(XTEN)v
+endif
+XTEN := $(XTEN)_zfh
+ABI  ?= ilp
+
+# Toolchain paths
+CC      = $(LLVM_PATH)/bin/clang
+OBJCOPY = $(LLVM_PATH)/bin/llvm-objcopy
+OBJDUMP = $(LLVM_PATH)/bin/llvm-objdump
+
+# Architecture flags  
+ARCH_FLAGS   = --target=riscv32 -march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XABI)
+ARCH_FLAGS  += -menable-experimental-extensions
+
+# Compiler flags (RVD=$(SPATZ_RVD), VLEN=$(SPATZ_VLEN), N_IPU=$(SPATZ_N_IPU), N_FPU=$(SPATZ_N_FPU), XDivSqrt=$(SPATZ_XDIVSQRT), Xdma=$(SPATZ_XDMA), RVF=$(SPATZ_RVF), RVV=$(SPATZ_RVV))
+CFLAGS       = $(ARCH_FLAGS) -static -nostartfiles -nostdlib
+CFLAGS      += -O3 -g -Wall -Wextra
+CFLAGS      += -fno-common -ffunction-sections -fdata-sections -fPIC
+CFLAGS      += -fno-builtin-memset -fno-builtin-memcpy
+CFLAGS      += -mcmodel=medany
+CFLAGS      += -DSPATZ_RVD=$(SPATZ_RVD) -DSPATZ_VLEN=$(SPATZ_VLEN) -DSPATZ_N_IPU=$(SPATZ_N_IPU) -DSPATZ_N_FPU=$(SPATZ_N_FPU) -DSPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) -DSPATZ_XDMA=$(SPATZ_XDMA) -DSPATZ_RVF=$(SPATZ_RVF) -DSPATZ_RVV=$(SPATZ_RVV)
+CFLAGS      += -I../../sw/utils -I../../hw/include -Ispatz_utils
+
+# Linker flags
+LDFLAGS      = -T $(KERNEL_DIR)/spatz_program.ld -fuse-ld=lld
+LDFLAGS     += -Wl,-z,norelro
+LDFLAGS     += -Wl,--allow-multiple-definition
+LDFLAGS     += $(ARCH_FLAGS)
+
+# Single binary with selected tests (passed via TESTS variable)
+# Binary and header names based on TEST_NAME
+BINARY_NAME  = $(TEST_NAME)_task_bin
+HEADER_NAME  = $(TEST_NAME)_task_bin
+
+# Directories
+BIN_DIR      = bin
+HEADER_DIR   = headers_bin
+TESTS_DIR    = tasks
+KERNEL_DIR   = kernel
+
+# Tasks to include (lowercase 'task' variable)
+# Usage: make all task="general_task" or task="general_task hello_world_task"
+# MUST be specified explicitly - no default
+task         ?=
+
+# Source files for selected tasks
+TASK_SRCS    = $(addprefix $(TESTS_DIR)/,$(addsuffix .c,$(task)))
+
+# Source and output files
+CRT0         = $(KERNEL_DIR)/spatz_crt0.S
+CRT0_OBJ     = $(BIN_DIR)/spatz_crt0.o
+ELF          = $(BIN_DIR)/$(BINARY_NAME).elf
+BIN          = $(BIN_DIR)/$(BINARY_NAME).bin
+DUMP         = $(BIN_DIR)/$(BINARY_NAME).dump
+HEADER       = $(HEADER_DIR)/$(HEADER_NAME).h
+
+# Python script for bin2header conversion
+BIN2HEADER   = ../../scripts/bin2header.py
+
+.PHONY: all clean bin header dirs
+
+# Complete build: compile binary and generate header
+# Force header regeneration when TASKS changes (always delete and rebuild)
+all: bin
+	@rm -f $(HEADER)
+	@$(MAKE) header
+
+# Build binary (ELF + BIN + DUMP)
+bin: dirs $(BIN) $(DUMP)
+
+# Generate header from binary (always regenerated by 'all' target)
+header: dirs $(HEADER)
+
+# Create directories
+dirs:
+	@mkdir -p $(BIN_DIR) $(HEADER_DIR)
+
+# Compile crt0.S
+$(CRT0_OBJ): $(CRT0)
+	@echo "[SPATZ] Compiling crt0..."
+	$(CC) $(ARCH_FLAGS) -c -o $@ $(CRT0)
+
+# Build ELF executable with crt0 + selected tasks
+# Tasks are launched by CV32 via interrupt-driven task switching
+$(ELF): $(CRT0_OBJ) $(TASK_SRCS) $(KERNEL_DIR)/spatz_program.ld
+	@mkdir -p $(BIN_DIR)
+	@if [ -z "$(TASK_SRCS)" ]; then \
+		echo "[SPATZ] ERROR: No task sources found for task=$(task)"; \
+		exit 1; \
+	fi
+	@echo "[SPATZ] Included tasks: $(task)"
+	@echo "[SPATZ] Tasks will be launched by CV32 via spatz_set_func() and interrupt"
+	@FIRST_TASK=$(word 1,$(task)); \
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(CRT0_OBJ) $(TASK_SRCS) \
+		-Wl,--defsym,__first_task=$${FIRST_TASK}
+
+# Generate binary from ELF
+$(BIN): $(ELF)
+	@echo "[SPATZ] Generating binary..."
+	$(OBJCOPY) -O binary $< $@
+
+# Generate disassembly
+$(DUMP): $(ELF)
+	@echo "[SPATZ] Generating disassembly..."
+	$(OBJDUMP) -D -S $< > $@
+
+# Generate header with binary array + extract function symbols
+$(HEADER): $(BIN) $(ELF)
+	@mkdir -p $(HEADER_DIR)
+	@echo "[SPATZ] Generating header with binary array..."
+	@python3 $(BIN2HEADER) $(BIN) $(HEADER) \
+		--name $(HEADER_NAME) \
+		--section .spatz_binary \
+		--address "dynamic (_spatz_binary_start)"
+	@echo "[SPATZ] Adding binary start + dispatcher + task symbols..."
+	@GUARD_NAME=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \
+	sed -i "/#endif \/\* __$${GUARD_NAME}_H__ \*\//d" $(HEADER)
+	@echo "" >> $(HEADER)
+	@echo "/* Binary start address - defined by CV32 linker */" >> $(HEADER)
+	@echo "extern uint32_t _spatz_binary_start; /* Linker symbol from CV32 link.ld */" >> $(HEADER)
+	@echo "#define SPATZ_BINARY_START ((uint32_t)&_spatz_binary_start)" >> $(HEADER)
+	@echo "" >> $(HEADER)
+	@echo "/* Dispatcher loop address - for spatz_run() */" >> $(HEADER)
+	@DISP_ADDR=$$($(OBJDUMP) -t $(ELF) | grep "dispatcher_loop$$" | awk '{print $$1}'); \
+	echo "#define SPATZ_DISPATCHER_LOOP (SPATZ_BINARY_START + 0x$${DISP_ADDR})" >> $(HEADER)
+	@echo "" >> $(HEADER)
+	@echo "/* Task function entry points - OFFSETS from SPATZ_BINARY_START */" >> $(HEADER)
+	@$(OBJDUMP) -t $(ELF) | grep "_task$$" | grep -v "__first_task$$" | awk '{print $$1, $$NF}' | while read addr name; do \
+		TASK_NAME=$$(echo $$name | tr 'a-z' 'A-Z'); \
+		echo "#define $${TASK_NAME} (SPATZ_BINARY_START + 0x$$addr)" >> $(HEADER); \
+	done
+	@echo "" >> $(HEADER)
+	@GUARD_NAME=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \
+	echo "#endif /* __$${GUARD_NAME}_H__ */" >> $(HEADER)
+
+clean:
+	rm -rf $(BIN_DIR)/*.elf $(BIN_DIR)/*.bin $(BIN_DIR)/*.dump $(BIN_DIR)/*.o
+	rm -rf $(HEADER_DIR)/*.h
+
+# Clean only current test artifacts (preserves other headers)
+clean-test:
+	rm -f $(ELF) $(BIN) $(DUMP) $(CRT0_OBJ) $(HEADER)
+
+.PRECIOUS: $(ELF) $(BIN)
diff --git a/spatz/sw/kernel/spatz_crt0.S b/spatz/sw/kernel/spatz_crt0.S
new file mode 100644
index 0000000..03382f8
--- /dev/null
+++ b/spatz/sw/kernel/spatz_crt0.S
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2018-2019 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Spatz Runtime
+ * Entry: Bootrom should jumps here
+ *
+ */
+
+#define READY_ADDR        0x00001704
+#define START_ADDR        0x00001708
+#define TASKBIN_ADDR      0x0000170C
+#define DATA_ADDR         0x00001710
+#define RETURN_ADDR       0x00001714
+#define DONE_ADDR         0x00001718
+#define MSTATUS_VS        0x200        // Vector Status [9:8] = 01
+#define MSTATUS_MIE       0x8          // Machine Interrupt Enable
+#define MIE_MEIE          0x800        // External interrupt enable
+
+.section .text.start, "ax"
+.global _start
+
+_start:
+    /* Init stack pointer */
+    li      sp, 0x0001FFF8
+
+    /* Clear ALL integer registers */
+    mv x1,  x0
+    // x2 = sp just initialized
+    mv x3,  x0
+    mv x4,  x0
+    mv x5,  x0
+    mv x6,  x0
+    mv x7,  x0
+    mv x8,  x0
+    mv x9,  x0
+    mv x10, x0
+    mv x11, x0
+    mv x12, x0
+    mv x13, x0
+    mv x14, x0
+    mv x15, x0
+    mv x16, x0
+    mv x17, x0
+    mv x18, x0
+    mv x19, x0
+    mv x20, x0
+    mv x21, x0
+    mv x22, x0
+    mv x23, x0
+    mv x24, x0
+    mv x25, x0
+    mv x26, x0
+    mv x27, x0
+    mv x28, x0
+    mv x29, x0
+    mv x30, x0
+    mv x31, x0
+
+    /* Enable vector extension (VS=Initial for Spatz) */
+    li      t0, MSTATUS_VS
+    csrs    mstatus, t0
+
+    /* Clear BSS section */
+    la      a0, __bss_start
+    la      a1, __bss_end
+1:  bgeu    a0, a1, 2f
+    sw      zero, 0(a0)
+    addi    a0, a0, 4
+    j       1b
+2:
+    /* Setup trap handler for interrupt */
+    la      t0, _trap_handler
+    csrw    mtvec, t0
+    
+    /* Enable external interrupts (MEIE) */
+    li      t0, MIE_MEIE
+    csrs    mie, t0
+    
+    /* Enable global interrupts (MIE bit in mstatus) */
+    li      t0, MSTATUS_MIE
+    csrs    mstatus, t0
+    
+    /* Signal ready to CV32 - now ready to receive interrupts */
+    li      t0, READY_ADDR
+    li      t1, 1
+    sw      t1, 0(t0)
+
+    /* First WFI - after this we deassert READY */
+    wfi
+    
+    /* Deassert READY after first wakeup (only once) */
+    li      t0, READY_ADDR
+    sw      zero, 0(t0)
+    
+    /* Fall through to main dispatcher loop */
+
+dispatcher_loop:
+    /* Wait for interrupt from CV32 */
+    wfi
+    
+    /* After wakeup from interrupt, task has been executed */
+    /* Loop back */
+    j       dispatcher_loop
+
+/* Trap handler - distinguishes interrupts from exceptions */
+_trap_handler:
+    csrr    t0, mcause
+    bltz    t0, _handle_interrupt   // mcause[31]=1 → interrupt
+    
+    /* EXCEPTION: Write error code to RETURN_ADDR, signal DONE, halt */
+_handle_exception:
+    csrr    t1, mcause      // Read full mcause
+    andi    t1, t1, 0xFF    // Extract exception code [7:0]
+    ori     t1, t1, 0x100   // Set bit 8 to distinguish from task errors
+    li      t0, RETURN_ADDR
+    sw      t1, 0(t0)
+    
+    li      t0, DONE_ADDR
+    li      t1, 1
+    sw      t1, 0(t0)
+    
+    /* Halt */
+1:  wfi
+    j       1b
+
+_handle_interrupt:
+    /* Check if external interrupt */
+    andi    t0, t0, 0x7FF
+    li      t1, 11
+    bne     t0, t1, _trap_ret
+    
+    /* Read task address from TASKBIN_ADDR and save it in t1 */
+    li      t0, TASKBIN_ADDR
+    lw      t1, 0(t0)
+    beqz    t1, _trap_ret
+    
+    /* Clear START */
+    li      t0, START_ADDR
+    sw      zero, 0(t0)
+    
+    /* Jump to saved task address */
+    jalr    ra, 0(t1)
+    
+    /* Task completed - write exit code 0 (success, no exceptions) */
+    li      t0, RETURN_ADDR
+    sw      zero, 0(t0)
+    
+    /* Signal done */
+    li      t0, DONE_ADDR
+    li      t1, 1
+    sw      t1, 0(t0)
+
+_trap_ret:
+    mret
diff --git a/spatz/sw/kernel/spatz_program.ld b/spatz/sw/kernel/spatz_program.ld
new file mode 100644
index 0000000..b946b61
--- /dev/null
+++ b/spatz/sw/kernel/spatz_program.ld
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2018-2019 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Memory Layout (when embedded in CV32 instrram):
+ *   0xCC000000: CV32 .text
+ *   0xCC000000 + CV32_size: .spatz_binary (Spatz code inline)
+ * 
+ * Stack: Spatz uses remaining 16KB after CV32 (0x0001C000-0x00020000)
+ *   - CV32 uses: 0x00010000-0x0001C000 (48KB)
+ *   - Spatz uses: 0x0001C000-0x00020000 (16KB)
+ */
+
+OUTPUT_ARCH(riscv)
+ENTRY(_start)
+
+MEMORY
+{
+    prog (rwxa)  : ORIGIN = 0x00000000, LENGTH = 0x10000  /* 64KB max - relative addressing */
+}
+
+/* Stack configuration - Spatz uses remaining 16KB after CV32 */
+/* CV32 uses: 0x00010000-0x0001C000 (48KB, from CV32 link.ld) */
+/* Spatz uses: 0x0001C000-0x00020000 (16KB) */
+_stack_start = 0x0001FFF8;  /* Top of Spatz stack (grows downward) */
+
+SECTIONS
+{
+    .text : {
+        . = ALIGN(4);
+        KEEP(*(.text.start))
+        *(.text)
+        *(.text.*)
+        . = ALIGN(4);
+    } > prog
+
+    .rodata : {
+        . = ALIGN(4);
+        *(.rodata)
+        *(.rodata.*)
+        . = ALIGN(4);
+    } > prog
+
+    .data : {
+        . = ALIGN(4);
+        *(.data)
+        *(.data.*)
+        *(.sdata)
+        *(.sdata.*)
+        . = ALIGN(4);
+    } > prog
+
+    .bss : {
+        . = ALIGN(4);
+        __bss_start = .;
+        *(.bss)
+        *(.bss.*)
+        *(.sbss)
+        *(.sbss.*)
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end = .;
+    } > prog
+
+    /* Stack symbol for spatz_crt0.S */
+    /* spatz_crt0.S uses: la sp, stack */
+    stack = _stack_start;
+
+    /* Discard debug sections */
+    /DISCARD/ : {
+        *(.comment)
+        *(.note.*)
+    }
+}
diff --git a/spatz/sw/spatz_utils/spatz_workers.h b/spatz/sw/spatz_utils/spatz_workers.h
new file mode 100644
index 0000000..9da8680
--- /dev/null
+++ b/spatz/sw/spatz_utils/spatz_workers.h
@@ -0,0 +1,1760 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Spatz Vector Utilities - Based on Spatz Benchmarks
+ * Provides optimized vector kernels for various operations
+ */
+
+
+#ifndef _SPATZ_UTILS_H_
+#define _SPATZ_UTILS_H_
+
+#include <stdint.h>
+
+// FP16 type alias
+typedef uint16_t fp16_t;
+typedef _Float16 float16_t;
+
+// =============================================================================
+// Dot Product Kernels (from spatzBenchmarks/dp-fdotp)
+// =============================================================================
+
+// 64-bit dot-product: a * b
+static inline double fdotp_v64b(const double *a, const double *b, unsigned int avl) {
+  const unsigned int orig_avl = avl;
+  unsigned int vl;
+  double red;
+
+  // Clean the accumulator
+  asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+  asm volatile("vmv.s.x v0, zero");
+
+  // Stripmine and accumulate a partial reduced vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle64.v v8,  (%0)" ::"r"(a));
+    asm volatile("vle64.v v16, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    if (avl == orig_avl) {
+      asm volatile("vfmul.vv v24, v8, v16");
+    } else {
+      asm volatile("vfmacc.vv v24, v8, v16");
+    }
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+  } while (avl > 0);
+
+  // Reduce and return
+  asm volatile("vsetvli zero, %0, e64, m8, ta, ma" ::"r"(orig_avl));
+  asm volatile("vfredusum.vs v0, v24, v0");
+  asm volatile("vfmv.f.s %0, v0" : "=f"(red));
+
+  return red;
+}
+
+// 32-bit dot-product: a * b
+static inline float fdotp_v32b(const float *a, const float *b, unsigned int avl) {
+  const unsigned int orig_avl = avl;
+  unsigned int vl;
+  float red;
+
+  asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl));
+  asm volatile("vmv.s.x v0, zero");
+
+  // Stripmine and accumulate a partial reduced vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle32.v v8,  (%0)" ::"r"(a));
+    asm volatile("vle32.v v16, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    if (avl == orig_avl) {
+      asm volatile("vfmul.vv v24, v8, v16");
+    } else {
+      asm volatile("vfmacc.vv v24, v8, v16");
+    }
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+  } while (avl > 0);
+
+  // Reduce and return
+  asm volatile("vsetvli zero, %0, e32, m8, ta, ma" ::"r"(orig_avl));
+  asm volatile("vfredusum.vs v0, v24, v0");
+  asm volatile("vfmv.f.s %0, v0" : "=f"(red));
+  return red;
+}
+
+// 16-bit dot-product: a * b
+static inline float16_t fdotp_v16b(const float16_t *a, const float16_t *b, unsigned int avl) {
+  const unsigned int orig_avl = avl;
+  unsigned int vl;
+  float16_t red;
+
+  asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+  asm volatile("vmv.s.x v0, zero");
+
+  // Stripmine and accumulate a partial reduced vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle16.v v8,  (%0)" ::"r"(a));
+    asm volatile("vle16.v v16, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    if (avl == orig_avl) {
+      asm volatile("vfmul.vv v24, v8, v16");
+    } else {
+      asm volatile("vfmacc.vv v24, v8, v16");
+    }
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+  } while (avl > 0);
+
+  // Reduce and return
+  asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(orig_avl));
+  asm volatile("vfredusum.vs v0, v24, v0");
+  asm volatile("vfmv.f.s %0, v0" : "=f"(red));
+  return red;
+}
+
+// =============================================================================
+// AXPY Kernels (from spatzBenchmarks/dp-faxpy): y = a * x + y
+// =============================================================================
+
+// 64-bit AXPY: y = a * x + y
+static inline void faxpy_v64b(const double a, const double *x, const double *y, unsigned int avl) {
+  unsigned int vl;
+
+  // Stripmine and accumulate a partial vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load vectors
+    asm volatile("vle64.v v0, (%0)" ::"r"(x));
+    asm volatile("vle64.v v8, (%0)" ::"r"(y));
+
+    // Multiply-accumulate
+    asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
+
+    // Store results
+    asm volatile("vse64.v v8, (%0)" ::"r"(y));
+
+    // Bump pointers
+    x += vl;
+    y += vl;
+    avl -= vl;
+  } while (avl > 0);
+}
+
+// 32-bit AXPY: y = a * x + y
+static inline void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) {
+  unsigned int vl;
+
+  // Stripmine and accumulate a partial vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load vectors
+    asm volatile("vle32.v v0, (%0)" ::"r"(x));
+    asm volatile("vle32.v v8, (%0)" ::"r"(y));
+
+    // Multiply-accumulate
+    asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
+
+    // Store results
+    asm volatile("vse32.v v8, (%0)" ::"r"(y));
+
+    // Bump pointers
+    x += vl;
+    y += vl;
+    avl -= vl;
+  } while (avl > 0);
+}
+
+// 16-bit AXPY: y = a * x + y
+static inline void faxpy_v16b(const float16_t a, const float16_t *x, const float16_t *y, unsigned int avl) {
+  unsigned int vl;
+
+  // Stripmine and accumulate a partial vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load vectors
+    asm volatile("vle16.v v0, (%0)" ::"r"(x));
+    asm volatile("vle16.v v8, (%0)" ::"r"(y));
+
+    // Multiply-accumulate
+    asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
+
+    // Store results
+    asm volatile("vse16.v v8, (%0)" ::"r"(y));
+
+    // Bump pointers
+    x += vl;
+    y += vl;
+    avl -= vl;
+  } while (avl > 0);
+}
+
+// =============================================================================
+// Matrix Multiplication Kernels (from spatzBenchmarks/dp-fmatmul)
+// =============================================================================
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+// Forward declarations
+static inline void fmatmul_v64b_2xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v64b_4xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v64b_8xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+
+// Matrix multiplication wrapper - automatically selects optimal variant
+// Computes C = A × B where C is MxP, A is MxN, B is NxP
+static inline void fmatmul_v64b(double *c, const double *a, const double *b,
+                                const unsigned int M, const unsigned int N,
+                                const unsigned int P) {
+  if (M <= 4) {
+    fmatmul_v64b_2xVL(c, a, b, 0, M, N, P, 0, P);
+  } else if (M <= 8) {
+    fmatmul_v64b_4xVL(c, a, b, 0, M, N, P, 0, P);
+  } else {
+    fmatmul_v64b_8xVL(c, a, b, 0, M, N, P, 0, P);
+  }
+}
+
+// 2xVL variant - optimized for small matrices (M <= 4)
+static inline void fmatmul_v64b_2xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e64, m8, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const double *b_ = b + p;
+    double *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 2) {
+      const double *a_ = a + m * N;
+      const double *a__ = a_;
+
+      asm volatile("vle64.v v16, (%0);" ::"r"(b_));
+      const double *b__ = b_ + P;
+
+      double *c__ = c_ + m * P;
+
+      double t0, t1;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle64.v v24, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1));
+          t1 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1));
+          t1 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle64.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+        t1 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+      asm volatile("vse64.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+      asm volatile("vse64.v v8, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 4xVL variant - optimized for medium matrices (M <= 8)
+static inline void fmatmul_v64b_4xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e64, m4, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const double *b_ = b + p;
+    double *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 4) {
+      const double *a_ = a + m * N;
+      const double *a__ = a_;
+
+      asm volatile("vle64.v v16, (%0);" ::"r"(b_));
+      const double *b__ = b_ + P;
+
+      double *c__ = c_ + m * P;
+
+      double t0, t1, t2, t3;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+      a__ += N;
+      t2 = *a__;
+      a__ += N;
+      t3 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        asm volatile("vle64.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        a__ = a_ + ++n;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3));
+          t3 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3));
+          t3 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle64.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+        t1 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+        t2 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+        t3 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse64.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+      asm volatile("vse64.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+      asm volatile("vse64.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+      asm volatile("vse64.v v12, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 8xVL variant - optimized for larger matrices (M > 8)
+static inline void fmatmul_v64b_8xVL(double *c, const double *a, const double *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e64, m2, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const double *b_ = b + p;
+    double *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 8) {
+      const double *a_ = a + m * N;
+      const double *a__ = a_;
+
+      asm volatile("vle64.v v18, (%0);" ::"r"(b_));
+      const double *b__ = b_ + P;
+
+      double *c__ = c_ + m * P;
+
+      double t0, t1, t2, t3, t4, t5, t6, t7;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+      a__ += N;
+      t2 = *a__;
+      a__ += N;
+      t3 = *a__;
+      a__ += N;
+      t4 = *a__;
+      a__ += N;
+      t5 = *a__;
+      a__ += N;
+      t6 = *a__;
+      a__ += N;
+      t7 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle64.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3));
+          t3 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4));
+          t4 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5));
+          t5 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6));
+          t6 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7));
+          t7 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3));
+          t3 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4));
+          t4 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5));
+          t5 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6));
+          t6 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7));
+          t7 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle64.v v18, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+        t1 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+        t2 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+        t3 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+        t4 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+        t5 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+        t6 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+        t7 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse64.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+      asm volatile("vse64.v v2, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+      asm volatile("vse64.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+      asm volatile("vse64.v v6, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+      asm volatile("vse64.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+      asm volatile("vse64.v v10, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+      asm volatile("vse64.v v12, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+      asm volatile("vse64.v v14, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// =============================================================================
+// FP32 Matrix Multiplication Kernels (from spatzBenchmarks/sp-fmatmul)
+// =============================================================================
+
+// Forward declarations for FP32 matmul
+static inline void fmatmul_v32b_2xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v32b_4xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v32b_8xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+
+// FP32 matrix multiplication wrapper - automatically selects optimal variant
+// Computes C = A × B where C is MxP, A is MxN, B is NxP
+static inline void fmatmul_v32b(float *c, const float *a, const float *b,
+                                const unsigned int M, const unsigned int N,
+                                const unsigned int P) {
+  if (M <= 4) {
+    fmatmul_v32b_2xVL(c, a, b, 0, M, N, P, 0, P);
+  } else if (M <= 8) {
+    fmatmul_v32b_4xVL(c, a, b, 0, M, N, P, 0, P);
+  } else {
+    fmatmul_v32b_8xVL(c, a, b, 0, M, N, P, 0, P);
+  }
+}
+
+// 2xVL variant - optimized for small matrices (M <= 4)
+static inline void fmatmul_v32b_2xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e32, m8, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float *b_ = b + p;
+    float *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 2) {
+      const float *a_ = a + m * N;
+      const float *a__ = a_;
+
+      asm volatile("vle32.v v16, (%0);" ::"r"(b_));
+      const float *b__ = b_ + P;
+
+      float *c__ = c_ + m * P;
+
+      float t0, t1;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle32.v v24, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1));
+          t1 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1));
+          t1 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle32.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+        t1 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+      asm volatile("vse32.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+      asm volatile("vse32.v v8, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 4xVL variant - optimized for medium matrices (M <= 8)
+static inline void fmatmul_v32b_4xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e32, m4, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float *b_ = b + p;
+    float *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 4) {
+      const float *a_ = a + m * N;
+      const float *a__ = a_;
+
+      asm volatile("vle32.v v16, (%0);" ::"r"(b_));
+      const float *b__ = b_ + P;
+
+      float *c__ = c_ + m * P;
+
+      float t0, t1, t2, t3;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+      a__ += N;
+      t2 = *a__;
+      a__ += N;
+      t3 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        asm volatile("vle32.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        a__ = a_ + ++n;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3));
+          t3 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3));
+          t3 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle32.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+        t1 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+        t2 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+        t3 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse32.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+      asm volatile("vse32.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+      asm volatile("vse32.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+      asm volatile("vse32.v v12, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 8xVL variant - optimized for larger matrices (M > 8)
+static inline void fmatmul_v32b_8xVL(float *c, const float *a, const float *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e32, m2, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float *b_ = b + p;
+    float *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 8) {
+      const float *a_ = a + m * N;
+      const float *a__ = a_;
+
+      asm volatile("vle32.v v18, (%0);" ::"r"(b_));
+      const float *b__ = b_ + P;
+
+      float *c__ = c_ + m * P;
+
+      float t0, t1, t2, t3, t4, t5, t6, t7;
+
+      t0 = *a__;
+      a__ += N;
+      t1 = *a__;
+      a__ += N;
+      t2 = *a__;
+      a__ += N;
+      t3 = *a__;
+      a__ += N;
+      t4 = *a__;
+      a__ += N;
+      t5 = *a__;
+      a__ += N;
+      t6 = *a__;
+      a__ += N;
+      t7 = *a__;
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle32.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3));
+          t3 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4));
+          t4 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5));
+          t5 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6));
+          t6 = *a__;
+          a__ += N;
+          asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7));
+          t7 = *a__;
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0));
+          t0 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1));
+          t1 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2));
+          t2 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3));
+          t3 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4));
+          t4 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5));
+          t5 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6));
+          t6 = *a__;
+          a__ += N;
+          asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7));
+          t7 = *a__;
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle32.v v18, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        t0 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+        t1 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+        t2 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+        t3 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+        t4 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+        t5 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+        t6 = *a__;
+        a__ += N;
+        asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+        t7 = *a__;
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse32.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+      asm volatile("vse32.v v2, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+      asm volatile("vse32.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+      asm volatile("vse32.v v6, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+      asm volatile("vse32.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+      asm volatile("vse32.v v10, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+      asm volatile("vse32.v v12, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+      asm volatile("vse32.v v14, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// =============================================================================
+// FP16 Matrix Multiplication Kernels (from spatzBenchmarks/hp-fmatmul)
+// =============================================================================
+
+// Forward declarations for FP16 matmul SDOTP variants
+static inline void matmul_2xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end);
+static inline void matmul_4xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end);
+static inline void matmul_8xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end);
+
+static inline void fmatmul_v16b_sdotp(float16_t *c, const float16_t *a, const float16_t *b,
+                          const unsigned int M, const unsigned int N,
+                          const unsigned int P) {
+  if (M <= 4) {
+    matmul_2xVL(c, a, b, 0, M, N, P, 0, P);
+  } else if (M <= 8) {
+    matmul_4xVL(c, a, b, 0, M, N, P, 0, P);
+  } else {
+    matmul_8xVL(c, a, b, 0, M, N, P, 0, P);
+  }
+}
+
+// 2xVL variant - optimized for small matrices (M <= 4)
+static inline void matmul_2xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl - processes 2 columns at a time
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e16, m8, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(2 * (p_end - p)));
+
+    const float16_t *b_ = b + 2 * p;
+    float16_t *c_ = c + p;
+
+    // Account for the used operands
+    p += gvl / 2;
+
+    for (unsigned int m = m_start; m < m_end; m += 2) {
+      asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(gvl));
+
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle16.v v16, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + 2 * P;
+
+      float16_t *c__ = c_ + m * P;
+
+      double t0, t1;
+
+      asm volatile("vmv.v.x v0, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v8, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        n += 2;
+        a__ = a_ + n;
+
+        asm volatile("vle16.v v24, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v16" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v16" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+
+        n += 2;
+        a__ = a_ + n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle16.v v16, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v24" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v24" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      }
+
+      asm volatile("vfwdotp.vf v0, %0, v24" ::"f"(t0));
+      asm volatile("vfwdotp.vf v8, %0, v24" ::"f"(t1));
+
+      asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(gvl / 2));
+
+      asm volatile("vfncvt.f.f.w v0, v0");
+      asm volatile("vse16.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v8, v8");
+      asm volatile("vse16.v v8, (%0);" ::"r"(c__));
+    }
+  }
+}
+
+//4xVL variant - optimized for medium matrices (M <= 8)
+static inline void matmul_4xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl - processes 2 columns at a time
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e16, m4, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(2 * (p_end - p)));
+
+    const float16_t *b_ = b + 2 * p;
+    float16_t *c_ = c + p;
+
+    // Account for the used operands
+    p += gvl / 2;
+
+    for (unsigned int m = m_start; m < m_end; m += 4) {
+      asm volatile("vsetvli zero, %0, e16, m4, ta, ma" ::"r"(gvl));
+
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle16.v v16, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + 2 * P;
+
+      float16_t *c__ = c_ + m * P;
+
+      double t0, t1, t2, t3;
+
+      asm volatile("vmv.v.x v0, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v4, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v8, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v12, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        n += 2;
+        a__ = a_ + n;
+
+        asm volatile("vle16.v v20, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v16" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v4, %0, v16" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v16" ::"f"(t2));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v12, %0, v16" ::"f"(t3));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+
+        n += 2;
+        a__ = a_ + n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle16.v v16, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t2));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t3));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+      }
+
+      asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t1));
+      asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t2));
+      asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t3));
+
+      asm volatile("vsetvli zero, %0, e16, m4, ta, ma" ::"r"(gvl / 2));
+
+      asm volatile("vfncvt.f.f.w v0, v0");
+      asm volatile("vse16.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v4, v4");
+      asm volatile("vse16.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v8, v8");
+      asm volatile("vse16.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v12, v12");
+      asm volatile("vse16.v v12, (%0);" ::"r"(c__));
+    }
+  }
+}
+
+//8xVL variant - optimized for larger matrices (M > 8)
+static inline void matmul_8xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                               const unsigned int m_start, const unsigned int m_end,
+                               const unsigned int N, const unsigned int P,
+                               const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl - processes 2 columns at a time
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e16, m2, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(2 * (p_end - p)));
+
+    const float16_t *b_ = b + 2 * p;
+    float16_t *c_ = c + p;
+
+    // Account for the used operands
+    p += gvl / 2;
+
+    for (unsigned int m = m_start; m < m_end; m += 8) {
+      asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(gvl));
+
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle16.v v18, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + 2 * P;
+
+      float16_t *c__ = c_ + m * P;
+
+      double t0, t1, t2, t3, t4, t5, t6, t7;
+
+      asm volatile("vmv.v.x v0, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v2, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v4, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v6, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v8, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v10, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v12, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("vmv.v.x v14, zero");
+      asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        n += 2;
+        a__ = a_ + n;
+
+        asm volatile("vle16.v v20, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v18" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v2, %0, v18" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v4, %0, v18" ::"f"(t2));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v6, %0, v18" ::"f"(t3));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v18" ::"f"(t4));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v10, %0, v18" ::"f"(t5));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v12, %0, v18" ::"f"(t6));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v14, %0, v18" ::"f"(t7));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+
+        n += 2;
+        a__ = a_ + n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle16.v v18, (%0);" ::"r"(b__));
+        b__ += 2 * P;
+
+        asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v2, %0, v20" ::"f"(t1));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t2));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v6, %0, v20" ::"f"(t3));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t4));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v10, %0, v20" ::"f"(t5));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t6));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfwdotp.vf v14, %0, v20" ::"f"(t7));
+        asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+      }
+
+      asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vfwdotp.vf v2, %0, v20" ::"f"(t1));
+      asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t2));
+      asm volatile("vfwdotp.vf v6, %0, v20" ::"f"(t3));
+      asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t4));
+      asm volatile("vfwdotp.vf v10, %0, v20" ::"f"(t5));
+      asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t6));
+      asm volatile("vfwdotp.vf v14, %0, v20" ::"f"(t7));
+
+      asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(gvl / 2));
+
+      asm volatile("vfncvt.f.f.w v0, v0");
+      asm volatile("vse16.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v2, v2");
+      asm volatile("vse16.v v2, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v4, v4");
+      asm volatile("vse16.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v6, v6");
+      asm volatile("vse16.v v6, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v8, v8");
+      asm volatile("vse16.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v10, v10");
+      asm volatile("vse16.v v10, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v12, v12");
+      asm volatile("vse16.v v12, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfncvt.f.f.w v14, v14");
+      asm volatile("vse16.v v14, (%0);" ::"r"(c__));
+    }
+  }
+}
+
+// From spatzBenchmarks/hp-fmatmul
+// Forward declarations for FP16 matmul standard (direct FP16)
+static inline void fmatmul_v16b_2xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v16b_4xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+static inline void fmatmul_v16b_8xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end);
+
+static inline void fmatmul_v16b(float16_t *c, const float16_t *a, const float16_t *b,
+                                const unsigned int M, const unsigned int N,
+                                const unsigned int P) {
+  if (M <= 4) {
+    fmatmul_v16b_2xVL(c, a, b, 0, M, N, P, 0, P);
+  } else if (M <= 8) {
+    fmatmul_v16b_4xVL(c, a, b, 0, M, N, P, 0, P);
+  } else {
+    fmatmul_v16b_8xVL(c, a, b, 0, M, N, P, 0, P);
+  }
+}
+
+// 2xVL variant - optimized for small matrices (M <= 4)
+static inline void fmatmul_v16b_2xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e32, m8, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float16_t *b_ = b + p;
+    float16_t *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 2) {
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle32.v v16, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + P;
+
+      float16_t *c__ = c_ + m * P;
+
+      float t0, t1;
+
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle32.v v24, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle32.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0));
+      asm volatile("vse32.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1));
+      asm volatile("vse32.v v8, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 4xVL variant - optimized for medium matrices (M <= 8)
+static inline void fmatmul_v16b_4xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e32, m4, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float16_t *b_ = b + p;
+    float16_t *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 4) {
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle32.v v16, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + P;
+
+      float16_t *c__ = c_ + m * P;
+
+      float t0, t1, t2, t3;
+
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        asm volatile("vle32.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        a__ = a_ + ++n;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle32.v v16, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse32.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1));
+      asm volatile("vse32.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2));
+      asm volatile("vse32.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3));
+      asm volatile("vse32.v v12, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+// 8xVL variant - optimized for larger matrices (M > 8)
+static inline void fmatmul_v16b_8xVL(float16_t *c, const float16_t *a, const float16_t *b,
+                                      const unsigned int m_start, const unsigned int m_end,
+                                      const unsigned int N, const unsigned int P,
+                                      const unsigned int p_start, const unsigned int p_end) {
+  unsigned int p = p_start;
+  while (p < p_end) {
+    // Calculate the vl
+    size_t gvl;
+    asm volatile("vsetvli %[gvl], %[vl], e16, m2, ta, ma"
+                 : [gvl] "=r"(gvl)
+                 : [vl] "r"(p_end - p));
+
+    const float16_t *b_ = b + p;
+    float16_t *c_ = c + p;
+
+    for (unsigned int m = m_start; m < m_end; m += 8) {
+      const float16_t *a_ = a + m * N;
+      const float16_t *a__ = a_;
+
+      asm volatile("vle16.v v18, (%0);" ::"r"(b_));
+      const float16_t *b__ = b_ + P;
+
+      float16_t *c__ = c_ + m * P;
+
+      float t0, t1, t2, t3, t4, t5, t6, t7;
+
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+      a__ += N;
+      asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+
+      unsigned int n = 0;
+
+      while (n < N) {
+        a__ = a_ + ++n;
+
+        asm volatile("vle16.v v20, (%0);" ::"r"(b__));
+        b__ += P;
+
+        if (n == 1) {
+          asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+        } else {
+          asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+          a__ += N;
+          asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7));
+          asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+        }
+
+        a__ = a_ + ++n;
+
+        if (n == N)
+          break;
+
+        asm volatile("vle16.v v18, (%0);" ::"r"(b__));
+        b__ += P;
+
+        asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__));
+        a__ += N;
+        asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+        asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__));
+      }
+
+      asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0));
+      asm volatile("vse16.v v0, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1));
+      asm volatile("vse16.v v2, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2));
+      asm volatile("vse16.v v4, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3));
+      asm volatile("vse16.v v6, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4));
+      asm volatile("vse16.v v8, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5));
+      asm volatile("vse16.v v10, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6));
+      asm volatile("vse16.v v12, (%0);" ::"r"(c__));
+      c__ += P;
+      asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7));
+      asm volatile("vse16.v v14, (%0);" ::"r"(c__));
+    }
+
+    p += gvl;
+  }
+}
+
+#endif // _SPATZ_UTILS_H_
diff --git a/spatz/sw/tasks/atomic_remote_task.c b/spatz/sw/tasks/atomic_remote_task.c
new file mode 100644
index 0000000..dd9da10
--- /dev/null
+++ b/spatz/sw/tasks/atomic_remote_task.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Atomic Remote Task for Spatz
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+#define DATA_ADDR (SPATZ_DATA)
+#define NUM_ITER       2
+#define NUM_TARGETS    3  // Self + 2 remote tiles
+
+typedef struct {
+    uint32_t hartid;
+    uint32_t counter_addr;
+    uint32_t sync_base;
+} atomic_params_t;
+
+int atomic_remote_task(void) {
+  // Read parameter structure pointer from DATA register
+  uint32_t params_ptr = mmio32(DATA_ADDR);
+  volatile atomic_params_t *params = (volatile atomic_params_t *)params_ptr;
+  
+  uint32_t hartid = params->hartid;
+  uint32_t my_addr = params->counter_addr;
+  uint32_t sync_base = params->sync_base;
+  
+  // Each hart increments: self, (self+1)%16, (self+2)%16
+  for (int iter = 0; iter < NUM_ITER; iter++) {
+    for (int i = 0; i < NUM_TARGETS; i++) {
+      uint32_t target = (hartid + i) % 16;
+      uint32_t addr = sync_base + target * L1_TILE_OFFSET;
+      
+      asm volatile(
+        "addi t0, %0, 0     \n"
+        "li   t1, 1         \n"
+        "amoadd.w t2, t1, (t0) \n"
+        :
+        : "r"(addr)
+        : "t0", "t1", "t2", "memory"
+      );
+    }
+  }
+  
+  while (mmio32(my_addr) != (NUM_ITER * NUM_TARGETS));
+  
+  return 0;
+}
diff --git a/spatz/sw/tasks/dotp_task.c b/spatz/sw/tasks/dotp_task.c
new file mode 100644
index 0000000..23ae320
--- /dev/null
+++ b/spatz/sw/tasks/dotp_task.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * FP16 Dot Product using RISC-V Vector Extension
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+// DATA register address for reading parameter pointer
+#define DATA_ADDR (SPATZ_DATA)
+
+// Parameter structure in shared L1 memory
+typedef struct {
+    uint32_t a_addr;        // Vector A address
+    uint32_t b_addr;        // Vector B address  
+    uint32_t result_addr;   // Result address (scalar)
+    uint32_t n_elements;    // Number of elements
+} dotp_params_t;
+
+// Main entry point
+int dotp_task(void) {
+    
+    // Read parameter structure pointer from DATA_REG
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile dotp_params_t *params = (volatile dotp_params_t *)params_ptr;
+    
+    float16_t *A = (float16_t *)params->a_addr;
+    float16_t *B = (float16_t *)params->b_addr;
+    float *result = (float *)params->result_addr;
+    uint32_t N = params->n_elements;
+    
+    // Compute dot product using Spatz benchmark kernel
+    float16_t result_fp16 = fdotp_v16b(A, B, N);
+    
+    // Store result as FP16 (cast pointer to float16_t*)
+    *(float16_t *)result = result_fp16;
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/hello_task.c b/spatz/sw/tasks/hello_task.c
new file mode 100644
index 0000000..e80334e
--- /dev/null
+++ b/spatz/sw/tasks/hello_task.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Simple Hello World Task for Spatz
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+#define DATA_ADDR (SPATZ_DATA)
+
+// Parameter structure
+typedef struct {
+    uint32_t tile_id;
+} hello_params_t;
+
+// Main entry point
+int hello_task(void) {
+    
+    // Read parameter structure pointer from DATA register
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile hello_params_t *params = (volatile hello_params_t *)params_ptr;
+    
+    uint32_t tile_id = params->tile_id;
+    
+    // Print from Spatz!
+    printf("*** Hello from Spatz task on Tile %d! ***\n", tile_id);
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/hello_world_simple_task.c b/spatz/sw/tasks/hello_world_simple_task.c
new file mode 100644
index 0000000..54a38f7
--- /dev/null
+++ b/spatz/sw/tasks/hello_world_simple_task.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Hello World Task for Spatz_cc
+ */
+#include "magia_tile_utils.h"
+
+int hello_world_simple_task(void) {
+    printf("[SNITCH] Hello World from Spatz!\n");
+    return 0;
+}
diff --git a/spatz/sw/tasks/idma_simple_task.c b/spatz/sw/tasks/idma_simple_task.c
new file mode 100644
index 0000000..c3120aa
--- /dev/null
+++ b/spatz/sw/tasks/idma_simple_task.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * iDMA Task for MAGIA Tile - Demonstrates iDMA Transfers with Event Unit controlled by Spatz_cc
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "event_unit_utils.h"
+#include "idma_mm_utils.h"
+
+#define L1_DMA_SRC  ((volatile uint32_t *)(L1_BASE + 0x1000))
+#define L1_DMA_DST  ((volatile uint32_t *)(L1_BASE + 0x2000))
+#define L2_DMA_BUF  ((volatile uint32_t *)(L2_BASE + 0x1000))
+#define DMA_SIZE    64  // 64 bytes = 16 words
+
+int idma_simple_task(void) {
+    printf("[SNITCH] \n========================================\n");
+    printf("[SNITCH] IDMA TASK: DMA Transfers + Event Unit\n");
+    printf("[SNITCH] ========================================\n");
+    int errors = 0;
+    
+    // Enable iDMA events (keep EU_SPATZ_DONE enabled for CV32)
+    printf("[SNITCH] Enabling iDMA events...\n");
+    eu_enable_events(EU_IDMA_ALL_DONE_MASK);
+    
+    // 1. Test iDMA transfer L1→L2
+    printf("[SNITCH] Testing iDMA L1 to L2 transfer...\n");
+    
+    // Initialize source data in L1
+    for (int i = 0; i < (DMA_SIZE/4); i++) {
+        L1_DMA_SRC[i] = 0xA0000000 + i;
+    }
+    
+    // Launch DMA transfer (L1→L2)
+    int tid = idma_L1ToL2((uint32_t)L1_DMA_SRC, (uint32_t)L2_DMA_BUF, DMA_SIZE);
+    
+    // Wait for O2A completion with polling (10ms timeout)
+    uint32_t events = eu_idma_wait_o2a_completion(EU_WAIT_MODE_POLLING);
+    
+    if (events & EU_IDMA_O2A_DONE_MASK) {
+        printf("[SNITCH] DMA L1 to L2 complete (O2A event detected)\n");
+    } else {
+        printf("[SNITCH] DMA L1 to L2 TIMEOUT\n");
+        errors++;
+    }
+    
+    // 2. Test iDMA transfer L2→L1
+    printf("[SNITCH] Testing iDMA L2 to L1 transfer...\n");
+    
+    // Clear destination
+    for (int i = 0; i < (DMA_SIZE/4); i++) {
+        L1_DMA_DST[i] = 0;
+    }
+    
+    tid = idma_L2ToL1((uint32_t)L2_DMA_BUF, (uint32_t)L1_DMA_DST, DMA_SIZE);
+    
+    // Wait for A2O completion with polling (10ms timeout)
+    events = eu_idma_wait_a2o_completion(EU_WAIT_MODE_POLLING);
+    
+    if (events & EU_IDMA_A2O_DONE_MASK) {
+        printf("[SNITCH] DMA L2 to L1 complete (A2O event detected)\n");
+    } else {
+        printf("[SNITCH] DMA L2 to L1 TIMEOUT\n");
+        errors++;
+    }
+    
+    // Verify transferred data (using mmio32)
+    printf("[SNITCH] Verifying transferred data...\n");
+    for (int i = 0; i < (DMA_SIZE/4); i++) {
+        uint32_t expected = 0xA0000000 + i;
+        uint32_t computed = mmio32((uint32_t)L1_DMA_DST + 4*i);
+        if (computed != expected) {
+            printf("[SNITCH] [%d] FAIL: computed=0x%08x, expected=0x%08x\n", 
+                   i, computed, expected);
+            errors++;
+        }
+    }
+    if (errors == 0) printf("[SNITCH] Data verification: OK\n");
+    return 0;
+}
diff --git a/spatz/sw/tasks/instructions_sweep_simple_task.c b/spatz/sw/tasks/instructions_sweep_simple_task.c
new file mode 100644
index 0000000..d042ca3
--- /dev/null
+++ b/spatz/sw/tasks/instructions_sweep_simple_task.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Instructions Sweep Task for Spatz_cc
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+
+#define VEC_SIZE 16
+
+// Memory layout in L1
+#define VEC_A_BASE   (L1_BASE + 0x00005000)
+#define VEC_B_BASE   (L1_BASE + 0x00005200)
+#define VEC_C_BASE   (L1_BASE + 0x00005400)
+#define SCALAR_BASE  (L1_BASE + 0x00005600)
+
+#define vec_a ((volatile float *)VEC_A_BASE)
+#define vec_b ((volatile float *)VEC_B_BASE)
+#define vec_c ((volatile float *)VEC_C_BASE)
+#define scalar_mem ((volatile float *)SCALAR_BASE)
+
+int instructions_sweep_simple_task(void) {
+    
+    printf("[SNITCH] 32-bit Instructions Test - Spatz Full Coverage\n");
+    printf("[SNITCH] Testing: RVF, Vector Int/FP (e8/e16/e32)\n");
+    printf("[SNITCH] Using L1 memory at 0x%08x\n", L1_BASE);
+    printf("[SNITCH] ========================================\n");
+    
+    float a_f, b_f, c_f, result_f;
+    int32_t iresult;
+    
+    // ==================== SCALAR RVF (Single Precision) ====================
+    printf("[SNITCH] Testing Scalar RVF Instructions:\n");
+    a_f = 3.5f; b_f = 2.0f; c_f = 1.5f;
+    
+    // Arithmetic
+    asm volatile("fadd.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FADD.S\n");
+    asm volatile("fsub.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FSUB.S\n");
+    asm volatile("fmul.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FMUL.S\n");
+    asm volatile("fmin.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FMIN.S\n");
+    asm volatile("fmax.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FMAX.S\n");
+    
+    // Sign injection
+    asm volatile("fsgnj.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FSGNJ.S\n");
+    asm volatile("fsgnjn.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FSGNJN.S\n");
+    asm volatile("fsgnjx.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf("  FSGNJX.S\n");
+    
+    // Comparisons
+    asm volatile("feq.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf("  FEQ.S\n");
+    asm volatile("flt.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf("  FLT.S\n");
+    asm volatile("fle.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf("  FLE.S\n");
+    asm volatile("fclass.s %0, %1" : "=r"(iresult) : "f"(a_f)); printf("  FCLASS.S\n");
+    
+    // FMA
+    asm volatile("fmadd.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf("  FMADD.S\n");
+    asm volatile("fmsub.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf("  FMSUB.S\n");
+    asm volatile("fnmadd.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf("  FNMADD.S\n");
+    asm volatile("fnmsub.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf("  FNMSUB.S\n");
+    
+    // Conversions
+    asm volatile("fcvt.w.s %0, %1, rtz" : "=r"(iresult) : "f"(a_f)); printf("  FCVT.W.S\n");
+    asm volatile("fcvt.wu.s %0, %1, rtz" : "=r"(iresult) : "f"(a_f)); printf("  FCVT.WU.S\n");
+    asm volatile("fcvt.s.w %0, %1" : "=f"(result_f) : "r"(iresult)); printf("  FCVT.S.W\n");
+    asm volatile("fcvt.s.wu %0, %1" : "=f"(result_f) : "r"(iresult)); printf("  FCVT.S.WU\n");
+    
+    // Load/Store
+    scalar_mem[0] = 42.0f;
+    asm volatile("flw %0, 0(%1)" : "=f"(result_f) : "r"(scalar_mem)); printf("  FLW\n");
+    asm volatile("fsw %0, 0(%1)" :: "f"(result_f), "r"(scalar_mem)); printf("  FSW\n");
+    
+    
+    // ==================== SCALAR M-EXTENSION (Integer Mul/Div) ====================
+    printf("[SNITCH] Testing Scalar M-Extension Instructions:\n");
+    int32_t a_i = 15, b_i = 4, result_i;
+    
+    asm volatile("mul %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  MUL\n");
+    asm volatile("mulh %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  MULH\n");
+    asm volatile("mulhu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  MULHU\n");
+    asm volatile("mulhsu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  MULHSU\n");
+    asm volatile("div %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  DIV\n");
+    asm volatile("divu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  DIVU\n");
+    asm volatile("rem %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  REM\n");
+    asm volatile("remu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf("  REMU\n");
+    
+    // ==================== VECTOR INTEGER (e32) ====================
+    printf("[SNITCH] Testing Vector Integer Instructions (e32):\n");
+    
+    volatile int32_t *vec_ia = (volatile int32_t *)VEC_A_BASE;
+    volatile int32_t *vec_ib = (volatile int32_t *)VEC_B_BASE;
+    volatile int32_t *vec_ic = (volatile int32_t *)VEC_C_BASE;
+    
+    for (int i = 0; i < VEC_SIZE; i++) {
+        vec_ia[i] = i + 1;
+        vec_ib[i] = i * 2;
+    }
+    
+    asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE)); printf("  VSETVLI (e32)\n");
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_ia)); printf("  VLE32.V\n");
+    asm volatile("vle32.v v4, (%0)" ::"r"(vec_ib)); printf("  VLE32.V\n");
+    
+    // Arithmetic
+    asm volatile("vadd.vv v8, v0, v4"); printf("  VADD.VV\n");
+    asm volatile("vsub.vv v8, v0, v4"); printf("  VSUB.VV\n");
+    asm volatile("vmul.vv v8, v0, v4"); printf("  VMUL.VV\n");
+    asm volatile("vdivu.vv v8, v0, v4"); printf("  VDIVU.VV\n");
+    asm volatile("vdiv.vv v8, v0, v4"); printf("  VDIV.VV\n");
+    asm volatile("vremu.vv v8, v0, v4"); printf("  VREMU.VV\n");
+    asm volatile("vrem.vv v8, v0, v4"); printf("  VREM.VV\n");
+    
+    int32_t scalar_i = 5;
+    asm volatile("vadd.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VADD.VX\n");
+    asm volatile("vsub.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSUB.VX\n");
+    asm volatile("vrsub.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VRSUB.VX\n");
+    asm volatile("vmul.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VMUL.VX\n");
+    
+    // Logical
+    asm volatile("vand.vv v8, v0, v4"); printf("  VAND.VV\n");
+    asm volatile("vor.vv v8, v0, v4"); printf("  VOR.VV\n");
+    asm volatile("vxor.vv v8, v0, v4"); printf("  VXOR.VV\n");
+    asm volatile("vand.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VAND.VX\n");
+    asm volatile("vor.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VOR.VX\n");
+    asm volatile("vxor.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VXOR.VX\n");
+    
+    // Shifts
+    asm volatile("vsll.vv v8, v0, v4"); printf("  VSLL.VV\n");
+    asm volatile("vsrl.vv v8, v0, v4"); printf("  VSRL.VV\n");
+    asm volatile("vsra.vv v8, v0, v4"); printf("  VSRA.VV\n");
+    asm volatile("vsll.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSLL.VX\n");
+    asm volatile("vsrl.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSRL.VX\n");
+    asm volatile("vsra.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSRA.VX\n");
+    
+    // Min/Max
+    asm volatile("vmin.vv v8, v0, v4"); printf("  VMIN.VV\n");
+    asm volatile("vmax.vv v8, v0, v4"); printf("  VMAX.VV\n");
+    asm volatile("vminu.vv v8, v0, v4"); printf("  VMINU.VV\n");
+    asm volatile("vmaxu.vv v8, v0, v4"); printf("  VMAXU.VV\n");
+    
+    // High multiply
+    asm volatile("vmulh.vv v8, v0, v4"); printf("  VMULH.VV\n");
+    asm volatile("vmulhu.vv v8, v0, v4"); printf("  VMULHU.VV\n");
+    asm volatile("vmulhsu.vv v8, v0, v4"); printf("  VMULHSU.VV\n");
+    asm volatile("vmulh.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VMULH.VX\n");
+    
+    // MAC
+    asm volatile("vmacc.vv v8, v0, v4"); printf("  VMACC.VV\n");
+    asm volatile("vnmsac.vv v8, v0, v4"); printf("  VNMSAC.VV\n");
+    asm volatile("vmadd.vv v8, v0, v4"); printf("  VMADD.VV\n");
+    asm volatile("vnmsub.vv v8, v0, v4"); printf("  VNMSUB.VV\n");
+    
+    // Carry/Borrow
+    asm volatile("vadc.vvm v8, v0, v4, v0"); printf("  VADC.VVM\n");
+    asm volatile("vmadc.vv v0, v4, v8"); printf("  VMADC.VV\n");
+    asm volatile("vsbc.vvm v8, v0, v4, v0"); printf("  VSBC.VVM\n");
+    asm volatile("vmsbc.vv v0, v4, v8"); printf("  VMSBC.VV\n");
+    
+    // Comparisons (mask operations)
+    asm volatile("vmseq.vv v0, v4, v8"); printf("  VMSEQ.VV\n");
+    asm volatile("vmsne.vv v0, v4, v8"); printf("  VMSNE.VV\n");
+    asm volatile("vmslt.vv v0, v4, v8"); printf("  VMSLT.VV\n");
+    asm volatile("vmsltu.vv v0, v4, v8"); printf("  VMSLTU.VV\n");
+    asm volatile("vmsle.vv v0, v4, v8"); printf("  VMSLE.VV\n");
+    asm volatile("vmsleu.vv v0, v4, v8"); printf("  VMSLEU.VV\n");
+    asm volatile("vmsltu.vx v0, v4, %0" ::"r"(scalar_i)); printf("  VMSLTU.VX\n");
+    asm volatile("vmslt.vx v0, v4, %0" ::"r"(scalar_i)); printf("  VMSLT.VX\n");
+    
+    // Integer immediate operations
+    asm volatile("vadd.vi v8, v0, 5"); printf("  VADD.VI\n");
+    asm volatile("vrsub.vi v8, v0, 10"); printf("  VRSUB.VI\n");
+    asm volatile("vand.vi v8, v0, 15"); printf("  VAND.VI\n");
+    asm volatile("vor.vi v8, v0, 7"); printf("  VOR.VI\n");
+    asm volatile("vxor.vi v8, v0, 3"); printf("  VXOR.VI\n");
+    asm volatile("vsll.vi v8, v0, 2"); printf("  VSLL.VI\n");
+    asm volatile("vsrl.vi v8, v0, 2"); printf("  VSRL.VI\n");
+    asm volatile("vsra.vi v8, v0, 2"); printf("  VSRA.VI\n");
+    asm volatile("vmseq.vi v0, v4, 5"); printf("  VMSEQ.VI\n");
+    asm volatile("vmsne.vi v0, v4, 5"); printf("  VMSNE.VI\n");
+    asm volatile("vmsleu.vi v0, v4, 10"); printf("  VMSLEU.VI\n");
+    asm volatile("vmsle.vi v0, v4, 10"); printf("  VMSLE.VI\n");
+    asm volatile("vmsgtu.vi v0, v4, 3"); printf("  VMSGTU.VI\n");
+    asm volatile("vmsgt.vi v0, v4, 3"); printf("  VMSGT.VI\n");
+    
+    // Reductions
+    asm volatile("vredsum.vs v8, v0, v4"); printf("  VREDSUM.VS\n");
+    asm volatile("vredand.vs v8, v0, v4"); printf("  VREDAND.VS\n");
+    asm volatile("vredor.vs v8, v0, v4"); printf("  VREDOR.VS\n");
+    asm volatile("vredxor.vs v8, v0, v4"); printf("  VREDXOR.VS\n");
+    asm volatile("vredmin.vs v8, v0, v4"); printf("  VREDMIN.VS\n");
+    asm volatile("vredminu.vs v8, v0, v4"); printf("  VREDMINU.VS\n");
+    asm volatile("vredmax.vs v8, v0, v4"); printf("  VREDMAX.VS\n");
+    asm volatile("vredmaxu.vs v8, v0, v4"); printf("  VREDMAXU.VS\n");
+    
+    // Move
+    asm volatile("vmv.v.v v8, v0"); printf("  VMV.V.V\n");
+    asm volatile("vmv.v.x v8, %0" ::"r"(scalar_i)); printf("  VMV.V.X\n");
+    asm volatile("vmv.s.x v8, %0" ::"r"(scalar_i)); printf("  VMV.S.X\n");
+    asm volatile("vmv.x.s %0, v0" :"=r"(result_i)); printf("  VMV.X.S\n");
+    
+    // Slide
+    asm volatile("vslideup.vx v8, v0, %0" ::"r"(2)); printf("  VSLIDEUP.VX\n");
+    asm volatile("vslidedown.vx v8, v0, %0" ::"r"(2)); printf("  VSLIDEDOWN.VX\n");
+    asm volatile("vslide1up.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSLIDE1UP.VX\n");
+    asm volatile("vslide1down.vx v8, v0, %0" ::"r"(scalar_i)); printf("  VSLIDE1DOWN.VX\n");
+    
+    // Widening (e16->e32)
+    asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2));
+    volatile int16_t *vec_i16a = (volatile int16_t *)VEC_A_BASE;
+    volatile int16_t *vec_i16b = (volatile int16_t *)VEC_B_BASE;
+    for (int i = 0; i < VEC_SIZE*2; i++) {
+        vec_i16a[i] = i + 1;
+        vec_i16b[i] = i * 2;
+    }
+    asm volatile("vle16.v v0, (%0)" ::"r"(vec_i16a));
+    asm volatile("vle16.v v2, (%0)" ::"r"(vec_i16b));
+    asm volatile("vwadd.vv v8, v0, v2"); printf("  VWADD.VV (e16->e32)\n");
+    asm volatile("vwaddu.vv v8, v0, v2"); printf("  VWADDU.VV (e16->e32)\n");
+    asm volatile("vwsub.vv v8, v0, v2"); printf("  VWSUB.VV (e16->e32)\n");
+    asm volatile("vwsubu.vv v8, v0, v2"); printf("  VWSUBU.VV (e16->e32)\n");
+    asm volatile("vwmul.vv v8, v0, v2"); printf("  VWMUL.VV (e16->e32)\n");
+    asm volatile("vwmulu.vv v8, v0, v2"); printf("  VWMULU.VV (e16->e32)\n");
+    asm volatile("vwmulsu.vv v8, v0, v2"); printf("  VWMULSU.VV (e16->e32)\n");
+    
+    // Widening with scalar (e16->e32)
+    int16_t scalar_i16 = 3;
+    asm volatile("vwadd.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWADD.VX (e16->e32)\n");
+    asm volatile("vwaddu.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWADDU.VX (e16->e32)\n");
+    asm volatile("vwsub.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWSUB.VX (e16->e32)\n");
+    asm volatile("vwsubu.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWSUBU.VX (e16->e32)\n");
+    asm volatile("vwmul.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWMUL.VX (e16->e32)\n");
+    asm volatile("vwmulu.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWMULU.VX (e16->e32)\n");
+    asm volatile("vwmulsu.vx v8, v0, %0" ::"r"(scalar_i16)); printf("  VWMULSU.VX (e16->e32)\n");
+    
+    // Widening MAC (e16->e32)
+    asm volatile("vwmacc.vv v8, v0, v2"); printf("  VWMACC.VV (e16->e32)\n");
+    asm volatile("vwmaccu.vv v8, v0, v2"); printf("  VWMACCU.VV (e16->e32)\n");
+    asm volatile("vwmaccsu.vv v8, v0, v2"); printf("  VWMACCSU.VV (e16->e32)\n");
+    
+    // Widening MAC with scalar (e16->e32)
+    asm volatile("vwmacc.vx v8, %0, v2" ::"r"(scalar_i16)); printf("  VWMACC.VX (e16->e32)\n");
+    asm volatile("vwmaccu.vx v8, %0, v2" ::"r"(scalar_i16)); printf("  VWMACCU.VX (e16->e32)\n");
+    asm volatile("vwmaccsu.vx v8, %0, v2" ::"r"(scalar_i16)); printf("  VWMACCSU.VX (e16->e32)\n");
+    asm volatile("vwmaccus.vx v8, %0, v2" ::"r"(scalar_i16)); printf("  VWMACCUS.VX (e16->e32)\n");
+    
+    // Note: Integer narrowing (vnsrl/vnsra) requires ELEN=64 (RVD extension)
+    // Skipped for 32-bit only configuration
+    
+    // Load/Store
+    asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE));
+    asm volatile("vse32.v v0, (%0)" ::"r"(vec_ic)); printf("  VSE32.V\n");
+    asm volatile("vle32.v v4, (%0)" ::"r"(vec_ic)); printf("  VLE32.V\n");
+    asm volatile("vlse32.v v4, (%0), %1" ::"r"(vec_ic), "r"(8)); printf("  VLSE32.V (strided)\n");
+    asm volatile("vsse32.v v4, (%0), %1" ::"r"(vec_ic), "r"(8)); printf("  VSSE32.V (strided)\n");
+    
+    // Indexed load/store
+    for (int i = 0; i < VEC_SIZE; i++) vec_ia[i] = i * 4;
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_ia));
+    asm volatile("vluxei32.v v4, (%0), v0" ::"r"(vec_ib)); printf("  VLUXEI32.V (indexed)\n");
+    asm volatile("vsuxei32.v v4, (%0), v0" ::"r"(vec_ic)); printf("  VSUXEI32.V (indexed)\n");
+    
+    // ==================== VECTOR INTEGER (e16) ====================
+    printf("[SNITCH] Testing Vector Integer Instructions (e16):\n");
+    asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2));
+    for (int i = 0; i < VEC_SIZE*2; i++) {
+        vec_i16a[i] = i + 1;
+        vec_i16b[i] = i * 2;
+    }
+    asm volatile("vle16.v v0, (%0)" ::"r"(vec_i16a)); printf("  VLE16.V\n");
+    asm volatile("vle16.v v2, (%0)" ::"r"(vec_i16b));
+    asm volatile("vadd.vv v4, v0, v2"); printf("  VADD.VV (e16)\n");
+    asm volatile("vmul.vv v4, v0, v2"); printf("  VMUL.VV (e16)\n");
+    asm volatile("vse16.v v4, (%0)" ::"r"(vec_i16a)); printf("  VSE16.V\n");
+    
+    // ==================== VECTOR INTEGER (e8) ====================
+    printf("[SNITCH] Testing Vector Integer Instructions (e8):\n");
+    asm volatile("vsetvli zero, %0, e8, m2, ta, ma" ::"r"(VEC_SIZE*4));
+    volatile int8_t *vec_i8a = (volatile int8_t *)VEC_A_BASE;
+    volatile int8_t *vec_i8b = (volatile int8_t *)VEC_B_BASE;
+    for (int i = 0; i < VEC_SIZE*4; i++) {
+        vec_i8a[i] = i + 1;
+        vec_i8b[i] = i * 2;
+    }
+    asm volatile("vle8.v v0, (%0)" ::"r"(vec_i8a)); printf("  VLE8.V\n");
+    asm volatile("vle8.v v2, (%0)" ::"r"(vec_i8b));
+    asm volatile("vadd.vv v4, v0, v2"); printf("  VADD.VV (e8)\n");
+    asm volatile("vmul.vv v4, v0, v2"); printf("  VMUL.VV (e8)\n");
+    asm volatile("vse8.v v4, (%0)" ::"r"(vec_i8a)); printf("  VSE8.V\n");
+    
+    // Reset to e32 for FP tests
+    asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE));
+    
+    // ==================== VECTOR FP (e32 - single precision) ====================
+    printf("[SNITCH] Testing Vector FP Instructions (e32):\n");
+    
+    // Riusa gli stessi buffer L1
+    for (int i = 0; i < VEC_SIZE; i++) {
+        vec_a[i] = (float)(i + 1);
+        vec_b[i] = (float)(i * 2);
+    }
+    
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vle32.v v4, (%0)" ::"r"(vec_b));
+    
+    // Arithmetic VV
+    asm volatile("vfadd.vv v8, v0, v4"); printf("  VFADD.VV (e32)\n");
+    asm volatile("vfsub.vv v8, v0, v4"); printf("  VFSUB.VV (e32)\n");
+    asm volatile("vfmul.vv v8, v0, v4"); printf("  VFMUL.VV (e32)\n");
+    
+    // Arithmetic VF
+    float scalar_f = 2.5f;
+    asm volatile("vfadd.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFADD.VF (e32)\n");
+    asm volatile("vfsub.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSUB.VF (e32)\n");
+    asm volatile("vfrsub.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFRSUB.VF (e32)\n");
+    asm volatile("vfmul.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFMUL.VF (e32)\n");
+    
+    // Min/Max
+    asm volatile("vfmin.vv v8, v0, v4"); printf("  VFMIN.VV (e32)\n");
+    asm volatile("vfmax.vv v8, v0, v4"); printf("  VFMAX.VV (e32)\n");
+    asm volatile("vfmin.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFMIN.VF (e32)\n");
+    asm volatile("vfmax.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFMAX.VF (e32)\n");
+    
+    // Sign injection
+    asm volatile("vfsgnj.vv v8, v0, v4"); printf("  VFSGNJ.VV (e32)\n");
+    asm volatile("vfsgnjn.vv v8, v0, v4"); printf("  VFSGNJN.VV (e32)\n");
+    asm volatile("vfsgnjx.vv v8, v0, v4"); printf("  VFSGNJX.VV (e32)\n");
+    asm volatile("vfsgnj.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSGNJ.VF (e32)\n");
+    asm volatile("vfsgnjn.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSGNJN.VF (e32)\n");
+    asm volatile("vfsgnjx.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSGNJX.VF (e32)\n");
+    
+    // FMA
+    asm volatile("vle32.v v8, (%0)" ::"r"(vec_b));
+    asm volatile("vfmadd.vv v0, v4, v8"); printf("  VFMADD.VV (e32)\n");
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vfmsub.vv v0, v4, v8"); printf("  VFMSUB.VV (e32)\n");
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vfnmadd.vv v0, v4, v8"); printf("  VFNMADD.VV (e32)\n");
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vfnmsub.vv v0, v4, v8"); printf("  VFNMSUB.VV (e32)\n");
+    
+    // MACC
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vle32.v v8, (%0)" ::"r"(vec_b));
+    asm volatile("vfmacc.vv v8, v0, v4"); printf("  VFMACC.VV (e32)\n");
+    asm volatile("vle32.v v8, (%0)" ::"r"(vec_b));
+    asm volatile("vfmsac.vv v8, v0, v4"); printf("  VFMSAC.VV (e32)\n");
+    asm volatile("vle32.v v8, (%0)" ::"r"(vec_b));
+    asm volatile("vfnmacc.vv v8, v0, v4"); printf("  VFNMACC.VV (e32)\n");
+    asm volatile("vle32.v v8, (%0)" ::"r"(vec_b));
+    asm volatile("vfnmsac.vv v8, v0, v4"); printf("  VFNMSAC.VV (e32)\n");
+    
+    // Reductions
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vfmv.s.f v12, %0" ::"f"(0.0f));
+    asm volatile("vfredosum.vs v12, v0, v12"); printf("  VFREDOSUM.VS (e32)\n");
+    asm volatile("vfredusum.vs v12, v0, v12"); printf("  VFREDUSUM.VS (e32)\n");
+    asm volatile("vfredmin.vs v12, v0, v12"); printf("  VFREDMIN.VS (e32)\n");
+    asm volatile("vfredmax.vs v12, v0, v12"); printf("  VFREDMAX.VS (e32)\n");
+    
+    // Conversions
+    asm volatile("vfcvt.xu.f.v v8, v0"); printf("  VFCVT.XU.F.V (e32)\n");
+    asm volatile("vfcvt.x.f.v v8, v0"); printf("  VFCVT.X.F.V (e32)\n");
+    asm volatile("vfcvt.rtz.xu.f.v v8, v0"); printf("  VFCVT.RTZ.XU.F.V (e32)\n");
+    asm volatile("vfcvt.rtz.x.f.v v8, v0"); printf("  VFCVT.RTZ.X.F.V (e32)\n");
+    asm volatile("vfcvt.f.xu.v v0, v8"); printf("  VFCVT.F.XU.V (e32)\n");
+    asm volatile("vfcvt.f.x.v v0, v8"); printf("  VFCVT.F.X.V (e32)\n");
+    
+    // Move/Slide
+    asm volatile("vfmv.v.f v8, %0" ::"f"(scalar_f)); printf("  VFMV.V.F (e32)\n");
+    asm volatile("vfmv.s.f v8, %0" ::"f"(scalar_f)); printf("  VFMV.S.F (e32)\n");
+    asm volatile("vfmv.f.s %0, v0" :"=f"(result_f)); printf("  VFMV.F.S (e32)\n");
+    
+    // Reload fresh data for slide operations
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vfslide1up.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSLIDE1UP.VF (e32)\n");
+    asm volatile("vfslide1down.vf v8, v0, %0" ::"f"(scalar_f)); printf("  VFSLIDE1DOWN.VF (e32)\n");
+
+    printf("  [SKIPPED] Vector FP comparisons and VFCLASS.V not supported\n");
+    
+    // ==================== VECTOR FP (e16 - half precision) ====================
+    printf("[SNITCH] Testing Vector FP Instructions (e16):\n");
+    asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2));
+    volatile __fp16 *vec_h = (volatile __fp16 *)VEC_A_BASE;
+    for (int i = 0; i < VEC_SIZE*2; i++) vec_h[i] = (__fp16)(i + 1);
+    asm volatile("vle16.v v0, (%0)" ::"r"(vec_h));
+    asm volatile("vle16.v v2, (%0)" ::"r"(vec_h));
+    asm volatile("vfadd.vv v4, v0, v2"); printf("  VFADD.VV (e16)\n");
+    asm volatile("vfmul.vv v4, v0, v2"); printf("  VFMUL.VV (e16)\n");
+    asm volatile("vfmacc.vv v4, v0, v2"); printf("  VFMACC.VV (e16)\n");
+    
+    // Widening FP16->FP32
+    asm volatile("vfwadd.vv v8, v0, v2"); printf("  VFWADD.VV (e16->e32)\n");
+    asm volatile("vfwsub.vv v8, v0, v2"); printf("  VFWSUB.VV (e16->e32)\n");
+    asm volatile("vfwmul.vv v8, v0, v2"); printf("  VFWMUL.VV (e16->e32)\n");
+    asm volatile("vfwmacc.vv v8, v0, v2"); printf("  VFWMACC.VV (e16->e32)\n");
+    asm volatile("vfwnmacc.vv v8, v0, v2"); printf("  VFWNMACC.VV (e16->e32)\n");
+    asm volatile("vfwmsac.vv v8, v0, v2"); printf("  VFWMSAC.VV (e16->e32)\n");
+    asm volatile("vfwnmsac.vv v8, v0, v2"); printf("  VFWNMSAC.VV (e16->e32)\n");
+    
+    // Narrowing FP32->FP16 
+    asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2));
+    asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE));  // Load e32 data
+    asm volatile("vle32.v v0, (%0)" ::"r"(vec_a));
+    asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); // Set for narrowing output
+    asm volatile("vfncvt.f.f.w v8, v0"); printf("  VFNCVT.F.F.W (e32->e16)\n");
+    asm volatile("vfncvt.xu.f.w v8, v0"); printf("  VFNCVT.XU.F.W (e32->e16)\n");
+    asm volatile("vfncvt.x.f.w v8, v0"); printf("  VFNCVT.X.F.W (e32->e16)\n");
+    asm volatile("vfncvt.rtz.xu.f.w v8, v0"); printf("  VFNCVT.RTZ.XU.F.W (e32->e16)\n");
+    asm volatile("vfncvt.rtz.x.f.w v8, v0"); printf("  VFNCVT.RTZ.X.F.W (e32->e16)\n");
+    
+    printf("[SNITCH] ==== Test completed ====\n");
+    return 0;
+}
diff --git a/spatz/sw/tasks/interconnect32_simple_task.c b/spatz/sw/tasks/interconnect32_simple_task.c
new file mode 100644
index 0000000..b535fdc
--- /dev/null
+++ b/spatz/sw/tasks/interconnect32_simple_task.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * 32-bit Interconnect Access Test Task for Spatz_cc
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+
+// Memory in L1
+#define TEST_MEM32_BASE (L1_BASE + 0x00003000)
+#define test_mem32 ((volatile uint32_t *)TEST_MEM32_BASE)
+
+int interconnect32_simple_task(void) {
+    
+    printf("[SNITCH] Testing 32-bit accesses\n");
+    printf("[SNITCH] Using L1 memory at 0x%08x\n", TEST_MEM32_BASE);
+    
+    // =============================================
+    // Test 1: Sequential 32-bit stores and loads
+    // =============================================
+    printf("[SNITCH] Sequential 32-bit store/load\n");
+    
+    for (int i = 0; i < 16; i++) {
+        test_mem32[i] = 0x10000000 + i;
+    }
+    
+    for (int i = 0; i < 16; i++) {
+        if (test_mem32[i] != (uint32_t)(0x10000000 + i)) {
+            printf("[SNITCH] ERROR: 32-bit store/load failed at [%d]\n", i);
+        }
+    }
+    
+    // =============================================
+    // Test 2: Strided 32-bit access pattern
+    // =============================================
+    printf("[SNITCH] Strided 32-bit access\n");
+    
+    test_mem32[0] = 0xAAAAAAAA;
+    test_mem32[2] = 0xBBBBBBBB;
+    test_mem32[4] = 0xCCCCCCCC;
+    test_mem32[6] = 0xDDDDDDDD;
+    
+    if (test_mem32[0] != 0xAAAAAAAA || test_mem32[4] != 0xCCCCCCCC) {
+        printf("[SNITCH] ERROR: Strided access failed\n");
+    }
+    
+    // =============================================
+    // Test 3: Reverse order stores
+    // =============================================
+    printf("[SNITCH] Reverse order stores\n");
+    
+    for (int i = 7; i >= 0; i--) {
+        test_mem32[i] = 0x30000000 + (7 - i);
+    }
+    
+    if (test_mem32[0] != 0x30000007 || test_mem32[7] != 0x30000000) {
+        printf("[SNITCH] ERROR: Reverse order failed\n");
+    }
+    
+    // =============================================
+    // Test 4: Alternating read/write pattern
+    // =============================================
+    printf("[SNITCH] Alternating read/write\n");
+    
+    uint32_t accumulator = 0;
+    for (int i = 0; i < 8; i++) {
+        test_mem32[i] = 0x40000000 + i;
+        accumulator += test_mem32[i];
+    }
+    
+    if (accumulator != 0x0000001C + (8 * 0x40000000)) {
+        printf("[SNITCH] ERROR: Alternating pattern failed\n");
+    }
+    
+    // =============================================
+    // Test 7: Vector load/store 32-bit (vle32.v/vse32.v)
+    // =============================================
+    printf("[SNITCH] Vector load/store e32\n");
+    
+    // Initialize pattern for vector operations
+    for (int i = 0; i < 16; i++) {
+        test_mem32[i] = 0x70000000 + (i * 0x11);
+    }
+    
+    // Configure vector: vtype = e32, m1, ta, ma (vl=8)
+    asm volatile(
+        "li t0, 8\n"
+        "vsetvli zero, t0, e32, m1, ta, ma\n"
+        ::: "t0"
+    );
+    
+    // Vector load from test_mem32[0:7] into v1
+    asm volatile(
+        "vle32.v v1, (%0)\n"
+        :: "r"(&test_mem32[0])
+        : "memory"
+    );
+    
+    // Vector store from v1 to test_mem32[16:23]
+    asm volatile(
+        "vse32.v v1, (%0)\n"
+        :: "r"(&test_mem32[16])
+        : "memory"
+    );
+    
+    // Verify vector store
+    for (int i = 0; i < 8; i++) {
+        if (test_mem32[16 + i] != test_mem32[i]) {
+            printf("[SNITCH] ERROR: Vector store failed at [%d], got 0x%08x expected 0x%08x\n",
+                   i, test_mem32[16 + i], test_mem32[i]);
+        }
+    }
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/interconnect64_simple_task.c b/spatz/sw/tasks/interconnect64_simple_task.c
new file mode 100644
index 0000000..319d4c7
--- /dev/null
+++ b/spatz/sw/tasks/interconnect64_simple_task.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * 64-bit Interconnect Access Test Task for Spatz_cc 
+ * !!! This task requires Spatz with RVD extension enabled !!!
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+
+// Memory in L1
+#define TEST_MEM64_BASE (L1_BASE + 0x00004000)
+#define test_mem64 ((volatile uint64_t *)TEST_MEM64_BASE)
+#define test_mem32 ((volatile uint32_t *)TEST_MEM64_BASE)
+
+int interconnect64_simple_task(void) {
+    
+    printf("[SNITCH] Testing 64 bit accesses\n");
+    printf("[SNITCH] Using L1 memory at 0x%08x\n", TEST_MEM64_BASE);
+    
+    // =============================================
+    // Test 1: 64-bit aligned stores and loads
+    // =============================================
+    printf("[SNITCH] 64-bit aligned store/load\n");
+    
+    test_mem64[0] = 0x123456789ABCDEF0ULL;
+    test_mem64[1] = 0xFEDCBA9876543210ULL;
+    test_mem64[2] = 0xAAAAAAAABBBBBBBBULL;
+    test_mem64[3] = 0xCCCCCCCCDDDDDDDDULL;
+    
+    if (test_mem64[0] != 0x123456789ABCDEF0ULL) {
+        printf("[SNITCH] ERROR: 64-bit store/load failed at [0]\n");
+    }
+    if (test_mem64[1] != 0xFEDCBA9876543210ULL) {
+        printf("[SNITCH] ERROR: 64-bit store/load failed at [1]\n");
+    }
+    
+    // =============================================
+    // Test 2: 32-bit stores to 64-bit memory
+    // =============================================
+    printf("[SNITCH] 32-bit stores to 64-bit region\n");
+    
+    test_mem32[0] = 0x11111111;
+    test_mem32[1] = 0x22222222;
+    test_mem32[2] = 0x33333333;
+    test_mem32[3] = 0x44444444;
+    
+    if (test_mem64[0] != 0x2222222211111111ULL) {
+        printf("[SNITCH] ERROR: 32-bit stores failed, got 0x%08x%08x\n", 
+               (uint32_t)(test_mem64[0] >> 32), (uint32_t)test_mem64[0]);
+    }
+    
+    // =============================================
+    // Test 3: Mixed 32/64 access patterns
+    // =============================================
+    printf("[SNITCH] Mixed 32/64-bit access pattern\n");
+    
+    test_mem64[4] = 0xDEADBEEFCAFEBABEULL;
+    uint32_t low_word = test_mem32[8];
+    uint32_t high_word = test_mem32[9];
+    
+    if (low_word != 0xCAFEBABE || high_word != 0xDEADBEEF) {
+        printf("[SNITCH] ERROR: Mixed access failed, low=0x%08x high=0x%08x\n", 
+               low_word, high_word);
+    }
+    
+    // =============================================
+    // Test 4: Sequential 64-bit loads (stress reqrsp64toobi32)
+    // =============================================
+    printf("[SNITCH] Sequential 64-bit loads\n");
+    
+    for (int i = 0; i < 8; i++) {
+        test_mem64[i] = (uint64_t)i * 0x0101010101010101ULL;
+    }
+    
+    uint64_t sum = 0;
+    for (int i = 0; i < 8; i++) {
+        sum += test_mem64[i];
+    }
+    
+    uint64_t expected_sum = 0x1C1C1C1C1C1C1C1CULL;
+    if (sum != expected_sum) {
+        printf("[SNITCH] ERROR: Sequential load sum mismatch\n");
+    }
+    
+    // =============================================
+    // Test 5: Alternating 32/64 stores (stress tcdm64todualtcdm32)
+    // =============================================
+    printf("[SNITCH] Alternating 32/64-bit stores\n");
+    
+    test_mem64[0] = 0x0000000000000000ULL;
+    test_mem32[0] = 0xAAAAAAAA;  // Writes to low word of test_mem64[0]
+    test_mem64[1] = 0x5555555555555555ULL;
+    test_mem32[2] = 0xBBBBBBBB;  // Writes to low word of test_mem64[1]
+    
+    if (test_mem64[0] != 0x00000000AAAAAAAAULL) {
+        printf("[SNITCH] ERROR: Alternating store failed at [0], got 0x%08x%08x\n", 
+               (uint32_t)(test_mem64[0] >> 32), (uint32_t)test_mem64[0]);
+    }
+    if (test_mem64[1] != 0x55555555BBBBBBBBULL) {
+        printf("[SNITCH] ERROR: Alternating store failed at [1], got 0x%08x%08x\n",
+               (uint32_t)(test_mem64[1] >> 32), (uint32_t)test_mem64[1]);
+    }
+    
+    // =============================================
+    // Test 6: Unaligned 32-bit accesses
+    // =============================================
+    printf("[SNITCH] Unaligned 32-bit accesses\n");
+    
+    test_mem32[1] = 0x12345678;
+    test_mem32[3] = 0x9ABCDEF0;
+    test_mem32[5] = 0xFEDCBA98;
+    
+    if (test_mem32[1] != 0x12345678) {
+        printf("[SNITCH] ERROR: Unaligned access failed\n");
+    }
+    
+    // =============================================
+    // Test 7: Vector load/store 64-bit (vle64.v/vse64.v)
+    // =============================================
+    printf("[SNITCH] Vector load/store e64\n");
+    
+    // Initialize pattern for vector operations
+    for (int i = 0; i < 8; i++) {
+        test_mem64[i] = 0x7000000000000000ULL + ((uint64_t)i * 0x1111111111111111ULL);
+    }
+    
+    // Configure vector: vtype = e64, m1, ta, ma (vl=4)
+    asm volatile(
+        "li t0, 4\n"
+        "vsetvli zero, t0, e64, m1, ta, ma\n"
+        ::: "t0"
+    );
+    
+    // Vector load from test_mem64[0:3] into v2
+    asm volatile(
+        "vle64.v v2, (%0)\n"
+        :: "r"(&test_mem64[0])
+        : "memory"
+    );
+    
+    // Vector store from v2 to test_mem64[8:11]
+    asm volatile(
+        "vse64.v v2, (%0)\n"
+        :: "r"(&test_mem64[8])
+        : "memory"
+    );
+    
+    // Verify vector store
+    for (int i = 0; i < 4; i++) {
+        if (test_mem64[8 + i] != test_mem64[i]) {
+            printf("[SNITCH] ERROR: Vector store failed at [%d], got 0x%08x%08x expected 0x%08x%08x\n",
+                   i, (uint32_t)(test_mem64[8 + i] >> 32), (uint32_t)test_mem64[8 + i],
+                   (uint32_t)(test_mem64[i] >> 32), (uint32_t)test_mem64[i]);
+        }
+    }
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/matmul16_task.c b/spatz/sw/tasks/matmul16_task.c
new file mode 100644
index 0000000..93c4a08
--- /dev/null
+++ b/spatz/sw/tasks/matmul16_task.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * FP16 Matrix Multiplication using RISC-V Vector Extension
+ * Computes C = A × B where C is M×K, A is M×N, B is N×K
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+// DATA register address for reading parameter pointer
+#define DATA_ADDR (SPATZ_DATA)
+
+// Parameter structure in shared L1 memory
+typedef struct {
+    uint32_t a_addr;        // Matrix A address
+    uint32_t b_addr;        // Matrix B address  
+    uint32_t c_addr;        // Matrix C address (result)
+    uint32_t m_size;        // Number of rows in A and C
+    uint32_t n_size;        // Number of columns in A, rows in B
+    uint32_t k_size;        // Number of columns in B and C
+} matmul_params_t;
+
+// Main entry point
+int matmul16_task(void) {
+    
+    // Read parameter structure pointer from DATA register
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile matmul_params_t *params = (volatile matmul_params_t *)params_ptr;
+    
+    float16_t *A = (float16_t *)params->a_addr;
+    float16_t *B = (float16_t *)params->b_addr;
+    float16_t *C = (float16_t *)params->c_addr;
+    uint32_t M = params->m_size;
+    uint32_t N = params->n_size;
+    uint32_t K = params->k_size;
+    
+    // Compute matrix multiplication using Spatz benchmark kernel
+    // C[M×K] = A[M×N] × B[N×K]
+    fmatmul_v16b(C, A, B, M, N, K);
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/matvec16_task.c b/spatz/sw/tasks/matvec16_task.c
new file mode 100644
index 0000000..690154f
--- /dev/null
+++ b/spatz/sw/tasks/matvec16_task.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * FP16 Matrix-Vector Multiplication (GEMV)
+ * Based on PLAY library: https://github.com/FondazioneChipsIT/PLAY
+ * source/linalg_gemv/arch/linalg_gemv_spatz.c
+ * 
+ * Computes: dst = α·mat·vec_x + β·vec_y
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+#define DATA_ADDR (SPATZ_DATA)
+
+typedef struct __attribute__((packed)) {
+    uint32_t mat_addr;
+    uint32_t vec_x_addr;
+    uint32_t vec_y_addr;
+    uint32_t dst_addr;
+    uint16_t alpha;
+    uint16_t beta;
+    uint32_t m_size;
+    uint32_t n_size;
+} matvec_params_t;
+
+/* Helper functions adapted from PLAY library */
+
+static inline int vector_set_all_spatz(float16_t *vec, const float16_t val, const int len)
+{
+    const float16_t *v;
+    size_t avl;
+    size_t vl;
+
+    avl = len;
+    v = vec;
+
+    asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+    asm volatile ("vfmv.v.f v0, %0" :: "f"(val));
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vse16.v v0, (%0)" :: "r"(v));
+        v += vl;
+    }
+
+    return 0;
+}
+
+static inline int vector_memcpy_spatz(const float16_t *src, float16_t *dst, const int len)
+{
+    size_t avl;
+    size_t vl;
+    const float16_t *p_src;
+    float16_t *p_dst;
+
+    avl = len;
+    p_src = src;
+    p_dst = dst;
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vle16.v v0, (%0)" :: "r"(p_src));
+        asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst));
+        p_src += vl;
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+static inline int vector_scale_spatz(const float16_t *src, const float16_t val, float16_t *dst, const int len)
+{
+    size_t avl;
+    size_t vl;
+    const float16_t *p_src;
+    float16_t *p_dst;
+
+    avl = len;
+    p_src = src;
+    p_dst = dst;
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vle16.v v0, (%0)" :: "r"(p_src));
+        asm volatile ("vfmul.vf v8, v0, %0" :: "f"(val));
+        asm volatile ("vse16.v v8, (%0)" :: "r"(p_dst));
+        p_src += vl;
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+/* Main GEMV function - direct translation from PLAY linalg_gemv_spatz.c */
+
+static int linalg_gemv_spatz_serial(
+    const float16_t *mat, 
+    const float16_t *vec_x, 
+    const float16_t *vec_y,
+    const float16_t alpha, 
+    const float16_t beta, 
+    float16_t *dst, 
+    const int dim_M, 
+    const int dim_N)
+{
+    float16_t ZERO_f = 0.0f;
+    float16_t ONE_f = 1.0f;
+
+    const float16_t *col_m;
+    float16_t *p_dst;
+    float16_t elem_x;
+
+    size_t stride;
+    size_t avl;
+    size_t vl;
+
+    // CRITICAL FIX: Use bit pattern comparison for FP16
+    // Direct float comparison (alpha == ZERO_f) fails because it compares FP16 vs FP32
+    uint16_t alpha_bits = *(uint16_t *)&alpha;
+    uint16_t beta_bits = *(uint16_t *)&beta;
+    uint16_t zero_bits = 0x0000;
+    uint16_t one_bits = 0x3C00;
+
+    if (alpha_bits == zero_bits) {
+        if (beta_bits == zero_bits)
+            vector_set_all_spatz(dst, ZERO_f, dim_M);
+        else if (beta_bits == one_bits)
+            vector_memcpy_spatz(vec_y, dst, dim_M);
+        else
+            vector_scale_spatz(vec_y, beta, dst, dim_M);
+        return 0;
+    }
+
+    if (beta_bits == zero_bits)
+        vector_set_all_spatz(dst, ZERO_f, dim_M);
+    else if (beta_bits == one_bits)
+        vector_memcpy_spatz(vec_y, dst, dim_M);
+    else
+        vector_scale_spatz(vec_y, beta, dst, dim_M);
+
+    avl = dim_M;
+    p_dst = dst;
+    stride = dim_N * sizeof(float16_t);
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vfmv.v.f v0, %0" :: "f"(ZERO_f));
+        asm volatile ("vle16.v v24, (%0)" :: "r"(p_dst));
+
+        for (int n = 0; n < dim_N; n++) {
+            col_m = mat + n;
+            elem_x = vec_x[n];
+            asm volatile ("vlse16.v v8, (%0), %1" :: "r"(col_m), "r"(stride));
+            asm volatile ("vfmacc.vf v0, %0, v8" :: "f"(elem_x));
+        }
+
+        asm volatile ("vfmul.vf v0, v0, %0" :: "f"(alpha));
+        asm volatile ("vfadd.vv v0, v24, v0");
+        asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst));
+
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+/* Task entry point */
+
+int matvec16_task(void) {
+    
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile matvec_params_t *params = (volatile matvec_params_t *)params_ptr;
+    
+    float16_t *mat = (float16_t *)params->mat_addr;
+    float16_t *vec_x = (float16_t *)params->vec_x_addr;
+    float16_t *vec_y = (float16_t *)params->vec_y_addr;
+    float16_t *dst = (float16_t *)params->dst_addr;
+    uint32_t M = params->m_size;
+    uint32_t N = params->n_size;
+    
+    // Bitcast uint16_t to float16_t
+    union { uint16_t u; float16_t f; } alpha_u, beta_u;
+    alpha_u.u = params->alpha;
+    beta_u.u = params->beta;
+    
+    linalg_gemv_spatz_serial(mat, vec_x, vec_y, alpha_u.f, beta_u.f, dst, M, N);
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/matvectrans16_task.c b/spatz/sw/tasks/matvectrans16_task.c
new file mode 100644
index 0000000..4babd22
--- /dev/null
+++ b/spatz/sw/tasks/matvectrans16_task.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * FP16 Transposed Matrix-Vector Multiplication (GEMV Transposed)
+ * Based on PLAY library: https://github.com/FondazioneChipsIT/PLAY
+ * source/linalg_gemv_trans/arch/linalg_gemv_trans_spatz.c
+ * 
+ * Computes: dst = α·mat^T·vec_x + β·vec_y
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+#define DATA_ADDR (SPATZ_DATA)
+
+typedef struct __attribute__((packed)) {
+    uint32_t mat_addr;
+    uint32_t vec_x_addr;
+    uint32_t vec_y_addr;
+    uint32_t dst_addr;
+    uint16_t alpha;
+    uint16_t beta;
+    uint32_t m_size;
+    uint32_t n_size;
+} matvectrans_params_t;
+
+/* Helper functions adapted from PLAY library */
+
+static inline int vector_set_all_spatz(float16_t *vec, const float16_t val, const int len)
+{
+    const float16_t *v;
+    size_t avl;
+    size_t vl;
+
+    avl = len;
+    v = vec;
+
+    asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+    asm volatile ("vfmv.v.f v0, %0" :: "f"(val));
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vse16.v v0, (%0)" :: "r"(v));
+        v += vl;
+    }
+
+    return 0;
+}
+
+static inline int vector_memcpy_spatz(const float16_t *src, float16_t *dst, const int len)
+{
+    size_t avl;
+    size_t vl;
+    const float16_t *p_src;
+    float16_t *p_dst;
+
+    avl = len;
+    p_src = src;
+    p_dst = dst;
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vle16.v v0, (%0)" :: "r"(p_src));
+        asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst));
+        p_src += vl;
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+static inline int vector_scale_spatz(const float16_t *src, const float16_t val, float16_t *dst, const int len)
+{
+    size_t avl;
+    size_t vl;
+    const float16_t *p_src;
+    float16_t *p_dst;
+
+    avl = len;
+    p_src = src;
+    p_dst = dst;
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vle16.v v0, (%0)" :: "r"(p_src));
+        asm volatile ("vfmul.vf v8, v0, %0" :: "f"(val));
+        asm volatile ("vse16.v v8, (%0)" :: "r"(p_dst));
+        p_src += vl;
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+/* Main GEMV Transposed function - direct translation from PLAY linalg_gemv_trans_spatz.c */
+
+static int linalg_gemv_trans_spatz_serial(
+    const float16_t *mat, 
+    const float16_t *vec_x, 
+    const float16_t *vec_y,
+    const float16_t alpha, 
+    const float16_t beta, 
+    float16_t *dst, 
+    const int dim_M, 
+    const int dim_N)
+{
+    float16_t ZERO_f = 0.0f;
+    float16_t ONE_f = 1.0f;
+
+    const float16_t *row_m;
+    float16_t *p_dst;
+    float16_t elem_x;
+
+    size_t avl;
+    size_t vl;
+
+    // CRITICAL FIX: Use bit pattern comparison for FP16
+    // Direct float comparison (alpha == ZERO_f) fails because it compares FP16 vs FP32
+    uint16_t alpha_bits = *(uint16_t *)&alpha;
+    uint16_t beta_bits = *(uint16_t *)&beta;
+    uint16_t zero_bits = 0x0000;
+    uint16_t one_bits = 0x3C00;
+
+    if (alpha_bits == zero_bits) {
+        if (beta_bits == zero_bits)
+            vector_set_all_spatz(dst, ZERO_f, dim_N);
+        else if (beta_bits == one_bits)
+            vector_memcpy_spatz(vec_y, dst, dim_N);
+        else
+            vector_scale_spatz(vec_y, beta, dst, dim_N);
+        return 0;
+    }
+
+    if (beta_bits == zero_bits)
+        vector_set_all_spatz(dst, ZERO_f, dim_N);
+    else if (beta_bits == one_bits)
+        vector_memcpy_spatz(vec_y, dst, dim_N);
+    else
+        vector_scale_spatz(vec_y, beta, dst, dim_N);
+
+    avl = dim_N;
+    p_dst = dst;
+
+    for (; avl > 0; avl -= vl) {
+        asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        asm volatile ("vfmv.v.f v0, %0" :: "f"(ZERO_f));
+        asm volatile ("vle16.v v24, (%0)" :: "r"(p_dst));
+
+        for (int m = 0; m < dim_M; m++) {
+            row_m = mat + (m * dim_N);
+            elem_x = vec_x[m];
+
+            asm volatile ("vle16.v v8, (%0)" :: "r"(row_m));
+            asm volatile ("vfmacc.vf v0, %0, v8" :: "f"(elem_x));
+        }
+
+        asm volatile ("vfmul.vf v0, v0, %0" :: "f"(alpha));
+        asm volatile ("vfadd.vv v0, v24, v0");
+        asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst));
+
+        p_dst += vl;
+    }
+
+    return 0;
+}
+
+/* Task entry point */
+
+int matvectrans16_task(void) {
+    
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile matvectrans_params_t *params = (volatile matvectrans_params_t *)params_ptr;
+    
+    float16_t *mat = (float16_t *)params->mat_addr;
+    float16_t *vec_x = (float16_t *)params->vec_x_addr;
+    float16_t *vec_y = (float16_t *)params->vec_y_addr;
+    float16_t *dst = (float16_t *)params->dst_addr;
+    uint32_t M = params->m_size;
+    uint32_t N = params->n_size;
+    
+    // Bitcast uint16_t to float16_t
+    union { uint16_t u; float16_t f; } alpha_u, beta_u;
+    alpha_u.u = params->alpha;
+    beta_u.u = params->beta;
+    
+    linalg_gemv_trans_spatz_serial(mat, vec_x, vec_y, alpha_u.f, beta_u.f, dst, M, N);
+    
+    return 0;
+}
diff --git a/spatz/sw/tasks/vecsum16_task.c b/spatz/sw/tasks/vecsum16_task.c
new file mode 100644
index 0000000..fda1201
--- /dev/null
+++ b/spatz/sw/tasks/vecsum16_task.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * FP16 Vector-Vector Sum using RISC-V Vector Extension
+ * Computes Z = X + Y where X, Y, Z are N×1 vectors
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "spatz_workers.h"
+
+// DATA register address for reading parameter pointer
+#define DATA_ADDR (SPATZ_DATA)
+
+// Parameter structure in shared L1 memory
+typedef struct {
+    uint32_t x_addr;        // Vector X address [N×1]
+    uint32_t y_addr;        // Vector Y address [N×1]
+    uint32_t z_addr;        // Result Z address [N×1]
+    uint32_t n_size;        // Vector length N
+} vecsum_params_t;
+
+// Main entry point
+int vecsum16_task(void) {
+    
+    // Read parameter structure pointer from DATA register
+    uint32_t params_ptr = mmio32(DATA_ADDR);
+    volatile vecsum_params_t *params = (volatile vecsum_params_t *)params_ptr;
+    
+    const float16_t *X = (const float16_t *)params->x_addr;
+    const float16_t *Y = (const float16_t *)params->y_addr;
+    float16_t *Z = (float16_t *)params->z_addr;
+    uint32_t N = params->n_size;
+    
+    // Vector addition: Z[N] = X[N] + Y[N]
+    // Use stripmine loop for large vectors
+    unsigned int avl = N;
+    unsigned int vl;
+    
+    do {
+        // Set vector length
+        asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl));
+        
+        // Load X and Y vectors
+        asm volatile("vle16.v v0, (%0)" ::"r"(X));
+        asm volatile("vle16.v v8, (%0)" ::"r"(Y));
+        
+        // Vector floating-point add: v16 = v0 + v8
+        asm volatile("vfadd.vv v16, v0, v8");
+        
+        // Store result
+        asm volatile("vse16.v v16, (%0)" ::"r"(Z));
+        
+        // Bump pointers
+        X += vl;
+        Y += vl;
+        Z += vl;
+        avl -= vl;
+    } while (avl > 0);
+    
+    return 0;
+}
diff --git a/sw/kernel/link.ld b/sw/kernel/link.ld
index bad56d5..420dd19 100644
--- a/sw/kernel/link.ld
+++ b/sw/kernel/link.ld
@@ -22,7 +22,7 @@ MEMORY
 {
     instrram    : ORIGIN = 0xcc000000, LENGTH = 0x8000
     dataram     : ORIGIN = 0xcc010000, LENGTH = 0xF00000
-    stack       : ORIGIN = 0x00010000, LENGTH = 0x10000 /* 64K, maybe it's a bit too much (?) */
+    stack       : ORIGIN = 0x00010000, LENGTH = 0xC000 /* 48K - CV32 stack only */
 }
 
 /* Stack information variables */
@@ -61,6 +61,19 @@ SECTIONS
         _endtext = .;
     }  > instrram
 
+    /* Spatz embedded binary - inlined from header file */
+    /* Positioned right after CV32 .text in instrram */
+    
+    . = ALIGN(4);
+    _spatz_binary_start = .;
+    
+    .spatz_binary : {
+        KEEP(*(.spatz_binary))
+    } > instrram
+    
+    _spatz_binary_end = .;
+    _spatz_binary_size = _spatz_binary_end - _spatz_binary_start;
+
     /*--------------------------------------------------------------------*/
     /* Global constructor/destructor segment                              */
     /*--------------------------------------------------------------------*/
diff --git a/sw/tests/spatz_tests/atomic_mesh_spatz_test.c b/sw/tests/spatz_tests/atomic_mesh_spatz_test.c
new file mode 100644
index 0000000..2036d78
--- /dev/null
+++ b/sw/tests/spatz_tests/atomic_mesh_spatz_test.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * MAGIA Mesh Atomic Test with Spatz (Snitch)
+ * 
+ */
+
+#include "magia_tile_utils.h"
+#include "magia_utils.h"
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "atomic_mesh_spatz_test_task_bin.h"
+
+#define SYNC_BASE      (L1_BASE + 0x00010000)  // L1 base where counters are stored
+#define PARAM_BASE     (L1_BASE + 0x00010100)  // Parameters area (separate from counter)
+
+// Parameter structure for Spatz task
+typedef struct {
+    uint32_t hartid;
+    uint32_t counter_addr;
+    uint32_t sync_base;
+} atomic_params_t;
+
+int main(void) {
+  uint32_t hartid = get_hartid();
+  
+  printf("[Tile %d] Starting Mesh Atomic Test with Spatz\n", hartid);
+  
+  // Initialize Event Unit
+  eu_init();
+  eu_enable_events(EU_SPATZ_DONE_MASK);
+  
+  // Initialize all counters to 0 across mesh
+  printf("[Tile %d] Initializing counter to 0\n", hartid);
+  uint32_t my_counter_addr = SYNC_BASE + hartid * L1_TILE_OFFSET;
+  mmio32(my_counter_addr) = 0;
+  
+  // Setup parameters in L1 - each tile uses its own L1 region
+  uint32_t param_base = PARAM_BASE + hartid * L1_TILE_OFFSET;
+  volatile atomic_params_t *params = (volatile atomic_params_t *)param_base;
+  params->hartid = hartid;
+  params->counter_addr = my_counter_addr;
+  params->sync_base = SYNC_BASE;
+  
+  printf("[Tile %d] Initializing Spatz with atomic task\n", hartid);
+  
+  // Initialize Spatz with atomic remote task binary
+  spatz_init(SPATZ_BINARY_START);
+  
+  printf("[Tile %d] Starting Spatz task\n", hartid);
+  
+  spatz_pass_params(param_base);
+  spatz_run_task(ATOMIC_REMOTE_TASK);
+  
+  
+  printf("[Tile %d] Waiting for Spatz completion...\n", hartid);
+  
+  // Wait for Spatz done event using WFE
+  eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK);
+  
+  printf("[Tile %d] Spatz completed! Checking result...\n", hartid);
+  
+  // Check Spatz exit code (0 = pass, 1 = fail)
+  uint32_t exit_code = spatz_get_exit_code();
+  
+  // Read final counter value
+  uint32_t final_count = mmio32(my_counter_addr);
+  uint32_t expected = 6;  // NUM_ITER (2) * NUM_TARGETS (3) from task
+  
+  printf("[Tile %d] Counter: %d (expected %d), exit_code: %d\n", 
+         hartid, final_count, expected, exit_code);
+  
+  if (exit_code == 0 && final_count == expected) {
+    printf("[Tile %d] [PASS] Test completed successfully!\n", hartid);
+    return 0;
+  } else {
+    printf("[Tile %d] [FAIL] Test failed!\n", hartid);
+    return 1;
+  }
+}
diff --git a/sw/tests/spatz_tests/dotp_mesh_spatz_test.c b/sw/tests/spatz_tests/dotp_mesh_spatz_test.c
new file mode 100644
index 0000000..d8a97f5
--- /dev/null
+++ b/sw/tests/spatz_tests/dotp_mesh_spatz_test.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * MAGIA Mesh Test - Spatz Dot Product
+ * 
+ */
+
+#include "magia_tile_utils.h"
+#include "magia_utils.h"
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "idma_mm_utils.h"
+#include "dotp_mesh_spatz_test_task_bin.h"
+
+// Test configuration  
+#define DOTP_SIZE 1024  // Elements per tile
+#define DIFF_TH (0x0011)  // Threshold for FP16 comparison
+
+// FP16 test pattern: A[i] = 1.0, B[i] = 2.0
+// Expected result: 1024 * 1.0 * 2.0 = 2048.0
+typedef uint16_t fp16_t;
+static const fp16_t dotp_a_value = 0x3C00;  // 1.0 in FP16
+static const fp16_t dotp_b_value = 0x4000;  // 2.0 in FP16
+static const fp16_t dotp_golden = 0x6800;   // 2048.0 in FP16
+
+#define A_VEC_BASE      (L1_BASE + 0x00010000)
+#define B_VEC_BASE      (L1_BASE + 0x00012000)
+#define RESULT_BASE     (L1_BASE + 0x00014000)
+
+#define T_BASE          (L2_BASE + 0x0004A000)  
+#define L2_RESULTS_BASE (L2_BASE + 0x00050000)
+#define MHARTID_OFFSET  (0x00010000)  
+
+#define DOTP_PARAM_BASE (L1_BASE + 0x00015000)
+
+// Parameter structure for Spatz task
+typedef struct {
+    uint32_t a_addr;        // Vector A address
+    uint32_t b_addr;        // Vector B address  
+    uint32_t result_addr;   // Result address (scalar)
+    uint32_t n_elements;    // Number of elements
+} dotp_params_t;
+
+int main(void) {
+    uint32_t my_hartid = get_hartid();
+    uint32_t l2_temp = T_BASE + my_hartid * MHARTID_OFFSET;
+    uint32_t transfer_id;
+
+    // Transfer vector A to L1 through iDMA
+    printf("Initializing vector A through iDMA...\n");
+    for (unsigned int i = 0; i < DOTP_SIZE; i++) {
+        mmio16(l2_temp + 2*i) = dotp_a_value;
+    }
+    transfer_id = idma_L2ToL1(l2_temp, A_VEC_BASE + my_hartid * L1_TILE_OFFSET, DOTP_SIZE * 2);
+    dma_wait(transfer_id);
+    
+    // Transfer vector B to L1 through iDMA
+    printf("Initializing vector B through iDMA...\n");
+    for (unsigned int i = 0; i < DOTP_SIZE; i++) {
+        mmio16(l2_temp + 2*i) = dotp_b_value;
+    }
+    transfer_id = idma_L2ToL1(l2_temp, B_VEC_BASE + my_hartid * L1_TILE_OFFSET, DOTP_SIZE * 2);
+    dma_wait(transfer_id);
+    
+    // Initialize Event Unit
+    printf("Initializing Event Unit and Spatz...\n");
+    eu_init();
+    eu_enable_events(EU_SPATZ_DONE_MASK);
+    
+    // Initialize Spatz BEFORE writing parameters
+    spatz_init(SPATZ_BINARY_START);
+
+    // Write parameters to tile-specific L1 using L1_TILE_OFFSET
+    uint32_t param_base = DOTP_PARAM_BASE + my_hartid * L1_TILE_OFFSET;
+    volatile dotp_params_t *params = (volatile dotp_params_t *)param_base;
+    params->a_addr = A_VEC_BASE + my_hartid * L1_TILE_OFFSET;
+    params->b_addr = B_VEC_BASE + my_hartid * L1_TILE_OFFSET;
+    params->result_addr = RESULT_BASE + my_hartid * L1_TILE_OFFSET;
+    params->n_elements = DOTP_SIZE;
+    
+    printf("Starting Spatz dot product...\n");
+    
+    spatz_pass_params(param_base);
+    spatz_run_task(DOTP_TASK);
+    // Wait for Spatz completion
+    eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK);
+    
+    printf("Spatz task completed.\n");
+    
+    // Check Spatz exit code
+    uint32_t exit_code = spatz_get_exit_code();
+    if (exit_code != 0) {
+        printf("ERROR: Spatz exit code = 0x%08x\n", exit_code);
+    }
+    
+    // Move result from L1 to L2 using iDMA
+    printf("Moving result to L2 through iDMA...\n");
+    uint32_t l1_result_addr = RESULT_BASE + my_hartid * L1_TILE_OFFSET;
+    uint32_t l2_result_addr = L2_RESULTS_BASE + my_hartid * MHARTID_OFFSET;
+    transfer_id = idma_L1ToL2(l1_result_addr, l2_result_addr, 4);
+    dma_wait(transfer_id);
+    
+    // Read result from L2
+    fp16_t result = mmio16(l2_result_addr);
+    
+    uint16_t computed = result;
+    uint16_t expected = dotp_golden;
+    uint16_t diff = (computed > expected) ? (computed - expected) : (expected - computed);
+    
+    unsigned int num_errors = 0;
+    if (diff > DIFF_TH) {
+        num_errors++;
+        printf("ERROR: Tile %d result (=0x%04x) != Golden (=0x%04x), diff=0x%04x\n", 
+               my_hartid, computed, expected, diff);
+    } else {
+        printf("OK: Tile %d result matches golden value\n", my_hartid);
+    }
+    
+    printf("\nFinished test with %u error(s)\n", num_errors);
+    
+    return num_errors;
+}
diff --git a/sw/tests/spatz_tests/hello_mesh_spatz_test.c b/sw/tests/spatz_tests/hello_mesh_spatz_test.c
new file mode 100644
index 0000000..bc29189
--- /dev/null
+++ b/sw/tests/spatz_tests/hello_mesh_spatz_test.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * MAGIA Hello Mesh Spatz Test
+ * Simple test running Hello World on Spatz across multiple tiles
+ * 
+ */
+
+#include "magia_tile_utils.h"
+#include "magia_utils.h"
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "hello_mesh_spatz_test_task_bin.h"
+
+// Parameter structure
+typedef struct {
+    uint32_t tile_id;
+} hello_params_t;
+
+int main(void) {
+
+  eu_init();
+  eu_enable_events(EU_SPATZ_DONE_MASK);
+
+  // Write parameters in L1 - each tile uses its own L1 region with offset
+  uint32_t my_hartid = get_hartid();
+  uint32_t param_base = L1_BASE + 0x00010000 + my_hartid * L1_TILE_OFFSET;
+  volatile hello_params_t *params = (volatile hello_params_t *)param_base;
+  params->tile_id = my_hartid;
+  
+  spatz_init(SPATZ_BINARY_START);
+  
+  printf("Starting Spatz Hello World task (tile_id=%d)...\n", my_hartid);
+  
+  spatz_pass_params(param_base);
+  spatz_run_task(HELLO_TASK);
+  
+  // Wait for Spatz completion using WFE
+  printf("Waiting for Spatz completion...\n");
+  eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK);
+
+  printf("Spatz task completed successfully!\n");
+
+  return 0;
+}
+
diff --git a/sw/tests/spatz_tests/matmul_compare_spatz_test.c b/sw/tests/spatz_tests/matmul_compare_spatz_test.c
new file mode 100644
index 0000000..c6dc381
--- /dev/null
+++ b/sw/tests/spatz_tests/matmul_compare_spatz_test.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ * Matrix Multiplication Comparison Test: RedMule vs Spatz
+ */
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "redmule_mm_utils.h"
+#include "event_unit_utils.h"
+#include "matmul_compare_spatz_test_task_bin.h"
+
+// Golden model test data headers
+#include "x_input.h"
+#include "w_input.h"
+#include "y_input.h"
+#include "z_output.h"
+
+// Matrix dimensions (from golden model headers)
+#define M_SIZE (96)
+#define N_SIZE (64)
+#define K_SIZE (64)
+
+// Memory layout in L1
+#define X_BASE (L1_BASE + 0x00012048)
+#define W_BASE (L1_BASE + 0x00016048)
+#define Y_BASE (L1_BASE + 0x0001A048)
+#define Z_BASE (L2_BASE + 0x00042000)
+
+#define A_BASE X_BASE
+#define B_BASE W_BASE
+#define C_REDMULE_BASE Y_BASE
+#define C_SPATZ_BASE   (L1_BASE + 0x00023000)
+#define C_SCALAR_BASE  Z_BASE
+
+// Spatz parameter area in shared L1
+#define MATMUL_PARAM_BASE (L1_BASE + 0x00010000)
+
+// Parameter structure for Spatz task (matches matmul16_task.c)
+typedef struct {
+    uint32_t a_addr;        // Matrix A address
+    uint32_t b_addr;        // Matrix B address  
+    uint32_t c_addr;        // Matrix C address (result)
+    uint32_t m_size;        // Number of rows in A and C
+    uint32_t n_size;        // Number of columns in A, rows in B
+    uint32_t k_size;        // Number of columns in B and C
+} matmul_params_t;
+
+#define DIFF_TH (0x0011)
+
+typedef uint16_t fp16_t;
+
+int main(void) {
+    // Enable cycle counter (generic for both CV32E40P and CV32E40X)
+    ccount_en();
+    
+
+    // Initialize input matrices from golden model headers
+    printf("Initializing matrices from golden model...\n");
+    
+    // X (matrix A)
+    for (int i = 0; i < M_SIZE*N_SIZE; i++)
+        mmio16(X_BASE + 2*i) = x_inp[i];
+    
+    // W (matrix B)
+    for (int i = 0; i < N_SIZE*K_SIZE; i++)
+        mmio16(W_BASE + 2*i) = w_inp[i];
+    
+    // Y (RedMule output) - initialize to ZERO (no accumulation, same as Spatz)
+    for (int i = 0; i < M_SIZE*K_SIZE; i++)
+        mmio16(Y_BASE + 2*i) = 0;
+    
+    printf("Matrices initialized.\n\n");
+    
+    // =====================================================
+    // Test 1: RedMule HWPE Accelerator
+    // =====================================================
+    
+    // Initialize and configure RedMulE
+    hwpe_cg_enable();
+    hwpe_soft_clear();
+    
+    // Acquire RedMule job
+    int offload_id_tmp;
+    while ((offload_id_tmp = hwpe_acquire_job()) < 0);
+    
+    // Configure RedMulE with matrices
+    redmule_cfg((unsigned int)A_BASE, (unsigned int)B_BASE, (unsigned int)C_REDMULE_BASE, 
+                M_SIZE, N_SIZE, K_SIZE, 
+                (uint8_t)gemm_ops, (uint8_t)Float16, (uint8_t)Float16);
+    
+    printf("Starting RedMulE computation...\n");
+    
+    uint32_t redmule_start = get_cycle();
+    
+    // Trigger job
+    hwpe_trigger_job();
+    
+    // Wait for HWPE completion using the polling method
+    hwpe_wait_for_completion();
+    
+    uint32_t redmule_cycles = get_cycle() - redmule_start;
+
+    printf("RedMule cycles: %u\n", redmule_cycles);
+    printf("RedMule completed.\n\n");
+    
+    // =====================================================
+    // Test 2: Spatz Vector Processor
+    // =====================================================
+    
+    // Initialize Event Unit and Spatz
+    eu_init();
+    eu_enable_events(EU_SPATZ_DONE_MASK);
+    
+    printf("Initializing Spatz...\n");
+    spatz_init(SPATZ_BINARY_START);
+    
+    // Write parameters to shared L1 memory for Spatz task
+    volatile matmul_params_t *params = (volatile matmul_params_t *)MATMUL_PARAM_BASE;
+    params->a_addr = A_BASE;
+    params->b_addr = B_BASE;
+    params->c_addr = C_SPATZ_BASE;
+    params->m_size = M_SIZE;
+    params->n_size = N_SIZE;
+    params->k_size = K_SIZE;
+    
+    printf("Initializing Spatz output matrix to zero...\n");
+    for (uint32_t i = 0; i < M_SIZE * K_SIZE; i++) {
+        mmio16(C_SPATZ_BASE + 2*i) = 0;
+    }
+    
+    printf("Starting Spatz computation...\n");
+    
+    uint32_t spatz_start = get_cycle();
+    spatz_pass_params(MATMUL_PARAM_BASE);
+    spatz_run_task(MATMUL16_TASK);   
+
+    eu_wait_spatz_polling(EU_SPATZ_DONE_MASK);
+    
+    uint32_t spatz_cycles = get_cycle() - spatz_start;
+    
+    printf("Spatz cycles: %u\n\n", spatz_cycles);
+
+    // =====================================================
+    // Result Verification
+    // =====================================================
+    
+    unsigned int mismatch_errors = 0;
+    uint16_t redmule_val, spatz_val, diff;
+
+    // Verify that RedMule and Spatz produce IDENTICAL results
+    printf("Comparing RedMule vs Spatz results...\n");
+    for(int i = 0; i < M_SIZE*K_SIZE; i++){
+      redmule_val = mmio16(C_REDMULE_BASE + 2*i);
+      spatz_val = mmio16(C_SPATZ_BASE + 2*i);
+      diff = (redmule_val > spatz_val) ? (redmule_val - spatz_val) : (spatz_val - redmule_val);
+      if(diff > DIFF_TH){
+        mismatch_errors++;
+        if(mismatch_errors <= 10) // Print first 10 errors only
+          printf("  ERROR: RedMule[%d](=0x%04x) != Spatz[%d](=0x%04x), diff=0x%04x\n", 
+                 i, redmule_val, i, spatz_val, diff);
+      }
+    }
+    
+    printf("\n");
+    if (mismatch_errors == 0) {
+      printf("SUCCESS: RedMule and Spatz produce IDENTICAL results!\n");
+    } else {
+      printf("FAILURE: RedMule and Spatz differ in %d elements\n", mismatch_errors);
+    }
+    
+    printf("\n");
+    printf("Performance Summary:\n");
+    printf("RedMule cycles: %u\n", redmule_cycles);
+    printf("Spatz cycles:   %u\n", spatz_cycles);
+
+  return mismatch_errors;
+
+}
diff --git a/sw/tests/spatz_tests/simple_spatz_test.c b/sw/tests/spatz_tests/simple_spatz_test.c
new file mode 100644
index 0000000..bfe5c7f
--- /dev/null
+++ b/sw/tests/spatz_tests/simple_spatz_test.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2023-2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ *
+ * Spatz Test - CV32 launches Spatz and waits with WFE
+ * 
+ */
+
+#include "magia_tile_utils.h"
+#include "magia_spatz_utils.h"
+#include "event_unit_utils.h"
+#include "simple_spatz_test_task_bin.h"
+
+int main(void) {
+
+    int errors = 0;
+
+    printf("[CV32] Spatz Test:\n");
+
+    // ==========================================
+    // Initialization of Event Unit and Spatz
+    // ==========================================
+    
+    // Initialize Event Unit
+    eu_init();
+    eu_enable_events(EU_SPATZ_DONE_MASK);  // Enable Spatz DONE event
+
+    // Enable clk and initialize Spatz (bootrom jumps to _start, does full init)
+    printf("\n[CV32] Initializing Spatz...\n");
+    spatz_init(SPATZ_BINARY_START);
+    
+    // ==========================================
+    // Test: Spatz Task
+    // ==========================================
+    printf("\n[CV32] Launching SPATZ Task\n");
+    spatz_run_task(HELLO_WORLD_SIMPLE_TASK);
+
+    eu_wait_spatz_polling(EU_SPATZ_DONE_MASK);
+    
+    if(spatz_get_exit_code() != 0) {
+        printf("[CV32] SPATZ TASK ENDED with exit code: 0x%03x\n", spatz_get_exit_code());
+        errors++;
+    } else {
+        printf("[CV32] SPATZ TASK ENDED successfully\n");
+    }
+
+    // Disable Spatz clock
+    spatz_clk_dis();
+    
+    // Return error status for CV32
+    return errors;
+}
\ No newline at end of file
diff --git a/sw/utils/event_unit_utils.h b/sw/utils/event_unit_utils.h
index 10ee2ea..a3156fa 100644
--- a/sw/utils/event_unit_utils.h
+++ b/sw/utils/event_unit_utils.h
@@ -111,6 +111,13 @@
 #define EU_FSYNC_ERROR_MASK          (1 << EU_FSYNC_ERROR_BIT)
 #define EU_FSYNC_ALL_MASK            (EU_FSYNC_DONE_MASK | EU_FSYNC_ERROR_MASK)
 
+// Spatz events (accelerator events [8] + cluster events [23])
+#define EU_SPATZ_DONE_BIT            8   
+#define EU_SPATZ_START_BIT           23  
+#define EU_SPATZ_DONE_MASK           (1 << EU_SPATZ_DONE_BIT)
+#define EU_SPATZ_START_MASK          (1 << EU_SPATZ_START_BIT)
+#define EU_SPATZ_ALL_MASK            (EU_SPATZ_DONE_MASK | EU_SPATZ_START_MASK)
+
 // Wait modes
 typedef enum {
     EU_WAIT_MODE_POLLING = 0,
@@ -321,6 +328,34 @@ static inline uint32_t eu_fsync_has_error(void) {
     return eu_check_events(EU_FSYNC_ERROR_MASK);
 }
 
+//=============================================================================
+// SPATZ FUNCTIONS
+//=============================================================================
+
+static inline void eu_spatz_init(void) {
+    eu_clear_events(0xFFFFFFFF);
+    eu_enable_events(EU_SPATZ_DONE_MASK);
+}
+
+static inline uint32_t eu_spatz_is_done(void) {
+    return eu_check_events(EU_SPATZ_DONE_MASK);
+}
+
+
+static inline void eu_wait_spatz_wfe(uint32_t event_mask) {
+    while (!eu_check_events(event_mask)) {
+        eu_evt_wait();
+    }
+    eu_clear_events(event_mask);
+}
+
+static inline void eu_wait_spatz_polling(uint32_t event_mask) {
+    while (!eu_check_events(event_mask)) {
+        wait_nop(10);
+    }
+    eu_clear_events(event_mask);
+}
+
 //=============================================================================
 // MULTI-ACCELERATOR FUNCTIONS
 //=============================================================================
diff --git a/sw/utils/magia_spatz_utils.h b/sw/utils/magia_spatz_utils.h
new file mode 100644
index 0000000..835deb1
--- /dev/null
+++ b/sw/utils/magia_spatz_utils.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2024 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Authors: Luca Balboni <luca.balboni10@studio.unibo.it>
+ * 
+ *  Spatz Utility Functions
+ */
+#ifndef MAGIA_SPATZ_UTILS_H
+#define MAGIA_SPATZ_UTILS_H
+
+#include <stdint.h>
+#include "magia_tile_utils.h"
+
+#define SPATZ_CLK_EN   (SPATZ_CTRL_BASE + 0x00)
+#define SPATZ_READY    (SPATZ_CTRL_BASE + 0x04)
+#define SPATZ_START    (SPATZ_CTRL_BASE + 0x08)
+#define SPATZ_TASKBIN  (SPATZ_CTRL_BASE + 0x0C)
+#define SPATZ_DATA     (SPATZ_CTRL_BASE + 0x10)
+#define SPATZ_RETURN   (SPATZ_CTRL_BASE + 0x14)
+#define SPATZ_DONE     (SPATZ_CTRL_BASE + 0x18)
+
+#define mmio32(x) (*(volatile uint32_t *)(x))
+
+static inline void spatz_clk_en(void) {
+    mmio32(SPATZ_CLK_EN) = 1;
+}
+
+static inline void spatz_clk_dis(void) {
+    mmio32(SPATZ_CLK_EN) = 0;
+}
+
+static inline void spatz_set_func(uint32_t addr) {
+    mmio32(SPATZ_TASKBIN) = addr;
+}
+
+static inline void spatz_trigger_en_irq(void) {
+    mmio32(SPATZ_START) = 1;
+}
+
+static inline void spatz_trigger_dis_irq(void) {
+    mmio32(SPATZ_START) = 0;
+}
+
+static inline void spatz_done(void) {
+    mmio32(SPATZ_DONE) = 1;
+}
+
+static inline uint32_t spatz_get_exit_code(void) {
+    return mmio32(SPATZ_RETURN);
+}
+
+static inline void spatz_run_task(uint32_t spatz_task_addr) {
+    spatz_set_func(spatz_task_addr);
+    spatz_trigger_en_irq();
+    while(mmio32(SPATZ_START) != 0);
+}
+
+static inline void spatz_pass_params(uint32_t params_ptr) {
+    mmio32(SPATZ_DATA) = params_ptr;
+}
+
+static inline void spatz_run_task_with_params(uint32_t spatz_task_addr, uint32_t params_ptr) {
+    spatz_pass_params(params_ptr);
+    spatz_run_task(spatz_task_addr);
+}
+
+static inline void spatz_init(uint32_t spatz_start_addr) {
+    spatz_set_func(spatz_start_addr);
+    spatz_clk_en();
+    while (mmio32(SPATZ_READY) == 0){
+        printf("Waiting for Spatz to be ready...\n");
+    };
+}
+
+#endif
diff --git a/sw/utils/magia_tile_utils.h b/sw/utils/magia_tile_utils.h
index 533bd3f..88783ad 100644
--- a/sw/utils/magia_tile_utils.h
+++ b/sw/utils/magia_tile_utils.h
@@ -25,6 +25,7 @@
 #include <stdint.h>
 #include "tinyprintf.h"
 
+
 #define NUM_L1_BANKS (32)
 #define WORDS_BANK   (8192)
 #define BITS_WORD    (32)
@@ -38,7 +39,9 @@
 #define FSYNC_END       (0x000006FF)
 #define EVENT_UNIT_BASE (0x00000700)
 #define EVENT_UNIT_END  (0x000016FF)
-#define RESERVED_START  (0x00001700)   
+#define SPATZ_CTRL_BASE (0x00001700)
+#define SPATZ_CTRL_END  (0x000017FF)
+#define RESERVED_START  (0x00001800)   
 #define RESERVED_END    (0x0000FFFF)   
 #define STACK_START     (0x00010000)
 #define STACK_END       (0x0001FFFF)
@@ -126,8 +129,8 @@ static inline void ccount_en(){
 #ifdef CV32E40X
     asm volatile("csrrci zero, 0x320, 0x1" ::);
 #else
-    uint32_t pcmr = 1;
-    asm volatile("csrw 0x7e1, %0" ::"r"(pcmr));
+    asm volatile("csrw 0x7E0, %0" :: "r"(0x1));
+    asm volatile("csrw 0x7E1, %0" :: "r"(0x1));
 #endif
 }
 
@@ -135,8 +138,7 @@ static inline void ccount_dis(){
 #ifdef CV32E40X
     asm volatile("csrrsi zero, 0x320, 0x1" ::);
 #else
-    uint32_t pcmr = 0;
-    asm volatile("csrw 0x7e1, %0" ::"r"(pcmr));
+    asm volatile("csrw 0x7E1, %0" :: "r"(0x0)); 
 #endif
 }
 
@@ -146,8 +148,7 @@ static inline uint32_t get_cyclel(){
     asm volatile("csrr %0, cycle"
                  :"=r"(cyclel):);
 #else
-    asm volatile("csrr %0, 0x780"
-                 :"=r"(cyclel):);
+    asm volatile("csrr %0, 0x780" : "=r"(cyclel));
 #endif
     return cyclel;
 }
@@ -192,66 +193,4 @@ uint32_t get_time(){
     return timel;
 }
 
-// Additional Flex-V CSR access functions based on CSR table
-static inline uint32_t get_mstatus(){
-    uint32_t mstatus;
-    asm volatile("csrr %0, 0x300" :"=r"(mstatus):); // MSTATUS (0x300)
-    return mstatus;
-}
-
-static inline void set_mstatus(uint32_t value){
-    asm volatile("csrw 0x300, %0" ::"r"(value)); // MSTATUS (0x300)
-}
-
-static inline uint32_t get_mtvec(){
-    uint32_t mtvec;
-    asm volatile("csrr %0, 0x305" :"=r"(mtvec):); // MTVEC (0x305)
-    return mtvec;
-}
-
-static inline void set_mtvec(uint32_t value){
-    asm volatile("csrw 0x305, %0" ::"r"(value)); // MTVEC (0x305)
-}
-
-static inline uint32_t get_mepc(){
-    uint32_t mepc;
-    asm volatile("csrr %0, 0x341" :"=r"(mepc):); // MEPC (0x341)
-    return mepc;
-}
-
-static inline void set_mepc(uint32_t value){
-    asm volatile("csrw 0x341, %0" ::"r"(value)); // MEPC (0x341)
-}
-
-static inline uint32_t get_mcause(){
-    uint32_t mcause;
-    asm volatile("csrr %0, 0x342" :"=r"(mcause):); // MCAUSE (0x342)
-    return mcause;
-}
-
-static inline uint32_t get_privlv(){
-    uint32_t privlv;
-    asm volatile("csrr %0, 0xc10" :"=r"(privlv):); // PRIVLV (0xC10)
-    return privlv;
-}
-
-static inline uint32_t get_uhartid(){
-    uint32_t uhartid;
-    asm volatile("csrr %0, 0x014" :"=r"(uhartid):); // UHARTID (0x014)
-    return uhartid;
-}
-
-// Flex-V performance counter control
-static inline void perf_counter_enable(){
-    uint32_t pcer = 3; // Enable cycles (bit 0) and instruction count (bit 1)
-    uint32_t pcmr = 1; // Enable global performance counter
-    asm volatile("csrw 0x7e0, %0" ::"r"(pcer)); // PCER_MACHINE (0x7E0)
-    asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1)
-}
-
-static inline void perf_counter_disable(){
-    uint32_t pcmr = 0; // Disable global performance counter
-    asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1)
-}
-
 #endif /*MAGIA_TILE_UTILS_H*/
diff --git a/sw/utils/redmule_mm_utils.h b/sw/utils/redmule_mm_utils.h
index c08a728..cb06f0b 100644
--- a/sw/utils/redmule_mm_utils.h
+++ b/sw/utils/redmule_mm_utils.h
@@ -52,12 +52,20 @@
 #define REDMULE_REG_MOPCNT  0x38
 
 /* Operations and formats */
-#define gemm_ops    0x1
 #define Float16     0x1
 #define Float16Alt  0x2
 #define Float8      0x3
 #define Float8Alt   0x4
 
+#define gemm_ops    0x1
+#define matmul_ops  0x0
+#define addmax      0x2
+#define addmin      0x3
+#define mulmax      0x4
+#define mulmin      0x5
+#define maxmin      0x6
+#define minmax      0x7
+
 /* HWPE Register Access Functions */
 static inline void redmule_x_add_set(unsigned int value) {
   HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_MARITH0);
diff --git a/sw/utils/tinyprintf.h b/sw/utils/tinyprintf.h
index 914a675..df789e7 100644
--- a/sw/utils/tinyprintf.h
+++ b/sw/utils/tinyprintf.h
@@ -139,7 +139,8 @@ void putf(char *null, char c) {
 /* Optional external types dependencies */
 
 #if TINYPRINTF_DEFINE_TFP_SPRINTF
-# include <sys/types.h>  /* size_t */
+// Define size_t for embedded environment (no sys/types.h)
+typedef unsigned int size_t;
 #endif
 
 /* Declarations */
diff --git a/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv b/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv
index dcb55bc..46b795f 100644
--- a/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv
+++ b/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv
@@ -69,7 +69,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{
   typedef logic[31:0] axi_data_mst_addr_t;
 typedef logic[31:0] axi_data_mst_data_t;
 typedef logic[3:0] axi_data_mst_strb_t;
-typedef logic[1:0] axi_data_mst_id_t;
+typedef logic[2:0] axi_data_mst_id_t;
 typedef logic[0:0] axi_data_mst_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_mst,             axi_data_mst_req_t,             axi_data_mst_rsp_t,             axi_data_mst_addr_t,             axi_data_mst_id_t,             axi_data_mst_data_t,             axi_data_mst_strb_t,             axi_data_mst_user_t)
 
@@ -77,7 +77,7 @@ typedef logic[0:0] axi_data_mst_user_t;
   typedef logic[31:0] axi_data_slv_addr_t;
 typedef logic[31:0] axi_data_slv_data_t;
 typedef logic[3:0] axi_data_slv_strb_t;
-typedef logic[3:0] axi_data_slv_id_t;
+typedef logic[5:0] axi_data_slv_id_t;
 typedef logic[0:0] axi_data_slv_user_t;
 `AXI_TYPEDEF_ALL_CT(axi_data_slv,             axi_data_slv_req_t,             axi_data_slv_rsp_t,             axi_data_slv_addr_t,             axi_data_slv_id_t,             axi_data_slv_data_t,             axi_data_slv_strb_t,             axi_data_slv_user_t)
 
@@ -87,8 +87,8 @@ typedef logic[0:0] axi_data_slv_user_t;
   localparam axi_cfg_t AxiCfg = '{    AddrWidth: 32,
     DataWidth: 32,
     UserWidth: 1,
-    InIdWidth: 4,
-    OutIdWidth: 2};
+    InIdWidth: 6,
+    OutIdWidth: 3};
 `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t)
 
 `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)