From ad26d179da1cef7f2927ab9c48c295bb7cac2028 Mon Sep 17 00:00:00 2001 From: Luca Balboni Date: Wed, 11 Mar 2026 18:29:35 +0100 Subject: [PATCH] Bumped Spatz Core Complex --- .gitignore | 27 +- Bender.local | 2 + Bender.yml | 47 +- Makefile | 102 +- bender_sim.mk | 7 +- bender_synth.mk | 14 +- hw/mesh/magia_pkg.sv | 4 +- hw/mesh/noc/floo_axi_mesh_16x16_noc.sv | 8 +- hw/mesh/noc/floo_axi_mesh_2x2_noc.sv | 8 +- hw/mesh/noc/floo_axi_mesh_32x32_noc.sv | 8 +- hw/mesh/noc/floo_axi_mesh_4x4_noc.sv | 8 +- hw/mesh/noc/floo_axi_mesh_8x8_noc.sv | 8 +- .../floonoc_axi_mesh_16x16_config.yml | 4 +- .../floonoc_axi_mesh_2x2_config.yml | 4 +- .../floonoc_axi_mesh_32x32_config.yml | 4 +- .../floonoc_axi_mesh_4x4_config.yml | 4 +- .../floonoc_axi_mesh_8x8_config.yml | 4 +- hw/tile/converters/reqrsp2obi.sv | 90 + hw/tile/converters/reqrsp64_to_obi32.sv | 104 + hw/tile/converters/tcdm2hci.sv | 66 + hw/tile/converters/tcdm2hci_atomic.sv | 166 ++ hw/tile/converters/tcdm2obi.sv | 84 + hw/tile/converters/tcdm64_to_dual_hci32.sv | 103 + .../converters/tcdm64_to_dual_hci32_atomic.sv | 136 ++ hw/tile/converters/tcdm64_to_dual_tcdm32.sv | 188 ++ hw/tile/eu_direct_cut.sv | 105 + hw/tile/magia_event_unit.sv | 46 +- hw/tile/magia_tile.sv | 421 +++- hw/tile/magia_tile_pkg.sv | 329 ++- hw/tile/obi_slave_ctrl_spatz.sv | 179 ++ hw/tile/spatz_bootrom.sv | 39 + hw/tile/spatz_cc_wrapper.sv | 489 +++++ scripts/bin2header.py | 67 + scripts/generate_spatz_bootrom.py | 100 + setup_env.sh | 4 +- spatz/README.md | 360 ++++ spatz/bootrom/Makefile | 96 + spatz/bootrom/spatz_init.S | 36 + spatz/bootrom/spatz_init.ld | 62 + spatz/sw/Makefile | 193 ++ spatz/sw/kernel/spatz_crt0.S | 169 ++ spatz/sw/kernel/spatz_program.ld | 86 + spatz/sw/spatz_utils/spatz_workers.h | 1760 +++++++++++++++++ spatz/sw/tasks/atomic_remote_task.c | 66 + spatz/sw/tasks/dotp_task.c | 57 + spatz/sw/tasks/hello_task.c | 47 + spatz/sw/tasks/hello_world_simple_task.c | 26 + spatz/sw/tasks/idma_simple_task.c | 96 + .../sw/tasks/instructions_sweep_simple_task.c | 426 ++++ spatz/sw/tasks/interconnect32_simple_task.c | 131 ++ spatz/sw/tasks/interconnect64_simple_task.c | 174 ++ spatz/sw/tasks/matmul16_task.c | 60 + spatz/sw/tasks/matvec16_task.c | 207 ++ spatz/sw/tasks/matvectrans16_task.c | 206 ++ spatz/sw/tasks/vecsum16_task.c | 78 + sw/kernel/link.ld | 15 +- sw/tests/spatz_tests/atomic_mesh_spatz_test.c | 95 + sw/tests/spatz_tests/dotp_mesh_spatz_test.c | 137 ++ sw/tests/spatz_tests/hello_mesh_spatz_test.c | 61 + .../spatz_tests/matmul_compare_spatz_test.c | 195 ++ sw/tests/spatz_tests/simple_spatz_test.c | 66 + sw/utils/event_unit_utils.h | 35 + sw/utils/magia_spatz_utils.h | 88 + sw/utils/magia_tile_utils.h | 77 +- sw/utils/redmule_mm_utils.h | 10 +- sw/utils/tinyprintf.h | 3 +- target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv | 8 +- 67 files changed, 7798 insertions(+), 307 deletions(-) create mode 100644 hw/tile/converters/reqrsp2obi.sv create mode 100644 hw/tile/converters/reqrsp64_to_obi32.sv create mode 100644 hw/tile/converters/tcdm2hci.sv create mode 100644 hw/tile/converters/tcdm2hci_atomic.sv create mode 100644 hw/tile/converters/tcdm2obi.sv create mode 100644 hw/tile/converters/tcdm64_to_dual_hci32.sv create mode 100644 hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv create mode 100644 hw/tile/converters/tcdm64_to_dual_tcdm32.sv create mode 100644 hw/tile/eu_direct_cut.sv create mode 100644 hw/tile/obi_slave_ctrl_spatz.sv create mode 100644 hw/tile/spatz_bootrom.sv create mode 100644 hw/tile/spatz_cc_wrapper.sv create mode 100644 scripts/bin2header.py create mode 100644 scripts/generate_spatz_bootrom.py create mode 100644 spatz/README.md create mode 100644 spatz/bootrom/Makefile create mode 100644 spatz/bootrom/spatz_init.S create mode 100644 spatz/bootrom/spatz_init.ld create mode 100644 spatz/sw/Makefile create mode 100644 spatz/sw/kernel/spatz_crt0.S create mode 100644 spatz/sw/kernel/spatz_program.ld create mode 100644 spatz/sw/spatz_utils/spatz_workers.h create mode 100644 spatz/sw/tasks/atomic_remote_task.c create mode 100644 spatz/sw/tasks/dotp_task.c create mode 100644 spatz/sw/tasks/hello_task.c create mode 100644 spatz/sw/tasks/hello_world_simple_task.c create mode 100644 spatz/sw/tasks/idma_simple_task.c create mode 100644 spatz/sw/tasks/instructions_sweep_simple_task.c create mode 100644 spatz/sw/tasks/interconnect32_simple_task.c create mode 100644 spatz/sw/tasks/interconnect64_simple_task.c create mode 100644 spatz/sw/tasks/matmul16_task.c create mode 100644 spatz/sw/tasks/matvec16_task.c create mode 100644 spatz/sw/tasks/matvectrans16_task.c create mode 100644 spatz/sw/tasks/vecsum16_task.c create mode 100644 sw/tests/spatz_tests/atomic_mesh_spatz_test.c create mode 100644 sw/tests/spatz_tests/dotp_mesh_spatz_test.c create mode 100644 sw/tests/spatz_tests/hello_mesh_spatz_test.c create mode 100644 sw/tests/spatz_tests/matmul_compare_spatz_test.c create mode 100644 sw/tests/spatz_tests/simple_spatz_test.c create mode 100644 sw/utils/magia_spatz_utils.h diff --git a/.gitignore b/.gitignore index 7c8a8fa..d275390 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,29 @@ build-hw.log profile-ips.log magia_venv/ modelsim.ini -# sw/tests/*/ + +sw/tests/*/build/ +sw/tests/*/logs/ +sw/tests/*/work/ +sw/tests/*/work +sw/tests/*/transcript +sw/tests/*/*.log + +sw/tests/*/*/build/ +sw/tests/*/*/logs/ +sw/tests/*/*/work/ +sw/tests/*/*/work +sw/tests/*/*/transcript +sw/tests/*/*/*.log + +spatz/bootrom/*.elf +spatz/bootrom/*.bin +spatz/bootrom/*.dump +spatz/bootrom/*.hex + +spatz/sw/bin/*.elf +spatz/sw/bin/*.bin +spatz/sw/bin/*.dump +spatz/sw/bin/*.o + +spatz/sw/headers_bin/*.h \ No newline at end of file diff --git a/Bender.local b/Bender.local index a7bab39..dc33deb 100644 --- a/Bender.local +++ b/Bender.local @@ -7,3 +7,5 @@ overrides: register_interface : { git: "https://github.com/pulp-platform/register_interface.git", rev: e25b36670ff7aab3402f40efcc2b11ee0f31cf19 } idma : { git: "https://github.com/pulp-platform/iDMA.git" , rev: c12caf59bb482fe44b27361f6924ad346b2d22fe } tech_cells_generic : { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13 } + cluster_icache : { git: "https://github.com/pulp-platform/cluster_icache.git" , rev: f88227d287251812b54c77fbfa608aa6dcd92b31 } + axi_riscv_atomics : { git: "https://github.com/pulp-platform/axi_riscv_atomics.git" , version: 0.8.2 } \ No newline at end of file diff --git a/Bender.yml b/Bender.yml index b96784d..a7f1435 100644 --- a/Bender.yml +++ b/Bender.yml @@ -26,12 +26,13 @@ package: dependencies: redmule : { git: "https://github.com/pulp-platform/redmule.git" , rev: 944d4a4d45fe05147cfbf7f872af677578f3b15c } # branch: fc/ooo-mux cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } # branch: vi/redmule_scaleup - cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: 37a82d337ba60129c333d104c29e816d0698b53b } + cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: f5241403d5d65dbe1fffacd7035dd7ae1359c8ef } # branch: lb/magia_core + spatz : { git: "https://github.com/pulp-platform/spatz.git" , rev: 2a524191c8a6472d83fe69ce3a24f90bee5b4ef0 } # branch: lb/magia-spatz_cc idma : { git: "https://github.com/pulp-platform/iDMA.git" , rev: a6b190c7991331432afa9a2899d032bc1b176830 } # branch: vi/redmule_scaleup hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , version: 1.6 } hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , version: 3.0.0 } hci : { git: "https://github.com/pulp-platform/hci.git" , version: 2.3.0 } - cluster_icache : { git: "https://github.com/pulp-platform/cluster_icache.git" , rev: 917ecbf908bdaa22c5713bbcff277d142506bb16 } # branch: michaero/astral + cluster_icache : { git: "https://github.com/pulp-platform/cluster_icache.git" , rev: f88227d287251812b54c77fbfa608aa6dcd92b31 } # branch: lb/magia-spatz fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: "pulp-v0.1.3" } fpu_ss : { git: "https://github.com/pulp-platform/fpu_ss.git" , rev: 8e2eff774d9d38a1e17a46bd56a0936dac9522f0 } # branch: vi/bender_manifest obi : { git: "https://github.com/pulp-platform/obi.git" , rev: 528dc65303d5ffb02fbc254324c6b53eac0dd6e5 } # branch: lb/fix_atop_resolver @@ -69,6 +70,14 @@ sources: - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/converters/tcdm2hci.sv + - hw/tile/converters/tcdm2obi.sv + - hw/tile/converters/tcdm2hci_atomic.sv + - hw/tile/converters/reqrsp2obi.sv + - hw/tile/converters/tcdm64_to_dual_hci32.sv + - hw/tile/converters/tcdm64_to_dual_tcdm32.sv + - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv + - hw/tile/converters/reqrsp64_to_obi32.sv - hw/tile/xbar_periph_bus_if.sv - hw/tile/cluster_event_map.sv - hw/tile/magia_event_unit.sv @@ -82,7 +91,11 @@ sources: - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv + - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/spatz_bootrom.sv + - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv + - hw/tile/eu_direct_cut.sv - hw/tile/magia_redmule_wrap.sv - hw/tile/magia_tile.sv # MAGIA DV @@ -119,6 +132,14 @@ sources: - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/converters/tcdm2hci.sv + - hw/tile/converters/tcdm2obi.sv + - hw/tile/converters/tcdm2hci_atomic.sv + - hw/tile/converters/reqrsp2obi.sv + - hw/tile/converters/tcdm64_to_dual_hci32.sv + - hw/tile/converters/tcdm64_to_dual_tcdm32.sv + - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv + - hw/tile/converters/reqrsp64_to_obi32.sv - hw/tile/xbar_periph_bus_if.sv - hw/tile/cluster_event_map.sv - hw/tile/magia_event_unit.sv @@ -132,7 +153,11 @@ sources: - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv + - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/spatz_bootrom.sv + - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv + - hw/tile/eu_direct_cut.sv - hw/tile/magia_redmule_wrap.sv - hw/tile/magia_tile.sv # MAGIA @@ -153,6 +178,7 @@ sources: - hw/mesh/noc/floo_axi_mesh_8x8_noc.sv - hw/mesh/noc/floo_axi_mesh_16x16_noc.sv - hw/mesh/noc/floo_axi_mesh_32x32_noc.sv + - target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv # MAGIA Packages - hw/mesh/magia_pkg.sv - hw/tile/magia_tile_pkg.sv @@ -168,6 +194,14 @@ sources: - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/converters/tcdm2hci.sv + - hw/tile/converters/tcdm2obi.sv + - hw/tile/converters/tcdm2hci_atomic.sv + - hw/tile/converters/reqrsp2obi.sv + - hw/tile/converters/tcdm64_to_dual_hci32.sv + - hw/tile/converters/tcdm64_to_dual_tcdm32.sv + - hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv + - hw/tile/converters/reqrsp64_to_obi32.sv - hw/tile/xbar_periph_bus_if.sv - hw/tile/cluster_event_map.sv - hw/tile/magia_event_unit.sv @@ -181,15 +215,14 @@ sources: - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv - hw/tile/obi_slave_fsync.sv + - hw/tile/obi_slave_ctrl_spatz.sv + - hw/tile/spatz_bootrom.sv + - hw/tile/spatz_cc_wrapper.sv - hw/tile/core_data_demux_eu_direct.sv + - hw/tile/eu_direct_cut.sv - hw/tile/magia_redmule_wrap.sv - hw/tile/magia_tile.sv # MAGIA - - hw/mesh/noc/floo_axi_mesh_2x2_noc.sv - - hw/mesh/noc/floo_axi_mesh_4x4_noc.sv - - hw/mesh/noc/floo_axi_mesh_8x8_noc.sv - - hw/mesh/noc/floo_axi_mesh_16x16_noc.sv - - hw/mesh/noc/floo_axi_mesh_32x32_noc.sv - hw/mesh/magia.sv # Tech - pd/sourcecode/tc_sram.sv diff --git a/Makefile b/Makefile index dbeb6b6..c4c277a 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ # Paths to folders ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) -core ?= CV32E40X +core ?= CV32E40P MAGIA_DIR ?= $(shell pwd) @@ -43,7 +43,7 @@ XLEN ?= 32 ifeq ($(core), CV32E40X) XTEN = imafc else - XTEN = imfcxpulpv2 + XTEN = imcxgap9 endif ABI ?= ilp XABI ?= f @@ -55,7 +55,9 @@ XABI ?= f #endif TEST_DIR := sw/tests -TEST_SRCS = $(TEST_DIR)/$(test).c +# Auto-detect test location in any subdirectory +TEST_SUBDIR = $(filter-out .,$(shell find $(TEST_DIR) -name "$(test).c" -printf "%P\n" 2>/dev/null | head -1 | xargs dirname 2>/dev/null)) +TEST_SRCS = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test).c compile_script ?= scripts/compile.tcl compile_script_synth ?= scripts/synth_compile.tcl @@ -104,6 +106,7 @@ endif INC += -Isw INC += -Isw/inc INC += -Isw/utils +INC += -Ispatz/sw/headers_bin BOOTSCRIPT := sw/kernel/crt0.S LINKSCRIPT := sw/kernel/link.ld @@ -114,15 +117,23 @@ OBJDUMP=$(ISA)$(XLEN)-unknown-elf-objdump CC_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -O2 -g -Wextra -Wall -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wundef -fdata-sections -ffunction-sections -MMD -MP LD_OPTS=-march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XLEN)$(XABI) -D__$(ISA)__ -MMD -MP -nostartfiles -nostdlib -Wl,--gc-sections +# Spatz embedded binary support (via header) +SPATZ_SW_DIR := spatz/sw + +# Auto-detect which Spatz tasks are used by looking for *_TASK symbols in CV32 code +# Example: HELLO_WORLD_TASK → hello_world_task +SPATZ_TASKS := $(shell grep -oP '\b(?!SPATZ_)[A-Z][A-Z0-9_]*_TASK\b' $(TEST_SRCS) 2>/dev/null | tr '[:upper:]' '[:lower:]' | awk '!seen[$$0]++') + # Setup build object dirs -CRT=$(TEST_DIR)/$(test)/build/crt0.o -OBJ=$(TEST_DIR)/$(test)/build/verif.o -BIN=$(TEST_DIR)/$(test)/build/verif -DUMP=$(TEST_DIR)/$(test)/build/verif.dump -ODUMP=$(TEST_DIR)/$(test)/build/verif.objdump -ITB=$(TEST_DIR)/$(test)/build/verif.itb -STIM_INSTR=$(TEST_DIR)/$(test)/build/stim_instr.txt -STIM_DATA=$(TEST_DIR)/$(test)/build/stim_data.txt +TEST_BUILD_DIR = $(TEST_DIR)/$(if $(TEST_SUBDIR),$(TEST_SUBDIR)/)$(test) +CRT=$(TEST_BUILD_DIR)/build/crt0.o +OBJ=$(TEST_BUILD_DIR)/build/verif.o +BIN=$(TEST_BUILD_DIR)/build/verif +DUMP=$(TEST_BUILD_DIR)/build/verif.dump +ODUMP=$(TEST_BUILD_DIR)/build/verif.objdump +ITB=$(TEST_BUILD_DIR)/build/verif.itb +STIM_INSTR=$(TEST_BUILD_DIR)/build/stim_instr.txt +STIM_DATA=$(TEST_BUILD_DIR)/build/stim_data.txt VSIM_INI=modelsim.ini VSIM_LIBS=work @@ -131,25 +142,40 @@ $(STIM_INSTR) $(STIM_DATA): $(BIN) objcopy --srec-len 1 --output-target=srec $(BIN) $(BIN).s19 && \ scripts/parse_s19.pl $(BIN).s19 > $(BIN).txt && \ $(BASE_PYTHON) scripts/s19tomem.py $(BIN).txt $(STIM_INSTR) $(STIM_DATA) - cd $(TEST_DIR)/$(test) && \ + cd $(TEST_BUILD_DIR) && \ ln -sfn $(ROOT_DIR)/$(INI_PATH) $(VSIM_INI) && \ ln -sfn $(ROOT_DIR)/$(WORK_PATH) $(VSIM_LIBS) +# Build Spatz binary with auto-detected tasks (only if tasks are used) +# Generate test-specific header: test_name_task_bin.h +.PHONY: spatz-header +spatz-header: + @if [ -n "$(SPATZ_TASKS)" ]; then \ + echo "[SPATZ] Auto-detected tasks: $(SPATZ_TASKS)"; \ + $(MAKE) -C $(SPATZ_SW_DIR) task="$(SPATZ_TASKS)" TEST_NAME=$(test) SPATZ_RVD=$(SPATZ_RVD) SPATZ_VLEN=$(SPATZ_VLEN) SPATZ_N_IPU=$(SPATZ_N_IPU) SPATZ_N_FPU=$(SPATZ_N_FPU) SPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) SPATZ_XDMA=$(SPATZ_XDMA) SPATZ_RVF=$(SPATZ_RVF) SPATZ_RVV=$(SPATZ_RVV) all; \ + else \ + echo "[SPATZ] No Spatz tasks detected - skipping Spatz compilation"; \ + fi + $(BIN): $(CRT) $(OBJ) + @if [ -n "$(SPATZ_TASKS)" ]; then \ + echo "[CV32-LINK] Linking with embedded Spatz binary (tasks: $(SPATZ_TASKS))"; \ + else \ + echo "[CV32-LINK] Linking without Spatz binary"; \ + fi $(LD) $(LD_OPTS) -o $(BIN) $(CRT) $(OBJ) -T$(LINKSCRIPT) $(CRT): - cd $(TEST_DIR) && \ - mkdir -p $(test) && \ - cd $(test) && \ - mkdir -p build + mkdir -p $(TEST_BUILD_DIR)/build $(CC) $(CC_OPTS) -c $(BOOTSCRIPT) -o $(CRT) +# Compile CV32 test (depends on spatz-header only if tasks detected) +ifneq ($(SPATZ_TASKS),) +$(OBJ): spatz-header +endif + $(OBJ): - cd $(TEST_DIR) && \ - mkdir -p $(test) && \ - cd $(test) && \ - mkdir -p build + mkdir -p $(TEST_BUILD_DIR)/build $(CC) $(CC_OPTS) -c $(TEST_SRCS) $(FLAGS) $(INC) -o $(OBJ) SHELL := /bin/bash @@ -176,7 +202,7 @@ all: $(STIM_INSTR) $(STIM_DATA) dis objdump itb # Run the simulation run: $(CRT) ifeq ($(gui), 0) - cd $(TEST_DIR)/$(test); \ + cd $(TEST_BUILD_DIR); \ $(QUESTA) vsim -c vopt_tb $(questa_run_fast_flag) -l transcript -do "run -a" \ +INST_HEX=$(inst_hex_name) \ +DATA_HEX=$(data_hex_name) \ @@ -188,7 +214,7 @@ ifeq ($(gui), 0) ) \ +itb_file=$(itb_file) else - cd $(TEST_DIR)/$(test); \ + cd $(TEST_BUILD_DIR); \ $(QUESTA) vsim vopt_tb $(questa_run_flag) -l transcript \ -do "add log -r sim:/$(tb)/*" \ -do "source $(WAVES)" \ @@ -234,6 +260,7 @@ bender_targs += -t cv32e40p_include_tracer # Targets needed to avoid error even though the module is not used bender_targs += -t snitch_cluster bender_targs += -t idma_test +bender_targs += -t spatz #ifeq ($(REDMULE_COMPLEX),1) # tb := redmule_complex_tb @@ -258,6 +285,31 @@ else bender_targs += -t redmule_hwpe endif +# Define for Spatz target +bender_defs += -D TARGET_SPATZ +SPATZ_RVD ?= 0 # 0: 32-bit TCDM (ELEN=32), 1: 64-bit TCDM (ELEN=64) +SPATZ_VLEN ?= 256 # Vector length in bits (128, 256, 512, ...) +SPATZ_NRVREG ?= 32 # Number of vector registers (RISC-V standard=32) +SPATZ_NR_VRF_BANKS ?= 4 # Number of VRF banks (banking parallelism: 2, 4, 8) +SPATZ_N_IPU ?= 1 # Number of Integer Processing Units (1-8) +SPATZ_N_FPU ?= 4 # Number of Floating Point Units (1-8) +SPATZ_NR_PARALLEL_INSTR ?= 4 # Number of parallel vector instructions (scoreboard depth) +SPATZ_XDIVSQRT ?= 0 # 0: FP div/sqrt disabled, 1: enabled (increases area) +SPATZ_XDMA ?= 0 # 0: DMA disabled, 1: enabled +SPATZ_RVF ?= 1 # 0: single-precision FP disabled, 1: enabled +SPATZ_RVV ?= 1 # 0: vector extension disabled, 1: enabled +bender_defs += -D SPATZ_RVD=$(SPATZ_RVD) +bender_defs += -D SPATZ_VLEN=$(SPATZ_VLEN) +bender_defs += -D SPATZ_NRVREG=$(SPATZ_NRVREG) +bender_defs += -D SPATZ_NR_VRF_BANKS=$(SPATZ_NR_VRF_BANKS) +bender_defs += -D SPATZ_N_IPU=$(SPATZ_N_IPU) +bender_defs += -D SPATZ_N_FPU=$(SPATZ_N_FPU) +bender_defs += -D SPATZ_NR_PARALLEL_INSTR=$(SPATZ_NR_PARALLEL_INSTR) +bender_defs += -D SPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) +bender_defs += -D SPATZ_XDMA=$(SPATZ_XDMA) +bender_defs += -D SPATZ_RVF=$(SPATZ_RVF) +bender_defs += -D SPATZ_RVV=$(SPATZ_RVV) + update-ips: $(BENDER) update $(BENDER) script vsim \ @@ -308,7 +360,11 @@ clean-sdk: rm -rf $(SW)/pulp-sdk clean: - rm -rf $(TEST_DIR)/$(test) + rm -rf $(TEST_BUILD_DIR) + @if [ -d "$(SPATZ_SW_DIR)" ]; then \ + echo "[CLEAN] Cleaning Spatz..."; \ + $(MAKE) -C $(SPATZ_SW_DIR) clean; \ + fi dis: $(OBJDUMP) -d -S $(BIN) > $(DUMP) diff --git a/bender_sim.mk b/bender_sim.mk index 86e58f5..0d2ab4f 100644 --- a/bender_sim.mk +++ b/bender_sim.mk @@ -16,12 +16,7 @@ sim_targs += -t rtl sim_targs += -t test sim_targs += -t idma_test - -#ifeq ($(REDMULE_COMPLEX),1) -# sim_targs += -t redmule_test_complex -#else -# sim_targs += -t redmule_test_hwpe -#endif +sim_targs += -t redmule_hwpe ifeq ($(mesh_dv), 0) sim_targs += -t standalone_tile diff --git a/bender_synth.mk b/bender_synth.mk index 4bd5430..46db632 100644 --- a/bender_synth.mk +++ b/bender_synth.mk @@ -14,15 +14,15 @@ # SPDX-License-Identifier: Apache-2.0 synth_targs += -t rtl -synth_targs += -t redmule_complex +synth_targs += -t redmule_hwpe synth_targs += -t asic synth_targs += -t synopsys +synth_targs += -t spatz -#ifeq ($(REDMULE_COMPLEX),1) -# synth_defs += -D REDMULE_COMPLEX_SYNTH -#else -# synth_defs += -D REDMULE_HWPE_SYNTH -#endif +ifeq ($(mesh_dv), 0) +synth_defs += -D TARGET_STANDALONE_TILE +endif -synth_defs += -D REDMULE_COMPLEX_SYNTH +synth_defs += -D REDMULE_HWPE_SYNTH synth_defs += -D SYNTHESIS +synth_defs += -D TARGET_SPATZ diff --git a/hw/mesh/magia_pkg.sv b/hw/mesh/magia_pkg.sv index 88852a9..5f35a15 100644 --- a/hw/mesh/magia_pkg.sv +++ b/hw/mesh/magia_pkg.sv @@ -44,11 +44,11 @@ package magia_pkg; localparam int unsigned USR_W = 1; // Default User Width // Parameters used by the NoC - parameter int unsigned AXI_NOC_ID_W = 4; // AXI NoC ID Width: 2 bits on slave port + 2 bits to select from i$, iDMA and core data + parameter int unsigned AXI_NOC_ID_W = 6; // AXI NoC ID Width: matches slave side id_width (6 bits) parameter int unsigned AXI_NOC_U_W = USR_W; // Parameters used by the L2 - parameter int unsigned L2_ID_W = 2; // The ID Width reflects the ID Width of the Tile AXI XBAR + parameter int unsigned L2_ID_W = 3; // The ID Width reflects the slave ID Width of the Tile AXI XBAR (for 5 ports: log2(5)=3) parameter int unsigned L2_U_W = 1; // Parameter used for the Fractal Sync network diff --git a/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv b/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv index 22c3f43..2fd557e 100644 --- a/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv +++ b/hw/mesh/noc/floo_axi_mesh_16x16_noc.sv @@ -603,7 +603,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -611,7 +611,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -621,8 +621,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) diff --git a/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv b/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv index a1a73f3..9a62e0c 100644 --- a/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv +++ b/hw/mesh/noc/floo_axi_mesh_2x2_noc.sv @@ -71,7 +71,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -79,7 +79,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -89,8 +89,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) diff --git a/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv b/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv index 05e14d1..be0bca2 100644 --- a/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv +++ b/hw/mesh/noc/floo_axi_mesh_32x32_noc.sv @@ -2171,7 +2171,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -2179,7 +2179,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -2189,8 +2189,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) diff --git a/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv b/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv index 38b4156..688d4e4 100644 --- a/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv +++ b/hw/mesh/noc/floo_axi_mesh_4x4_noc.sv @@ -99,7 +99,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -107,7 +107,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -117,8 +117,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) diff --git a/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv b/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv index 90483b5..5f7b3bf 100644 --- a/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv +++ b/hw/mesh/noc/floo_axi_mesh_8x8_noc.sv @@ -203,7 +203,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -211,7 +211,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -221,8 +221,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml index b85ac94..44e68a4 100644 --- a/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_mesh_16x16_config.yml @@ -11,13 +11,13 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "data_slv" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 endpoints: diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml index 69c4628..ea375b2 100644 --- a/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_mesh_2x2_config.yml @@ -11,13 +11,13 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "data_slv" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 endpoints: diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml index 44efa46..65bd166 100644 --- a/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_mesh_32x32_config.yml @@ -11,13 +11,13 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "data_slv" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 endpoints: diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml index b13e59c..92aaef6 100644 --- a/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_mesh_4x4_config.yml @@ -11,13 +11,13 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "data_slv" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 endpoints: diff --git a/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml b/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml index 945921d..7246ee0 100644 --- a/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml +++ b/hw/mesh/noc_configs/floonoc_axi_mesh_8x8_config.yml @@ -11,13 +11,13 @@ protocols: protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 2 + id_width: 3 user_width: 1 - name: "data_slv" protocol: "AXI4" data_width: 32 addr_width: 32 - id_width: 4 + id_width: 6 user_width: 1 endpoints: diff --git a/hw/tile/converters/reqrsp2obi.sv b/hw/tile/converters/reqrsp2obi.sv new file mode 100644 index 0000000..d604a08 --- /dev/null +++ b/hw/tile/converters/reqrsp2obi.sv @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * Convert reqrsp to OBI protocol - Direct conversion without AXI intermediate step + */ + +module reqrsp2obi + import reqrsp_pkg::*; +#( + parameter obi_pkg::obi_cfg_t ObiCfg = obi_pkg::ObiDefaultConfig, + parameter type obi_req_t = logic, + parameter type obi_rsp_t = logic, + parameter type reqrsp_req_t = logic, + parameter type reqrsp_rsp_t = logic +) ( + input logic clk_i, + input logic rst_ni, + + input reqrsp_req_t reqrsp_req_i, + output reqrsp_rsp_t reqrsp_rsp_o, + + output obi_req_t obi_req_o, + input obi_rsp_t obi_rsp_i +); + + // Convert reqrsp AMO to OBI ATOP encoding + function automatic logic [5:0] amo_to_atop(reqrsp_pkg::amo_op_e amo); + case (amo) + reqrsp_pkg::AMONone: return 6'h00; + reqrsp_pkg::AMOSwap: return 6'h21; + reqrsp_pkg::AMOAdd: return 6'h20; + reqrsp_pkg::AMOAnd: return 6'h2C; + reqrsp_pkg::AMOOr: return 6'h28; + reqrsp_pkg::AMOXor: return 6'h24; + reqrsp_pkg::AMOMax: return 6'h34; + reqrsp_pkg::AMOMaxu: return 6'h3C; + reqrsp_pkg::AMOMin: return 6'h30; + reqrsp_pkg::AMOMinu: return 6'h38; + reqrsp_pkg::AMOLR: return 6'h22; + reqrsp_pkg::AMOSC: return 6'h23; + default: return 6'h00; + endcase + endfunction + + /*******************************************************************/ + /* Direct reqrsp → OBI conversion */ + /*******************************************************************/ + + logic amo_detected; + assign amo_detected = (reqrsp_req_i.q.amo != reqrsp_pkg::AMONone); + assign obi_req_o.req = reqrsp_req_i.q_valid; + assign obi_req_o.a.addr = reqrsp_req_i.q.addr; + assign obi_req_o.a.we = reqrsp_req_i.q.write || amo_detected; + assign obi_req_o.a.wdata = reqrsp_req_i.q.data; + assign obi_req_o.a.be = reqrsp_req_i.q.strb; + assign obi_req_o.a.aid = '0; + + // Optional request fields + assign obi_req_o.a.a_optional.auser = '0; + assign obi_req_o.a.a_optional.wuser = '0; + assign obi_req_o.a.a_optional.atop = amo_to_atop(reqrsp_req_i.q.amo); + assign obi_req_o.a.a_optional.memtype = 2'b00; + assign obi_req_o.a.a_optional.mid = '0; + assign obi_req_o.a.a_optional.prot = 3'b000; + assign obi_req_o.a.a_optional.dbg = 1'b0; + assign obi_req_o.a.a_optional.achk = '0; + + // OBI → reqrsp Response + assign reqrsp_rsp_o.q_ready = obi_rsp_i.gnt; + assign reqrsp_rsp_o.p_valid = obi_rsp_i.rvalid; + assign reqrsp_rsp_o.p.data = obi_rsp_i.r.rdata; + assign reqrsp_rsp_o.p.error = obi_rsp_i.r.err; + +endmodule + diff --git a/hw/tile/converters/reqrsp64_to_obi32.sv b/hw/tile/converters/reqrsp64_to_obi32.sv new file mode 100644 index 0000000..8ba2cb7 --- /dev/null +++ b/hw/tile/converters/reqrsp64_to_obi32.sv @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * REQRSP 64-bit to OBI 32-bit Protocol Converter + * + */ + +module reqrsp64_to_obi32 + import obi_pkg::*; +#( + parameter type reqrsp_req_t = logic, // 64-bit reqrsp request type + parameter type reqrsp_rsp_t = logic, // 64-bit reqrsp response type + parameter type obi_req_t = logic, // 32-bit OBI request type + parameter type obi_rsp_t = logic, // 32-bit OBI response type + parameter obi_pkg::obi_cfg_t ObiCfg = obi_pkg::ObiDefaultConfig +)( + input logic clk_i, + input logic rst_ni, + + // REQRSP side (64-bit from Snitch RV64) + input reqrsp_req_t reqrsp_req_i, + output reqrsp_rsp_t reqrsp_rsp_o, + + // OBI side (32-bit to peripherals/L2) + output obi_req_t obi_req_o, + input obi_rsp_t obi_rsp_i +); + + /*******************************************************************/ + /* REQRSP64 → OBI32 Request Conversion */ + /*******************************************************************/ + // Determine if accessing upper or lower word based on strb + logic upper_word_req; + assign upper_word_req = |reqrsp_req_i.q.strb[7:4]; + + // Select based on which half is being accessed + logic [31:0] wdata_32bit; + assign wdata_32bit = upper_word_req ? reqrsp_req_i.q.data[63:32] : reqrsp_req_i.q.data[31:0]; + + logic [3:0] be_32bit; + assign be_32bit = upper_word_req ? reqrsp_req_i.q.strb[7:4] : reqrsp_req_i.q.strb[3:0]; + + // OBI Request signals + assign obi_req_o.req = reqrsp_req_i.q_valid; + assign obi_req_o.a.addr = reqrsp_req_i.q.addr; + assign obi_req_o.a.we = reqrsp_req_i.q.write; + assign obi_req_o.a.wdata = wdata_32bit; + assign obi_req_o.a.be = be_32bit; + assign obi_req_o.a.aid = '0; + assign obi_req_o.a.a_optional.auser = '0; + assign obi_req_o.a.a_optional.wuser = '0; + assign obi_req_o.a.a_optional.atop = 6'h00; // No atomics on this path + assign obi_req_o.a.a_optional.memtype = 2'b00; + assign obi_req_o.a.a_optional.mid = '0; + assign obi_req_o.a.a_optional.prot = 3'b000; + assign obi_req_o.a.a_optional.dbg = 1'b0; + assign obi_req_o.a.a_optional.achk = '0; + + /*******************************************************************/ + /* OBI32 → REQRSP64 Response Conversion */ + /*******************************************************************/ + + // Response handshaking + assign reqrsp_rsp_o.q_ready = obi_rsp_i.gnt; + assign reqrsp_rsp_o.p_valid = obi_rsp_i.rvalid; + + // Track which word half was requested (for correct response data placement) + logic [7:0] req_strb_q; + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + req_strb_q <= 8'h0; + end else if (reqrsp_req_i.q_valid && obi_rsp_i.gnt) begin + req_strb_q <= reqrsp_req_i.q.strb; + end + end + + // Data placement based on which half was requested: + logic upper_word_access; + assign upper_word_access = |req_strb_q[7:4]; + + assign reqrsp_rsp_o.p.data = upper_word_access ? + {obi_rsp_i.r.rdata, 32'h0000_0000} : // Upper word → [63:32] + {32'h0000_0000, obi_rsp_i.r.rdata}; // Lower word → [31:0] + + // Error propagation + assign reqrsp_rsp_o.p.error = obi_rsp_i.r.err; + +endmodule : reqrsp64_to_obi32 diff --git a/hw/tile/converters/tcdm2hci.sv b/hw/tile/converters/tcdm2hci.sv new file mode 100644 index 0000000..364e56a --- /dev/null +++ b/hw/tile/converters/tcdm2hci.sv @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM to HCI Protocol Converter (Direct - No Atomics) + */ + +module tcdm2hci #( + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic, + parameter type hci_req_t = logic, + parameter type hci_rsp_t = logic +)( + input logic clk_i, + input logic rst_ni, + + // TCDM side (Spatz vector ports) + input tcdm_req_t tcdm_req_i, + output tcdm_rsp_t tcdm_rsp_o, + + // HCI side (L1 SPM interconnect) + output hci_req_t hci_req_o, + input hci_rsp_t hci_rsp_i +); + + /*******************************************************************/ + /* TCDM → HCI Request Mapping */ + /*******************************************************************/ + + // Map request channel + assign hci_req_o.req = tcdm_req_i.q_valid; + assign hci_req_o.add = tcdm_req_i.q.addr; + assign hci_req_o.wen = ~tcdm_req_i.q.write; + assign hci_req_o.data = tcdm_req_i.q.data; + assign hci_req_o.be = tcdm_req_i.q.strb; + assign hci_req_o.user = tcdm_req_i.q.user; + assign hci_req_o.id = '0; + assign hci_req_o.ecc = '0; + assign hci_req_o.ereq = '0; + assign hci_req_o.r_ready = 1'b1; + assign hci_req_o.r_eready = '0; + + /*******************************************************************/ + /* HCI → TCDM Response Mapping */ + /*******************************************************************/ + + // Map response channel + assign tcdm_rsp_o.q_ready = hci_rsp_i.gnt; + assign tcdm_rsp_o.p_valid = hci_rsp_i.r_valid; + assign tcdm_rsp_o.p.data = hci_rsp_i.r_data; + +endmodule : tcdm2hci diff --git a/hw/tile/converters/tcdm2hci_atomic.sv b/hw/tile/converters/tcdm2hci_atomic.sv new file mode 100644 index 0000000..ce3aac2 --- /dev/null +++ b/hw/tile/converters/tcdm2hci_atomic.sv @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM to HCI Converter with Atomic Operation Support + * + * Conversion chain: TCDM (with AMO) → OBI (with atop) → OBI (resolved) → HCI + * + */ + +module tcdm2hci_atomic + import obi_pkg::*; + import magia_tile_pkg::*; +#( + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic, + parameter type obi_req_t = logic, + parameter type obi_rsp_t = logic, + parameter type hci_req_t = logic, + parameter type hci_rsp_t = logic, + + // OBI channel types + parameter type obi_a_chan_t = logic, + parameter type obi_r_chan_t = logic, + + // OBI optional types for atomic resolver + parameter type obi_a_optional_t = logic, + parameter type obi_r_optional_t = logic, + + // OBI configuration for atomic resolver + parameter obi_pkg::obi_cfg_t SbrPortObiCfg = obi_pkg::ObiDefaultConfig, + parameter obi_pkg::obi_cfg_t MgrPortObiCfg = obi_pkg::ObiDefaultConfig, + + // Pipeline cut control + parameter bit BypassCut = 1'b0 +)( + input logic clk_i, + input logic rst_ni, + + // TCDM slave port (with atomic operations) + input tcdm_req_t tcdm_req_i, + output tcdm_rsp_t tcdm_rsp_o, + + // HCI master port (atomic-free) + output hci_req_t hci_req_o, + input hci_rsp_t hci_rsp_i +); + + // Internal OBI signals between converters + obi_req_t obi_req; + obi_rsp_t obi_rsp; + + // OBI signals after atomic resolution + obi_req_t obi_resolved_req; + obi_rsp_t obi_resolved_rsp; + + // OBI signals after post-atomic cut (pipeline stage) + obi_req_t obi_cut_req; + obi_rsp_t obi_cut_rsp; + + /*******************************************************************/ + /* Stage 1: TCDM → OBI (with atomic operations preserved) */ + /*******************************************************************/ + + tcdm2obi #( + .tcdm_req_t ( tcdm_req_t ), + .tcdm_rsp_t ( tcdm_rsp_t ), + .obi_req_t ( obi_req_t ), + .obi_rsp_t ( obi_rsp_t ) + ) i_tcdm2obi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm_req_i ), + .tcdm_rsp_o ( tcdm_rsp_o ), + .obi_req_o ( obi_req ), + .obi_rsp_i ( obi_rsp ) + ); + + /*******************************************************************/ + /* Stage 2: OBI Atomic Resolution (atop → RMW sequences) */ + /*******************************************************************/ + + obi_atop_resolver #( + .SbrPortObiCfg ( SbrPortObiCfg ), + .MgrPortObiCfg ( MgrPortObiCfg ), + .sbr_port_obi_req_t ( obi_req_t ), + .sbr_port_obi_rsp_t ( obi_rsp_t ), + .mgr_port_obi_req_t ( obi_req_t ), + .mgr_port_obi_rsp_t ( obi_rsp_t ), + .mgr_port_obi_a_optional_t ( obi_a_optional_t ), + .mgr_port_obi_r_optional_t ( obi_r_optional_t ), + .LrScEnable ( 1'b1 ), + .RegisterAmo ( magia_tile_pkg::RegisterAmo) + ) i_obi_atop_resolver ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( 1'b0 ), + .sbr_port_req_i ( obi_req ), + .sbr_port_rsp_o ( obi_rsp ), + .mgr_port_req_o ( obi_resolved_req ), + .mgr_port_rsp_i ( obi_resolved_rsp ) + ); + + /*******************************************************************/ + /* Stage 3: OBI Cut (optional - controlled by BypassCut param) */ + /*******************************************************************/ + + generate + if (BypassCut) begin : gen_bypass_cut + assign obi_cut_req = obi_resolved_req; + assign obi_resolved_rsp = obi_cut_rsp; + end else begin : gen_enable_cut + obi_cut #( + .ObiCfg ( MgrPortObiCfg ), + .obi_a_chan_t ( obi_a_chan_t ), + .obi_r_chan_t ( obi_r_chan_t ), + .obi_req_t ( obi_req_t ), + .obi_rsp_t ( obi_rsp_t ) + ) i_obi_cut_post_atomic ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .sbr_port_req_i ( obi_resolved_req ), + .sbr_port_rsp_o ( obi_resolved_rsp ), + .mgr_port_req_o ( obi_cut_req ), + .mgr_port_rsp_i ( obi_cut_rsp ) + ); + end + endgenerate + + /*******************************************************************/ + /* Stage 4: OBI → HCI conversion (request and response) */ + /******************************************************************/ + + // Convert OBI request to HCI request + obi2hci_req #( + .obi_req_t ( obi_req_t ), + .hci_req_t ( hci_req_t ) + ) i_obi2hci_req ( + .obi_req_i ( obi_cut_req ), + .hci_req_o ( hci_req_o ) + ); + + // Convert HCI response to OBI response + hci2obi_rsp #( + .hci_rsp_t ( hci_rsp_t ), + .obi_rsp_t ( obi_rsp_t ) + ) i_hci2obi_rsp ( + .hci_rsp_i ( hci_rsp_i ), + .obi_rsp_o ( obi_cut_rsp ) + ); + +endmodule : tcdm2hci_atomic diff --git a/hw/tile/converters/tcdm2obi.sv b/hw/tile/converters/tcdm2obi.sv new file mode 100644 index 0000000..43fc5ce --- /dev/null +++ b/hw/tile/converters/tcdm2obi.sv @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM to OBI Protocol Converter - Combinatorial conversion + * + */ + +module tcdm2obi + import reqrsp_pkg::*; +#( + parameter type tcdm_req_t = logic, + parameter type tcdm_rsp_t = logic, + parameter type obi_req_t = logic, + parameter type obi_rsp_t = logic +)( + input logic clk_i, + input logic rst_ni, + + input tcdm_req_t tcdm_req_i, + output tcdm_rsp_t tcdm_rsp_o, + + output obi_req_t obi_req_o, + input obi_rsp_t obi_rsp_i +); + + // Convert TCDM AMO to OBI ATOP encoding + function automatic logic [5:0] amo_to_atop(reqrsp_pkg::amo_op_e amo); + case (amo) + reqrsp_pkg::AMONone: return 6'h00; // ATOPNONE + reqrsp_pkg::AMOSwap: return 6'h21; // AMOSWAP + reqrsp_pkg::AMOAdd: return 6'h20; // AMOADD + reqrsp_pkg::AMOAnd: return 6'h2C; // AMOAND + reqrsp_pkg::AMOOr: return 6'h28; // AMOOR + reqrsp_pkg::AMOXor: return 6'h24; // AMOXOR + reqrsp_pkg::AMOMax: return 6'h34; // AMOMAX (signed) + reqrsp_pkg::AMOMaxu: return 6'h3C; // AMOMAXU (unsigned) + reqrsp_pkg::AMOMin: return 6'h30; // AMOMIN (signed) + reqrsp_pkg::AMOMinu: return 6'h38; // AMOMINU (unsigned) + reqrsp_pkg::AMOLR: return 6'h22; // ATOPLR (Load-Reserved) + reqrsp_pkg::AMOSC: return 6'h23; // ATOPSC (Store-Conditional) + default: return 6'h00; // Safe default: ATOPNONE + endcase + endfunction + + logic amo_detected; + assign amo_detected = (tcdm_req_i.q.amo != reqrsp_pkg::AMONone); + + // TCDM → OBI Request (combinatorial) + assign obi_req_o.req = tcdm_req_i.q_valid; + assign obi_req_o.a.addr = tcdm_req_i.q.addr; + assign obi_req_o.a.we = tcdm_req_i.q.write || amo_detected; + assign obi_req_o.a.wdata = tcdm_req_i.q.data; + assign obi_req_o.a.be = tcdm_req_i.q.strb; + assign obi_req_o.a.aid = '0; + assign obi_req_o.a.a_optional.auser = '0; + assign obi_req_o.a.a_optional.wuser = '0; + assign obi_req_o.a.a_optional.atop = amo_to_atop(tcdm_req_i.q.amo); + assign obi_req_o.a.a_optional.memtype = 2'b00; + assign obi_req_o.a.a_optional.mid = '0; + assign obi_req_o.a.a_optional.prot = 3'b000; + assign obi_req_o.a.a_optional.dbg = 1'b0; + assign obi_req_o.a.a_optional.achk = '0; + + // OBI → TCDM Response + assign tcdm_rsp_o.q_ready = obi_rsp_i.gnt; + assign tcdm_rsp_o.p_valid = obi_rsp_i.rvalid; + assign tcdm_rsp_o.p.data = obi_rsp_i.r.rdata; + +endmodule : tcdm2obi diff --git a/hw/tile/converters/tcdm64_to_dual_hci32.sv b/hw/tile/converters/tcdm64_to_dual_hci32.sv new file mode 100644 index 0000000..9a355e4 --- /dev/null +++ b/hw/tile/converters/tcdm64_to_dual_hci32.sv @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM 64-bit to Dual HCI 32-bit Protocol Converter + * + */ + +module tcdm64_to_dual_hci32 #( + parameter type tcdm64_req_t = logic, // 64-bit TCDM request type + parameter type tcdm64_rsp_t = logic, // 64-bit TCDM response type + parameter type tcdm32_req_t = logic, // 32-bit TCDM request type + parameter type tcdm32_rsp_t = logic, // 32-bit TCDM response type + parameter type hci_req_t = logic, // 32-bit HCI request type + parameter type hci_rsp_t = logic // 32-bit HCI response type +)( + input logic clk_i, + input logic rst_ni, + + // TCDM side (Spatz 64-bit vector port) + input tcdm64_req_t tcdm_req_i, + output tcdm64_rsp_t tcdm_rsp_o, + + // HCI side (Two 32-bit L1 SPM interconnect ports) + output hci_req_t hci_req_lo_o, // Lower 32-bit word (bits [31:0]) + input hci_rsp_t hci_rsp_lo_i, + output hci_req_t hci_req_hi_o, // Upper 32-bit word (bits [63:32]) + input hci_rsp_t hci_rsp_hi_i +); + + // Internal TCDM32 signals + tcdm32_req_t tcdm32_req_lo, tcdm32_req_hi; + tcdm32_rsp_t tcdm32_rsp_lo, tcdm32_rsp_hi; + + /*******************************************************************/ + /* Step 1: TCDM64 → 2× TCDM32 split (tcdm64_to_dual_tcdm32) */ + /*******************************************************************/ + + tcdm64_to_dual_tcdm32 #( + .tcdm64_req_t ( tcdm64_req_t ), + .tcdm64_rsp_t ( tcdm64_rsp_t ), + .tcdm32_req_t ( tcdm32_req_t ), + .tcdm32_rsp_t ( tcdm32_rsp_t ) + ) i_tcdm64_to_dual_tcdm32 ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm_req_i ), + .tcdm_rsp_o ( tcdm_rsp_o ), + .tcdm_req_lo_o ( tcdm32_req_lo ), + .tcdm_rsp_lo_i ( tcdm32_rsp_lo ), + .tcdm_req_hi_o ( tcdm32_req_hi ), + .tcdm_rsp_hi_i ( tcdm32_rsp_hi ) + ); + + /*******************************************************************/ + /* Step 2: TCDM32 → HCI32 (2× tcdm2hci) */ + /*******************************************************************/ + + // Lower bank: TCDM32 → HCI32 + tcdm2hci #( + .tcdm_req_t ( tcdm32_req_t ), + .tcdm_rsp_t ( tcdm32_rsp_t ), + .hci_req_t ( hci_req_t ), + .hci_rsp_t ( hci_rsp_t ) + ) i_tcdm2hci_lo ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm32_req_lo ), + .tcdm_rsp_o ( tcdm32_rsp_lo ), + .hci_req_o ( hci_req_lo_o ), + .hci_rsp_i ( hci_rsp_lo_i ) + ); + + // Upper bank: TCDM32 → HCI32 + tcdm2hci #( + .tcdm_req_t ( tcdm32_req_t ), + .tcdm_rsp_t ( tcdm32_rsp_t ), + .hci_req_t ( hci_req_t ), + .hci_rsp_t ( hci_rsp_t ) + ) i_tcdm2hci_hi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm32_req_hi ), + .tcdm_rsp_o ( tcdm32_rsp_hi ), + .hci_req_o ( hci_req_hi_o ), + .hci_rsp_i ( hci_rsp_hi_i ) + ); + +endmodule : tcdm64_to_dual_hci32 diff --git a/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv b/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv new file mode 100644 index 0000000..452b509 --- /dev/null +++ b/hw/tile/converters/tcdm64_to_dual_hci32_atomic.sv @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM 64-bit to Dual HCI 32-bit Protocol Converter with Atomic Support + * + */ + +module tcdm64_to_dual_hci32_atomic + import obi_pkg::*; +#( + parameter type tcdm64_req_t = logic, // 64-bit TCDM request type (with AMO) + parameter type tcdm64_rsp_t = logic, // 64-bit TCDM response type + parameter type tcdm32_req_t = logic, // 32-bit TCDM request type (with AMO) + parameter type tcdm32_rsp_t = logic, // 32-bit TCDM response type + parameter type obi32_req_t = logic, // 32-bit OBI request type + parameter type obi32_rsp_t = logic, // 32-bit OBI response type + parameter type obi32_a_chan_t = logic, // 32-bit OBI address channel type + parameter type obi32_r_chan_t = logic, // 32-bit OBI response channel type + parameter type hci_req_t = logic, // 32-bit HCI request type + parameter type hci_rsp_t = logic, // 32-bit HCI response type + + // OBI optional types for atomic resolvers + parameter type obi32_a_optional_t = logic, + parameter type obi32_r_optional_t = logic, + + // OBI configuration for atomic resolvers + parameter obi_pkg::obi_cfg_t SbrPortObiCfg = obi_pkg::ObiDefaultConfig, + parameter obi_pkg::obi_cfg_t MgrPortObiCfg = obi_pkg::ObiDefaultConfig, + parameter bit BypassCut = 1'b0 +)( + input logic clk_i, + input logic rst_ni, + + // TCDM side (Snitch RV64 port with AMO) + input tcdm64_req_t tcdm_req_i, + output tcdm64_rsp_t tcdm_rsp_o, + + // HCI side (Two 32-bit L1 SPM interconnect ports) + output hci_req_t hci_req_lo_o, // Lower 32-bit word (bits [31:0]) + input hci_rsp_t hci_rsp_lo_i, + output hci_req_t hci_req_hi_o, // Upper 32-bit word (bits [63:32]) + input hci_rsp_t hci_rsp_hi_i +); + + // Internal TCDM32 signals after tcdm64_to_dual_tcdm32 + tcdm32_req_t tcdm32_req_lo, tcdm32_req_hi; + tcdm32_rsp_t tcdm32_rsp_lo, tcdm32_rsp_hi; + + /*******************************************************************/ + /* Step 1: TCDM64 → 2× TCDM32 split (tcdm64_to_dual_tcdm32) */ + /*******************************************************************/ + + tcdm64_to_dual_tcdm32 #( + .tcdm64_req_t ( tcdm64_req_t ), + .tcdm64_rsp_t ( tcdm64_rsp_t ), + .tcdm32_req_t ( tcdm32_req_t ), + .tcdm32_rsp_t ( tcdm32_rsp_t ) + ) i_tcdm64_to_dual_tcdm32 ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm_req_i ), + .tcdm_rsp_o ( tcdm_rsp_o ), + .tcdm_req_lo_o ( tcdm32_req_lo ), + .tcdm_rsp_lo_i ( tcdm32_rsp_lo ), + .tcdm_req_hi_o ( tcdm32_req_hi ), + .tcdm_rsp_hi_i ( tcdm32_rsp_hi ) + ); + + /*******************************************************************/ + /* Step 2: TCDM32 → HCI32 with atomics (2× tcdm2hci_atomic) */ + /*******************************************************************/ + + // Lower bank: TCDM32 → HCI32 with atomic support + tcdm2hci_atomic #( + .tcdm_req_t ( tcdm32_req_t ), + .tcdm_rsp_t ( tcdm32_rsp_t ), + .obi_req_t ( obi32_req_t ), + .obi_rsp_t ( obi32_rsp_t ), + .obi_a_chan_t ( obi32_a_chan_t ), + .obi_r_chan_t ( obi32_r_chan_t ), + .hci_req_t ( hci_req_t ), + .hci_rsp_t ( hci_rsp_t ), + .obi_a_optional_t ( obi32_a_optional_t ), + .obi_r_optional_t ( obi32_r_optional_t ), + .SbrPortObiCfg ( SbrPortObiCfg ), + .MgrPortObiCfg ( MgrPortObiCfg ), + .BypassCut ( BypassCut ) + ) i_tcdm2hci_atomic_lo ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm32_req_lo ), + .tcdm_rsp_o ( tcdm32_rsp_lo ), + .hci_req_o ( hci_req_lo_o ), + .hci_rsp_i ( hci_rsp_lo_i ) + ); + + // Upper bank: TCDM32 → HCI32 with atomic support + tcdm2hci_atomic #( + .tcdm_req_t ( tcdm32_req_t ), + .tcdm_rsp_t ( tcdm32_rsp_t ), + .obi_req_t ( obi32_req_t ), + .obi_rsp_t ( obi32_rsp_t ), + .obi_a_chan_t ( obi32_a_chan_t ), + .obi_r_chan_t ( obi32_r_chan_t ), + .hci_req_t ( hci_req_t ), + .hci_rsp_t ( hci_rsp_t ), + .obi_a_optional_t ( obi32_a_optional_t ), + .obi_r_optional_t ( obi32_r_optional_t ), + .SbrPortObiCfg ( SbrPortObiCfg ), + .MgrPortObiCfg ( MgrPortObiCfg ), + .BypassCut ( BypassCut ) + ) i_tcdm2hci_atomic_hi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( tcdm32_req_hi ), + .tcdm_rsp_o ( tcdm32_rsp_hi ), + .hci_req_o ( hci_req_hi_o ), + .hci_rsp_i ( hci_rsp_hi_i ) + ); + +endmodule : tcdm64_to_dual_hci32_atomic diff --git a/hw/tile/converters/tcdm64_to_dual_tcdm32.sv b/hw/tile/converters/tcdm64_to_dual_tcdm32.sv new file mode 100644 index 0000000..f51e422 --- /dev/null +++ b/hw/tile/converters/tcdm64_to_dual_tcdm32.sv @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * TCDM 64-bit to Dual TCDM 32-bit Protocol Converter + * + */ + +module tcdm64_to_dual_tcdm32 #( + parameter type tcdm64_req_t = logic, // 64-bit TCDM request type + parameter type tcdm64_rsp_t = logic, // 64-bit TCDM response type + parameter type tcdm32_req_t = logic, // 32-bit TCDM request type + parameter type tcdm32_rsp_t = logic // 32-bit TCDM response type +)( + input logic clk_i, + input logic rst_ni, + + // TCDM 64-bit side (Snitch data port) + input tcdm64_req_t tcdm_req_i, + output tcdm64_rsp_t tcdm_rsp_o, + + // TCDM 32-bit side (Two parallel TCDM ports) + output tcdm32_req_t tcdm_req_lo_o, // Lower 32-bit word (bits [31:0]) + input tcdm32_rsp_t tcdm_rsp_lo_i, + output tcdm32_req_t tcdm_req_hi_o, // Upper 32-bit word (bits [63:32]) + input tcdm32_rsp_t tcdm_rsp_hi_i +); + + /*******************************************************************/ + /* TCDM64 → Dual TCDM32: Minimal State (2 Registers Only) */ + /*******************************************************************/ + + // Request analysis signals + logic strb_is_zero; + logic is_load; + logic access_lo, access_hi; + + // Decode request type + assign strb_is_zero = (tcdm_req_i.q.strb == 8'h00); + assign is_load = !tcdm_req_i.q.write; + + // Determine which banks to access: + // - Normal operation: access banks based on strobe bits + // - Load with strb=0: access BOTH banks (full 64-bit read) + // - Store with strb=0: access NO banks (no-op) + assign access_lo = |tcdm_req_i.q.strb[3:0] || (strb_is_zero && is_load); + assign access_hi = |tcdm_req_i.q.strb[7:4] || (strb_is_zero && is_load); + + logic access_lo_q, access_hi_q; + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + access_lo_q <= 1'b0; + access_hi_q <= 1'b0; + end else if (tcdm_req_i.q_valid && tcdm_rsp_o.q_ready) begin + // Latch access pattern for all requests + // For loads with strb=0, this will be both banks (full read) + access_lo_q <= access_lo; + access_hi_q <= access_hi; + end + end + + // Response arrives when BOTH accessed banks have responded + logic rsp_arriving; + always_comb begin + rsp_arriving = 1'b0; + + if (access_lo_q && access_hi_q) begin + rsp_arriving = tcdm_rsp_lo_i.p_valid && tcdm_rsp_hi_i.p_valid; + end else if (access_lo_q) begin + rsp_arriving = tcdm_rsp_lo_i.p_valid; + end else if (access_hi_q) begin + rsp_arriving = tcdm_rsp_hi_i.p_valid; + end + end + + // Address generation for dual banks + logic [31:0] addr_base; + logic [31:0] addr_lo, addr_hi; + + assign addr_base = {tcdm_req_i.q.addr[31:3], 3'b000}; // Align to 8-byte boundary + assign addr_lo = addr_base; // Lower 32-bit word [31:0] + assign addr_hi = addr_base + 32'd4; // Upper 32-bit word [63:32] + + /*******************************************************************/ + /* TCDM32 Request Outputs */ + /*******************************************************************/ + + // Lower bank (LO) request generation + always_comb begin + tcdm_req_lo_o.q_valid = 1'b0; + tcdm_req_lo_o.q.addr = '0; + tcdm_req_lo_o.q.write = 1'b0; + tcdm_req_lo_o.q.amo = reqrsp_pkg::AMONone; + tcdm_req_lo_o.q.data = '0; + tcdm_req_lo_o.q.strb = '0; + tcdm_req_lo_o.q.user = '0; + + if (access_lo) begin + tcdm_req_lo_o.q_valid = tcdm_req_i.q_valid; + tcdm_req_lo_o.q.addr = addr_lo; + tcdm_req_lo_o.q.write = tcdm_req_i.q.write; + tcdm_req_lo_o.q.amo = tcdm_req_i.q.amo; + tcdm_req_lo_o.q.data = tcdm_req_i.q.data[31:0]; + tcdm_req_lo_o.q.user = tcdm_req_i.q.user; + + // Strobe handling: load with strb=0 reads full 32-bit word + if (strb_is_zero && is_load) + tcdm_req_lo_o.q.strb = 4'hF; + else + tcdm_req_lo_o.q.strb = tcdm_req_i.q.strb[3:0]; + end + end + + // Upper bank (HI) request generation + always_comb begin + tcdm_req_hi_o.q_valid = 1'b0; + tcdm_req_hi_o.q.addr = '0; + tcdm_req_hi_o.q.write = 1'b0; + tcdm_req_hi_o.q.amo = reqrsp_pkg::AMONone; + tcdm_req_hi_o.q.data = '0; + tcdm_req_hi_o.q.strb = '0; + tcdm_req_hi_o.q.user = '0; + + if (access_hi) begin + tcdm_req_hi_o.q_valid = tcdm_req_i.q_valid; + tcdm_req_hi_o.q.addr = addr_hi; + tcdm_req_hi_o.q.write = tcdm_req_i.q.write; + tcdm_req_hi_o.q.amo = tcdm_req_i.q.amo; + tcdm_req_hi_o.q.data = tcdm_req_i.q.data[63:32]; + tcdm_req_hi_o.q.user = tcdm_req_i.q.user; + + // Strobe handling: load with strb=0 reads full 32-bit word + if (strb_is_zero && is_load) + tcdm_req_hi_o.q.strb = 4'hF; + else + tcdm_req_hi_o.q.strb = tcdm_req_i.q.strb[7:4]; + end + end + + /*******************************************************************/ + /* TCDM64 Response & Grant */ + /*******************************************************************/ + + // Ready signal: accept request when target banks are ready + always_comb begin + tcdm_rsp_o.q_ready = 1'b0; + + if (access_lo && access_hi) begin + // Both banks accessed: need both ready + tcdm_rsp_o.q_ready = tcdm_rsp_lo_i.q_ready && tcdm_rsp_hi_i.q_ready; + end else if (access_lo) begin + // Only LO bank accessed + tcdm_rsp_o.q_ready = tcdm_rsp_lo_i.q_ready; + end else if (access_hi) begin + // Only HI bank accessed + tcdm_rsp_o.q_ready = tcdm_rsp_hi_i.q_ready; + end else if (strb_is_zero && !is_load) begin + // Store with strb=0: no memory access needed, accept immediately + tcdm_rsp_o.q_ready = 1'b1; + end + end + + // Response valid: asserted when all accessed banks have responded + assign tcdm_rsp_o.p_valid = rsp_arriving; + + // Response data: recombine 32-bit responses into 64-bit word + // Uses registered access pattern to handle asynchronous bank responses + assign tcdm_rsp_o.p.data = { + access_hi_q ? tcdm_rsp_hi_i.p.data : 32'h0, + access_lo_q ? tcdm_rsp_lo_i.p.data : 32'h0 + }; + +endmodule : tcdm64_to_dual_tcdm32 diff --git a/hw/tile/eu_direct_cut.sv b/hw/tile/eu_direct_cut.sv new file mode 100644 index 0000000..b962e8b --- /dev/null +++ b/hw/tile/eu_direct_cut.sv @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * EU Direct Link Cut + */ + +module eu_direct_cut + import magia_tile_pkg::*; +#( + parameter type eu_direct_req_t = logic, + parameter type eu_direct_rsp_t = logic, + parameter bit Bypass = 1'b0, + parameter bit BypassReq = Bypass, + parameter bit BypassRsp = Bypass +)( + input logic clk_i, + input logic rst_ni, + + input eu_direct_req_t sbr_req_i, + output eu_direct_rsp_t sbr_rsp_o, + + output eu_direct_req_t mgr_req_o, + input eu_direct_rsp_t mgr_rsp_i +); + + // Request payload (exclude req signal for valid/ready) + typedef struct packed { + logic[magia_pkg::ADDR_W-1:0] addr; + logic wen; + logic[magia_pkg::DATA_W-1:0] wdata; + logic[3:0] be; + } eu_req_payload_t; + + eu_req_payload_t sbr_req_payload, mgr_req_payload; + + assign sbr_req_payload.addr = sbr_req_i.addr; + assign sbr_req_payload.wen = sbr_req_i.wen; + assign sbr_req_payload.wdata = sbr_req_i.wdata; + assign sbr_req_payload.be = sbr_req_i.be; + + assign mgr_req_o.addr = mgr_req_payload.addr; + assign mgr_req_o.wen = mgr_req_payload.wen; + assign mgr_req_o.wdata = mgr_req_payload.wdata; + assign mgr_req_o.be = mgr_req_payload.be; + + // Request spill register + spill_register #( + .T ( eu_req_payload_t ), + .Bypass ( BypassReq ) + ) i_req_spill ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( sbr_req_i.req ), + .ready_o ( sbr_rsp_o.gnt ), + .data_i ( sbr_req_payload ), + .valid_o ( mgr_req_o.req ), + .ready_i ( mgr_rsp_i.gnt ), + .data_o ( mgr_req_payload ) + ); + + // Response payload (exclude rvalid for valid/ready) + typedef struct packed { + logic[magia_pkg::DATA_W-1:0] rdata; + logic err; + } eu_rsp_payload_t; + + eu_rsp_payload_t sbr_rsp_payload, mgr_rsp_payload; + + assign mgr_rsp_payload.rdata = mgr_rsp_i.rdata; + assign mgr_rsp_payload.err = mgr_rsp_i.err; + + assign sbr_rsp_o.rdata = sbr_rsp_payload.rdata; + assign sbr_rsp_o.err = sbr_rsp_payload.err; + + // Response spill register + spill_register #( + .T ( eu_rsp_payload_t ), + .Bypass ( BypassRsp ) + ) i_rsp_spill ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( mgr_rsp_i.rvalid ), + .ready_o ( ), + .data_i ( mgr_rsp_payload ), + .valid_o ( sbr_rsp_o.rvalid ), + .ready_i ( 1'b1 ), + .data_o ( sbr_rsp_payload ) + ); + +endmodule diff --git a/hw/tile/magia_event_unit.sv b/hw/tile/magia_event_unit.sv index 32050a2..76e6b37 100644 --- a/hw/tile/magia_event_unit.sv +++ b/hw/tile/magia_event_unit.sv @@ -25,10 +25,10 @@ import magia_tile_pkg::*; // MAGIA Event Unit Parameters - Optimized for single-core system parameter int unsigned NB_CORES = 1, // Single core system parameter int unsigned NB_SW_EVT = 1, // Minimal SW events for basic functionality - parameter int unsigned NB_BARR = 0, // Barrier units disabled (no sync needed) - parameter int unsigned NB_HW_MUT = 0, // Hardware mutexes disabled (no contention) + parameter int unsigned NB_BARR = 2, // Minimum 2 barriers (workaround: $clog2(1)=0 causes issues) + parameter int unsigned NB_HW_MUT = 2, // Minimum 2 mutexes (workaround: $clog2(1)=0 causes issues) parameter int unsigned MUTEX_MSG_W = 32, // Mutex message width (unused but kept for compatibility) - parameter int unsigned DISP_FIFO_DEPTH = 0, // Task dispatcher disabled (no distribution) + parameter int unsigned DISP_FIFO_DEPTH = 1, // Minimal dispatcher FIFO (workaround for synthesis) parameter int unsigned EVNT_WIDTH = 8, // SOC event width (external events) parameter int unsigned SOC_FIFO_DEPTH = 8 // SOC event FIFO depth (external events) ) @@ -67,12 +67,16 @@ import magia_tile_pkg::*; output logic eu_direct_gnt_o, output logic eu_direct_rvalid_o, output logic [31:0] eu_direct_rdata_o, - output logic eu_direct_err_o + output logic eu_direct_err_o, + + // OBI slave connection + input core_obi_data_req_t obi_req_i, + output core_obi_data_rsp_t obi_rsp_o ); // Create internal interface instances XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) eu_direct_link[NB_CORES-1:0](); - XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) speriph_slave(); // Tied off + XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) speriph_slave(); // Internal signals logic soc_periph_evt_ready_internal; @@ -93,15 +97,29 @@ import magia_tile_pkg::*; assign eu_direct_rdata_o = eu_direct_link[0].r_rdata; assign eu_direct_err_o = eu_direct_link[0].r_opc; // r_opc: 0=OK, 1=ERROR - // Tie off speriph_slave (not used anymore) - assign speriph_slave.req = 1'b0; - assign speriph_slave.add = '0; - assign speriph_slave.wen = 1'b1; - assign speriph_slave.wdata = '0; - assign speriph_slave.be = '0; - assign speriph_slave.id = '0; - + // Address range check and offset calculation + localparam logic [magia_pkg::ADDR_W-1:0] EU_BASE_ADDR = magia_tile_pkg::EVENT_UNIT_ADDR_START; + logic addr_in_range; + logic [magia_pkg::ADDR_W-1:0] addr_offset; + + assign addr_in_range = (obi_req_i.a.addr >= magia_tile_pkg::EVENT_UNIT_ADDR_START) && + (obi_req_i.a.addr <= magia_tile_pkg::EVENT_UNIT_ADDR_END); + assign addr_offset = obi_req_i.a.addr - EU_BASE_ADDR; + + // OBI to XBAR_PERIPH_BUS conversion - pass RELATIVE address (offset from base) + assign speriph_slave.req = obi_req_i.req && addr_in_range; + assign speriph_slave.add = addr_offset; + assign speriph_slave.wen = ~obi_req_i.a.we; + assign speriph_slave.wdata = obi_req_i.a.wdata; + assign speriph_slave.be = obi_req_i.a.be; + assign speriph_slave.id = '0; + // Direct response mapping - no mux needed + assign obi_rsp_o.gnt = speriph_slave.gnt; + assign obi_rsp_o.rvalid = speriph_slave.r_valid; + assign obi_rsp_o.r.rdata = speriph_slave.r_rdata; + assign obi_rsp_o.r.err = speriph_slave.r_opc; // r_opc: 0=OK, 1=ERROR + // Event Unit Flex instantiation event_unit_top #( @@ -133,7 +151,7 @@ import magia_tile_pkg::*; .soc_periph_evt_valid_i ( 1'b0 ), .soc_periph_evt_ready_o ( soc_periph_evt_ready_internal ), .soc_periph_evt_data_i ( '0 ), - .speriph_slave ( speriph_slave.Slave ), + .speriph_slave ( speriph_slave ), .eu_direct_link ( eu_direct_link ) ); diff --git a/hw/tile/magia_tile.sv b/hw/tile/magia_tile.sv index a9fcbf2..e1a2bee 100644 --- a/hw/tile/magia_tile.sv +++ b/hw/tile/magia_tile.sv @@ -135,6 +135,8 @@ module magia_tile logic[magia_pkg::ADDR_W-1:0] tile_fsync_ctrl_end_addr; logic[magia_pkg::ADDR_W-1:0] tile_event_unit_start_addr; logic[magia_pkg::ADDR_W-1:0] tile_event_unit_end_addr; + logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_spatz_ctrl_end_addr; magia_tile_pkg::redmule_data_req_t redmule_data_req; magia_tile_pkg::redmule_data_rsp_t redmule_data_rsp; @@ -148,20 +150,20 @@ module magia_tile magia_tile_pkg::core_obi_data_req_t core_obi_data_req; magia_tile_pkg::core_obi_data_rsp_t core_obi_data_rsp; - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req; + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp; - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_req; // Index 0 -> L2, Index 1 -> L1SPM - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_rsp; // Index 0 -> L2, Index 1 -> L1SPM + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_req; + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_rsp; magia_tile_pkg::core_obi_data_req_t core_l1_data_amo_req; magia_tile_pkg::core_obi_data_rsp_t core_l1_data_amo_rsp; - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_req; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_MGR-1:0] obi_xbar_slv_cut_rsp; // Index 0 -> core request, Index 1 -> ext request, Index 2 -> Spatz request magia_tile_pkg::core_obi_data_req_t ext_obi_data_req; magia_tile_pkg::core_obi_data_rsp_t ext_obi_data_rsp; @@ -332,6 +334,33 @@ module magia_tile magia_tile_pkg::eu_direct_req_t eu_direct_req; magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp; + // EU direct with pipeline cut + magia_tile_pkg::eu_direct_req_t eu_direct_req_cut; + magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp_cut; + + // Spatz CC signals + snitch_pkg::interrupts_t spatz_irq; + magia_tile_pkg::core_hci_data_req_t [magia_tile_pkg::SPATZ_HCI_PORTS-1:0] spatz_hci_req; + magia_tile_pkg::core_hci_data_rsp_t [magia_tile_pkg::SPATZ_HCI_PORTS-1:0] spatz_hci_rsp; + magia_tile_pkg::core_obi_data_req_t spatz_obi_req; + magia_tile_pkg::core_obi_data_rsp_t spatz_obi_rsp; + logic spatz_inst_req; + logic [31:0] spatz_inst_addr; + logic spatz_inst_cacheable; + logic spatz_flush_i_valid; + logic [31:0] spatz_inst_data; + logic spatz_inst_ready; + logic spatz_inst_error; + logic spatz_flush_i_ready; + logic spatz_enable_prefetching; + snitch_pkg::core_events_t spatz_core_events; + magia_tile_pkg::core_axi_instr_req_t spatz_icache_axi_req; + magia_tile_pkg::core_axi_instr_rsp_t spatz_icache_axi_rsp; + logic spatz_start; + logic spatz_done; + logic spatz_clk_en; + logic spatz_clk; + /*******************************************************/ /** Internal Signal Definitions End **/ /*******************************************************/ @@ -346,15 +375,19 @@ module magia_tile assign tile_fsync_ctrl_end_addr = magia_tile_pkg::FSYNC_CTRL_ADDR_END; assign tile_event_unit_start_addr = magia_tile_pkg::EVENT_UNIT_ADDR_START; assign tile_event_unit_end_addr = magia_tile_pkg::EVENT_UNIT_ADDR_END; - assign tile_l1_start_addr = magia_tile_pkg::L1_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - assign tile_l1_end_addr = magia_tile_pkg::L1_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_spatz_ctrl_start_addr = magia_tile_pkg::SPATZ_CTRL_ADDR_START; + assign tile_spatz_ctrl_end_addr = magia_tile_pkg::SPATZ_CTRL_ADDR_END; assign tile_reserved_start_addr = magia_tile_pkg::RESERVED_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; assign tile_reserved_end_addr = magia_tile_pkg::RESERVED_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_l1_start_addr = magia_tile_pkg::L1_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_l1_end_addr = magia_tile_pkg::L1_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; - assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; - assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; - assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX] = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX] = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] = '{idx: 32'd5, start_addr: tile_event_unit_start_addr, end_addr: tile_event_unit_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] = '{idx: 32'd6, start_addr: tile_spatz_ctrl_start_addr, end_addr: tile_spatz_ctrl_end_addr }; `ifndef CV32E40X assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] = '{idx: 32'd2, start_addr: tile_redmule_ctrl_start_addr, end_addr: tile_redmule_ctrl_end_addr }; assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_IDMA_IDX] = '{idx: 32'd3, start_addr: tile_idma_ctrl_start_addr, end_addr: tile_idma_ctrl_end_addr }; @@ -364,6 +397,7 @@ module magia_tile assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; + assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_BOOTROM_IDX] = '{idx: 32'd2, start_addr: magia_tile_pkg::SPATZ_BOOT_ADDR, end_addr: magia_tile_pkg::SPATZ_BOOT_ADDR + magia_tile_pkg::SPATZ_BOOTROM_SIZE}; assign obi_xbar_en_default_idx = '1; // Routing to the AXI Xbar all requests with an address outside the range of the internal L1 and the external L2 assign obi_xbar_default_idx = '0; @@ -374,11 +408,15 @@ module magia_tile assign core_l2_data_rsp = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_CORE_DATA_IDX]; assign axi_xbar_data_in_req[magia_tile_pkg::AXI_CORE_INSTR_IDX] = core_l2_instr_req; assign core_l2_instr_rsp = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_CORE_INSTR_IDX]; - + assign axi_xbar_data_in_req[magia_tile_pkg::AXI_SPATZ_INSTR_IDX] = spatz_icache_axi_req; + assign spatz_icache_axi_rsp = axi_xbar_data_in_rsp[magia_tile_pkg::AXI_SPATZ_INSTR_IDX]; + assign obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX] = core_obi_data_req; assign core_obi_data_rsp = obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX]; assign obi_xbar_slv_req[magia_tile_pkg::OBI_EXT_IDX] = ext_obi_data_req; assign ext_obi_data_rsp = obi_xbar_slv_rsp[magia_tile_pkg::OBI_EXT_IDX]; + assign obi_xbar_slv_req[magia_tile_pkg::OBI_SPATZ_IDX] = spatz_obi_req; + assign spatz_obi_rsp = obi_xbar_slv_rsp[magia_tile_pkg::OBI_SPATZ_IDX]; assign axi_data_user = '0; assign obi_rsp_data_user = '0; @@ -661,6 +699,22 @@ module magia_tile .eu_direct_rsp_i ( eu_direct_rsp ) ); + // EU direct pipeline cut + eu_direct_cut #( + .eu_direct_req_t ( magia_tile_pkg::eu_direct_req_t ), + .eu_direct_rsp_t ( magia_tile_pkg::eu_direct_rsp_t ), + .Bypass ( 1'b0 ), + .BypassReq ( 1'b0 ), + .BypassRsp ( 1'b0 ) + ) i_eu_direct_cut ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .sbr_req_i ( eu_direct_req ), + .sbr_rsp_o ( eu_direct_rsp ), + .mgr_req_o ( eu_direct_req_cut ), + .mgr_rsp_i ( eu_direct_rsp_cut ) + ); + /*******************************************************/ /** Core Data Demux End **/ /*******************************************************/ @@ -689,6 +743,13 @@ module magia_tile .clk_o ( core_clk ) ); + // Spatz clock gating controlled by obi_slave_ctrl_spatz + tc_clk_gating spatz_clock_gating ( + .clk_i ( sys_clk ), + .en_i ( spatz_clk_en ), + .test_en_i ( test_mode_i ), + .clk_o ( spatz_clk ) + ); /*******************************************************/ /** Clock Gating End **/ /*******************************************************/ @@ -789,6 +850,11 @@ module magia_tile /*******************************************************/ `HCI_ASSIGN_TO_INTF(hci_core_if[0], core_l1_data_req, core_l1_data_rsp) // Only 1 core supported + generate + for (genvar i = 0; i < magia_tile_pkg::SPATZ_HCI_PORTS; i++) begin : gen_spatz_hci_assign + `HCI_ASSIGN_TO_INTF(hci_core_if[i+1], spatz_hci_req[i], spatz_hci_rsp[i]) // Spatz CC HCI ports + end + endgenerate `HCI_ASSIGN_TO_INTF(hci_redmule_if[0], redmule_data_req, redmule_data_rsp) // Only 1 RedMulE supported `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_CH_READ_IDX], idma_hci_read_req, idma_hci_read_rsp) // iDMA HCI read channel `HCI_ASSIGN_TO_INTF(hci_dma_if[magia_tile_pkg::HCI_DMA_CH_WRITE_IDX], idma_hci_write_req, idma_hci_write_rsp) // iDMA HCI write channel @@ -968,7 +1034,7 @@ module magia_tile .wu_wfe_i ); `else - // flex-v core with integrated FPU and tracer + // RI5CY core with integrated FPU and tracer riscv_core #( .N_EXT_PERF_COUNTERS ( magia_tile_pkg::N_EXT_PERF_COUNTERS ), .INSTR_RDATA_WIDTH ( magia_tile_pkg::INSTR_RDATA_WIDTH ), @@ -1081,32 +1147,8 @@ module magia_tile /*******************************************************/ /** Core Data Demuxing (OBI XBAR) Beginning **/ /*******************************************************/ - - obi_atop_resolver #( - .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), - .MgrPortObiCfg ( obi_pkg::ObiDefaultConfig ), - .sbr_port_obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), - .sbr_port_obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ), - .mgr_port_obi_req_t ( ), - .mgr_port_obi_rsp_t ( ), - .mgr_port_obi_a_optional_t ( magia_tile_pkg::core_data_obi_a_optional_t ), - .mgr_port_obi_r_optional_t ( magia_tile_pkg::core_data_obi_r_optional_t ), - .LrScEnable ( ), - .RegisterAmo ( magia_tile_pkg::RegisterAmo ) - ) i_obi_atomics ( - .clk_i ( sys_clk ), - .rst_ni ( rst_ni ), - .testmode_i ( test_mode_i ), - .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), - .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), - .mgr_port_req_o ( core_l1_data_amo_req ), - .mgr_port_rsp_i ( core_l1_data_amo_rsp ) - ); - - // Cut only external paths comming from the AXI XBAR - assign obi_xbar_slv_cut_req[0] = obi_xbar_slv_req[0]; - assign obi_xbar_slv_rsp[0] = obi_xbar_slv_cut_rsp[0]; - for (genvar i = 1; i < magia_tile_pkg::N_MGR; i++) begin: gen_obi_xbar_sbr_cut + // OBI cut instances for EXT and SPATZ master ports + for (genvar i = magia_tile_pkg::OBI_EXT_IDX; i <= magia_tile_pkg::OBI_SPATZ_IDX; i++) begin : gen_obi_cut obi_cut #( .ObiCfg ( magia_tile_pkg::obi_amo_cfg ), .obi_a_chan_t ( magia_tile_pkg::core_data_obi_a_chan_t ), @@ -1114,15 +1156,18 @@ module magia_tile .obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), .obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ) ) i_obi_cut_sbr ( - .clk_i ( sys_clk ), - .rst_ni ( rst_ni ), - .sbr_port_req_i ( obi_xbar_slv_req[i] ), - .sbr_port_rsp_o ( obi_xbar_slv_rsp[i] ), - .mgr_port_req_o ( obi_xbar_slv_cut_req[i] ), - .mgr_port_rsp_i ( obi_xbar_slv_cut_rsp[i] ) + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .sbr_port_req_i ( obi_xbar_slv_req[i] ), + .sbr_port_rsp_o ( obi_xbar_slv_rsp[i] ), + .mgr_port_req_o ( obi_xbar_slv_cut_req[i] ), + .mgr_port_rsp_i ( obi_xbar_slv_cut_rsp[i] ) ); end + assign obi_xbar_slv_cut_req[magia_tile_pkg::OBI_CORE_IDX] = obi_xbar_slv_req[magia_tile_pkg::OBI_CORE_IDX]; + assign obi_xbar_slv_rsp[magia_tile_pkg::OBI_CORE_IDX] = obi_xbar_slv_cut_rsp[magia_tile_pkg::OBI_CORE_IDX]; + obi_xbar #( .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), .MgrPortObiCfg ( ), @@ -1169,6 +1214,27 @@ module magia_tile ); end + obi_atop_resolver #( + .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), + .MgrPortObiCfg ( obi_pkg::ObiDefaultConfig ), + .sbr_port_obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), + .sbr_port_obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ), + .mgr_port_obi_req_t ( ), + .mgr_port_obi_rsp_t ( ), + .mgr_port_obi_a_optional_t ( magia_tile_pkg::core_data_obi_a_optional_t ), + .mgr_port_obi_r_optional_t ( magia_tile_pkg::core_data_obi_r_optional_t ), + .LrScEnable ( ), + .RegisterAmo ( magia_tile_pkg::RegisterAmo ) + ) i_obi_atomics ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .testmode_i ( test_mode_i ), + .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), + .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), + .mgr_port_req_o ( core_l1_data_amo_req ), + .mgr_port_rsp_i ( core_l1_data_amo_rsp ) + ); + /*******************************************************/ /** Core Data Demuxing (OBI XBAR) End **/ /*******************************************************/ @@ -1646,14 +1712,32 @@ module magia_tile /*******************************************************/ /** Floating-Point Unit End **/ /*******************************************************/ +/** Spatz Control Slave Beginning **/ +/*******************************************************/ + + obi_slave_ctrl_spatz #( + .BaseAddr ( magia_tile_pkg::SPATZ_CTRL_ADDR_START ) + ) i_spatz_ctrl ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_SPATZ_CTRL_IDX] ), + .clk_en_o ( spatz_clk_en ), + .start_o ( spatz_start ), + .done_o ( spatz_done ) + ); + +/*******************************************************/ +/** Spatz Control Slave End **/ +/*******************************************************/ /** Event Unit Beginning **/ /*******************************************************/ // Event array assignments for proper 2D array structure - assign acc_events_array[0] = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, 1'b0}; - assign dma_events_array[0] = {idma_obi2axi_done, idma_axi2obi_done}; - assign timer_events_array[0] = 2'b00; - assign other_events_array[0] = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, 24'b0}; // iDMA status events [31:28]|idma_obi2axi_error, idma_axi2obi_error, iDMA error events [27:26]|fsync_error, fsync_done, Fsync events [25:24], Reserved [23:0] - SW events are INTERNAL to Event Unit! + assign acc_events_array[0] = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, spatz_done}; + assign dma_events_array[0] = {idma_obi2axi_done, idma_axi2obi_done}; + assign timer_events_array[0] = 2'b00; + assign other_events_array[0] = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, spatz_start, 23'b0}; // iDMA status events [31:28]|iDMA errors [27:26]|Fsync [25:24]|Spatz start [23]|Reserved [22:0] `ifdef CV32E40X assign eu_core_irq_ack = eu_core_irq_req; @@ -1662,54 +1746,215 @@ module magia_tile assign core_busy_o = !core_sleep_o; `endif - magia_event_unit #( - .NB_CORES ( 1 ), // Single core system - .NB_SW_EVT ( 1 ), // Minimum 1 SW event to avoid indexing issues (unused but required) - .NB_BARR ( 0 ), // No barriers needed with single core - .NB_HW_MUT ( 0 ), // No mutexes needed with single core - .MUTEX_MSG_W ( 32 ), // Keep default even if unused - .DISP_FIFO_DEPTH ( 0 ), // No task dispatcher needed - .EVNT_WIDTH ( 8 ), // SOC event width (keep default) - .SOC_FIFO_DEPTH ( 8 ) // SOC FIFO depth (keep default) + magia_event_unit #( + .NB_CORES ( 1 ), + .NB_SW_EVT ( 1 ), + .NB_BARR ( 2 ), + .NB_HW_MUT ( 1 ), + .MUTEX_MSG_W ( 32 ), + .DISP_FIFO_DEPTH ( 1 ), + .EVNT_WIDTH ( 8 ), + .SOC_FIFO_DEPTH ( 8 ) ) i_magia_event_unit ( - .clk_i ( sys_clk ), - .rst_ni ( rst_ni ), - .test_mode_i ( test_mode_i ), + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .test_mode_i ( test_mode_i ), // Event inputs - single core arrays - .acc_events_i ( acc_events_array ), // Accelerator events - .dma_events_i ( dma_events_array ), // iDMA completion events - .timer_events_i ( timer_events_array ), - .other_events_i ( other_events_array ), // Combined events + .acc_events_i ( acc_events_array ), + .dma_events_i ( dma_events_array ), + .timer_events_i ( timer_events_array ), + .other_events_i ( other_events_array ), // Core IRQ interface - .core_irq_req_o ( eu_core_irq_req ), - .core_irq_id_o ( eu_core_irq_id ), - .core_irq_ack_i ( eu_core_irq_ack ), - .core_irq_ack_id_i ( eu_core_irq_ack_id ), + .core_irq_req_o ( eu_core_irq_req ), + .core_irq_id_o ( eu_core_irq_id ), + .core_irq_ack_i ( eu_core_irq_ack ), + .core_irq_ack_id_i( eu_core_irq_ack_id ), // Core control - .core_busy_i ( core_busy_o ), - .core_clock_en_o ( eu_core_clk_en ), + .core_busy_i ( core_busy_o ), + .core_clock_en_o ( eu_core_clk_en ), // Debug - .dbg_req_i ( debug_req_i ), - .core_dbg_req_o ( eu_core_dbg_req ), - - // EU Direct Link Interface - abstract types - .eu_direct_req_i ( eu_direct_req.req ), - .eu_direct_addr_i ( eu_direct_req.addr ), - .eu_direct_wen_i ( eu_direct_req.wen ), - .eu_direct_wdata_i ( eu_direct_req.wdata ), - .eu_direct_be_i ( eu_direct_req.be ), - .eu_direct_gnt_o ( eu_direct_rsp.gnt ), - .eu_direct_rvalid_o ( eu_direct_rsp.rvalid ), - .eu_direct_rdata_o ( eu_direct_rsp.rdata ), - .eu_direct_err_o ( eu_direct_rsp.err ) + .dbg_req_i ( debug_req_i ), + .core_dbg_req_o ( eu_core_dbg_req ), + + // EU Direct Link Interface (with cut for timing) + .eu_direct_req_i ( eu_direct_req_cut.req ), + .eu_direct_addr_i ( eu_direct_req_cut.addr ), + .eu_direct_wen_i ( eu_direct_req_cut.wen ), + .eu_direct_wdata_i ( eu_direct_req_cut.wdata ), + .eu_direct_be_i ( eu_direct_req_cut.be ), + .eu_direct_gnt_o ( eu_direct_rsp_cut.gnt ), + .eu_direct_rvalid_o ( eu_direct_rsp_cut.rvalid ), + .eu_direct_rdata_o ( eu_direct_rsp_cut.rdata ), + .eu_direct_err_o ( eu_direct_rsp_cut.err ), + + // OBI Peripheral Slave Interface + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_EVENT_UNIT_IDX] ) ); /*******************************************************/ -/** Event Unit End **/ +/** Event Unit End **/ +/*******************************************************/ + +/*******************************************************/ +/** Spatz CC Beginning **/ +/*******************************************************/ + // Spatz interrupt signals + assign spatz_irq.msip = 1'b0; // Machine software interrupt (unused - no multi-core) + assign spatz_irq.mtip = 1'b0; // Machine timer interrupt (unused) + assign spatz_irq.meip = spatz_start; // Machine external interrupt - from obi_slave_ctrl_spatz + assign spatz_irq.mcip = 1'b0; // Machine cluster-local interrupt (unused - no cluster) + assign spatz_irq.debug = 1'b0; // Debug request from external debugger + + spatz_cc_wrapper #( + .AddrWidth ( magia_pkg::ADDR_W ), + .DataWidth ( magia_tile_pkg::SPATZ_TCDM_DATA_WIDTH ), + .NumSpatzFPUs ( SPATZ_NUM_FPU ), + .NumSpatzIPUs ( SPATZ_NUM_IPU ), + .BootAddr ( magia_tile_pkg::SPATZ_BOOT_ADDR ), + .RVF ( magia_tile_pkg::SPATZ_RVF_PARAM ), + .RVD ( magia_tile_pkg::SPATZ_RVD_PARAM ), + .RVV ( magia_tile_pkg::SPATZ_RVV_PARAM ), + .XDivSqrt ( magia_tile_pkg::SPATZ_XDIVSQRT_PARAM ), + .FPUImplementation ( magia_tile_pkg::SPATZ_FPUImplementation ) + ) i_spatz_cc_core ( + .clk_i ( spatz_clk ), // Use gated clock + .rst_ni ( rst_ni ), + .test_mode_i ( test_mode_i ), + + // Hart ID + .hart_id_i ( mhartid_i ), // Same as CV32 + .tcdm_addr_base_i ( tile_l1_start_addr ), // Dynamic L1 base per tile + + // Interrupts + .irq_i ( spatz_irq ), + + // HCI Master Interface - Connect to HCI interconnect + .hci_master_req_o ( spatz_hci_req ), + .hci_master_rsp_i ( spatz_hci_rsp ), + + // OBI Master Interface - Connect to OBI crossbar + .obi_master_req_o ( spatz_obi_req ), + .obi_master_rsp_i ( spatz_obi_rsp ), + + // Instruction Cache Interface + .inst_req_o ( spatz_inst_req ), + .inst_addr_o ( spatz_inst_addr ), + .inst_cacheable_o ( spatz_inst_cacheable ), + .flush_i_valid_o ( spatz_flush_i_valid ), + .inst_data_i ( spatz_inst_data ), + .inst_ready_i ( spatz_inst_ready ), + .inst_error_i ( spatz_inst_error ), + .flush_i_ready_i ( spatz_flush_i_ready ), + + // Events and status + .core_events_o ( spatz_core_events ) + ); + +/*******************************************************/ +/** Spatz CC End **/ +/*******************************************************/ +/** Spatz ICache Beginning **/ +/*******************************************************/ + + assign spatz_enable_prefetching = 1'b0; + + snitch_icache #( + .NR_FETCH_PORTS ( 1 ), // Single Spatz CC core + .L0_LINE_COUNT ( 8 ), // L0 cache lines + .LINE_WIDTH ( magia_tile_pkg::SPATZ_ICACHE_LINE_WIDTH ), // 256 bits + .LINE_COUNT ( magia_tile_pkg::SPATZ_ICACHE_LINE_COUNT ), // 32 lines + .SET_COUNT ( magia_tile_pkg::SPATZ_ICACHE_WAYS ), // 2-way set associative + .FETCH_AW ( magia_pkg::ADDR_W ), // Address width + .FETCH_DW ( 32 ), // 32-bit instructions + .FILL_AW ( magia_pkg::ADDR_W ), // AXI address width + .FILL_DW ( magia_pkg::DATA_W ), // AXI data width + .SERIAL_LOOKUP ( 0 ), + .L1_TAG_SCM ( 0 ), + .NUM_AXI_OUTSTANDING( 2 ), + .EARLY_LATCH ( 0 ), + .L0_EARLY_TAG_WIDTH ( magia_tile_pkg::SPATZ_L0_EARLY_TAG_W ), + .ISO_CROSSING ( 1'b0 ), + .axi_req_t ( magia_tile_pkg::core_axi_instr_req_t ), + .axi_rsp_t ( magia_tile_pkg::core_axi_instr_rsp_t ) + ) i_spatz_cc_icache ( + .clk_i ( spatz_clk ), + .clk_d2_i ( spatz_clk ), + .rst_ni ( rst_ni ), + .enable_prefetching_i ( spatz_enable_prefetching ), + .icache_l0_events_o ( ), + .icache_l1_events_o ( ), + .flush_valid_i ( spatz_flush_i_valid ), + .flush_ready_o ( spatz_flush_i_ready ), + .inst_addr_i ( spatz_inst_addr ), + .inst_cacheable_i ( spatz_inst_cacheable ), + .inst_data_o ( spatz_inst_data ), + .inst_valid_i ( spatz_inst_req ), + .inst_ready_o ( spatz_inst_ready ), + .inst_error_o ( spatz_inst_error ), + .sram_cfg_tag_i ( '0 ), + .sram_cfg_data_i ( '0 ), + .axi_req_o ( spatz_icache_axi_req ), + .axi_rsp_i ( spatz_icache_axi_rsp ) + ); + +/*******************************************************/ +/** Spatz ICache End **/ +/*******************************************************/ +/** Spatz Bootrom Beginning **/ +/*******************************************************/ + // AXI to regbus converter for bootrom + magia_tile_pkg::reg_dma_req_t bootrom_reg_req; + magia_tile_pkg::reg_dma_rsp_t bootrom_reg_rsp; + logic [magia_pkg::AXI_NOC_ID_W-1:0] bootrom_reg_id; + logic bootrom_busy; + + axi_to_reg_v2 #( + .AxiAddrWidth ( magia_pkg::ADDR_W ), + .AxiDataWidth ( magia_pkg::DATA_W ), + .AxiIdWidth ( magia_pkg::AXI_NOC_ID_W ), + .AxiUserWidth ( magia_pkg::AXI_NOC_U_W ), + .RegDataWidth ( magia_pkg::DATA_W ), + .axi_req_t ( magia_pkg::axi_xbar_mst_req_t ), + .axi_rsp_t ( magia_pkg::axi_xbar_mst_rsp_t ), + .reg_req_t ( magia_tile_pkg::reg_dma_req_t ), + .reg_rsp_t ( magia_tile_pkg::reg_dma_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .axi_req_i ( axi_xbar_mst_req[magia_tile_pkg::AXI_XBAR_MST_BOOTROM_IDX] ), + .axi_rsp_o ( axi_xbar_mst_rsp[magia_tile_pkg::AXI_XBAR_MST_BOOTROM_IDX] ), + .reg_req_o ( bootrom_reg_req ), + .reg_rsp_i ( bootrom_reg_rsp ), + .reg_id_o ( bootrom_reg_id ), + .busy_o ( bootrom_busy ) + ); + + // Spatz bootrom module (generated from spatz_init.S) + spatz_bootrom i_spatz_bootrom ( + .clk_i ( sys_clk ), + .req_i ( bootrom_reg_req.valid ), + .addr_i ( bootrom_reg_req.addr ), + .rdata_o ( bootrom_reg_rsp.rdata ) + ); + + // Regbus response handling instead of macro + always_ff @(posedge sys_clk or negedge rst_ni) begin + if (!rst_ni) begin + bootrom_reg_rsp.ready <= 1'b0; + end else begin + bootrom_reg_rsp.ready <= bootrom_reg_req.valid; + end + end + + assign bootrom_reg_rsp.error = 1'b0; + +/*******************************************************/ +/** Spatz Bootrom End **/ /*******************************************************/ endmodule: magia_tile \ No newline at end of file diff --git a/hw/tile/magia_tile_pkg.sv b/hw/tile/magia_tile_pkg.sv index 4f06773..e7d135b 100644 --- a/hw/tile/magia_tile_pkg.sv +++ b/hw/tile/magia_tile_pkg.sv @@ -29,6 +29,8 @@ package magia_tile_pkg; `include "register_interface/typedef.svh" `include "idma/typedef.svh" `include "fractal_sync/typedef.svh" + `include "reqrsp_interface/typedef.svh" + `include "tcdm_interface/typedef.svh" `include "../include/alias.svh" @@ -50,34 +52,182 @@ package magia_tile_pkg; // Address map localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_START = 32'h0000_0100; - localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE = 32'h0000_00FF; localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_END = REDMULE_CTRL_ADDR_START + REDMULE_CTRL_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START = REDMULE_CTRL_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE = 32'h0000_0400; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START = REDMULE_CTRL_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE = 32'h0000_03FF; localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_END = IDMA_CTRL_ADDR_START + IDMA_CTRL_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START = IDMA_CTRL_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START = IDMA_CTRL_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE = 32'h0000_00FF; localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_END = FSYNC_CTRL_ADDR_START + FSYNC_CTRL_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START = FSYNC_CTRL_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE = 32'h0000_1000; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START = FSYNC_CTRL_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE = 32'h0000_0FFF; localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_END = EVENT_UNIT_ADDR_START + EVENT_UNIT_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = EVENT_UNIT_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE = 32'h0000_E900; // Calculated to make RESERVED_ADDR_END = 0x0001_0000 + localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_START = EVENT_UNIT_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_SIZE = 32'h0000_00FF; + localparam logic [magia_pkg::ADDR_W-1:0] SPATZ_CTRL_ADDR_END = SPATZ_CTRL_ADDR_START + SPATZ_CTRL_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = SPATZ_CTRL_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE = 32'h0000_E7FF; // Calculated to make RESERVED_ADDR_END = 0x0001_0000 localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_END = RESERVED_ADDR_START + RESERVED_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START = RESERVED_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] STACK_SIZE = 32'h0001_0000; + localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START = RESERVED_ADDR_END +1; + localparam logic [magia_pkg::ADDR_W-1:0] STACK_SIZE = 32'h0000_FFFF; localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_END = STACK_ADDR_START + STACK_SIZE; - localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_START = STACK_ADDR_END; - localparam logic [magia_pkg::ADDR_W-1:0] L1_SIZE = 32'h000E_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_START = STACK_ADDR_END + 1; + localparam logic [magia_pkg::ADDR_W-1:0] L1_SIZE = 32'h000D_FFFF; localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_END = L1_ADDR_START + L1_SIZE; localparam logic [magia_pkg::ADDR_W-1:0] L1_TILE_OFFSET = 32'h0010_0000; localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_START = 32'hC000_0000; - localparam logic [magia_pkg::ADDR_W-1:0] L2_SIZE = 32'h4000_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L2_SIZE = 32'h3FFF_FFFF; localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_END = L2_ADDR_START + L2_SIZE; + // Instruction region for Spatz code (cacheable region) + localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_ADDR_START = 32'hCC00_0000; + localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_SIZE = 32'h0000_8000; // 32KB + + localparam logic [magia_pkg::ADDR_W-1:0] INSTRRAM_PMA_MASK = 32'hFFFF_8000; + + // Snitch PMA Configuration - defines cacheable regions for instruction fetches + function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_snitch_cached_regions(); + automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions; + cached_regions = '{default: '0}; + cached_regions[0] = '{base: INSTRRAM_ADDR_START, mask: INSTRRAM_PMA_MASK}; + return cached_regions; + endfunction + + localparam snitch_pma_pkg::snitch_pma_t SPATZ_SNITCH_PMA_CFG = '{ + NrCachedRegionRules: 1, + CachedRegion: get_snitch_cached_regions(), + default: 0 + }; + + //SPATZ PARAMETERS from Makefile defines + //SPATZ_RVD (Double-width vector extension support) + `ifdef SPATZ_RVD + localparam bit SPATZ_RVD_PARAM = `SPATZ_RVD; + `else + localparam bit SPATZ_RVD_PARAM = 1'b0; + `endif + + //SPATZ_N_IPU and SPATZ_N_FPU + `ifdef SPATZ_N_IPU + localparam int unsigned SPATZ_NUM_IPU = `SPATZ_N_IPU; + `else + localparam int unsigned SPATZ_NUM_IPU = 1; + `endif + + `ifdef SPATZ_N_FPU + localparam int unsigned SPATZ_NUM_FPU = `SPATZ_N_FPU; + `else + localparam int unsigned SPATZ_NUM_FPU = 4; + `endif + + //SPATZ_XDIVSQRT (FP division/sqrt enable) + `ifdef SPATZ_XDIVSQRT + localparam bit SPATZ_XDIVSQRT_PARAM = `SPATZ_XDIVSQRT; + `else + localparam bit SPATZ_XDIVSQRT_PARAM = 1'b0; + `endif + + //SPATZ_XDMA (DMA inside Spatz_cc) + `ifdef SPATZ_XDMA + localparam bit SPATZ_XDMA_PARAM = `SPATZ_XDMA; + `else + localparam bit SPATZ_XDMA_PARAM = 1'b0; + `endif + + //SPATZ_RVF (Single-precision FP support) + `ifdef SPATZ_RVF + localparam bit SPATZ_RVF_PARAM = `SPATZ_RVF; + `else + localparam bit SPATZ_RVF_PARAM = 1'b1; + `endif + + //SPATZ_RVV (Vector extension support) + `ifdef SPATZ_RVV + localparam bit SPATZ_RVV_PARAM = `SPATZ_RVV; + `else + localparam bit SPATZ_RVV_PARAM = 1'b1; + `endif + + // Spatz CC parameters (must be defined before HCI parameters) + localparam int unsigned SPATZ_NUM_FU = (SPATZ_NUM_FPU > SPATZ_NUM_IPU) ? SPATZ_NUM_FPU : SPATZ_NUM_IPU; // Max of FPU and IPU + localparam int unsigned SPATZ_TCDM_PORTS = SPATZ_NUM_FU + 1; // N_FU + 1 TCDM ports (N_FU vector + 1 snitch) + localparam int unsigned SPATZ_HCI_PORTS = SPATZ_RVD_PARAM ? (SPATZ_TCDM_PORTS * 2) : SPATZ_TCDM_PORTS; // RVD=1: 2xTCDM HCI32, RVD=0: 1xTCDM HCI32 + + // Spatz CC outstanding transactions and timing parameters + parameter int unsigned SPATZ_NUM_INT_OUTSTANDING_LOADS = 1; // Snitch core outstanding loads + parameter int unsigned SPATZ_NUM_INT_OUTSTANDING_MEM = 4; // Snitch core outstanding memory ops + parameter int unsigned SPATZ_NUM_SPATZ_OUTSTANDING_LOADS = 4; // Spatz vector unit outstanding loads + parameter bit SPATZ_XDIVSQRT = SPATZ_XDIVSQRT_PARAM; // From Makefile define (0=disabled, 1=enabled) + parameter bit SPATZ_XDMA = SPATZ_XDMA_PARAM; // From Makefile define (0=disabled, 1=enabled) + parameter bit SPATZ_RVF = SPATZ_RVF_PARAM; // From Makefile define (single-precision FP) + parameter bit SPATZ_RVV = SPATZ_RVV_PARAM; // From Makefile define (vector extension) + parameter bit SPATZ_REGISTER_OFFLOAD_RSP = 1'b0; // Pipeline register on offload response + parameter bit SPATZ_REGISTER_CORE_REQ = 1'b1; // Pipeline register on core request + parameter bit SPATZ_REGISTER_CORE_RSP = 1'b1; // Pipeline register on core response + + // Spatz FPU implementation configuration + localparam fpnew_pkg::fpu_implementation_t SPATZ_FPUImplementation = '{ + PipeRegs: // FMA Block + '{ + '{ 3, // FP32 FMA + SPATZ_RVD_PARAM ? 3 : 0, // FP64 FMA + 3, // FP16 + 3, // FP8 + 3, // FP16alt + 3 // FP8alt + }, + '{1, 1, 1, 1, 1, 1}, // DIVSQRT (all formats) + '{1, 1, 1, 1, 1, 1}, // NONCOMP (all formats) + '{2, 2, 2, 2, 2, 2}, // CONV (all formats) + '{5, 5, 5, 5, 5, 5} // DOTP + }, + UnitTypes: '{'{ fpnew_pkg::MERGED, + SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED}, // FMA + '{fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED, + fpnew_pkg::DISABLED}, // DIVSQRT + '{fpnew_pkg::PARALLEL, + SPATZ_RVD_PARAM ? fpnew_pkg::PARALLEL : fpnew_pkg::DISABLED, + fpnew_pkg::PARALLEL, + fpnew_pkg::PARALLEL, + fpnew_pkg::PARALLEL, + fpnew_pkg::PARALLEL}, // NONCOMP + '{fpnew_pkg::MERGED, + SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED}, // CONV + '{fpnew_pkg::MERGED, + SPATZ_RVD_PARAM ? fpnew_pkg::MERGED : fpnew_pkg::DISABLED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED, + fpnew_pkg::MERGED}}, // DOTP + PipeConfig: fpnew_pkg::BEFORE + }; + + // Spatz bootrom parameters + parameter logic [31:0] SPATZ_BOOT_ADDR = 32'h1000_0000; // Spatz bootrom base address + parameter logic [31:0] SPATZ_BOOTROM_SIZE = 32'h0000_00FF; + + // Spatz TCDM parameters + parameter int unsigned SPATZ_TCDM_ADDR_WIDTH = $clog2(magia_pkg::N_MEM_BANKS * magia_pkg::N_WORDS_BANK * magia_pkg::DATA_W / 8); + parameter int unsigned SPATZ_TCDM_DATA_WIDTH = SPATZ_RVD_PARAM ? 64 : 32; // Spatz TCDM data width + parameter int unsigned SPATZ_TCDM_STRB_WIDTH = SPATZ_RVD_PARAM ? 8 : 4; // Spatz TCDM strobe width + + // Parameters used by the HCI parameter int unsigned N_HWPE = 1; // Number of HWPEs attached to the port - parameter int unsigned N_CORE = 1; // Number of Core ports + parameter int unsigned N_CORE = 1 + SPATZ_HCI_PORTS; // Number of Core ports: 1 CV32 + Spatz HCI ports (RVD=1: 11 total, RVD=0: 6 total) parameter int unsigned N_DMA = 2; // Number of DMA ports (1 for the read channel and 1 for the write channel) typedef enum logic{ HCI_DMA_CH_READ_IDX = 1'b0, @@ -125,7 +275,7 @@ package magia_tile_pkg; parameter bit USE_PMP = 1'b1; // Enable PMP parameter bit PULP_CLUSTER = 1'b1; // PULP cluster mode parameter bit FPU = 1'b1; // Enable FPU (main feature) - parameter bit ZFINX = 1'b0; // Zfinx extension (integer FP in GPR) - Must be 0 for standard FPU + parameter bit ZFINX = 1'b1; // Zfinx extension (integer FP in GPR) - Must be 1 for CV32E40P with FP16/FP16alt parameter bit FP_DIVSQRT = 1'b1; // FP division and square root parameter bit SHARED_FP = 1'b0; // Shared FP unit parameter bit SHARED_DSP_MULT = 1'b0; // Shared DSP multiplier @@ -167,23 +317,24 @@ package magia_tile_pkg; parameter int unsigned MID_WIDTH = 1; // Width of the mid signal (manager identifier, see OBI documentation) parameter int unsigned OBI_ID_WIDTH = 1; // Width of the id - configuration `ifdef CV32E40X - parameter int unsigned N_SBR = 2; // Number of slaves (HCI, AXI XBAR) + parameter int unsigned N_SBR = 4; // Number of slaves (HCI, AXI XBAR, Event_Unit, Spatz_Ctrl) `else - parameter int unsigned N_SBR = 5; // Number of slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link + parameter int unsigned N_SBR = 7; // Number of OBI slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl) `endif - parameter int unsigned N_MGR = 2; // Number of masters (Core, AXI XBAR) + parameter int unsigned N_MGR = 3; // Number of masters (Core, AXI XBAR, Spatz CC) parameter int unsigned N_MAX_TRAN = 1; // Number of maximum outstanding transactions `ifdef CV32E40X - parameter int unsigned N_ADDR_RULE = 4; // Number of address rules (L2, L1, Stack, Reserved) + parameter int unsigned N_ADDR_RULE = 6; // Number of address rules (L2, L1, Stack, Reserved, Event_Unit, Spatz_Ctrl) `else - parameter int unsigned N_ADDR_RULE = 7; // Number of address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link + parameter int unsigned N_ADDR_RULE = 9; // Number of OBI address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl, Event_Unit, Spatz_Ctrl) `endif localparam int unsigned N_BIT_SBR = $clog2(N_SBR); // Number of bits required to identify each slave // Parameters used by AXI - parameter int unsigned AXI_DATA_ID_W = 2; // Width of the AXI Data ID (2 bits: Core, iDMA, I$, ext) - parameter int unsigned AXI_INSTR_ID_W = 1; // Width of the AXI Instruction ID (0 bits: direct Core - I$ connection) - parameter int unsigned AXI_ID_W = 2; // Width of the AXI Unified Communication Channel ID + parameter int unsigned AXI_DATA_ID_W = 3; // Width of the AXI Data ID (3 bits for 5 slave ports on crossbar: 2^3=8) + parameter int unsigned AXI_INSTR_ID_W = 3; // Width of the AXI Instruction ID (3 bits for 5 slave ports on crossbar) + parameter int unsigned AXI_ID_W = 3; // Width of the AXI Unified Communication Channel ID (3 bits for 5 slave ports) + localparam int unsigned AXI_MST_ID_W = 6; // Width of master port ID (slave 3b + prepend 3b for 5 ports) parameter int unsigned AXI_DATA_U_W = magia_pkg::USR_W; // Width of the AXI Data User parameter int unsigned AXI_INSTR_U_W = magia_pkg::USR_W; // Width of the AXI Instruction User parameter int unsigned AXI_U_W = magia_pkg::USR_W; // Width of the AXI Unified Communication Channel User @@ -307,8 +458,8 @@ package magia_tile_pkg; parameter bit FSYNC_STALL = 1; // Fractal Sync Stall during synchronization // Parameters of the AXI XBAR - parameter int unsigned AxiXbarNoSlvPorts = 4; // Number of Slave Ports (iDMA, Core Data, Core I$ and ext) - parameter int unsigned AxiXbarNoMstPorts = 2; // Number of Master Ports (to ext and to internal L1 from ext) + parameter int unsigned AxiXbarNoSlvPorts = 5; // Number of Slave Ports (ext, iDMA, Core Data, CV32 I$, Spatz I$) + parameter int unsigned AxiXbarNoMstPorts = 3; // Number of Master Ports (to ext, to internal L1, to Spatz bootrom) localparam int unsigned AxiXbarSlvAxiIDWidth = AXI_DATA_ID_W; // Number of bits to indentify each Slave Port parameter int unsigned AxiXbarMaxWTrans = 16; // Maximum number of outstanding transactions per write parameter int unsigned AxiXbarMaxMstTrans = AxiXbarMaxWTrans; // Maximum number of outstanding transactions per master @@ -332,6 +483,12 @@ package magia_tile_pkg; parameter int unsigned FETCH_DW = magia_pkg::DATA_W; // i$ Fetch interface data width. Power of two; >= 8. parameter int unsigned FILL_AW = magia_pkg::ADDR_W; // i$ Fill interface address width. Same as FILL_AW; >= 1. parameter int unsigned FILL_DW = magia_pkg::DATA_W; // i$ Fill interface data width. Power of two; >= 8. + + // Spatz ICache parameters (dedicated icache for Spatz CC) + parameter int unsigned SPATZ_ICACHE_LINE_WIDTH = 256; // Spatz i$ cache line width (should be investigated which is the best value) + parameter int unsigned SPATZ_ICACHE_LINE_COUNT = 32; // Spatz i$ number of cache lines + parameter int unsigned SPATZ_ICACHE_WAYS = 2; // Spatz i$ number of ways (2-way set associative) + localparam int unsigned SPATZ_L0_EARLY_TAG_W = snitch_pkg::PAGE_SHIFT - $clog2(SPATZ_ICACHE_LINE_WIDTH/8); // L0 early tag width // Parameters used by the FPU parameter bit FPU_ZFINX = 0; // FPU use Zfinx extension instead of the F ISA extention @@ -364,9 +521,10 @@ package magia_tile_pkg; logic[magia_pkg::ADDR_W-1:0] end_addr; } obi_xbar_rule_t; - typedef enum logic{ - OBI_EXT_IDX = 1, - OBI_CORE_IDX = 0 + typedef enum logic[1:0]{ + OBI_SPATZ_IDX = 2, + OBI_EXT_IDX = 1, + OBI_CORE_IDX = 0 } obi_xbar_idx_e; typedef struct packed { @@ -450,16 +608,20 @@ package magia_tile_pkg; } core_cache_instr_rsp_t; `ifdef CV32E40X - typedef enum logic[1:0]{ - OBI_XBAR_STACK_IDX = 3, - OBI_XBAR_RESERVED_IDX = 2, - OBI_XBAR_L1SPM_IDX = 1, - OBI_XBAR_L2_IDX = 0 + typedef enum logic[2:0]{ + OBI_XBAR_STACK_IDX = 5, + OBI_XBAR_SPATZ_CTRL_IDX = 4, + OBI_XBAR_EVENT_UNIT_IDX = 3, + OBI_XBAR_RESERVED_IDX = 2, + OBI_XBAR_L1SPM_IDX = 1, + OBI_XBAR_L2_IDX = 0 } obi_mem_array_idx_e; `else - typedef enum logic[2:0]{ - OBI_XBAR_STACK_IDX = 6, - OBI_XBAR_RESERVED_IDX = 5, + typedef enum logic[3:0]{ + OBI_XBAR_STACK_IDX = 8, + OBI_XBAR_RESERVED_IDX = 7, + OBI_XBAR_SPATZ_CTRL_IDX = 6, + OBI_XBAR_EVENT_UNIT_IDX = 5, OBI_XBAR_FSYNC_CTRL_IDX = 4, OBI_XBAR_IDMA_IDX = 3, OBI_XBAR_REDMULE_CTRL_IDX = 2, @@ -468,20 +630,30 @@ package magia_tile_pkg; } obi_mem_array_idx_e; `endif - typedef enum logic[1:0]{ - AXI_XBAR_STACK_IDX = 3, - AXI_XBAR_RESERVED_IDX = 2, - AXI_XBAR_L1SPM_IDX = 1, + typedef enum logic[2:0]{ + AXI_XBAR_STACK_IDX = 4, + AXI_XBAR_RESERVED_IDX = 3, + AXI_XBAR_L1SPM_IDX = 2, + AXI_XBAR_BOOTROM_IDX = 1, AXI_XBAR_L2_IDX = 0 } axi_mem_array_idx_e; - typedef enum logic[1:0]{ - AXI_EXT_IDX = 3, - AXI_IDMA_IDX = 2, - AXI_CORE_DATA_IDX = 1, - AXI_CORE_INSTR_IDX = 0 + + typedef enum logic[2:0]{ + AXI_SPATZ_INSTR_IDX = 4, + AXI_EXT_IDX = 3, + AXI_IDMA_IDX = 2, + AXI_CORE_DATA_IDX = 1, + AXI_CORE_INSTR_IDX = 0 } axi_xbar_idx_e; + + typedef enum logic[1:0]{ + AXI_XBAR_MST_EXT_IDX = 0, + AXI_XBAR_MST_INT_IDX = 1, + AXI_XBAR_MST_BOOTROM_IDX = 2 + } axi_xbar_mst_idx_e; + typedef struct packed { logic[N_SIGN-1:0][SIGN_W-1:0] sign_list; } xif_inst_rule_t; @@ -495,7 +667,15 @@ package magia_tile_pkg; `HCI_TYPEDEF_RSP_T(redmule_data_rsp_t, logic[DWH-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0]) localparam obi_pkg::obi_optional_cfg_t obi_amo_optional_cfg = obi_pkg::obi_all_optional_config(AUSER_WIDTH, WUSER_WIDTH, RUSER_WIDTH, MID_WIDTH, ACHK_WIDTH, RCHK_WIDTH); - localparam obi_pkg::obi_cfg_t obi_amo_cfg = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_amo_optional_cfg); + localparam obi_pkg::obi_optional_cfg_t obi_no_amo_optional_cfg = '{UseAtop: 1'b0, UseMemtype: 1'b0, UseProt: 1'b0, UseDbg: 1'b0, AUserWidth: AUSER_WIDTH, WUserWidth: WUSER_WIDTH, RUserWidth: RUSER_WIDTH, MidWidth: MID_WIDTH, AChkWidth: ACHK_WIDTH, RChkWidth: RCHK_WIDTH}; + + // OBI full configurations - 32-bit (default) + localparam obi_pkg::obi_cfg_t obi_amo_cfg = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_amo_optional_cfg); + localparam obi_pkg::obi_cfg_t obi_no_amo_cfg = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, magia_pkg::DATA_W, OBI_ID_WIDTH, obi_no_amo_optional_cfg); + + // OBI full configurations - 64-bit + localparam obi_pkg::obi_cfg_t obi_amo_cfg_64 = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, SPATZ_TCDM_DATA_WIDTH, OBI_ID_WIDTH, obi_amo_optional_cfg); + localparam obi_pkg::obi_cfg_t obi_no_amo_cfg_64 = obi_pkg::obi_default_cfg(magia_pkg::ADDR_W, SPATZ_TCDM_DATA_WIDTH, OBI_ID_WIDTH, obi_no_amo_optional_cfg); localparam bit RegisterAmo = 1; `OBI_TYPEDEF_ALL_A_OPTIONAL(core_data_obi_a_optional_t, AUSER_WIDTH, WUSER_WIDTH, MID_WIDTH, ACHK_WIDTH) @@ -518,6 +698,7 @@ package magia_tile_pkg; `AXI_TYPEDEF_ALL_CT(core_axi_data, core_axi_data_req_t, core_axi_data_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0]) `AXI_TYPEDEF_ALL_CT(core_axi_instr, core_axi_instr_req_t, core_axi_instr_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0]) + `REG_BUS_TYPEDEF_ALL(reg_dma, logic[magia_pkg::ADDR_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0]) `REG_BUS_TYPEDEF_ALL(idma_fe_reg, logic[magia_pkg::ADDR_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0]) `IDMA_TYPEDEF_FULL_REQ_T(idma_be_req_t, logic[iDMA_AxiIdWidth-1:0], idma_addr_t, logic[iDMA_TFLenWidth-1:0]) @@ -557,9 +738,9 @@ package magia_tile_pkg; idma_obi_a_chan_t a_chan; } obi; } idma_write_meta_channel_t; - - `AXI_ALIAS(core_axi_data, axi_xbar_slv, core_axi_data_req_t, axi_xbar_slv_req_t, core_axi_data_rsp_t, axi_xbar_slv_rsp_t) - `AXI_ALIAS(core_axi_data, axi_xbar_mst, core_axi_data_req_t, axi_xbar_mst_req_t, core_axi_data_rsp_t, axi_xbar_mst_rsp_t) + + `AXI_TYPEDEF_ALL_CT(axi_xbar_slv, axi_xbar_slv_req_t, axi_xbar_slv_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0]) + `AXI_TYPEDEF_ALL_CT(axi_xbar_mst, axi_xbar_mst_req_t, axi_xbar_mst_rsp_t, logic[magia_pkg::ADDR_W-1:0], logic[AXI_MST_ID_W-1:0], logic[magia_pkg::DATA_W-1:0], logic[magia_pkg::STRB_W-1:0], logic[AXI_U_W-1:0]) `HCI_TYPEDEF_REQ_T(idma_hci_req_t, logic[AWC-1:0], logic[DW_LIC-1:0], logic[SW_LIC-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0]) `HCI_TYPEDEF_RSP_T(idma_hci_rsp_t, logic[DW_LIC-1:0], logic[UWH-1:0], logic[0:0], logic[0:0], logic[0:0]) @@ -577,7 +758,7 @@ package magia_tile_pkg; UniqueIds : 1'b0, AxiAddrWidth : magia_pkg::ADDR_W, AxiDataWidth : magia_pkg::DATA_W, - NoAddrRules : 3 + NoAddrRules : 4 }; `FSYNC_TYPEDEF_ALL(ht_tile_fsync, logic[FSYNC_AGGR_W-1:0], logic[FSYNC_LVL_W-1:0], logic[FSYNC_ID_W-1:0]) @@ -585,4 +766,50 @@ package magia_tile_pkg; `FSYNC_TYPEDEF_ALL(hn_tile_fsync, logic[FSYNC_NBR_AGGR_W-1:0], logic[FSYNC_NBR_LVL_W-1:0], logic[FSYNC_NBR_ID_W-1:0]) `FSYNC_TYPEDEF_ALL(vn_tile_fsync, logic[FSYNC_NBR_AGGR_W-1:0], logic[FSYNC_NBR_LVL_W-1:0], logic[FSYNC_NBR_ID_W-1:0]) + /*******************************************************************/ + /* Spatz Core Complex Wrapper Types */ + /*******************************************************************/ + + // Base types for Spatz TCDM and reqrsp + typedef logic [magia_pkg::ADDR_W-1:0] spatz_addr_t; + typedef logic [magia_pkg::DATA_W-1:0] spatz_data_t; + typedef logic [magia_pkg::DATA_W/8-1:0] spatz_strb_t; + typedef logic spatz_tcdm_user_t; + typedef logic [magia_tile_pkg::SPATZ_TCDM_ADDR_WIDTH-1:0] spatz_tcdm_addr_t; + + // 64-bit types for TCDM and reqrsp interfaces when RVD=1 + typedef logic [SPATZ_TCDM_DATA_WIDTH-1:0] spatz_data64_t; + typedef logic [SPATZ_TCDM_STRB_WIDTH-1:0] spatz_strb64_t; + + `TCDM_TYPEDEF_ALL(spatz_tcdm, spatz_tcdm_addr_t, spatz_data_t, spatz_strb_t, spatz_tcdm_user_t) + `REQRSP_TYPEDEF_ALL(spatz_reqrsp, spatz_addr_t, spatz_data_t, spatz_strb_t) // 32-bit for Snitch data port (RVD=0, DataWidth=32) + `REQRSP_TYPEDEF_ALL(spatz_reqrsp64, spatz_addr_t, spatz_data64_t, spatz_strb64_t) // 64-bit for Snitch data port (RVD=1, DataWidth=64) + + // TCDM64 types with AMO support (using TCDM_TYPEDEF_ALL macro which includes amo field) + typedef logic [magia_pkg::ADDR_W-1:0] spatz_tcdm64_addr_t; + typedef logic [SPATZ_TCDM_DATA_WIDTH-1:0] spatz_tcdm64_data_t; + typedef logic [SPATZ_TCDM_STRB_WIDTH-1:0] spatz_tcdm64_strb_t; + typedef logic spatz_tcdm64_user_t; + + `TCDM_TYPEDEF_ALL(spatz_tcdm64, spatz_tcdm64_addr_t, spatz_tcdm64_data_t, spatz_tcdm64_strb_t, spatz_tcdm64_user_t) + + // Alias for backward compatibility with spatz_cc instantiation + typedef spatz_tcdm64_req_chan_t spatz_tcdm64_payload_t; + + // TCDM32 types with AMO support (for modular atomic resolver architecture) + typedef logic [magia_pkg::ADDR_W-1:0] spatz_tcdm32_addr_t; + typedef logic [31:0] spatz_tcdm32_data_t; + typedef logic [3:0] spatz_tcdm32_strb_t; + typedef logic spatz_tcdm32_user_t; + + `TCDM_TYPEDEF_ALL(spatz_tcdm32, spatz_tcdm32_addr_t, spatz_tcdm32_data_t, spatz_tcdm32_strb_t, spatz_tcdm32_user_t) + + // OBI 32-bit types for modular atomic architecture + `OBI_TYPEDEF_ALL_A_OPTIONAL(spatz_obi32_a_optional_t, AUSER_WIDTH, WUSER_WIDTH, MID_WIDTH, ACHK_WIDTH) + `OBI_TYPEDEF_ALL_R_OPTIONAL(spatz_obi32_r_optional_t, RUSER_WIDTH, RCHK_WIDTH) + `OBI_TYPEDEF_A_CHAN_T(spatz_obi32_a_chan_t, magia_pkg::ADDR_W, 32, AID_WIDTH, spatz_obi32_a_optional_t) + `OBI_TYPEDEF_R_CHAN_T(spatz_obi32_r_chan_t, 32, RID_WIDTH, spatz_obi32_r_optional_t) + `OBI_TYPEDEF_DEFAULT_REQ_T(spatz_obi32_req_t, spatz_obi32_a_chan_t) + `OBI_TYPEDEF_RSP_T(spatz_obi32_rsp_t, spatz_obi32_r_chan_t) + endpackage: magia_tile_pkg \ No newline at end of file diff --git a/hw/tile/obi_slave_ctrl_spatz.sv b/hw/tile/obi_slave_ctrl_spatz.sv new file mode 100644 index 0000000..d90cf98 --- /dev/null +++ b/hw/tile/obi_slave_ctrl_spatz.sv @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * OBI Slave Control Registers for Spatz Core Complex + * + */ + +module obi_slave_ctrl_spatz + import magia_tile_pkg::*; +#( + parameter logic [31:0] BaseAddr = 32'h00001700 // Base address for control registers +) ( + input logic clk_i, + input logic rst_ni, + + // OBI slave interface + input core_obi_data_req_t obi_req_i, + output core_obi_data_rsp_t obi_rsp_o, + + // Control outputs + output logic clk_en_o, + output logic start_o, + output logic done_o +); + + // Register offsets + localparam logic [3:0] CLK_EN_OFFSET = 4'h0; // +0x00 + localparam logic [3:0] READY_OFFSET = 4'h4; // +0x04 + localparam logic [3:0] START_OFFSET = 4'h8; // +0x08 + localparam logic [3:0] TASKBIN_OFFSET = 4'hC; // +0x0C + localparam logic [4:0] DATA_OFFSET = 5'h10; // +0x10 + localparam logic [4:0] RETURN_OFFSET = 5'h14; // +0x14 + localparam logic [4:0] DONE_OFFSET = 5'h18; // +0x18 + + // Registers + logic clk_en_q; + logic start_q; + logic ready_q; + logic [31:0] taskbin_q; + logic [31:0] data_q; + logic [31:0] return_q; + logic done_q; + + // Response pipeline + logic rvalid_q, rvalid_d; + logic [31:0] rdata_q, rdata_d; + + // Address decode (offset from base) + logic [4:0] addr_offset; + logic addr_valid; + + assign addr_offset = obi_req_i.a.addr[4:0]; + + // Check if address is in valid range + assign addr_valid = (obi_req_i.a.addr >= BaseAddr) && + (obi_req_i.a.addr < (BaseAddr + 28)); // 7 registers * 4 bytes + + // Grant only if address is valid + assign obi_rsp_o.gnt = obi_req_i.req && addr_valid; + assign obi_rsp_o.r.err = 1'b0; + + // ============================================ + // Register write logic (combinational) + // ============================================ + logic clk_en_d; + logic start_d; + logic ready_d; + logic [31:0] taskbin_d; + logic [31:0] data_d; + logic [31:0] return_d; + logic done_d; + + always_comb begin + // Default: keep current values + clk_en_d = clk_en_q; + start_d = start_q; + ready_d = ready_q; + taskbin_d = taskbin_q; + data_d = data_q; + return_d = return_q; + done_d = 1'b0; // Done is a pulse, auto-clears + + // Update registers on write only if address is valid + if (obi_req_i.req && obi_req_i.a.we && addr_valid) begin + case (addr_offset) + CLK_EN_OFFSET: clk_en_d = obi_req_i.a.wdata[0]; + READY_OFFSET: ready_d = obi_req_i.a.wdata[0]; // Spatz sets to 1 when ready for interrupts + START_OFFSET: start_d = obi_req_i.a.wdata[0]; // CV32 sets to 1, Spatz clears to 0 + TASKBIN_OFFSET: taskbin_d = obi_req_i.a.wdata; // Preserved register + DATA_OFFSET: data_d = obi_req_i.a.wdata; // Preserved register + RETURN_OFFSET: return_d = obi_req_i.a.wdata; // Preserved register + DONE_OFFSET: done_d = obi_req_i.a.wdata[0]; // Spatz writes 1, creates pulse + endcase + end + end + + // ============================================ + // Register sequential logic + // ============================================ + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + clk_en_q <= 1'b0; + start_q <= 1'b0; + ready_q <= 1'b0; + taskbin_q <= 32'h0; + data_q <= 32'h0; + return_q <= 32'h0; + done_q <= 1'b0; + end else begin + clk_en_q <= clk_en_d; + start_q <= start_d; + ready_q <= ready_d; + taskbin_q <= taskbin_d; + data_q <= data_d; + return_q <= return_d; + done_q <= done_d; + end + end + + // ============================================ + // OBI read response logic (combinational) + // ============================================ + always_comb begin + rdata_d = 32'h0; + rvalid_d = obi_req_i.req && addr_valid; + + if (obi_req_i.req && !obi_req_i.a.we && addr_valid) begin + case (addr_offset) + CLK_EN_OFFSET: rdata_d = {31'h0, clk_en_q}; + READY_OFFSET: rdata_d = {31'h0, ready_q}; + START_OFFSET: rdata_d = {31'h0, start_q}; + TASKBIN_OFFSET: rdata_d = taskbin_q; + DATA_OFFSET: rdata_d = data_q; + RETURN_OFFSET: rdata_d = return_q; + DONE_OFFSET: rdata_d = {31'h0, done_q}; + default: rdata_d = 32'hDEADBEEF; + endcase + end + end + + // ============================================ + // OBI response sequential logic + // ============================================ + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + rvalid_q <= 1'b0; + rdata_q <= 32'h0; + end else begin + rvalid_q <= rvalid_d; + rdata_q <= rdata_d; + end + end + + // Output assignments + assign obi_rsp_o.rvalid = rvalid_q; + assign obi_rsp_o.r.rdata = rdata_q; + assign obi_rsp_o.r.rid = obi_req_i.a.aid; + assign obi_rsp_o.r.r_optional = '0; + assign clk_en_o = clk_en_q; + assign start_o = start_q; + assign done_o = done_q; + + +endmodule diff --git a/hw/tile/spatz_bootrom.sv b/hw/tile/spatz_bootrom.sv new file mode 100644 index 0000000..7bbac14 --- /dev/null +++ b/hw/tile/spatz_bootrom.sv @@ -0,0 +1,39 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Auto-generated bootrom for Spatz core complex +// Generated from: spatz_init.bin +// Size: 16 bytes (4 words) + +module spatz_bootrom #( + parameter int unsigned DataWidth = 32, + parameter int unsigned AddrWidth = 32 +) ( + input logic clk_i, + input logic req_i, + input logic [AddrWidth-1:0] addr_i, + output logic [DataWidth-1:0] rdata_o +); + localparam int RomSize = 4; + localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1; + + const logic [RomSize-1:0][DataWidth-1:0] mem = { + 32'h00030067, + 32'h0002a303, + 32'h70c28293, + 32'h000012b7 + }; + + // Sequential read with registered address + logic [AddrBits-1:0] addr_q; + + always_ff @(posedge clk_i) begin + if (req_i) begin + addr_q <= addr_i[AddrBits-1+2:2]; + end + end + + // Return data based on registered address + assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0; +endmodule diff --git a/hw/tile/spatz_cc_wrapper.sv b/hw/tile/spatz_cc_wrapper.sv new file mode 100644 index 0000000..5171fcd --- /dev/null +++ b/hw/tile/spatz_cc_wrapper.sv @@ -0,0 +1,489 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * Spatz Core Complex Wrapper for MAGIA Tile + * + */ + +`include "snitch_vm/typedef.svh" + +module spatz_cc_wrapper + import magia_tile_pkg::*; + import magia_pkg::*; + import snitch_pkg::*; + import obi_pkg::*; + import fpnew_pkg::*; +#( + // Spatz Core Complex Parameters + parameter int unsigned AddrWidth = magia_pkg::ADDR_W, + parameter int unsigned DataWidth = magia_tile_pkg::SPATZ_TCDM_DATA_WIDTH, + parameter int unsigned TCDMAddrWidth = magia_tile_pkg::SPATZ_TCDM_ADDR_WIDTH, + parameter int unsigned NumSpatzFPUs = magia_tile_pkg::SPATZ_NUM_FPU, + parameter int unsigned NumSpatzIPUs = magia_tile_pkg::SPATZ_NUM_IPU, + parameter int unsigned NumIntOutstandingLoads = magia_tile_pkg::SPATZ_NUM_INT_OUTSTANDING_LOADS, + parameter int unsigned NumIntOutstandingMem = magia_tile_pkg::SPATZ_NUM_INT_OUTSTANDING_MEM, + parameter int unsigned NumSpatzOutstandingLoads = magia_tile_pkg::SPATZ_NUM_SPATZ_OUTSTANDING_LOADS, + parameter bit Xdma = magia_tile_pkg::SPATZ_XDMA, + parameter bit RVF = magia_tile_pkg::SPATZ_RVF, + parameter bit RVD = magia_tile_pkg::SPATZ_RVD_PARAM, + parameter bit RVV = magia_tile_pkg::SPATZ_RVV, + parameter bit XDivSqrt = magia_tile_pkg::SPATZ_XDIVSQRT, + parameter bit RegisterOffloadRsp = magia_tile_pkg::SPATZ_REGISTER_OFFLOAD_RSP, + parameter bit RegisterCoreReq = magia_tile_pkg::SPATZ_REGISTER_CORE_REQ, + parameter bit RegisterCoreRsp = magia_tile_pkg::SPATZ_REGISTER_CORE_RSP, + parameter logic [31:0] BootAddr = magia_tile_pkg::SPATZ_BOOT_ADDR, + + parameter fpnew_pkg::fpu_implementation_t FPUImplementation = magia_tile_pkg::SPATZ_FPUImplementation, + + // Derived parameters - calcolo dinamico basato su N_IPU e N_FPU configurabili + localparam int unsigned NumSpatzFUs = (NumSpatzFPUs > NumSpatzIPUs) ? NumSpatzFPUs : NumSpatzIPUs, + localparam int unsigned NumMemPortsPerSpatz = NumSpatzFUs, + localparam int unsigned TCDMPorts = RVV ? NumMemPortsPerSpatz + 1 : 1, // N_FU + 1 TCDM ports + localparam int unsigned HCIMasterPorts = RVD ? (TCDMPorts * 2) : TCDMPorts // RVD=1: 2 HCI32/TCDM, RVD=0: 1 HCI32/TCDM +)( + input logic clk_i, + input logic rst_ni, + input logic test_mode_i, + + // Core configuration + input logic [31:0] hart_id_i, + input logic [AddrWidth-1:0] tcdm_addr_base_i, // TCDM base address for this tile + + // Interrupts + input snitch_pkg::interrupts_t irq_i, + + // HCI Master Interface(s) - Connect to MAGIA HCI Interconnect for L1 SPM access + output magia_tile_pkg::core_hci_data_req_t [HCIMasterPorts-1:0] hci_master_req_o, + input magia_tile_pkg::core_hci_data_rsp_t [HCIMasterPorts-1:0] hci_master_rsp_i, + + // OBI Master Interface - Single port for Snitch core + output magia_tile_pkg::core_obi_data_req_t obi_master_req_o, + input magia_tile_pkg::core_obi_data_rsp_t obi_master_rsp_i, + + // Core -> ICache signals (from hive_req) + output logic inst_req_o, // Core wants instruction (inst_valid) + output logic [AddrWidth-1:0] inst_addr_o, // Instruction address + output logic inst_cacheable_o, // Cacheable flag + output logic flush_i_valid_o, // Flush request + // ICache -> Core signals (to hive_rsp) + input logic [31:0] inst_data_i, // Instruction data + input logic inst_ready_i, // ICache has data ready + input logic inst_error_i, // Fetch error + input logic flush_i_ready_i, // Flush complete + + // Events and status + output snitch_pkg::core_events_t core_events_o +); + + /*******************************************************************/ + /* Internal Type Definitions */ + /*******************************************************************/ + + // Local type aliases + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + + // Accelerator types + typedef struct packed { + logic [31:0] addr; + logic [5:0] id; + logic [31:0] data_op; + data_t data_arga; + data_t data_argb; + addr_t data_argc; + } acc_issue_req_t; + + typedef struct packed { + logic accept; + logic writeback; + logic loadstore; + logic exception; + logic isfloat; + } acc_issue_rsp_t; + + typedef struct packed { + logic [5:0] id; + logic error; + data_t data; + } acc_rsp_t; + + // Virtual memory types - use macro from snitch_vm/typedef.svh + `SNITCH_VM_TYPEDEF(AddrWidth) + + // Hive types (instruction fetch) - matching spatz_cluster.sv + typedef struct packed { + logic flush_i_valid; + addr_t inst_addr; + logic inst_cacheable; + logic inst_valid; + acc_issue_req_t acc_req; + logic acc_qvalid; + logic acc_pready; + logic [1:0] ptw_valid; + va_t [1:0] ptw_va; + pa_t [1:0] ptw_ppn; + } hive_req_t; + + typedef struct packed { + logic flush_i_ready; + logic [31:0] inst_data; + logic inst_ready; + logic inst_error; + logic acc_qready; + acc_rsp_t acc_resp; + logic acc_pvalid; + logic [1:0] ptw_ready; + l0_pte_t [1:0] ptw_pte; + logic [1:0] ptw_is_4mega; + } hive_rsp_t; + + // DMA types - simplified for MAGIA (DMA disabled) + typedef struct packed { + logic start; + logic complete; + logic error; + } dma_events_t; + + /*******************************************************************/ + /* Internal Signal Declarations */ + /*******************************************************************/ + + // Spatz_cc internal signals + hive_req_t hive_req; + hive_rsp_t hive_rsp; + + // Conditional signal declarations based on RVD parameter + generate + if (RVD) begin : gen_signals + // RVD=1: 64-bit signal types + magia_tile_pkg::spatz_reqrsp64_req_t data_req; + magia_tile_pkg::spatz_reqrsp64_rsp_t data_rsp; + magia_tile_pkg::spatz_tcdm64_req_t [TCDMPorts-1:0] tcdm_req; + magia_tile_pkg::spatz_tcdm64_rsp_t [TCDMPorts-1:0] tcdm_rsp; + end else begin : gen_signals + // RVD=0: 32-bit signal types + magia_tile_pkg::spatz_reqrsp_req_t data_req; + magia_tile_pkg::spatz_reqrsp_rsp_t data_rsp; + magia_tile_pkg::spatz_tcdm_req_t [TCDMPorts-1:0] tcdm_req; + magia_tile_pkg::spatz_tcdm_rsp_t [TCDMPorts-1:0] tcdm_rsp; + end + endgenerate + + /*******************************************************************/ + /* Spatz Core Complex Instance */ + /*******************************************************************/ + + generate + if (RVD) begin : gen_spatz_cc_rvd + // RVD=1: 64-bit configuration + spatz_cc #( + .AddrWidth ( AddrWidth ), + .DataWidth ( DataWidth ), // 64-bit + .TCDMAddrWidth ( TCDMAddrWidth ), + .dreq_t ( magia_tile_pkg::spatz_reqrsp64_req_t ), + .drsp_t ( magia_tile_pkg::spatz_reqrsp64_rsp_t ), + .tcdm_req_t ( magia_tile_pkg::spatz_tcdm64_req_t ), + .tcdm_req_chan_t ( magia_tile_pkg::spatz_tcdm64_payload_t ), + .tcdm_rsp_t ( magia_tile_pkg::spatz_tcdm64_rsp_t ), + .tcdm_rsp_chan_t ( magia_tile_pkg::spatz_tcdm64_rsp_chan_t ), + .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), + .axi_ar_chan_t ( magia_tile_pkg::idma_axi_ar_chan_t ), + .axi_aw_chan_t ( magia_tile_pkg::idma_axi_aw_chan_t ), + .axi_rsp_t ( magia_tile_pkg::idma_axi_rsp_t ), + .hive_req_t ( hive_req_t ), + .hive_rsp_t ( hive_rsp_t ), + .acc_issue_req_t ( acc_issue_req_t ), + .acc_issue_rsp_t ( acc_issue_rsp_t ), + .acc_rsp_t ( acc_rsp_t ), + .dma_events_t ( dma_events_t ), + .dma_perf_t ( logic ), + .SnitchPMACfg ( magia_tile_pkg::SPATZ_SNITCH_PMA_CFG ), + .FPUImplementation ( FPUImplementation ), + .BootAddr ( BootAddr ), + .RVE ( 1'b0 ), + .RVF ( RVF ), + .RVD ( 1'b1 ), + .XDivSqrt ( XDivSqrt ), + .XF8 ( 1'b1 ), + .XF16 ( 1'b1 ), + .XF16ALT ( 1'b1 ), + .XF8ALT ( 1'b1 ), + .Xdma ( Xdma ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .RVV ( RVV ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .IsoCrossing ( 1'b0 ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ) + ) i_spatz_cc ( + .clk_i ( clk_i ), + .clk_d2_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( test_mode_i ), + .hart_id_i ( hart_id_i ), + .irq_i ( irq_i ), + .tcdm_addr_base_i ( tcdm_addr_base_i ), + .hive_req_o ( hive_req ), + .hive_rsp_i ( hive_rsp ), + .data_req_o ( gen_signals.data_req ), + .data_rsp_i ( gen_signals.data_rsp ), + .tcdm_req_o ( gen_signals.tcdm_req ), + .tcdm_rsp_i ( gen_signals.tcdm_rsp ), + .axi_dma_req_o ( /* DMA disabled */ ), + .axi_dma_res_i ( '0 ), + .axi_dma_busy_o ( /* DMA disabled */ ), + .axi_dma_perf_o ( /* DMA disabled */ ), + .axi_dma_events_o ( /* DMA disabled */ ), + .core_events_o ( core_events_o ) + ); + end else begin : gen_spatz_cc_no_rvd + // RVD=0: 32-bit configuration + spatz_cc #( + .AddrWidth ( AddrWidth ), + .DataWidth ( DataWidth ), // 32-bit + .TCDMAddrWidth ( TCDMAddrWidth ), + .dreq_t ( magia_tile_pkg::spatz_reqrsp_req_t ), + .drsp_t ( magia_tile_pkg::spatz_reqrsp_rsp_t ), + .tcdm_req_t ( magia_tile_pkg::spatz_tcdm_req_t ), + .tcdm_req_chan_t ( magia_tile_pkg::spatz_tcdm_req_chan_t ), + .tcdm_rsp_t ( magia_tile_pkg::spatz_tcdm_rsp_t ), + .tcdm_rsp_chan_t ( magia_tile_pkg::spatz_tcdm_rsp_chan_t ), + .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), + .axi_ar_chan_t ( magia_tile_pkg::idma_axi_ar_chan_t ), + .axi_aw_chan_t ( magia_tile_pkg::idma_axi_aw_chan_t ), + .axi_rsp_t ( magia_tile_pkg::idma_axi_rsp_t ), + .hive_req_t ( hive_req_t ), + .hive_rsp_t ( hive_rsp_t ), + .acc_issue_req_t ( acc_issue_req_t ), + .acc_issue_rsp_t ( acc_issue_rsp_t ), + .acc_rsp_t ( acc_rsp_t ), + .dma_events_t ( dma_events_t ), + .dma_perf_t ( logic ), + .SnitchPMACfg ( magia_tile_pkg::SPATZ_SNITCH_PMA_CFG ), + .FPUImplementation ( FPUImplementation ), + .BootAddr ( BootAddr ), + .RVE ( 1'b0 ), + .RVF ( RVF ), + .RVD ( 1'b0 ), + .XDivSqrt ( XDivSqrt ), + .XF8 ( 1'b1 ), + .XF16 ( 1'b1 ), + .XF16ALT ( 1'b1 ), + .XF8ALT ( 1'b1 ), + .Xdma ( Xdma ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .RVV ( RVV ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .IsoCrossing ( 1'b0 ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ) + ) i_spatz_cc ( + .clk_i ( clk_i ), + .clk_d2_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( test_mode_i ), + .hart_id_i ( hart_id_i ), + .irq_i ( irq_i ), + .tcdm_addr_base_i ( tcdm_addr_base_i ), + .hive_req_o ( hive_req ), + .hive_rsp_i ( hive_rsp ), + .data_req_o ( gen_signals.data_req ), + .data_rsp_i ( gen_signals.data_rsp ), + .tcdm_req_o ( gen_signals.tcdm_req ), + .tcdm_rsp_i ( gen_signals.tcdm_rsp ), + .axi_dma_req_o ( /* DMA disabled */ ), + .axi_dma_res_i ( '0 ), + .axi_dma_busy_o ( /* DMA disabled */ ), + .axi_dma_perf_o ( /* DMA disabled */ ), + .axi_dma_events_o ( /* DMA disabled */ ), + .core_events_o ( core_events_o ) + ); + end + endgenerate + + /*******************************************************************/ + /* Instruction Cache Interface (Hive) Connection */ + /*******************************************************************/ + + // Map hive request to external icache ports (Core → ICache) + assign inst_req_o = hive_req.inst_valid; + assign inst_addr_o = hive_req.inst_addr; + assign inst_cacheable_o = hive_req.inst_cacheable; + assign flush_i_valid_o = hive_req.flush_i_valid; + + // Map external icache response to hive response (ICache → Core) + assign hive_rsp = '{ + inst_data : inst_data_i, + inst_ready : inst_ready_i, + inst_error : inst_error_i, + flush_i_ready: flush_i_ready_i, + default : '0 + }; + + /*******************************************************************/ + /* Protocol Conversion: reqrsp (Snitch) → OBI32 */ + /*******************************************************************/ + + generate + if (RVD) begin : gen_reqrsp64_to_obi32 + // RVD=1: Snitch data port 64-bit reqrsp → 32-bit OBI (for L2/peripherals) + reqrsp64_to_obi32 #( + .ObiCfg ( magia_tile_pkg::obi_amo_cfg ), + .reqrsp_req_t ( magia_tile_pkg::spatz_reqrsp64_req_t ), + .reqrsp_rsp_t ( magia_tile_pkg::spatz_reqrsp64_rsp_t ), + .obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), + .obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ) + ) i_snitch_reqrsp_to_obi32 ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .reqrsp_req_i ( gen_signals.data_req ), + .reqrsp_rsp_o ( gen_signals.data_rsp ), + .obi_req_o ( obi_master_req_o ), + .obi_rsp_i ( obi_master_rsp_i ) + ); + end else begin : gen_reqrsp32_to_obi32 + // RVD=0: Snitch data port 32-bit reqrsp → 32-bit OBI (for L2/peripherals) + reqrsp2obi #( + .ObiCfg ( magia_tile_pkg::obi_amo_cfg ), + .reqrsp_req_t ( magia_tile_pkg::spatz_reqrsp_req_t ), + .reqrsp_rsp_t ( magia_tile_pkg::spatz_reqrsp_rsp_t ), + .obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), + .obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ) + ) i_snitch_reqrsp_to_obi32 ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .reqrsp_req_i ( gen_signals.data_req ), + .reqrsp_rsp_o ( gen_signals.data_rsp ), + .obi_req_o ( obi_master_req_o ), + .obi_rsp_i ( obi_master_rsp_i ) + ); + end + endgenerate + + /*******************************************************************/ + /* Protocol Conversion: TCDM (Spatz) → HCI32 */ + /*******************************************************************/ + + generate + if (RVD) begin : gen_tcdm64_to_hci + // RVD=1: Each 64-bit TCDM port splits into 2×32-bit HCI ports + + // Ports 0-3: Spatz vector memory ports (64-bit TCDM → 2×32-bit HCI) + for (genvar i = 0; i < NumMemPortsPerSpatz; i++) begin : gen_tcdm64_to_dual_hci32_spatz + tcdm64_to_dual_hci32 #( + .tcdm64_req_t ( magia_tile_pkg::spatz_tcdm64_req_t ), + .tcdm64_rsp_t ( magia_tile_pkg::spatz_tcdm64_rsp_t ), + .tcdm32_req_t ( magia_tile_pkg::spatz_tcdm32_req_t ), + .tcdm32_rsp_t ( magia_tile_pkg::spatz_tcdm32_rsp_t ), + .hci_req_t ( magia_tile_pkg::core_hci_data_req_t ), + .hci_rsp_t ( magia_tile_pkg::core_hci_data_rsp_t ) + ) i_tcdm64_to_dual_hci32_spatz ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( gen_signals.tcdm_req[i] ), + .tcdm_rsp_o ( gen_signals.tcdm_rsp[i] ), + .hci_req_lo_o ( hci_master_req_o[i*2] ), + .hci_rsp_lo_i ( hci_master_rsp_i[i*2] ), + .hci_req_hi_o ( hci_master_req_o[i*2+1] ), + .hci_rsp_hi_i ( hci_master_rsp_i[i*2+1] ) + ); + end + + // Port 4: Snitch RV64 TCDM (with AMO) → Dual HCI32 (ports 8-9) + tcdm64_to_dual_hci32_atomic #( + .tcdm64_req_t ( magia_tile_pkg::spatz_tcdm64_req_t ), + .tcdm64_rsp_t ( magia_tile_pkg::spatz_tcdm64_rsp_t ), + .tcdm32_req_t ( magia_tile_pkg::spatz_tcdm32_req_t ), + .tcdm32_rsp_t ( magia_tile_pkg::spatz_tcdm32_rsp_t ), + .obi32_req_t ( magia_tile_pkg::spatz_obi32_req_t ), + .obi32_rsp_t ( magia_tile_pkg::spatz_obi32_rsp_t ), + .obi32_a_chan_t ( magia_tile_pkg::spatz_obi32_a_chan_t ), + .obi32_r_chan_t ( magia_tile_pkg::spatz_obi32_r_chan_t ), + .hci_req_t ( magia_tile_pkg::core_hci_data_req_t ), + .hci_rsp_t ( magia_tile_pkg::core_hci_data_rsp_t ), + .obi32_a_optional_t ( magia_tile_pkg::spatz_obi32_a_optional_t ), + .obi32_r_optional_t ( magia_tile_pkg::spatz_obi32_r_optional_t ), + .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), + .MgrPortObiCfg ( magia_tile_pkg::obi_no_amo_cfg ), + .BypassCut ( 1'b0 ) + ) i_tcdm64_to_dual_hci32_atomic_snitch ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( gen_signals.tcdm_req[NumMemPortsPerSpatz] ), + .tcdm_rsp_o ( gen_signals.tcdm_rsp[NumMemPortsPerSpatz] ), + .hci_req_lo_o ( hci_master_req_o[NumMemPortsPerSpatz*2] ), + .hci_rsp_lo_i ( hci_master_rsp_i[NumMemPortsPerSpatz*2] ), + .hci_req_hi_o ( hci_master_req_o[NumMemPortsPerSpatz*2+1] ), + .hci_rsp_hi_i ( hci_master_rsp_i[NumMemPortsPerSpatz*2+1] ) + ); + + end else begin : gen_tcdm32_to_hci + // RVD=0: Each 32-bit TCDM port maps directly to 1×32-bit HCI port + + // Ports 0-3: Spatz vector memory ports (32-bit TCDM → 32-bit HCI) + for (genvar i = 0; i < NumMemPortsPerSpatz; i++) begin : gen_tcdm32_to_hci32_spatz + tcdm2hci #( + .tcdm_req_t ( magia_tile_pkg::spatz_tcdm_req_t ), + .tcdm_rsp_t ( magia_tile_pkg::spatz_tcdm_rsp_t ), + .hci_req_t ( magia_tile_pkg::core_hci_data_req_t ), + .hci_rsp_t ( magia_tile_pkg::core_hci_data_rsp_t ) + ) i_tcdm32_to_hci32_spatz ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( gen_signals.tcdm_req[i] ), + .tcdm_rsp_o ( gen_signals.tcdm_rsp[i] ), + .hci_req_o ( hci_master_req_o[i] ), + .hci_rsp_i ( hci_master_rsp_i[i] ) + ); + end + + // Port 4: Snitch RV32 TCDM (with AMO) → HCI32 + tcdm2hci_atomic #( + .tcdm_req_t ( magia_tile_pkg::spatz_tcdm_req_t ), + .tcdm_rsp_t ( magia_tile_pkg::spatz_tcdm_rsp_t ), + .obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), + .obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ), + .obi_a_chan_t ( magia_tile_pkg::core_data_obi_a_chan_t ), + .obi_r_chan_t ( magia_tile_pkg::core_data_obi_r_chan_t ), + .hci_req_t ( magia_tile_pkg::core_hci_data_req_t ), + .hci_rsp_t ( magia_tile_pkg::core_hci_data_rsp_t ), + .obi_a_optional_t ( magia_tile_pkg::core_data_obi_a_optional_t ), + .obi_r_optional_t ( magia_tile_pkg::core_data_obi_r_optional_t ), + .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), + .MgrPortObiCfg ( magia_tile_pkg::obi_no_amo_cfg ), + .BypassCut ( 1'b0 ) + ) i_tcdm32_to_hci32_atomic_snitch ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tcdm_req_i ( gen_signals.tcdm_req[NumMemPortsPerSpatz] ), + .tcdm_rsp_o ( gen_signals.tcdm_rsp[NumMemPortsPerSpatz] ), + .hci_req_o ( hci_master_req_o[NumMemPortsPerSpatz] ), + .hci_rsp_i ( hci_master_rsp_i[NumMemPortsPerSpatz] ) + ); + end + endgenerate + +endmodule : spatz_cc_wrapper diff --git a/scripts/bin2header.py b/scripts/bin2header.py new file mode 100644 index 0000000..4195577 --- /dev/null +++ b/scripts/bin2header.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Convert binary file to C header with embedded array. +Generates a header with the binary as uint32_t array in custom linker section. +""" + +import argparse + +def bytes_to_words(data): + """Convert bytes to uint32_t words (little-endian).""" + words = [] + for i in range(0, len(data), 4): + # Read up to 4 bytes, pad with zeros if needed + chunk = data[i:i+4] + word = sum(byte << (8 * j) for j, byte in enumerate(chunk)) + words.append(f"0x{word:08X}") + return words + +def generate_header(binary_path, output_path, array_name, section_name, address): + """Generate C header with binary array in custom section.""" + + # Read binary file + with open(binary_path, 'rb') as f: + data = f.read() + + # Convert to words + words = bytes_to_words(data) + guard = f"__{array_name.upper()}_H__" + + # Write header file + with open(output_path, 'w') as f: + # Header guard and includes + f.write(f"/* Auto-generated from {binary_path} ({len(data)} bytes) */\n") + f.write(f"#ifndef {guard}\n") + f.write(f"#define {guard}\n\n") + f.write(f"#include \n\n") + + # Binary array with section attribute + f.write(f"/* Binary array in {section_name} section (target: {address}) */\n") + f.write(f"const uint32_t {array_name}[] ") + f.write(f"__attribute__((section(\"{section_name}\"), aligned(4), used)) = {{\n") + + # Write words (8 per line for readability) + for i in range(0, len(words), 8): + line = words[i:i+8] + f.write(" " + ", ".join(line)) + f.write(",\n" if i + 8 < len(words) else "\n") + + f.write("};\n\n") + f.write(f"#endif /* {guard} */\n") + + print(f"Generated: {output_path} ({len(data)} bytes = {len(words)} words)") + +def main(): + parser = argparse.ArgumentParser(description='Convert binary to C header') + parser.add_argument('binary', help='Input binary file') + parser.add_argument('output', help='Output header file') + parser.add_argument('--name', default='spatz_program', help='Array name') + parser.add_argument('--section', default='.spatz_binary', help='Linker section name') + parser.add_argument('--address', default='dynamic (_spatz_binary_start)', + help='Target address (informational)') + + args = parser.parse_args() + generate_header(args.binary, args.output, args.name, args.section, args.address) + +if __name__ == '__main__': + main() diff --git a/scripts/generate_spatz_bootrom.py b/scripts/generate_spatz_bootrom.py new file mode 100644 index 0000000..4fd641d --- /dev/null +++ b/scripts/generate_spatz_bootrom.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright 2024 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 +# +# Generate SystemVerilog bootrom module from binary file + +import argparse +import sys + +def generate_bootrom_sv(bin_file, output_file): + """Generate SystemVerilog bootrom module from binary file.""" + + # Read binary file + with open(bin_file, 'rb') as f: + data = f.read() + + # Pad to 32-bit word boundary + if len(data) % 4 != 0: + padding = 4 - (len(data) % 4) + data = data + b'\x00' * padding + + num_bytes = len(data) + num_words = num_bytes // 4 + + # Convert bytes to 32-bit words (little-endian, in address order) + words = [] + for i in range(0, num_bytes, 4): + word = int.from_bytes(data[i:i+4], byteorder='little') + words.append(word) + + # Generate SystemVerilog module (like spatz_cluster) + sv_content = f'''// Copyright 2024 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Auto-generated bootrom for Spatz core complex +// Generated from: {bin_file} +// Size: {num_bytes} bytes ({num_words} words) + +module spatz_bootrom #( + parameter int unsigned DataWidth = 32, + parameter int unsigned AddrWidth = 32 +) ( + input logic clk_i, + input logic req_i, + input logic [AddrWidth-1:0] addr_i, + output logic [DataWidth-1:0] rdata_o +); + localparam int RomSize = {num_words}; + localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1; + + const logic [RomSize-1:0][DataWidth-1:0] mem = {{ +''' + + # Add ROM content in REVERSE order (high index first) + # This is because packed arrays fill from high to low index + for i in range(num_words - 1, -1, -1): + sv_content += f" 32'h{words[i]:08x}" + if i > 0: + sv_content += "," + sv_content += "\n" + + sv_content += ''' }; + + // Sequential read with registered address + logic [AddrBits-1:0] addr_q; + + always_ff @(posedge clk_i) begin + if (req_i) begin + addr_q <= addr_i[AddrBits-1+2:2]; + end + end + + // Return data based on registered address + assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0; +endmodule +''' + + # Write output file + with open(output_file, 'w') as f: + f.write(sv_content) + + print(f"Generated {output_file}: {num_words} words ({num_bytes} bytes)") + +def main(): + parser = argparse.ArgumentParser(description='Generate SystemVerilog bootrom from binary') + parser.add_argument('binary', help='Input binary file') + parser.add_argument('-o', '--output', required=True, help='Output SystemVerilog file') + + args = parser.parse_args() + + try: + generate_bootrom_sv(args.binary, args.output) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/setup_env.sh b/setup_env.sh index ed0bd50..d588cee 100644 --- a/setup_env.sh +++ b/setup_env.sh @@ -15,8 +15,8 @@ export PATH=/usr/local/anaconda3-2023.07/condabin:$PATH export PATH=/home/visachi/.local/bin:$PATH export XLEN=32 if [[ "$core" == "CV32E40P" ]]; then - echo "Exporting ISA extentions: I, M, F, C, XPULP_V2" - export XTEN=imfcxpulpv2 + echo "Exporting ISA extentions: I, M, C, GAP9" + export XTEN=imcxgap9 else echo "Exporting ISA extentions: I, M, A, F, C" export XTEN=imafc diff --git a/spatz/README.md b/spatz/README.md new file mode 100644 index 0000000..d90ba1f --- /dev/null +++ b/spatz/README.md @@ -0,0 +1,360 @@ +# Spatz Vector Accelerator in MAGIA +[![License](https://img.shields.io/badge/license-Apache--2.0-green)](../LICENSE.APACHE) +[![SHL-0.51 license](https://img.shields.io/badge/license-SHL--0.51-green)](../LICENSE.SHL) + +The Spatz Core Complex (Spatz CC) is a **RISC-V Vector Extension 1.0 (zve32d/zved64d) core** integrated into the MAGIA tile architecture. It consists of **Snitch** (a compact 32-bit RISC-V scalar core) and **Spatz** (a vector coprocessor with multiple FPU lanes). The CV32 host core controls Spatz CC through memory-mapped registers and interrupt-driven task dispatching, following an accelerator-style programming model. + +**Key Features:** +- RISC-V Vector Extension 1.0 (zve64d/zved32d) support (RVV) +- Single and double precision floating-point (RVF, RVD) +- Configurable vector length (VLEN: 128-512 bits) +- Up to 8 vector FPU lanes +- Clock-gated, interrupt-driven operation +- Transparent binary embedding in CV32 firmware + +## ⚙️ Hardware Integration + +### OBI Slave Control Interface + +Spatz CC is controlled through an **OBI slave interface** (`obi_slave_ctrl_spatz`) that exposes memory-mapped control registers. This interface allows the CV32 to: +- Enable/disable the Spatz clock +- Set task entry points +- Trigger task execution via interrupt +- Read task completion status and exit codes + +### Control Registers + +The OBI slave exposes the following registers (base address: `SPATZ_CTRL_BASE = 0x00001700`): + +| Register Name | Offset | Address | Description | +|------------------|--------|------------|----------------------------------------------------------------------------------------------------------| +| `SPATZ_CLK_EN` | 0x00 | 0x00001700 | Clock enable (write 1 to enable, 0 to disable Spatz CC clock) | +| `SPATZ_READY` | 0x04 | 0x00001704 | Ready flag: Spatz writes 1 when ready for interrupts, 0 after first task; CV32 polls before triggering | +| `SPATZ_START` | 0x08 | 0x00001708 | Trigger register: write 1 to send interrupt to Spatz CC, Snitch clears it to 0 | +| `SPATZ_TASKBIN` | 0x0C | 0x0000170C | Task binary address: CV32 writes task entry point address, Snitch reads it | +| `SPATZ_DATA` | 0x10 | 0x00001710 | Data/parameter pointer: CV32 writes pointer to task parameters, Snitch reads it | +| `SPATZ_RETURN` | 0x14 | 0x00001714 | Return value: Snitch writes task exit code (0=success, non-zero=error/exception), CV32 reads it | +| `SPATZ_DONE` | 0x18 | 0x00001718 | Done flag: Snitch writes 1 when task completes, generates 1-cycle pulse to Event Unit, auto-clears | + +**Register Behavior:** +- **CLK_EN, START, READY, TASKBIN, DATA, RETURN**: Preserved registers - values persist until overwritten +- **DONE**: Pulse register - auto-clears after one cycle, used to trigger Event Unit + +**Signal Flow:** +1. CV32 writes task address to `SPATZ_TASKBIN` +2. CV32 enables clock via `SPATZ_CLK_EN = 1` +3. CV32 waits for `SPATZ_READY` to become 1 (Spatz initialization complete, in WFI loop) +4. CV32 writes parameter pointer to `SPATZ_DATA` (optional, only if task needs parameters) +5. CV32 writes 1 to `SPATZ_START` → triggers external interrupt to Spatz CC (Snitch core) +6. Snitch core wakes from WFI, reads task address from `SPATZ_TASKBIN`, clears `SPATZ_START` to 0 +7. CV32 waits for `SPATZ_START` to become 0 (Snitch has saved task address) +8. Snitch executes task, which reads parameters from address in `SPATZ_DATA` +9. Task completes and writes exit code to `SPATZ_RETURN`, then writes 1 to `SPATZ_DONE` +10. `SPATZ_DONE` generates 1-cycle pulse to Event Unit, CV32 wakes from WFE/polling +11. CV32 reads exit code from `SPATZ_RETURN` (0 = success) + +## 🚀 Quick Start + +### Typical CV32 Test with Spatz Tasks +```c +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "my_test_task_bin.h" // REQUIRED - Auto-generated header with SPATZ_BINARY_START and task symbols + +int main(void) { + // Initialize Event Unit and Spatz + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + //Task with parameters via SPATZ_DATA register (optional) + typedef struct { uint32_t addr; uint32_t size; } params_t; + params_t params = {.addr = DATA_BASE, .size = 1024}; + + spatz_init(SPATZ_BINARY_START); // Uses address from auto-generated header + + spatz_pass_params((uint32_t)¶ms); // Optional: pass params pointer via SPATZ_DATA + + //Simple task (no parameters via SPATZ_DATA) + spatz_run_task(MY_VECTOR_TASK); // MY_VECTOR_TASK defined in auto-generated header + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + if (spatz_get_exit_code() != 0) { + printf("Task failed: 0x%03lx\n", spatz_get_exit_code()); + return 1; + } + + spatz_clk_dis(); // Power saving + return 0; +} +``` + +**Header Naming Convention:** +- Test file: `sw/tests/my_test.c` +- Generated header: `spatz/sw/headers_bin/my_test_task_bin.h` +- Include in your test: `#include "my_test_task_bin.h"` + +### Writing a Spatz Task + +Create `spatz/sw/tasks/my_vector_task.c`: + +```c +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" + +// Define parameter structure (optional, if task needs parameters) +typedef struct { + uint32_t data_addr; + uint32_t size; +} my_params_t; + +int my_vector_task(void) { + // Read parameter pointer from SPATZ_DATA register (if task uses parameters) + uint32_t params_ptr = mmio32(SPATZ_DATA); + volatile my_params_t *params = (volatile my_params_t *)params_ptr; + + uint32_t data_addr = params->data_addr; + uint32_t size = params->size; + + // Your vector code here using RVV intrinsics or assembly + asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(32)); + // ... vector operations ... + + return 0; // Exit code: 0 = success, non-zero = error +} +``` + +**Naming Convention (Required):** +- File: `*_task.c` (lowercase, ends with `_task`) +- Function: `int *_task(void)` (matches filename) +- Symbol: `*_TASK` (uppercase in generated header) + +**Reading Parameters:** +- Parameters are passed via pointer in `SPATZ_DATA` register +- Task reads `SPATZ_DATA` to get pointer to parameter structure in L1 +- CV32 must ensure parameters are written to L1 before triggering task + +--- + +## 🔧 Configuration Parameters +### Main Parameters (Makefile) + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `SPATZ_RVD` | 0 | 0: 32-bit TCDM (ELEN=32); 1: 64-bit TCDM (ELEN=64, double precision) | +| `SPATZ_VLEN` | 256 | Vector register length in bits (128, 256, 512) | +| `SPATZ_N_FPU` | 4 | Number of FPU lanes (1-8) | +| `SPATZ_N_IPU` | 1 | Number of integer processing units (1-8) | +| `SPATZ_RVF` | 1 | Enable single-precision floating-point | +| `SPATZ_RVV` | 1 | Enable RISC-V Vector extension | +| `SPATZ_XDIVSQRT` | 0 | Enable FP div/sqrt | + +### Advanced Parameters + +Defined in `hw/tile/magia_tile_pkg.sv`. See [Spatz documentation](https://github.com/pulp-platform/spatz) for details. + +**Memory & Transactions:** +- `SPATZ_NUM_INT_OUTSTANDING_LOADS`, `SPATZ_NUM_INT_OUTSTANDING_MEM` +- `SPATZ_NUM_SPATZ_OUTSTANDING_LOADS` + +**Pipeline:** +- `SPATZ_REGISTER_OFFLOAD_RSP`, `SPATZ_REGISTER_CORE_REQ/RSP` +- `SPATZ_FPU_PIPE_*` (FMA, DIVSQRT, CONV, DOTP stages) + +--- + +## 🕸️ Interconnects + +1. **HCI Ports (High-Performance L1/TCDM Access):** + - Spatz CC connects to the **HCI interconnect** with multiple ports (5-10 ports depending on `N_FPU` and `RVD`) + - Each port corresponds to a TCDM request channel: N_FU ports from Spatz vector unit + 1 port from Snitch scalar core + - When `RVD=1`: HCI ports are doubled (2x32-bit ports per TCDM port) to support 64-bit data transfers + - **Destination:** L1 SPM and TCDM banks (local tile memory) + +2. **OBI Master Port (General Memory Access):** + - Spatz CC (Snitch core) connects to the **OBI crossbar as a master** through a req/rsp port + - Used for accesses **outside L1/TCDM**: peripherals, L2 memory, cross-tile communication + - Handles scalar memory operations from Snitch and accesses to non-TCDM address ranges + - Routed through OBI interconnect to reach all tile resources + +3. **OBI Slave Interface (Control Registers):** + - Spatz CC exposes an **OBI slave interface** (`obi_slave_ctrl_spatz`) for CV32 control + - Base address: `0x00001700` + - Contains 7 control registers: `SPATZ_CLK_EN`, `SPATZ_READY`, `SPATZ_START`, `SPATZ_TASKBIN`, `SPATZ_DATA`, `SPATZ_RETURN`, `SPATZ_DONE` + - CV32 accesses these registers via OBI to command Spatz CC operation +--- + +## 🖥️ Programming API + +### C Functions (`sw/utils/magia_spatz_utils.h`) + +```c +// Clock control +void spatz_clk_en(void); // Enable Spatz clock +void spatz_clk_dis(void); // Disable Spatz clock + +// Initialization and task control +void spatz_init(uint32_t addr); // Initialize: writes addr to TASKBIN, enables clock, waits for READY +void spatz_run_task(uint32_t addr); // Write task address to TASKBIN and trigger START=1 +void spatz_pass_params(uint32_t params_ptr); // Write parameter pointer to DATA (waits for START=0) +void spatz_run_task_with_params(uint32_t addr, uint32_t params_ptr); // Combined: run task + pass params +uint32_t spatz_get_exit_code(void); // Read exit code from RETURN register +``` +--- + +## 🏗️ Boot and Initialization +The bootROM is **extremely minimal** (only 3 instructions) to minimize hardware impact and ROM area. It performs only: +```assembly +// Read dispatcher entry point from TASKBIN register (set by CV32) +li t0, TASKBIN_ADDR // 0x0000170C +lw t1, 0(t0) +jr t1 // Jump to spatz_crt0.S _start +``` + +The CV32 must write the Spatz CC binary's `_start` address to `SPATZ_TASKBIN` **before enabling the Spatz CC clock** for the first time. + +#### 2. Runtime Initialization (`spatz/sw/kernel/spatz_crt0.S`) + +The **spatz_crt0.S** runtime performs complete initialization and enters the task dispatcher loop. It is designed to be **completely transparent** to the programmer. + +**Initialization Steps:** +1. **Stack Setup:** `sp = 0x0001FFF8` (128KB - 8 bytes, stack at top of Snitch local memory) +2. **Register Clearing:** All integer registers `x1-x31` cleared (except `sp`) +3. **Vector Extension Enable:** Set `mstatus.VS = 0x200` (Initial state to enable Spatz vector coprocessor) +4. **BSS Clearing:** Zero-initialize `.bss` section +5. **Trap Handler Setup:** Install `_trap_handler` at `mtvec` +6. **Interrupt Enable:** Enable external interrupts (`MIE.MEIE`) and global interrupts (`mstatus.MIE`) +7. **Signal READY:** Write 1 to `SPATZ_READY` register to signal CV32 that initialization is complete + +**Dispatcher Loop:** +```assembly + wfi // First WFI - wait for first interrupt + + // Deassert READY after first wakeup (only once) + li t0, READY_ADDR + sw zero, 0(t0) + +dispatcher_loop: + wfi // Wait for subsequent interrupts from CV32 + j dispatcher_loop // Loop back after task completion +``` + +**READY Synchronization:** +- After completing initialization, Spatz writes `READY = 1` +- CV32's `spatz_init()` polls `READY` until it becomes 1 before proceeding +- After the first WFI wakeup, Spatz clears `READY = 0` (only once) +- This prevents race conditions where CV32 might trigger an interrupt before Spatz enters WFI + +**Trap Handler (`_trap_handler`):** +- **Interrupts** (mcause[31]=1): + - **Reads and saves task address** from `TASKBIN` register into a register + - Clears `START` register to 0 to deassert interrupt signal (signals CV32 it's safe to write to `DATA`) + - Jumps to saved task address via `jalr ra, 0(t1)` + - After task returns, writes exit code 0 (success) to `RETURN` register + - Sets `DONE = 1` (generates 1-cycle pulse to Event Unit) + +- **Exceptions** (mcause[31]=0): + - Writes exception code | 0x100 to `RETURN` register (bit 8 distinguishes exceptions from task errors) + - Sets `DONE = 1` (generates 1-cycle pulse to Event Unit) + - Halts in infinite WFI loop + +**Exit Code Convention:** +- `0x000`: Task completed successfully +- `0x1XX`: Exception occurred (XX = mcause[7:0]) + +### Workflow + +**Programmer writes task** → **Makefile auto-detects** → **Builds Spatz binary** → **Converts to header** → **Embeds in CV32 firmware** + +**Task Naming Convention:** +- File: `*_task.c` (e.g., `my_vector_task.c`) +- Function: `int *_task(void)` +- Symbol: `*_TASK` (uppercase in header) + +### Auto-Detection and Build + +Top-level Makefile automatically: +1. Scans test code for `*_TASK` symbols +2. Triggers Spatz binary build +3. Generates header with task addresses + +```makefile +# Automatic in MAGIA-SPATZ/Makefile +SPATZ_TASKS := $(shell grep -oP '\b[A-Z][A-Z0-9_]*_TASK\b' $(TEST_SRCS)) +$(OBJ): spatz-header # Header generated before CV32 compilation +``` + +### Generated Header Example + +The build system automatically generates a header file with the Spatz binary and task entry points: + +```c +// Auto-generated: test_name_task_bin.h +const uint8_t test_name_task_bin[] __attribute__((section(".spatz_binary"))) = { + 0x37, 0x01, 0x02, 0x00, ... // Spatz binary data +}; + +#define SPATZ_BINARY_START ((uint32_t)&_spatz_binary_start) +#define MY_VECTOR_TASK (SPATZ_BINARY_START + 0x00000120) // Task offset +#define ANOTHER_TASK (SPATZ_BINARY_START + 0x00000240) // Another task offset + +``` + +**You MUST include this header in your CV32 test:** +```c +#include "my_test_task_bin.h" // Required for SPATZ_BINARY_START and task symbols +``` + + +## ⚡ Complete Build and Run + +### Example Test with Spatz + +**File: `sw/tests/my_test.c`** +```c +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "my_test_task_bin.h" + +int main(void) { + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + spatz_init(SPATZ_BINARY_START); + + spatz_run_task(MY_VECTOR_TASK); + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + return spatz_get_exit_code(); +} +``` + +### Build Commands + +```bash +# 1. Setup environment +make python_venv +source setup_env.sh +make python_deps +make bender +make update-ips > update-ips.log +make floonoc-patch + +# 2. Build hardware (QuestaSim) +module load questasim/2024.3 +make build-hw mesh_dv=0 fast_sim=1 > build-hw.log + +# 3. Build software (auto-detects Spatz tasks) +module load pulp-gcc7/1.0.16 +make clean all test='my_test' mesh_dv=0 + +# 4. Run simulation +make run test='my_test' mesh_dv=0 gui=0 +``` + +**Note:** Build flow is **identical** whether your test uses Spatz or not — the system automatically detects and handles Spatz tasks! + +--- + +## 🔏 License +All `software` sources are licensed under the Apache License 2.0 ([`LICENSE.APACHE`](../LICENSE.APACHE)). All `hardware` sources are licensed under the Solderpad Hardware License 0.51 ([`LICENSE.SHL`](../LICENSE.SHL)). diff --git a/spatz/bootrom/Makefile b/spatz/bootrom/Makefile new file mode 100644 index 0000000..d2f5648 --- /dev/null +++ b/spatz/bootrom/Makefile @@ -0,0 +1,96 @@ +# Copyright (C) 2023-2024 ETH Zurich and University of Bologna +# +# Licensed under the Solderpad Hardware License, Version 0.51 +# (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: SHL-0.51 +# +# Authors: Luca Balboni +# Spatz Initialization ROM Makefile for MAGIA Tile + +# Using Spatz LLVM toolchain +LLVM_PATH := /sw/scratch/luca.balboni10/SPATZ/install/llvm + +# Configuration from top-level Makefile +SPATZ_RVD ?= 0 # 0: ELEN=32, 1: ELEN=64 +SPATZ_VLEN ?= 256 # Vector length in bits +SPATZ_N_IPU ?= 1 # Number of Integer Processing Units (1-8) +SPATZ_N_FPU ?= 4 # Number of Floating Point Units (1-8) +SPATZ_XDIVSQRT ?= 0 # 0: FP div/sqrt disabled, 1: enabled +SPATZ_XDMA ?= 0 # 0: DMA disabled, 1: enabled +SPATZ_RVF ?= 1 # 0: single-precision FP disabled, 1: enabled +SPATZ_RVV ?= 1 # 0: vector extension disabled, 1: enabled + +ISA ?= riscv +ARCH ?= rv +XLEN ?= 32 + +# Build ISA string dynamically based on configuration flags +XTEN := ima +ifeq ($(SPATZ_RVD),1) + XTEN := $(XTEN)fd + XABI ?= 32d +else ifeq ($(SPATZ_RVF),1) + XTEN := $(XTEN)f + XABI ?= 32f +else + XABI ?= 32 +endif +ifeq ($(SPATZ_RVV),1) + XTEN := $(XTEN)v +endif +XTEN := $(XTEN)_zfh +ABI ?= ilp + +CC = $(LLVM_PATH)/bin/clang +OBJCOPY = $(LLVM_PATH)/bin/llvm-objcopy +OBJDUMP = $(LLVM_PATH)/bin/llvm-objdump + +# Compiler flags (RVD=$(SPATZ_RVD), VLEN=$(SPATZ_VLEN), N_IPU=$(SPATZ_N_IPU), N_FPU=$(SPATZ_N_FPU), XDivSqrt=$(SPATZ_XDIVSQRT), Xdma=$(SPATZ_XDMA), RVF=$(SPATZ_RVF), RVV=$(SPATZ_RVV)) +CFLAGS = --target=$(ISA)$(XLEN)-unknown-elf -march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XABI) +CFLAGS += -DSPATZ_RVD=$(SPATZ_RVD) -DSPATZ_VLEN=$(SPATZ_VLEN) -DSPATZ_N_IPU=$(SPATZ_N_IPU) -DSPATZ_N_FPU=$(SPATZ_N_FPU) -DSPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) -DSPATZ_XDMA=$(SPATZ_XDMA) -DSPATZ_RVF=$(SPATZ_RVF) -DSPATZ_RVV=$(SPATZ_RVV) +CFLAGS += -static -nostartfiles -nostdlib -O2 -g -Wall -Wextra + +# Linker flags +LDFLAGS = -T spatz_init.ld -fuse-ld=lld + +# Files +SRC = spatz_init.S +ELF = spatz_init.elf +BIN = spatz_init.bin +DUMP = spatz_init.dump + +.PHONY: all clean bin sv + +# Build binary and dump files +bin: $(BIN) $(DUMP) + +# Generate hardware bootrom SystemVerilog module from binary +sv: $(BIN) + @echo "Generating hardware bootrom: ../../hw/tile/spatz_bootrom.sv" + python3 ../../scripts/generate_spatz_bootrom.py $(BIN) -o ../../hw/tile/spatz_bootrom.sv + +# Complete build: clean, compile binary, generate hardware module +all: clean bin sv + +$(ELF): $(SRC) spatz_init.ld + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(SRC) + +$(BIN): $(ELF) + $(OBJCOPY) -O binary $< $@ + +$(DUMP): $(ELF) + $(OBJDUMP) -D $< > $@ + +clean: + rm -f $(ELF) $(BIN) $(DUMP) + +.PRECIOUS: $(ELF) $(BIN) diff --git a/spatz/bootrom/spatz_init.S b/spatz/bootrom/spatz_init.S new file mode 100644 index 0000000..3559298 --- /dev/null +++ b/spatz/bootrom/spatz_init.S @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2018-2019 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * + * Authors: Luca Balboni + * + * All initialization (stack, regs, vector, BSS) done in spatz_crt0.S + * Minimal Boot ROM - only read TASKBIN_ADDR and jump + */ + +#define TASKBIN_ADDR 0x0000170C // OBI slave ctrl exchange register + + .section .text.bootrom + .global _start + .option norvc + +_start: + // Read dispatcher loop address from TASKBIN_ADDR (set by CV32) + li t0, TASKBIN_ADDR + lw t1, 0(t0) + + // Jump to _start in spatz binary (does all init) + jr t1 diff --git a/spatz/bootrom/spatz_init.ld b/spatz/bootrom/spatz_init.ld new file mode 100644 index 0000000..851b5c0 --- /dev/null +++ b/spatz/bootrom/spatz_init.ld @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 +/* + * Spatz Boot ROM Linker Script for MAGIA Tile + * + * Maps bootrom to 0x10000000 (outside L2 range to avoid AXI address decode overlap). + * The bootrom is synthesized as a hardware module (spatz_bootrom.sv) mapped via AXI slave. + * + * Boot sequence: + * 1. Spatz resets with PC = 0x10000000 (SPATZ_BOOT_ADDR in magia_tile.sv) + * 2. Bootrom initializes core (clear regs, setup interrupts) + * 3. Bootrom waits for MEIP interrupt via WFI + * 4. CV32 writes entry point to FUNC_PTR (0x1704) + * 5. CV32 triggers START_TRIGGER (0x1700) → MEIP interrupt + * 6. Bootrom reads FUNC_PTR and jumps to user program + */ + +SEARCH_DIR(.) +OUTPUT_ARCH(riscv) +ENTRY(_start) + +MEMORY +{ + bootrom : ORIGIN = 0x10000000, LENGTH = 0x100 /* 256B at 0x10000000 (outside L2 range to avoid AXI overlap) */ +} + +SECTIONS +{ + .text.bootrom : { + KEEP(*(.text.bootrom)) + *(.text) + *(.text.*) + } > bootrom + + .rodata : { + *(.rodata) + *(.rodata.*) + } > bootrom + + /* Discard all other sections */ + /DISCARD/ : { + *(.data) + *(.data.*) + *(.bss) + *(.bss.*) + *(.comment) + *(.note.*) + } +} diff --git a/spatz/sw/Makefile b/spatz/sw/Makefile new file mode 100644 index 0000000..d22595f --- /dev/null +++ b/spatz/sw/Makefile @@ -0,0 +1,193 @@ +# Copyright (C) 2023-2024 ETH Zurich and University of Bologna +# +# Licensed under the Solderpad Hardware License, Version 0.51 +# (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: SHL-0.51 +# +# Authors: Luca Balboni +# Spatz Programs Makefile for MAGIA Tile + +LLVM_PATH := /sw/scratch/luca.balboni10/SPATZ/install/llvm + +# Configuration from top-level Makefile +SPATZ_RVD ?= 0 # 0: ELEN=32, 1: ELEN=64 +SPATZ_VLEN ?= 256 # Vector length in bits +SPATZ_N_IPU ?= 1 # Number of Integer Processing Units (1-8) +SPATZ_N_FPU ?= 4 # Number of Floating Point Units (1-8) +SPATZ_XDIVSQRT ?= 0 # 0: FP div/sqrt disabled, 1: enabled +SPATZ_XDMA ?= 0 # 0: DMA disabled, 1: enabled +SPATZ_RVF ?= 1 # 0: single-precision FP disabled, 1: enabled +SPATZ_RVV ?= 1 # 0: vector extension disabled, 1: enabled + +# Test name for generating specific header (passed from top Makefile) +TEST_NAME ?= base_spatz_test + +ISA ?= riscv +ARCH ?= rv +XLEN ?= 32 + +# Build ISA string dynamically based on configuration flags +XTEN := ima +ifeq ($(SPATZ_RVD),1) + XTEN := $(XTEN)fd + XABI ?= 32d +else ifeq ($(SPATZ_RVF),1) + XTEN := $(XTEN)f + XABI ?= 32f +else + XABI ?= 32 +endif +ifeq ($(SPATZ_RVV),1) + XTEN := $(XTEN)v +endif +XTEN := $(XTEN)_zfh +ABI ?= ilp + +# Toolchain paths +CC = $(LLVM_PATH)/bin/clang +OBJCOPY = $(LLVM_PATH)/bin/llvm-objcopy +OBJDUMP = $(LLVM_PATH)/bin/llvm-objdump + +# Architecture flags +ARCH_FLAGS = --target=riscv32 -march=$(ARCH)$(XLEN)$(XTEN) -mabi=$(ABI)$(XABI) +ARCH_FLAGS += -menable-experimental-extensions + +# Compiler flags (RVD=$(SPATZ_RVD), VLEN=$(SPATZ_VLEN), N_IPU=$(SPATZ_N_IPU), N_FPU=$(SPATZ_N_FPU), XDivSqrt=$(SPATZ_XDIVSQRT), Xdma=$(SPATZ_XDMA), RVF=$(SPATZ_RVF), RVV=$(SPATZ_RVV)) +CFLAGS = $(ARCH_FLAGS) -static -nostartfiles -nostdlib +CFLAGS += -O3 -g -Wall -Wextra +CFLAGS += -fno-common -ffunction-sections -fdata-sections -fPIC +CFLAGS += -fno-builtin-memset -fno-builtin-memcpy +CFLAGS += -mcmodel=medany +CFLAGS += -DSPATZ_RVD=$(SPATZ_RVD) -DSPATZ_VLEN=$(SPATZ_VLEN) -DSPATZ_N_IPU=$(SPATZ_N_IPU) -DSPATZ_N_FPU=$(SPATZ_N_FPU) -DSPATZ_XDIVSQRT=$(SPATZ_XDIVSQRT) -DSPATZ_XDMA=$(SPATZ_XDMA) -DSPATZ_RVF=$(SPATZ_RVF) -DSPATZ_RVV=$(SPATZ_RVV) +CFLAGS += -I../../sw/utils -I../../hw/include -Ispatz_utils + +# Linker flags +LDFLAGS = -T $(KERNEL_DIR)/spatz_program.ld -fuse-ld=lld +LDFLAGS += -Wl,-z,norelro +LDFLAGS += -Wl,--allow-multiple-definition +LDFLAGS += $(ARCH_FLAGS) + +# Single binary with selected tests (passed via TESTS variable) +# Binary and header names based on TEST_NAME +BINARY_NAME = $(TEST_NAME)_task_bin +HEADER_NAME = $(TEST_NAME)_task_bin + +# Directories +BIN_DIR = bin +HEADER_DIR = headers_bin +TESTS_DIR = tasks +KERNEL_DIR = kernel + +# Tasks to include (lowercase 'task' variable) +# Usage: make all task="general_task" or task="general_task hello_world_task" +# MUST be specified explicitly - no default +task ?= + +# Source files for selected tasks +TASK_SRCS = $(addprefix $(TESTS_DIR)/,$(addsuffix .c,$(task))) + +# Source and output files +CRT0 = $(KERNEL_DIR)/spatz_crt0.S +CRT0_OBJ = $(BIN_DIR)/spatz_crt0.o +ELF = $(BIN_DIR)/$(BINARY_NAME).elf +BIN = $(BIN_DIR)/$(BINARY_NAME).bin +DUMP = $(BIN_DIR)/$(BINARY_NAME).dump +HEADER = $(HEADER_DIR)/$(HEADER_NAME).h + +# Python script for bin2header conversion +BIN2HEADER = ../../scripts/bin2header.py + +.PHONY: all clean bin header dirs + +# Complete build: compile binary and generate header +# Force header regeneration when TASKS changes (always delete and rebuild) +all: bin + @rm -f $(HEADER) + @$(MAKE) header + +# Build binary (ELF + BIN + DUMP) +bin: dirs $(BIN) $(DUMP) + +# Generate header from binary (always regenerated by 'all' target) +header: dirs $(HEADER) + +# Create directories +dirs: + @mkdir -p $(BIN_DIR) $(HEADER_DIR) + +# Compile crt0.S +$(CRT0_OBJ): $(CRT0) + @echo "[SPATZ] Compiling crt0..." + $(CC) $(ARCH_FLAGS) -c -o $@ $(CRT0) + +# Build ELF executable with crt0 + selected tasks +# Tasks are launched by CV32 via interrupt-driven task switching +$(ELF): $(CRT0_OBJ) $(TASK_SRCS) $(KERNEL_DIR)/spatz_program.ld + @mkdir -p $(BIN_DIR) + @if [ -z "$(TASK_SRCS)" ]; then \ + echo "[SPATZ] ERROR: No task sources found for task=$(task)"; \ + exit 1; \ + fi + @echo "[SPATZ] Included tasks: $(task)" + @echo "[SPATZ] Tasks will be launched by CV32 via spatz_set_func() and interrupt" + @FIRST_TASK=$(word 1,$(task)); \ + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(CRT0_OBJ) $(TASK_SRCS) \ + -Wl,--defsym,__first_task=$${FIRST_TASK} + +# Generate binary from ELF +$(BIN): $(ELF) + @echo "[SPATZ] Generating binary..." + $(OBJCOPY) -O binary $< $@ + +# Generate disassembly +$(DUMP): $(ELF) + @echo "[SPATZ] Generating disassembly..." + $(OBJDUMP) -D -S $< > $@ + +# Generate header with binary array + extract function symbols +$(HEADER): $(BIN) $(ELF) + @mkdir -p $(HEADER_DIR) + @echo "[SPATZ] Generating header with binary array..." + @python3 $(BIN2HEADER) $(BIN) $(HEADER) \ + --name $(HEADER_NAME) \ + --section .spatz_binary \ + --address "dynamic (_spatz_binary_start)" + @echo "[SPATZ] Adding binary start + dispatcher + task symbols..." + @GUARD_NAME=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \ + sed -i "/#endif \/\* __$${GUARD_NAME}_H__ \*\//d" $(HEADER) + @echo "" >> $(HEADER) + @echo "/* Binary start address - defined by CV32 linker */" >> $(HEADER) + @echo "extern uint32_t _spatz_binary_start; /* Linker symbol from CV32 link.ld */" >> $(HEADER) + @echo "#define SPATZ_BINARY_START ((uint32_t)&_spatz_binary_start)" >> $(HEADER) + @echo "" >> $(HEADER) + @echo "/* Dispatcher loop address - for spatz_run() */" >> $(HEADER) + @DISP_ADDR=$$($(OBJDUMP) -t $(ELF) | grep "dispatcher_loop$$" | awk '{print $$1}'); \ + echo "#define SPATZ_DISPATCHER_LOOP (SPATZ_BINARY_START + 0x$${DISP_ADDR})" >> $(HEADER) + @echo "" >> $(HEADER) + @echo "/* Task function entry points - OFFSETS from SPATZ_BINARY_START */" >> $(HEADER) + @$(OBJDUMP) -t $(ELF) | grep "_task$$" | grep -v "__first_task$$" | awk '{print $$1, $$NF}' | while read addr name; do \ + TASK_NAME=$$(echo $$name | tr 'a-z' 'A-Z'); \ + echo "#define $${TASK_NAME} (SPATZ_BINARY_START + 0x$$addr)" >> $(HEADER); \ + done + @echo "" >> $(HEADER) + @GUARD_NAME=$$(echo "$(HEADER_NAME)" | tr 'a-z' 'A-Z'); \ + echo "#endif /* __$${GUARD_NAME}_H__ */" >> $(HEADER) + +clean: + rm -rf $(BIN_DIR)/*.elf $(BIN_DIR)/*.bin $(BIN_DIR)/*.dump $(BIN_DIR)/*.o + rm -rf $(HEADER_DIR)/*.h + +# Clean only current test artifacts (preserves other headers) +clean-test: + rm -f $(ELF) $(BIN) $(DUMP) $(CRT0_OBJ) $(HEADER) + +.PRECIOUS: $(ELF) $(BIN) diff --git a/spatz/sw/kernel/spatz_crt0.S b/spatz/sw/kernel/spatz_crt0.S new file mode 100644 index 0000000..03382f8 --- /dev/null +++ b/spatz/sw/kernel/spatz_crt0.S @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2018-2019 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Spatz Runtime + * Entry: Bootrom should jumps here + * + */ + +#define READY_ADDR 0x00001704 +#define START_ADDR 0x00001708 +#define TASKBIN_ADDR 0x0000170C +#define DATA_ADDR 0x00001710 +#define RETURN_ADDR 0x00001714 +#define DONE_ADDR 0x00001718 +#define MSTATUS_VS 0x200 // Vector Status [9:8] = 01 +#define MSTATUS_MIE 0x8 // Machine Interrupt Enable +#define MIE_MEIE 0x800 // External interrupt enable + +.section .text.start, "ax" +.global _start + +_start: + /* Init stack pointer */ + li sp, 0x0001FFF8 + + /* Clear ALL integer registers */ + mv x1, x0 + // x2 = sp just initialized + mv x3, x0 + mv x4, x0 + mv x5, x0 + mv x6, x0 + mv x7, x0 + mv x8, x0 + mv x9, x0 + mv x10, x0 + mv x11, x0 + mv x12, x0 + mv x13, x0 + mv x14, x0 + mv x15, x0 + mv x16, x0 + mv x17, x0 + mv x18, x0 + mv x19, x0 + mv x20, x0 + mv x21, x0 + mv x22, x0 + mv x23, x0 + mv x24, x0 + mv x25, x0 + mv x26, x0 + mv x27, x0 + mv x28, x0 + mv x29, x0 + mv x30, x0 + mv x31, x0 + + /* Enable vector extension (VS=Initial for Spatz) */ + li t0, MSTATUS_VS + csrs mstatus, t0 + + /* Clear BSS section */ + la a0, __bss_start + la a1, __bss_end +1: bgeu a0, a1, 2f + sw zero, 0(a0) + addi a0, a0, 4 + j 1b +2: + /* Setup trap handler for interrupt */ + la t0, _trap_handler + csrw mtvec, t0 + + /* Enable external interrupts (MEIE) */ + li t0, MIE_MEIE + csrs mie, t0 + + /* Enable global interrupts (MIE bit in mstatus) */ + li t0, MSTATUS_MIE + csrs mstatus, t0 + + /* Signal ready to CV32 - now ready to receive interrupts */ + li t0, READY_ADDR + li t1, 1 + sw t1, 0(t0) + + /* First WFI - after this we deassert READY */ + wfi + + /* Deassert READY after first wakeup (only once) */ + li t0, READY_ADDR + sw zero, 0(t0) + + /* Fall through to main dispatcher loop */ + +dispatcher_loop: + /* Wait for interrupt from CV32 */ + wfi + + /* After wakeup from interrupt, task has been executed */ + /* Loop back */ + j dispatcher_loop + +/* Trap handler - distinguishes interrupts from exceptions */ +_trap_handler: + csrr t0, mcause + bltz t0, _handle_interrupt // mcause[31]=1 → interrupt + + /* EXCEPTION: Write error code to RETURN_ADDR, signal DONE, halt */ +_handle_exception: + csrr t1, mcause // Read full mcause + andi t1, t1, 0xFF // Extract exception code [7:0] + ori t1, t1, 0x100 // Set bit 8 to distinguish from task errors + li t0, RETURN_ADDR + sw t1, 0(t0) + + li t0, DONE_ADDR + li t1, 1 + sw t1, 0(t0) + + /* Halt */ +1: wfi + j 1b + +_handle_interrupt: + /* Check if external interrupt */ + andi t0, t0, 0x7FF + li t1, 11 + bne t0, t1, _trap_ret + + /* Read task address from TASKBIN_ADDR and save it in t1 */ + li t0, TASKBIN_ADDR + lw t1, 0(t0) + beqz t1, _trap_ret + + /* Clear START */ + li t0, START_ADDR + sw zero, 0(t0) + + /* Jump to saved task address */ + jalr ra, 0(t1) + + /* Task completed - write exit code 0 (success, no exceptions) */ + li t0, RETURN_ADDR + sw zero, 0(t0) + + /* Signal done */ + li t0, DONE_ADDR + li t1, 1 + sw t1, 0(t0) + +_trap_ret: + mret diff --git a/spatz/sw/kernel/spatz_program.ld b/spatz/sw/kernel/spatz_program.ld new file mode 100644 index 0000000..b946b61 --- /dev/null +++ b/spatz/sw/kernel/spatz_program.ld @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2018-2019 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Memory Layout (when embedded in CV32 instrram): + * 0xCC000000: CV32 .text + * 0xCC000000 + CV32_size: .spatz_binary (Spatz code inline) + * + * Stack: Spatz uses remaining 16KB after CV32 (0x0001C000-0x00020000) + * - CV32 uses: 0x00010000-0x0001C000 (48KB) + * - Spatz uses: 0x0001C000-0x00020000 (16KB) + */ + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +MEMORY +{ + prog (rwxa) : ORIGIN = 0x00000000, LENGTH = 0x10000 /* 64KB max - relative addressing */ +} + +/* Stack configuration - Spatz uses remaining 16KB after CV32 */ +/* CV32 uses: 0x00010000-0x0001C000 (48KB, from CV32 link.ld) */ +/* Spatz uses: 0x0001C000-0x00020000 (16KB) */ +_stack_start = 0x0001FFF8; /* Top of Spatz stack (grows downward) */ + +SECTIONS +{ + .text : { + . = ALIGN(4); + KEEP(*(.text.start)) + *(.text) + *(.text.*) + . = ALIGN(4); + } > prog + + .rodata : { + . = ALIGN(4); + *(.rodata) + *(.rodata.*) + . = ALIGN(4); + } > prog + + .data : { + . = ALIGN(4); + *(.data) + *(.data.*) + *(.sdata) + *(.sdata.*) + . = ALIGN(4); + } > prog + + .bss : { + . = ALIGN(4); + __bss_start = .; + *(.bss) + *(.bss.*) + *(.sbss) + *(.sbss.*) + *(COMMON) + . = ALIGN(4); + __bss_end = .; + } > prog + + /* Stack symbol for spatz_crt0.S */ + /* spatz_crt0.S uses: la sp, stack */ + stack = _stack_start; + + /* Discard debug sections */ + /DISCARD/ : { + *(.comment) + *(.note.*) + } +} diff --git a/spatz/sw/spatz_utils/spatz_workers.h b/spatz/sw/spatz_utils/spatz_workers.h new file mode 100644 index 0000000..9da8680 --- /dev/null +++ b/spatz/sw/spatz_utils/spatz_workers.h @@ -0,0 +1,1760 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Spatz Vector Utilities - Based on Spatz Benchmarks + * Provides optimized vector kernels for various operations + */ + + +#ifndef _SPATZ_UTILS_H_ +#define _SPATZ_UTILS_H_ + +#include + +// FP16 type alias +typedef uint16_t fp16_t; +typedef _Float16 float16_t; + +// ============================================================================= +// Dot Product Kernels (from spatzBenchmarks/dp-fdotp) +// ============================================================================= + +// 64-bit dot-product: a * b +static inline double fdotp_v64b(const double *a, const double *b, unsigned int avl) { + const unsigned int orig_avl = avl; + unsigned int vl; + double red; + + // Clean the accumulator + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile("vmv.s.x v0, zero"); + + // Stripmine and accumulate a partial reduced vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle64.v v8, (%0)" ::"r"(a)); + asm volatile("vle64.v v16, (%0)" ::"r"(b)); + + // Multiply and accumulate + if (avl == orig_avl) { + asm volatile("vfmul.vv v24, v8, v16"); + } else { + asm volatile("vfmacc.vv v24, v8, v16"); + } + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + } while (avl > 0); + + // Reduce and return + asm volatile("vsetvli zero, %0, e64, m8, ta, ma" ::"r"(orig_avl)); + asm volatile("vfredusum.vs v0, v24, v0"); + asm volatile("vfmv.f.s %0, v0" : "=f"(red)); + + return red; +} + +// 32-bit dot-product: a * b +static inline float fdotp_v32b(const float *a, const float *b, unsigned int avl) { + const unsigned int orig_avl = avl; + unsigned int vl; + float red; + + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile("vmv.s.x v0, zero"); + + // Stripmine and accumulate a partial reduced vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle32.v v8, (%0)" ::"r"(a)); + asm volatile("vle32.v v16, (%0)" ::"r"(b)); + + // Multiply and accumulate + if (avl == orig_avl) { + asm volatile("vfmul.vv v24, v8, v16"); + } else { + asm volatile("vfmacc.vv v24, v8, v16"); + } + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + } while (avl > 0); + + // Reduce and return + asm volatile("vsetvli zero, %0, e32, m8, ta, ma" ::"r"(orig_avl)); + asm volatile("vfredusum.vs v0, v24, v0"); + asm volatile("vfmv.f.s %0, v0" : "=f"(red)); + return red; +} + +// 16-bit dot-product: a * b +static inline float16_t fdotp_v16b(const float16_t *a, const float16_t *b, unsigned int avl) { + const unsigned int orig_avl = avl; + unsigned int vl; + float16_t red; + + asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile("vmv.s.x v0, zero"); + + // Stripmine and accumulate a partial reduced vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load chunk a and b + asm volatile("vle16.v v8, (%0)" ::"r"(a)); + asm volatile("vle16.v v16, (%0)" ::"r"(b)); + + // Multiply and accumulate + if (avl == orig_avl) { + asm volatile("vfmul.vv v24, v8, v16"); + } else { + asm volatile("vfmacc.vv v24, v8, v16"); + } + + // Bump pointers + a += vl; + b += vl; + avl -= vl; + } while (avl > 0); + + // Reduce and return + asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(orig_avl)); + asm volatile("vfredusum.vs v0, v24, v0"); + asm volatile("vfmv.f.s %0, v0" : "=f"(red)); + return red; +} + +// ============================================================================= +// AXPY Kernels (from spatzBenchmarks/dp-faxpy): y = a * x + y +// ============================================================================= + +// 64-bit AXPY: y = a * x + y +static inline void faxpy_v64b(const double a, const double *x, const double *y, unsigned int avl) { + unsigned int vl; + + // Stripmine and accumulate a partial vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + asm volatile("vle64.v v0, (%0)" ::"r"(x)); + asm volatile("vle64.v v8, (%0)" ::"r"(y)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a)); + + // Store results + asm volatile("vse64.v v8, (%0)" ::"r"(y)); + + // Bump pointers + x += vl; + y += vl; + avl -= vl; + } while (avl > 0); +} + +// 32-bit AXPY: y = a * x + y +static inline void faxpy_v32b(const float a, const float *x, const float *y, unsigned int avl) { + unsigned int vl; + + // Stripmine and accumulate a partial vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + asm volatile("vle32.v v0, (%0)" ::"r"(x)); + asm volatile("vle32.v v8, (%0)" ::"r"(y)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a)); + + // Store results + asm volatile("vse32.v v8, (%0)" ::"r"(y)); + + // Bump pointers + x += vl; + y += vl; + avl -= vl; + } while (avl > 0); +} + +// 16-bit AXPY: y = a * x + y +static inline void faxpy_v16b(const float16_t a, const float16_t *x, const float16_t *y, unsigned int avl) { + unsigned int vl; + + // Stripmine and accumulate a partial vector + do { + // Set the vl + asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load vectors + asm volatile("vle16.v v0, (%0)" ::"r"(x)); + asm volatile("vle16.v v8, (%0)" ::"r"(y)); + + // Multiply-accumulate + asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a)); + + // Store results + asm volatile("vse16.v v8, (%0)" ::"r"(y)); + + // Bump pointers + x += vl; + y += vl; + avl -= vl; + } while (avl > 0); +} + +// ============================================================================= +// Matrix Multiplication Kernels (from spatzBenchmarks/dp-fmatmul) +// ============================================================================= + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +// Forward declarations +static inline void fmatmul_v64b_2xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v64b_4xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v64b_8xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + +// Matrix multiplication wrapper - automatically selects optimal variant +// Computes C = A × B where C is MxP, A is MxN, B is NxP +static inline void fmatmul_v64b(double *c, const double *a, const double *b, + const unsigned int M, const unsigned int N, + const unsigned int P) { + if (M <= 4) { + fmatmul_v64b_2xVL(c, a, b, 0, M, N, P, 0, P); + } else if (M <= 8) { + fmatmul_v64b_4xVL(c, a, b, 0, M, N, P, 0, P); + } else { + fmatmul_v64b_8xVL(c, a, b, 0, M, N, P, 0, P); + } +} + +// 2xVL variant - optimized for small matrices (M <= 4) +static inline void fmatmul_v64b_2xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e64, m8, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const double *b_ = b + p; + double *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 2) { + const double *a_ = a + m * N; + const double *a__ = a_; + + asm volatile("vle64.v v16, (%0);" ::"r"(b_)); + const double *b__ = b_ + P; + + double *c__ = c_ + m * P; + + double t0, t1; + + t0 = *a__; + a__ += N; + t1 = *a__; + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle64.v v24, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1)); + t1 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1)); + t1 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle64.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + t1 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + asm volatile("vse64.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + asm volatile("vse64.v v8, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 4xVL variant - optimized for medium matrices (M <= 8) +static inline void fmatmul_v64b_4xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e64, m4, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const double *b_ = b + p; + double *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 4) { + const double *a_ = a + m * N; + const double *a__ = a_; + + asm volatile("vle64.v v16, (%0);" ::"r"(b_)); + const double *b__ = b_ + P; + + double *c__ = c_ + m * P; + + double t0, t1, t2, t3; + + t0 = *a__; + a__ += N; + t1 = *a__; + a__ += N; + t2 = *a__; + a__ += N; + t3 = *a__; + + unsigned int n = 0; + + while (n < N) { + asm volatile("vle64.v v20, (%0);" ::"r"(b__)); + b__ += P; + + a__ = a_ + ++n; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3)); + t3 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3)); + t3 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle64.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + t3 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse64.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + asm volatile("vse64.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + asm volatile("vse64.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + asm volatile("vse64.v v12, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 8xVL variant - optimized for larger matrices (M > 8) +static inline void fmatmul_v64b_8xVL(double *c, const double *a, const double *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e64, m2, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const double *b_ = b + p; + double *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 8) { + const double *a_ = a + m * N; + const double *a__ = a_; + + asm volatile("vle64.v v18, (%0);" ::"r"(b_)); + const double *b__ = b_ + P; + + double *c__ = c_ + m * P; + + double t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = *a__; + a__ += N; + t1 = *a__; + a__ += N; + t2 = *a__; + a__ += N; + t3 = *a__; + a__ += N; + t4 = *a__; + a__ += N; + t5 = *a__; + a__ += N; + t6 = *a__; + a__ += N; + t7 = *a__; + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle64.v v20, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7)); + t7 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7)); + t7 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle64.v v18, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + t7 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse64.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + asm volatile("vse64.v v2, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + asm volatile("vse64.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + asm volatile("vse64.v v6, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + asm volatile("vse64.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + asm volatile("vse64.v v10, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + asm volatile("vse64.v v12, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + asm volatile("vse64.v v14, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// ============================================================================= +// FP32 Matrix Multiplication Kernels (from spatzBenchmarks/sp-fmatmul) +// ============================================================================= + +// Forward declarations for FP32 matmul +static inline void fmatmul_v32b_2xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v32b_4xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v32b_8xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + +// FP32 matrix multiplication wrapper - automatically selects optimal variant +// Computes C = A × B where C is MxP, A is MxN, B is NxP +static inline void fmatmul_v32b(float *c, const float *a, const float *b, + const unsigned int M, const unsigned int N, + const unsigned int P) { + if (M <= 4) { + fmatmul_v32b_2xVL(c, a, b, 0, M, N, P, 0, P); + } else if (M <= 8) { + fmatmul_v32b_4xVL(c, a, b, 0, M, N, P, 0, P); + } else { + fmatmul_v32b_8xVL(c, a, b, 0, M, N, P, 0, P); + } +} + +// 2xVL variant - optimized for small matrices (M <= 4) +static inline void fmatmul_v32b_2xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e32, m8, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float *b_ = b + p; + float *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 2) { + const float *a_ = a + m * N; + const float *a__ = a_; + + asm volatile("vle32.v v16, (%0);" ::"r"(b_)); + const float *b__ = b_ + P; + + float *c__ = c_ + m * P; + + float t0, t1; + + t0 = *a__; + a__ += N; + t1 = *a__; + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle32.v v24, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1)); + t1 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1)); + t1 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle32.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + t1 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + asm volatile("vse32.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + asm volatile("vse32.v v8, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 4xVL variant - optimized for medium matrices (M <= 8) +static inline void fmatmul_v32b_4xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e32, m4, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float *b_ = b + p; + float *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 4) { + const float *a_ = a + m * N; + const float *a__ = a_; + + asm volatile("vle32.v v16, (%0);" ::"r"(b_)); + const float *b__ = b_ + P; + + float *c__ = c_ + m * P; + + float t0, t1, t2, t3; + + t0 = *a__; + a__ += N; + t1 = *a__; + a__ += N; + t2 = *a__; + a__ += N; + t3 = *a__; + + unsigned int n = 0; + + while (n < N) { + asm volatile("vle32.v v20, (%0);" ::"r"(b__)); + b__ += P; + + a__ = a_ + ++n; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3)); + t3 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3)); + t3 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle32.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + t3 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse32.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + asm volatile("vse32.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + asm volatile("vse32.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + asm volatile("vse32.v v12, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 8xVL variant - optimized for larger matrices (M > 8) +static inline void fmatmul_v32b_8xVL(float *c, const float *a, const float *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e32, m2, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float *b_ = b + p; + float *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 8) { + const float *a_ = a + m * N; + const float *a__ = a_; + + asm volatile("vle32.v v18, (%0);" ::"r"(b_)); + const float *b__ = b_ + P; + + float *c__ = c_ + m * P; + + float t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = *a__; + a__ += N; + t1 = *a__; + a__ += N; + t2 = *a__; + a__ += N; + t3 = *a__; + a__ += N; + t4 = *a__; + a__ += N; + t5 = *a__; + a__ += N; + t6 = *a__; + a__ += N; + t7 = *a__; + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle32.v v20, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7)); + t7 = *a__; + } else { + asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7)); + t7 = *a__; + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle32.v v18, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + t0 = *a__; + a__ += N; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + t1 = *a__; + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + t2 = *a__; + a__ += N; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + t3 = *a__; + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + t4 = *a__; + a__ += N; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + t5 = *a__; + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + t6 = *a__; + a__ += N; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + t7 = *a__; + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse32.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + asm volatile("vse32.v v2, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + asm volatile("vse32.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + asm volatile("vse32.v v6, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + asm volatile("vse32.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + asm volatile("vse32.v v10, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + asm volatile("vse32.v v12, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + asm volatile("vse32.v v14, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// ============================================================================= +// FP16 Matrix Multiplication Kernels (from spatzBenchmarks/hp-fmatmul) +// ============================================================================= + +// Forward declarations for FP16 matmul SDOTP variants +static inline void matmul_2xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void matmul_4xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void matmul_8xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + +static inline void fmatmul_v16b_sdotp(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int M, const unsigned int N, + const unsigned int P) { + if (M <= 4) { + matmul_2xVL(c, a, b, 0, M, N, P, 0, P); + } else if (M <= 8) { + matmul_4xVL(c, a, b, 0, M, N, P, 0, P); + } else { + matmul_8xVL(c, a, b, 0, M, N, P, 0, P); + } +} + +// 2xVL variant - optimized for small matrices (M <= 4) +static inline void matmul_2xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl - processes 2 columns at a time + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e16, m8, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(2 * (p_end - p))); + + const float16_t *b_ = b + 2 * p; + float16_t *c_ = c + p; + + // Account for the used operands + p += gvl / 2; + + for (unsigned int m = m_start; m < m_end; m += 2) { + asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(gvl)); + + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle16.v v16, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + 2 * P; + + float16_t *c__ = c_ + m * P; + + double t0, t1; + + asm volatile("vmv.v.x v0, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v8, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + n += 2; + a__ = a_ + n; + + asm volatile("vle16.v v24, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v16" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v16" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + + n += 2; + a__ = a_ + n; + + if (n == N) + break; + + asm volatile("vle16.v v16, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v24" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v24" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + } + + asm volatile("vfwdotp.vf v0, %0, v24" ::"f"(t0)); + asm volatile("vfwdotp.vf v8, %0, v24" ::"f"(t1)); + + asm volatile("vsetvli zero, %0, e16, m8, ta, ma" ::"r"(gvl / 2)); + + asm volatile("vfncvt.f.f.w v0, v0"); + asm volatile("vse16.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v8, v8"); + asm volatile("vse16.v v8, (%0);" ::"r"(c__)); + } + } +} + +//4xVL variant - optimized for medium matrices (M <= 8) +static inline void matmul_4xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl - processes 2 columns at a time + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e16, m4, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(2 * (p_end - p))); + + const float16_t *b_ = b + 2 * p; + float16_t *c_ = c + p; + + // Account for the used operands + p += gvl / 2; + + for (unsigned int m = m_start; m < m_end; m += 4) { + asm volatile("vsetvli zero, %0, e16, m4, ta, ma" ::"r"(gvl)); + + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle16.v v16, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + 2 * P; + + float16_t *c__ = c_ + m * P; + + double t0, t1, t2, t3; + + asm volatile("vmv.v.x v0, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v4, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v8, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v12, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + n += 2; + a__ = a_ + n; + + asm volatile("vle16.v v20, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v16" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v4, %0, v16" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v16" ::"f"(t2)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v12, %0, v16" ::"f"(t3)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + + n += 2; + a__ = a_ + n; + + if (n == N) + break; + + asm volatile("vle16.v v16, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t2)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t3)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + } + + asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t1)); + asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t2)); + asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t3)); + + asm volatile("vsetvli zero, %0, e16, m4, ta, ma" ::"r"(gvl / 2)); + + asm volatile("vfncvt.f.f.w v0, v0"); + asm volatile("vse16.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v4, v4"); + asm volatile("vse16.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v8, v8"); + asm volatile("vse16.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v12, v12"); + asm volatile("vse16.v v12, (%0);" ::"r"(c__)); + } + } +} + +//8xVL variant - optimized for larger matrices (M > 8) +static inline void matmul_8xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl - processes 2 columns at a time + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e16, m2, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(2 * (p_end - p))); + + const float16_t *b_ = b + 2 * p; + float16_t *c_ = c + p; + + // Account for the used operands + p += gvl / 2; + + for (unsigned int m = m_start; m < m_end; m += 8) { + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(gvl)); + + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle16.v v18, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + 2 * P; + + float16_t *c__ = c_ + m * P; + + double t0, t1, t2, t3, t4, t5, t6, t7; + + asm volatile("vmv.v.x v0, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v2, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v4, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v6, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v8, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v10, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v12, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vmv.v.x v14, zero"); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + n += 2; + a__ = a_ + n; + + asm volatile("vle16.v v20, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v18" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v2, %0, v18" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v4, %0, v18" ::"f"(t2)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v6, %0, v18" ::"f"(t3)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v18" ::"f"(t4)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v10, %0, v18" ::"f"(t5)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v12, %0, v18" ::"f"(t6)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v14, %0, v18" ::"f"(t7)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + + n += 2; + a__ = a_ + n; + + if (n == N) + break; + + asm volatile("vle16.v v18, (%0);" ::"r"(b__)); + b__ += 2 * P; + + asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v2, %0, v20" ::"f"(t1)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t2)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v6, %0, v20" ::"f"(t3)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t4)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v10, %0, v20" ::"f"(t5)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t6)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vfwdotp.vf v14, %0, v20" ::"f"(t7)); + asm volatile("flw %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + } + + asm volatile("vfwdotp.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vfwdotp.vf v2, %0, v20" ::"f"(t1)); + asm volatile("vfwdotp.vf v4, %0, v20" ::"f"(t2)); + asm volatile("vfwdotp.vf v6, %0, v20" ::"f"(t3)); + asm volatile("vfwdotp.vf v8, %0, v20" ::"f"(t4)); + asm volatile("vfwdotp.vf v10, %0, v20" ::"f"(t5)); + asm volatile("vfwdotp.vf v12, %0, v20" ::"f"(t6)); + asm volatile("vfwdotp.vf v14, %0, v20" ::"f"(t7)); + + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(gvl / 2)); + + asm volatile("vfncvt.f.f.w v0, v0"); + asm volatile("vse16.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v2, v2"); + asm volatile("vse16.v v2, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v4, v4"); + asm volatile("vse16.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v6, v6"); + asm volatile("vse16.v v6, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v8, v8"); + asm volatile("vse16.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v10, v10"); + asm volatile("vse16.v v10, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v12, v12"); + asm volatile("vse16.v v12, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfncvt.f.f.w v14, v14"); + asm volatile("vse16.v v14, (%0);" ::"r"(c__)); + } + } +} + +// From spatzBenchmarks/hp-fmatmul +// Forward declarations for FP16 matmul standard (direct FP16) +static inline void fmatmul_v16b_2xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v16b_4xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); +static inline void fmatmul_v16b_8xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end); + +static inline void fmatmul_v16b(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int M, const unsigned int N, + const unsigned int P) { + if (M <= 4) { + fmatmul_v16b_2xVL(c, a, b, 0, M, N, P, 0, P); + } else if (M <= 8) { + fmatmul_v16b_4xVL(c, a, b, 0, M, N, P, 0, P); + } else { + fmatmul_v16b_8xVL(c, a, b, 0, M, N, P, 0, P); + } +} + +// 2xVL variant - optimized for small matrices (M <= 4) +static inline void fmatmul_v16b_2xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e32, m8, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float16_t *b_ = b + p; + float16_t *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 2) { + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle32.v v16, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + P; + + float16_t *c__ = c_ + m * P; + + float t0, t1; + + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle32.v v24, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle32.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + } + + asm volatile("vfmacc.vf v0, %0, v24" ::"f"(t0)); + asm volatile("vse32.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v24" ::"f"(t1)); + asm volatile("vse32.v v8, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 4xVL variant - optimized for medium matrices (M <= 8) +static inline void fmatmul_v16b_4xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e32, m4, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float16_t *b_ = b + p; + float16_t *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 4) { + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle32.v v16, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + P; + + float16_t *c__ = c_ + m * P; + + float t0, t1, t2, t3; + + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + asm volatile("vle32.v v20, (%0);" ::"r"(b__)); + b__ += P; + + a__ = a_ + ++n; + + if (n == 1) { + asm volatile("vfmul.vf v0, v16, %0" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v4, v16, %0" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v8, v16, %0" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v12, v16, %0" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + } else { + asm volatile("vfmacc.vf v0, %0, v16" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v4, %0, v16" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v16" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v12, %0, v16" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle32.v v16, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse32.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t1)); + asm volatile("vse32.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t2)); + asm volatile("vse32.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t3)); + asm volatile("vse32.v v12, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +// 8xVL variant - optimized for larger matrices (M > 8) +static inline void fmatmul_v16b_8xVL(float16_t *c, const float16_t *a, const float16_t *b, + const unsigned int m_start, const unsigned int m_end, + const unsigned int N, const unsigned int P, + const unsigned int p_start, const unsigned int p_end) { + unsigned int p = p_start; + while (p < p_end) { + // Calculate the vl + size_t gvl; + asm volatile("vsetvli %[gvl], %[vl], e16, m2, ta, ma" + : [gvl] "=r"(gvl) + : [vl] "r"(p_end - p)); + + const float16_t *b_ = b + p; + float16_t *c_ = c + p; + + for (unsigned int m = m_start; m < m_end; m += 8) { + const float16_t *a_ = a + m * N; + const float16_t *a__ = a_; + + asm volatile("vle16.v v18, (%0);" ::"r"(b_)); + const float16_t *b__ = b_ + P; + + float16_t *c__ = c_ + m * P; + + float t0, t1, t2, t3, t4, t5, t6, t7; + + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + + unsigned int n = 0; + + while (n < N) { + a__ = a_ + ++n; + + asm volatile("vle16.v v20, (%0);" ::"r"(b__)); + b__ += P; + + if (n == 1) { + asm volatile("vfmul.vf v0, v18, %0" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v2, v18, %0" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v4, v18, %0" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v6, v18, %0" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v8, v18, %0" ::"f"(t4)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v10, v18, %0" ::"f"(t5)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v12, v18, %0" ::"f"(t6)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmul.vf v14, v18, %0" ::"f"(t7)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + } else { + asm volatile("vfmacc.vf v0, %0, v18" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v2, %0, v18" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v4, %0, v18" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v6, %0, v18" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v18" ::"f"(t4)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v10, %0, v18" ::"f"(t5)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v12, %0, v18" ::"f"(t6)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v14, %0, v18" ::"f"(t7)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + } + + a__ = a_ + ++n; + + if (n == N) + break; + + asm volatile("vle16.v v18, (%0);" ::"r"(b__)); + b__ += P; + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t0) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t1) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t2) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t3) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t4) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t5) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t6) : [a] "r"(a__)); + a__ += N; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + asm volatile("flh %[t], 0(%[a])" : [t] "=f"(t7) : [a] "r"(a__)); + } + + asm volatile("vfmacc.vf v0, %0, v20" ::"f"(t0)); + asm volatile("vse16.v v0, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v2, %0, v20" ::"f"(t1)); + asm volatile("vse16.v v2, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v4, %0, v20" ::"f"(t2)); + asm volatile("vse16.v v4, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v6, %0, v20" ::"f"(t3)); + asm volatile("vse16.v v6, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v8, %0, v20" ::"f"(t4)); + asm volatile("vse16.v v8, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v10, %0, v20" ::"f"(t5)); + asm volatile("vse16.v v10, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v12, %0, v20" ::"f"(t6)); + asm volatile("vse16.v v12, (%0);" ::"r"(c__)); + c__ += P; + asm volatile("vfmacc.vf v14, %0, v20" ::"f"(t7)); + asm volatile("vse16.v v14, (%0);" ::"r"(c__)); + } + + p += gvl; + } +} + +#endif // _SPATZ_UTILS_H_ diff --git a/spatz/sw/tasks/atomic_remote_task.c b/spatz/sw/tasks/atomic_remote_task.c new file mode 100644 index 0000000..dd9da10 --- /dev/null +++ b/spatz/sw/tasks/atomic_remote_task.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Atomic Remote Task for Spatz + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +#define DATA_ADDR (SPATZ_DATA) +#define NUM_ITER 2 +#define NUM_TARGETS 3 // Self + 2 remote tiles + +typedef struct { + uint32_t hartid; + uint32_t counter_addr; + uint32_t sync_base; +} atomic_params_t; + +int atomic_remote_task(void) { + // Read parameter structure pointer from DATA register + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile atomic_params_t *params = (volatile atomic_params_t *)params_ptr; + + uint32_t hartid = params->hartid; + uint32_t my_addr = params->counter_addr; + uint32_t sync_base = params->sync_base; + + // Each hart increments: self, (self+1)%16, (self+2)%16 + for (int iter = 0; iter < NUM_ITER; iter++) { + for (int i = 0; i < NUM_TARGETS; i++) { + uint32_t target = (hartid + i) % 16; + uint32_t addr = sync_base + target * L1_TILE_OFFSET; + + asm volatile( + "addi t0, %0, 0 \n" + "li t1, 1 \n" + "amoadd.w t2, t1, (t0) \n" + : + : "r"(addr) + : "t0", "t1", "t2", "memory" + ); + } + } + + while (mmio32(my_addr) != (NUM_ITER * NUM_TARGETS)); + + return 0; +} diff --git a/spatz/sw/tasks/dotp_task.c b/spatz/sw/tasks/dotp_task.c new file mode 100644 index 0000000..23ae320 --- /dev/null +++ b/spatz/sw/tasks/dotp_task.c @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * FP16 Dot Product using RISC-V Vector Extension + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +// DATA register address for reading parameter pointer +#define DATA_ADDR (SPATZ_DATA) + +// Parameter structure in shared L1 memory +typedef struct { + uint32_t a_addr; // Vector A address + uint32_t b_addr; // Vector B address + uint32_t result_addr; // Result address (scalar) + uint32_t n_elements; // Number of elements +} dotp_params_t; + +// Main entry point +int dotp_task(void) { + + // Read parameter structure pointer from DATA_REG + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile dotp_params_t *params = (volatile dotp_params_t *)params_ptr; + + float16_t *A = (float16_t *)params->a_addr; + float16_t *B = (float16_t *)params->b_addr; + float *result = (float *)params->result_addr; + uint32_t N = params->n_elements; + + // Compute dot product using Spatz benchmark kernel + float16_t result_fp16 = fdotp_v16b(A, B, N); + + // Store result as FP16 (cast pointer to float16_t*) + *(float16_t *)result = result_fp16; + + return 0; +} diff --git a/spatz/sw/tasks/hello_task.c b/spatz/sw/tasks/hello_task.c new file mode 100644 index 0000000..e80334e --- /dev/null +++ b/spatz/sw/tasks/hello_task.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Simple Hello World Task for Spatz + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +#define DATA_ADDR (SPATZ_DATA) + +// Parameter structure +typedef struct { + uint32_t tile_id; +} hello_params_t; + +// Main entry point +int hello_task(void) { + + // Read parameter structure pointer from DATA register + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile hello_params_t *params = (volatile hello_params_t *)params_ptr; + + uint32_t tile_id = params->tile_id; + + // Print from Spatz! + printf("*** Hello from Spatz task on Tile %d! ***\n", tile_id); + + return 0; +} diff --git a/spatz/sw/tasks/hello_world_simple_task.c b/spatz/sw/tasks/hello_world_simple_task.c new file mode 100644 index 0000000..54a38f7 --- /dev/null +++ b/spatz/sw/tasks/hello_world_simple_task.c @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Hello World Task for Spatz_cc + */ +#include "magia_tile_utils.h" + +int hello_world_simple_task(void) { + printf("[SNITCH] Hello World from Spatz!\n"); + return 0; +} diff --git a/spatz/sw/tasks/idma_simple_task.c b/spatz/sw/tasks/idma_simple_task.c new file mode 100644 index 0000000..c3120aa --- /dev/null +++ b/spatz/sw/tasks/idma_simple_task.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * iDMA Task for MAGIA Tile - Demonstrates iDMA Transfers with Event Unit controlled by Spatz_cc + */ + +#include +#include "magia_tile_utils.h" +#include "event_unit_utils.h" +#include "idma_mm_utils.h" + +#define L1_DMA_SRC ((volatile uint32_t *)(L1_BASE + 0x1000)) +#define L1_DMA_DST ((volatile uint32_t *)(L1_BASE + 0x2000)) +#define L2_DMA_BUF ((volatile uint32_t *)(L2_BASE + 0x1000)) +#define DMA_SIZE 64 // 64 bytes = 16 words + +int idma_simple_task(void) { + printf("[SNITCH] \n========================================\n"); + printf("[SNITCH] IDMA TASK: DMA Transfers + Event Unit\n"); + printf("[SNITCH] ========================================\n"); + int errors = 0; + + // Enable iDMA events (keep EU_SPATZ_DONE enabled for CV32) + printf("[SNITCH] Enabling iDMA events...\n"); + eu_enable_events(EU_IDMA_ALL_DONE_MASK); + + // 1. Test iDMA transfer L1→L2 + printf("[SNITCH] Testing iDMA L1 to L2 transfer...\n"); + + // Initialize source data in L1 + for (int i = 0; i < (DMA_SIZE/4); i++) { + L1_DMA_SRC[i] = 0xA0000000 + i; + } + + // Launch DMA transfer (L1→L2) + int tid = idma_L1ToL2((uint32_t)L1_DMA_SRC, (uint32_t)L2_DMA_BUF, DMA_SIZE); + + // Wait for O2A completion with polling (10ms timeout) + uint32_t events = eu_idma_wait_o2a_completion(EU_WAIT_MODE_POLLING); + + if (events & EU_IDMA_O2A_DONE_MASK) { + printf("[SNITCH] DMA L1 to L2 complete (O2A event detected)\n"); + } else { + printf("[SNITCH] DMA L1 to L2 TIMEOUT\n"); + errors++; + } + + // 2. Test iDMA transfer L2→L1 + printf("[SNITCH] Testing iDMA L2 to L1 transfer...\n"); + + // Clear destination + for (int i = 0; i < (DMA_SIZE/4); i++) { + L1_DMA_DST[i] = 0; + } + + tid = idma_L2ToL1((uint32_t)L2_DMA_BUF, (uint32_t)L1_DMA_DST, DMA_SIZE); + + // Wait for A2O completion with polling (10ms timeout) + events = eu_idma_wait_a2o_completion(EU_WAIT_MODE_POLLING); + + if (events & EU_IDMA_A2O_DONE_MASK) { + printf("[SNITCH] DMA L2 to L1 complete (A2O event detected)\n"); + } else { + printf("[SNITCH] DMA L2 to L1 TIMEOUT\n"); + errors++; + } + + // Verify transferred data (using mmio32) + printf("[SNITCH] Verifying transferred data...\n"); + for (int i = 0; i < (DMA_SIZE/4); i++) { + uint32_t expected = 0xA0000000 + i; + uint32_t computed = mmio32((uint32_t)L1_DMA_DST + 4*i); + if (computed != expected) { + printf("[SNITCH] [%d] FAIL: computed=0x%08x, expected=0x%08x\n", + i, computed, expected); + errors++; + } + } + if (errors == 0) printf("[SNITCH] Data verification: OK\n"); + return 0; +} diff --git a/spatz/sw/tasks/instructions_sweep_simple_task.c b/spatz/sw/tasks/instructions_sweep_simple_task.c new file mode 100644 index 0000000..d042ca3 --- /dev/null +++ b/spatz/sw/tasks/instructions_sweep_simple_task.c @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Instructions Sweep Task for Spatz_cc + */ + +#include +#include "magia_tile_utils.h" + +#define VEC_SIZE 16 + +// Memory layout in L1 +#define VEC_A_BASE (L1_BASE + 0x00005000) +#define VEC_B_BASE (L1_BASE + 0x00005200) +#define VEC_C_BASE (L1_BASE + 0x00005400) +#define SCALAR_BASE (L1_BASE + 0x00005600) + +#define vec_a ((volatile float *)VEC_A_BASE) +#define vec_b ((volatile float *)VEC_B_BASE) +#define vec_c ((volatile float *)VEC_C_BASE) +#define scalar_mem ((volatile float *)SCALAR_BASE) + +int instructions_sweep_simple_task(void) { + + printf("[SNITCH] 32-bit Instructions Test - Spatz Full Coverage\n"); + printf("[SNITCH] Testing: RVF, Vector Int/FP (e8/e16/e32)\n"); + printf("[SNITCH] Using L1 memory at 0x%08x\n", L1_BASE); + printf("[SNITCH] ========================================\n"); + + float a_f, b_f, c_f, result_f; + int32_t iresult; + + // ==================== SCALAR RVF (Single Precision) ==================== + printf("[SNITCH] Testing Scalar RVF Instructions:\n"); + a_f = 3.5f; b_f = 2.0f; c_f = 1.5f; + + // Arithmetic + asm volatile("fadd.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FADD.S\n"); + asm volatile("fsub.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FSUB.S\n"); + asm volatile("fmul.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FMUL.S\n"); + asm volatile("fmin.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FMIN.S\n"); + asm volatile("fmax.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FMAX.S\n"); + + // Sign injection + asm volatile("fsgnj.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FSGNJ.S\n"); + asm volatile("fsgnjn.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FSGNJN.S\n"); + asm volatile("fsgnjx.s %0, %1, %2" : "=f"(result_f) : "f"(a_f), "f"(b_f)); printf(" FSGNJX.S\n"); + + // Comparisons + asm volatile("feq.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf(" FEQ.S\n"); + asm volatile("flt.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf(" FLT.S\n"); + asm volatile("fle.s %0, %1, %2" : "=r"(iresult) : "f"(a_f), "f"(b_f)); printf(" FLE.S\n"); + asm volatile("fclass.s %0, %1" : "=r"(iresult) : "f"(a_f)); printf(" FCLASS.S\n"); + + // FMA + asm volatile("fmadd.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf(" FMADD.S\n"); + asm volatile("fmsub.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf(" FMSUB.S\n"); + asm volatile("fnmadd.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf(" FNMADD.S\n"); + asm volatile("fnmsub.s %0, %1, %2, %3" : "=f"(result_f) : "f"(a_f), "f"(b_f), "f"(c_f)); printf(" FNMSUB.S\n"); + + // Conversions + asm volatile("fcvt.w.s %0, %1, rtz" : "=r"(iresult) : "f"(a_f)); printf(" FCVT.W.S\n"); + asm volatile("fcvt.wu.s %0, %1, rtz" : "=r"(iresult) : "f"(a_f)); printf(" FCVT.WU.S\n"); + asm volatile("fcvt.s.w %0, %1" : "=f"(result_f) : "r"(iresult)); printf(" FCVT.S.W\n"); + asm volatile("fcvt.s.wu %0, %1" : "=f"(result_f) : "r"(iresult)); printf(" FCVT.S.WU\n"); + + // Load/Store + scalar_mem[0] = 42.0f; + asm volatile("flw %0, 0(%1)" : "=f"(result_f) : "r"(scalar_mem)); printf(" FLW\n"); + asm volatile("fsw %0, 0(%1)" :: "f"(result_f), "r"(scalar_mem)); printf(" FSW\n"); + + + // ==================== SCALAR M-EXTENSION (Integer Mul/Div) ==================== + printf("[SNITCH] Testing Scalar M-Extension Instructions:\n"); + int32_t a_i = 15, b_i = 4, result_i; + + asm volatile("mul %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" MUL\n"); + asm volatile("mulh %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" MULH\n"); + asm volatile("mulhu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" MULHU\n"); + asm volatile("mulhsu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" MULHSU\n"); + asm volatile("div %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" DIV\n"); + asm volatile("divu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" DIVU\n"); + asm volatile("rem %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" REM\n"); + asm volatile("remu %0, %1, %2" : "=r"(result_i) : "r"(a_i), "r"(b_i)); printf(" REMU\n"); + + // ==================== VECTOR INTEGER (e32) ==================== + printf("[SNITCH] Testing Vector Integer Instructions (e32):\n"); + + volatile int32_t *vec_ia = (volatile int32_t *)VEC_A_BASE; + volatile int32_t *vec_ib = (volatile int32_t *)VEC_B_BASE; + volatile int32_t *vec_ic = (volatile int32_t *)VEC_C_BASE; + + for (int i = 0; i < VEC_SIZE; i++) { + vec_ia[i] = i + 1; + vec_ib[i] = i * 2; + } + + asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE)); printf(" VSETVLI (e32)\n"); + asm volatile("vle32.v v0, (%0)" ::"r"(vec_ia)); printf(" VLE32.V\n"); + asm volatile("vle32.v v4, (%0)" ::"r"(vec_ib)); printf(" VLE32.V\n"); + + // Arithmetic + asm volatile("vadd.vv v8, v0, v4"); printf(" VADD.VV\n"); + asm volatile("vsub.vv v8, v0, v4"); printf(" VSUB.VV\n"); + asm volatile("vmul.vv v8, v0, v4"); printf(" VMUL.VV\n"); + asm volatile("vdivu.vv v8, v0, v4"); printf(" VDIVU.VV\n"); + asm volatile("vdiv.vv v8, v0, v4"); printf(" VDIV.VV\n"); + asm volatile("vremu.vv v8, v0, v4"); printf(" VREMU.VV\n"); + asm volatile("vrem.vv v8, v0, v4"); printf(" VREM.VV\n"); + + int32_t scalar_i = 5; + asm volatile("vadd.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VADD.VX\n"); + asm volatile("vsub.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSUB.VX\n"); + asm volatile("vrsub.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VRSUB.VX\n"); + asm volatile("vmul.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VMUL.VX\n"); + + // Logical + asm volatile("vand.vv v8, v0, v4"); printf(" VAND.VV\n"); + asm volatile("vor.vv v8, v0, v4"); printf(" VOR.VV\n"); + asm volatile("vxor.vv v8, v0, v4"); printf(" VXOR.VV\n"); + asm volatile("vand.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VAND.VX\n"); + asm volatile("vor.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VOR.VX\n"); + asm volatile("vxor.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VXOR.VX\n"); + + // Shifts + asm volatile("vsll.vv v8, v0, v4"); printf(" VSLL.VV\n"); + asm volatile("vsrl.vv v8, v0, v4"); printf(" VSRL.VV\n"); + asm volatile("vsra.vv v8, v0, v4"); printf(" VSRA.VV\n"); + asm volatile("vsll.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSLL.VX\n"); + asm volatile("vsrl.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSRL.VX\n"); + asm volatile("vsra.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSRA.VX\n"); + + // Min/Max + asm volatile("vmin.vv v8, v0, v4"); printf(" VMIN.VV\n"); + asm volatile("vmax.vv v8, v0, v4"); printf(" VMAX.VV\n"); + asm volatile("vminu.vv v8, v0, v4"); printf(" VMINU.VV\n"); + asm volatile("vmaxu.vv v8, v0, v4"); printf(" VMAXU.VV\n"); + + // High multiply + asm volatile("vmulh.vv v8, v0, v4"); printf(" VMULH.VV\n"); + asm volatile("vmulhu.vv v8, v0, v4"); printf(" VMULHU.VV\n"); + asm volatile("vmulhsu.vv v8, v0, v4"); printf(" VMULHSU.VV\n"); + asm volatile("vmulh.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VMULH.VX\n"); + + // MAC + asm volatile("vmacc.vv v8, v0, v4"); printf(" VMACC.VV\n"); + asm volatile("vnmsac.vv v8, v0, v4"); printf(" VNMSAC.VV\n"); + asm volatile("vmadd.vv v8, v0, v4"); printf(" VMADD.VV\n"); + asm volatile("vnmsub.vv v8, v0, v4"); printf(" VNMSUB.VV\n"); + + // Carry/Borrow + asm volatile("vadc.vvm v8, v0, v4, v0"); printf(" VADC.VVM\n"); + asm volatile("vmadc.vv v0, v4, v8"); printf(" VMADC.VV\n"); + asm volatile("vsbc.vvm v8, v0, v4, v0"); printf(" VSBC.VVM\n"); + asm volatile("vmsbc.vv v0, v4, v8"); printf(" VMSBC.VV\n"); + + // Comparisons (mask operations) + asm volatile("vmseq.vv v0, v4, v8"); printf(" VMSEQ.VV\n"); + asm volatile("vmsne.vv v0, v4, v8"); printf(" VMSNE.VV\n"); + asm volatile("vmslt.vv v0, v4, v8"); printf(" VMSLT.VV\n"); + asm volatile("vmsltu.vv v0, v4, v8"); printf(" VMSLTU.VV\n"); + asm volatile("vmsle.vv v0, v4, v8"); printf(" VMSLE.VV\n"); + asm volatile("vmsleu.vv v0, v4, v8"); printf(" VMSLEU.VV\n"); + asm volatile("vmsltu.vx v0, v4, %0" ::"r"(scalar_i)); printf(" VMSLTU.VX\n"); + asm volatile("vmslt.vx v0, v4, %0" ::"r"(scalar_i)); printf(" VMSLT.VX\n"); + + // Integer immediate operations + asm volatile("vadd.vi v8, v0, 5"); printf(" VADD.VI\n"); + asm volatile("vrsub.vi v8, v0, 10"); printf(" VRSUB.VI\n"); + asm volatile("vand.vi v8, v0, 15"); printf(" VAND.VI\n"); + asm volatile("vor.vi v8, v0, 7"); printf(" VOR.VI\n"); + asm volatile("vxor.vi v8, v0, 3"); printf(" VXOR.VI\n"); + asm volatile("vsll.vi v8, v0, 2"); printf(" VSLL.VI\n"); + asm volatile("vsrl.vi v8, v0, 2"); printf(" VSRL.VI\n"); + asm volatile("vsra.vi v8, v0, 2"); printf(" VSRA.VI\n"); + asm volatile("vmseq.vi v0, v4, 5"); printf(" VMSEQ.VI\n"); + asm volatile("vmsne.vi v0, v4, 5"); printf(" VMSNE.VI\n"); + asm volatile("vmsleu.vi v0, v4, 10"); printf(" VMSLEU.VI\n"); + asm volatile("vmsle.vi v0, v4, 10"); printf(" VMSLE.VI\n"); + asm volatile("vmsgtu.vi v0, v4, 3"); printf(" VMSGTU.VI\n"); + asm volatile("vmsgt.vi v0, v4, 3"); printf(" VMSGT.VI\n"); + + // Reductions + asm volatile("vredsum.vs v8, v0, v4"); printf(" VREDSUM.VS\n"); + asm volatile("vredand.vs v8, v0, v4"); printf(" VREDAND.VS\n"); + asm volatile("vredor.vs v8, v0, v4"); printf(" VREDOR.VS\n"); + asm volatile("vredxor.vs v8, v0, v4"); printf(" VREDXOR.VS\n"); + asm volatile("vredmin.vs v8, v0, v4"); printf(" VREDMIN.VS\n"); + asm volatile("vredminu.vs v8, v0, v4"); printf(" VREDMINU.VS\n"); + asm volatile("vredmax.vs v8, v0, v4"); printf(" VREDMAX.VS\n"); + asm volatile("vredmaxu.vs v8, v0, v4"); printf(" VREDMAXU.VS\n"); + + // Move + asm volatile("vmv.v.v v8, v0"); printf(" VMV.V.V\n"); + asm volatile("vmv.v.x v8, %0" ::"r"(scalar_i)); printf(" VMV.V.X\n"); + asm volatile("vmv.s.x v8, %0" ::"r"(scalar_i)); printf(" VMV.S.X\n"); + asm volatile("vmv.x.s %0, v0" :"=r"(result_i)); printf(" VMV.X.S\n"); + + // Slide + asm volatile("vslideup.vx v8, v0, %0" ::"r"(2)); printf(" VSLIDEUP.VX\n"); + asm volatile("vslidedown.vx v8, v0, %0" ::"r"(2)); printf(" VSLIDEDOWN.VX\n"); + asm volatile("vslide1up.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSLIDE1UP.VX\n"); + asm volatile("vslide1down.vx v8, v0, %0" ::"r"(scalar_i)); printf(" VSLIDE1DOWN.VX\n"); + + // Widening (e16->e32) + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); + volatile int16_t *vec_i16a = (volatile int16_t *)VEC_A_BASE; + volatile int16_t *vec_i16b = (volatile int16_t *)VEC_B_BASE; + for (int i = 0; i < VEC_SIZE*2; i++) { + vec_i16a[i] = i + 1; + vec_i16b[i] = i * 2; + } + asm volatile("vle16.v v0, (%0)" ::"r"(vec_i16a)); + asm volatile("vle16.v v2, (%0)" ::"r"(vec_i16b)); + asm volatile("vwadd.vv v8, v0, v2"); printf(" VWADD.VV (e16->e32)\n"); + asm volatile("vwaddu.vv v8, v0, v2"); printf(" VWADDU.VV (e16->e32)\n"); + asm volatile("vwsub.vv v8, v0, v2"); printf(" VWSUB.VV (e16->e32)\n"); + asm volatile("vwsubu.vv v8, v0, v2"); printf(" VWSUBU.VV (e16->e32)\n"); + asm volatile("vwmul.vv v8, v0, v2"); printf(" VWMUL.VV (e16->e32)\n"); + asm volatile("vwmulu.vv v8, v0, v2"); printf(" VWMULU.VV (e16->e32)\n"); + asm volatile("vwmulsu.vv v8, v0, v2"); printf(" VWMULSU.VV (e16->e32)\n"); + + // Widening with scalar (e16->e32) + int16_t scalar_i16 = 3; + asm volatile("vwadd.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWADD.VX (e16->e32)\n"); + asm volatile("vwaddu.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWADDU.VX (e16->e32)\n"); + asm volatile("vwsub.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWSUB.VX (e16->e32)\n"); + asm volatile("vwsubu.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWSUBU.VX (e16->e32)\n"); + asm volatile("vwmul.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWMUL.VX (e16->e32)\n"); + asm volatile("vwmulu.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWMULU.VX (e16->e32)\n"); + asm volatile("vwmulsu.vx v8, v0, %0" ::"r"(scalar_i16)); printf(" VWMULSU.VX (e16->e32)\n"); + + // Widening MAC (e16->e32) + asm volatile("vwmacc.vv v8, v0, v2"); printf(" VWMACC.VV (e16->e32)\n"); + asm volatile("vwmaccu.vv v8, v0, v2"); printf(" VWMACCU.VV (e16->e32)\n"); + asm volatile("vwmaccsu.vv v8, v0, v2"); printf(" VWMACCSU.VV (e16->e32)\n"); + + // Widening MAC with scalar (e16->e32) + asm volatile("vwmacc.vx v8, %0, v2" ::"r"(scalar_i16)); printf(" VWMACC.VX (e16->e32)\n"); + asm volatile("vwmaccu.vx v8, %0, v2" ::"r"(scalar_i16)); printf(" VWMACCU.VX (e16->e32)\n"); + asm volatile("vwmaccsu.vx v8, %0, v2" ::"r"(scalar_i16)); printf(" VWMACCSU.VX (e16->e32)\n"); + asm volatile("vwmaccus.vx v8, %0, v2" ::"r"(scalar_i16)); printf(" VWMACCUS.VX (e16->e32)\n"); + + // Note: Integer narrowing (vnsrl/vnsra) requires ELEN=64 (RVD extension) + // Skipped for 32-bit only configuration + + // Load/Store + asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE)); + asm volatile("vse32.v v0, (%0)" ::"r"(vec_ic)); printf(" VSE32.V\n"); + asm volatile("vle32.v v4, (%0)" ::"r"(vec_ic)); printf(" VLE32.V\n"); + asm volatile("vlse32.v v4, (%0), %1" ::"r"(vec_ic), "r"(8)); printf(" VLSE32.V (strided)\n"); + asm volatile("vsse32.v v4, (%0), %1" ::"r"(vec_ic), "r"(8)); printf(" VSSE32.V (strided)\n"); + + // Indexed load/store + for (int i = 0; i < VEC_SIZE; i++) vec_ia[i] = i * 4; + asm volatile("vle32.v v0, (%0)" ::"r"(vec_ia)); + asm volatile("vluxei32.v v4, (%0), v0" ::"r"(vec_ib)); printf(" VLUXEI32.V (indexed)\n"); + asm volatile("vsuxei32.v v4, (%0), v0" ::"r"(vec_ic)); printf(" VSUXEI32.V (indexed)\n"); + + // ==================== VECTOR INTEGER (e16) ==================== + printf("[SNITCH] Testing Vector Integer Instructions (e16):\n"); + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); + for (int i = 0; i < VEC_SIZE*2; i++) { + vec_i16a[i] = i + 1; + vec_i16b[i] = i * 2; + } + asm volatile("vle16.v v0, (%0)" ::"r"(vec_i16a)); printf(" VLE16.V\n"); + asm volatile("vle16.v v2, (%0)" ::"r"(vec_i16b)); + asm volatile("vadd.vv v4, v0, v2"); printf(" VADD.VV (e16)\n"); + asm volatile("vmul.vv v4, v0, v2"); printf(" VMUL.VV (e16)\n"); + asm volatile("vse16.v v4, (%0)" ::"r"(vec_i16a)); printf(" VSE16.V\n"); + + // ==================== VECTOR INTEGER (e8) ==================== + printf("[SNITCH] Testing Vector Integer Instructions (e8):\n"); + asm volatile("vsetvli zero, %0, e8, m2, ta, ma" ::"r"(VEC_SIZE*4)); + volatile int8_t *vec_i8a = (volatile int8_t *)VEC_A_BASE; + volatile int8_t *vec_i8b = (volatile int8_t *)VEC_B_BASE; + for (int i = 0; i < VEC_SIZE*4; i++) { + vec_i8a[i] = i + 1; + vec_i8b[i] = i * 2; + } + asm volatile("vle8.v v0, (%0)" ::"r"(vec_i8a)); printf(" VLE8.V\n"); + asm volatile("vle8.v v2, (%0)" ::"r"(vec_i8b)); + asm volatile("vadd.vv v4, v0, v2"); printf(" VADD.VV (e8)\n"); + asm volatile("vmul.vv v4, v0, v2"); printf(" VMUL.VV (e8)\n"); + asm volatile("vse8.v v4, (%0)" ::"r"(vec_i8a)); printf(" VSE8.V\n"); + + // Reset to e32 for FP tests + asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE)); + + // ==================== VECTOR FP (e32 - single precision) ==================== + printf("[SNITCH] Testing Vector FP Instructions (e32):\n"); + + // Riusa gli stessi buffer L1 + for (int i = 0; i < VEC_SIZE; i++) { + vec_a[i] = (float)(i + 1); + vec_b[i] = (float)(i * 2); + } + + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vle32.v v4, (%0)" ::"r"(vec_b)); + + // Arithmetic VV + asm volatile("vfadd.vv v8, v0, v4"); printf(" VFADD.VV (e32)\n"); + asm volatile("vfsub.vv v8, v0, v4"); printf(" VFSUB.VV (e32)\n"); + asm volatile("vfmul.vv v8, v0, v4"); printf(" VFMUL.VV (e32)\n"); + + // Arithmetic VF + float scalar_f = 2.5f; + asm volatile("vfadd.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFADD.VF (e32)\n"); + asm volatile("vfsub.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSUB.VF (e32)\n"); + asm volatile("vfrsub.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFRSUB.VF (e32)\n"); + asm volatile("vfmul.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFMUL.VF (e32)\n"); + + // Min/Max + asm volatile("vfmin.vv v8, v0, v4"); printf(" VFMIN.VV (e32)\n"); + asm volatile("vfmax.vv v8, v0, v4"); printf(" VFMAX.VV (e32)\n"); + asm volatile("vfmin.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFMIN.VF (e32)\n"); + asm volatile("vfmax.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFMAX.VF (e32)\n"); + + // Sign injection + asm volatile("vfsgnj.vv v8, v0, v4"); printf(" VFSGNJ.VV (e32)\n"); + asm volatile("vfsgnjn.vv v8, v0, v4"); printf(" VFSGNJN.VV (e32)\n"); + asm volatile("vfsgnjx.vv v8, v0, v4"); printf(" VFSGNJX.VV (e32)\n"); + asm volatile("vfsgnj.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSGNJ.VF (e32)\n"); + asm volatile("vfsgnjn.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSGNJN.VF (e32)\n"); + asm volatile("vfsgnjx.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSGNJX.VF (e32)\n"); + + // FMA + asm volatile("vle32.v v8, (%0)" ::"r"(vec_b)); + asm volatile("vfmadd.vv v0, v4, v8"); printf(" VFMADD.VV (e32)\n"); + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vfmsub.vv v0, v4, v8"); printf(" VFMSUB.VV (e32)\n"); + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vfnmadd.vv v0, v4, v8"); printf(" VFNMADD.VV (e32)\n"); + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vfnmsub.vv v0, v4, v8"); printf(" VFNMSUB.VV (e32)\n"); + + // MACC + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vle32.v v8, (%0)" ::"r"(vec_b)); + asm volatile("vfmacc.vv v8, v0, v4"); printf(" VFMACC.VV (e32)\n"); + asm volatile("vle32.v v8, (%0)" ::"r"(vec_b)); + asm volatile("vfmsac.vv v8, v0, v4"); printf(" VFMSAC.VV (e32)\n"); + asm volatile("vle32.v v8, (%0)" ::"r"(vec_b)); + asm volatile("vfnmacc.vv v8, v0, v4"); printf(" VFNMACC.VV (e32)\n"); + asm volatile("vle32.v v8, (%0)" ::"r"(vec_b)); + asm volatile("vfnmsac.vv v8, v0, v4"); printf(" VFNMSAC.VV (e32)\n"); + + // Reductions + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vfmv.s.f v12, %0" ::"f"(0.0f)); + asm volatile("vfredosum.vs v12, v0, v12"); printf(" VFREDOSUM.VS (e32)\n"); + asm volatile("vfredusum.vs v12, v0, v12"); printf(" VFREDUSUM.VS (e32)\n"); + asm volatile("vfredmin.vs v12, v0, v12"); printf(" VFREDMIN.VS (e32)\n"); + asm volatile("vfredmax.vs v12, v0, v12"); printf(" VFREDMAX.VS (e32)\n"); + + // Conversions + asm volatile("vfcvt.xu.f.v v8, v0"); printf(" VFCVT.XU.F.V (e32)\n"); + asm volatile("vfcvt.x.f.v v8, v0"); printf(" VFCVT.X.F.V (e32)\n"); + asm volatile("vfcvt.rtz.xu.f.v v8, v0"); printf(" VFCVT.RTZ.XU.F.V (e32)\n"); + asm volatile("vfcvt.rtz.x.f.v v8, v0"); printf(" VFCVT.RTZ.X.F.V (e32)\n"); + asm volatile("vfcvt.f.xu.v v0, v8"); printf(" VFCVT.F.XU.V (e32)\n"); + asm volatile("vfcvt.f.x.v v0, v8"); printf(" VFCVT.F.X.V (e32)\n"); + + // Move/Slide + asm volatile("vfmv.v.f v8, %0" ::"f"(scalar_f)); printf(" VFMV.V.F (e32)\n"); + asm volatile("vfmv.s.f v8, %0" ::"f"(scalar_f)); printf(" VFMV.S.F (e32)\n"); + asm volatile("vfmv.f.s %0, v0" :"=f"(result_f)); printf(" VFMV.F.S (e32)\n"); + + // Reload fresh data for slide operations + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vfslide1up.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSLIDE1UP.VF (e32)\n"); + asm volatile("vfslide1down.vf v8, v0, %0" ::"f"(scalar_f)); printf(" VFSLIDE1DOWN.VF (e32)\n"); + + printf(" [SKIPPED] Vector FP comparisons and VFCLASS.V not supported\n"); + + // ==================== VECTOR FP (e16 - half precision) ==================== + printf("[SNITCH] Testing Vector FP Instructions (e16):\n"); + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); + volatile __fp16 *vec_h = (volatile __fp16 *)VEC_A_BASE; + for (int i = 0; i < VEC_SIZE*2; i++) vec_h[i] = (__fp16)(i + 1); + asm volatile("vle16.v v0, (%0)" ::"r"(vec_h)); + asm volatile("vle16.v v2, (%0)" ::"r"(vec_h)); + asm volatile("vfadd.vv v4, v0, v2"); printf(" VFADD.VV (e16)\n"); + asm volatile("vfmul.vv v4, v0, v2"); printf(" VFMUL.VV (e16)\n"); + asm volatile("vfmacc.vv v4, v0, v2"); printf(" VFMACC.VV (e16)\n"); + + // Widening FP16->FP32 + asm volatile("vfwadd.vv v8, v0, v2"); printf(" VFWADD.VV (e16->e32)\n"); + asm volatile("vfwsub.vv v8, v0, v2"); printf(" VFWSUB.VV (e16->e32)\n"); + asm volatile("vfwmul.vv v8, v0, v2"); printf(" VFWMUL.VV (e16->e32)\n"); + asm volatile("vfwmacc.vv v8, v0, v2"); printf(" VFWMACC.VV (e16->e32)\n"); + asm volatile("vfwnmacc.vv v8, v0, v2"); printf(" VFWNMACC.VV (e16->e32)\n"); + asm volatile("vfwmsac.vv v8, v0, v2"); printf(" VFWMSAC.VV (e16->e32)\n"); + asm volatile("vfwnmsac.vv v8, v0, v2"); printf(" VFWNMSAC.VV (e16->e32)\n"); + + // Narrowing FP32->FP16 + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); + asm volatile("vsetvli zero, %0, e32, m4, ta, ma" ::"r"(VEC_SIZE)); // Load e32 data + asm volatile("vle32.v v0, (%0)" ::"r"(vec_a)); + asm volatile("vsetvli zero, %0, e16, m2, ta, ma" ::"r"(VEC_SIZE*2)); // Set for narrowing output + asm volatile("vfncvt.f.f.w v8, v0"); printf(" VFNCVT.F.F.W (e32->e16)\n"); + asm volatile("vfncvt.xu.f.w v8, v0"); printf(" VFNCVT.XU.F.W (e32->e16)\n"); + asm volatile("vfncvt.x.f.w v8, v0"); printf(" VFNCVT.X.F.W (e32->e16)\n"); + asm volatile("vfncvt.rtz.xu.f.w v8, v0"); printf(" VFNCVT.RTZ.XU.F.W (e32->e16)\n"); + asm volatile("vfncvt.rtz.x.f.w v8, v0"); printf(" VFNCVT.RTZ.X.F.W (e32->e16)\n"); + + printf("[SNITCH] ==== Test completed ====\n"); + return 0; +} diff --git a/spatz/sw/tasks/interconnect32_simple_task.c b/spatz/sw/tasks/interconnect32_simple_task.c new file mode 100644 index 0000000..b535fdc --- /dev/null +++ b/spatz/sw/tasks/interconnect32_simple_task.c @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * 32-bit Interconnect Access Test Task for Spatz_cc + */ + +#include +#include "magia_tile_utils.h" + +// Memory in L1 +#define TEST_MEM32_BASE (L1_BASE + 0x00003000) +#define test_mem32 ((volatile uint32_t *)TEST_MEM32_BASE) + +int interconnect32_simple_task(void) { + + printf("[SNITCH] Testing 32-bit accesses\n"); + printf("[SNITCH] Using L1 memory at 0x%08x\n", TEST_MEM32_BASE); + + // ============================================= + // Test 1: Sequential 32-bit stores and loads + // ============================================= + printf("[SNITCH] Sequential 32-bit store/load\n"); + + for (int i = 0; i < 16; i++) { + test_mem32[i] = 0x10000000 + i; + } + + for (int i = 0; i < 16; i++) { + if (test_mem32[i] != (uint32_t)(0x10000000 + i)) { + printf("[SNITCH] ERROR: 32-bit store/load failed at [%d]\n", i); + } + } + + // ============================================= + // Test 2: Strided 32-bit access pattern + // ============================================= + printf("[SNITCH] Strided 32-bit access\n"); + + test_mem32[0] = 0xAAAAAAAA; + test_mem32[2] = 0xBBBBBBBB; + test_mem32[4] = 0xCCCCCCCC; + test_mem32[6] = 0xDDDDDDDD; + + if (test_mem32[0] != 0xAAAAAAAA || test_mem32[4] != 0xCCCCCCCC) { + printf("[SNITCH] ERROR: Strided access failed\n"); + } + + // ============================================= + // Test 3: Reverse order stores + // ============================================= + printf("[SNITCH] Reverse order stores\n"); + + for (int i = 7; i >= 0; i--) { + test_mem32[i] = 0x30000000 + (7 - i); + } + + if (test_mem32[0] != 0x30000007 || test_mem32[7] != 0x30000000) { + printf("[SNITCH] ERROR: Reverse order failed\n"); + } + + // ============================================= + // Test 4: Alternating read/write pattern + // ============================================= + printf("[SNITCH] Alternating read/write\n"); + + uint32_t accumulator = 0; + for (int i = 0; i < 8; i++) { + test_mem32[i] = 0x40000000 + i; + accumulator += test_mem32[i]; + } + + if (accumulator != 0x0000001C + (8 * 0x40000000)) { + printf("[SNITCH] ERROR: Alternating pattern failed\n"); + } + + // ============================================= + // Test 7: Vector load/store 32-bit (vle32.v/vse32.v) + // ============================================= + printf("[SNITCH] Vector load/store e32\n"); + + // Initialize pattern for vector operations + for (int i = 0; i < 16; i++) { + test_mem32[i] = 0x70000000 + (i * 0x11); + } + + // Configure vector: vtype = e32, m1, ta, ma (vl=8) + asm volatile( + "li t0, 8\n" + "vsetvli zero, t0, e32, m1, ta, ma\n" + ::: "t0" + ); + + // Vector load from test_mem32[0:7] into v1 + asm volatile( + "vle32.v v1, (%0)\n" + :: "r"(&test_mem32[0]) + : "memory" + ); + + // Vector store from v1 to test_mem32[16:23] + asm volatile( + "vse32.v v1, (%0)\n" + :: "r"(&test_mem32[16]) + : "memory" + ); + + // Verify vector store + for (int i = 0; i < 8; i++) { + if (test_mem32[16 + i] != test_mem32[i]) { + printf("[SNITCH] ERROR: Vector store failed at [%d], got 0x%08x expected 0x%08x\n", + i, test_mem32[16 + i], test_mem32[i]); + } + } + + return 0; +} diff --git a/spatz/sw/tasks/interconnect64_simple_task.c b/spatz/sw/tasks/interconnect64_simple_task.c new file mode 100644 index 0000000..319d4c7 --- /dev/null +++ b/spatz/sw/tasks/interconnect64_simple_task.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * 64-bit Interconnect Access Test Task for Spatz_cc + * !!! This task requires Spatz with RVD extension enabled !!! + */ + +#include +#include "magia_tile_utils.h" + +// Memory in L1 +#define TEST_MEM64_BASE (L1_BASE + 0x00004000) +#define test_mem64 ((volatile uint64_t *)TEST_MEM64_BASE) +#define test_mem32 ((volatile uint32_t *)TEST_MEM64_BASE) + +int interconnect64_simple_task(void) { + + printf("[SNITCH] Testing 64 bit accesses\n"); + printf("[SNITCH] Using L1 memory at 0x%08x\n", TEST_MEM64_BASE); + + // ============================================= + // Test 1: 64-bit aligned stores and loads + // ============================================= + printf("[SNITCH] 64-bit aligned store/load\n"); + + test_mem64[0] = 0x123456789ABCDEF0ULL; + test_mem64[1] = 0xFEDCBA9876543210ULL; + test_mem64[2] = 0xAAAAAAAABBBBBBBBULL; + test_mem64[3] = 0xCCCCCCCCDDDDDDDDULL; + + if (test_mem64[0] != 0x123456789ABCDEF0ULL) { + printf("[SNITCH] ERROR: 64-bit store/load failed at [0]\n"); + } + if (test_mem64[1] != 0xFEDCBA9876543210ULL) { + printf("[SNITCH] ERROR: 64-bit store/load failed at [1]\n"); + } + + // ============================================= + // Test 2: 32-bit stores to 64-bit memory + // ============================================= + printf("[SNITCH] 32-bit stores to 64-bit region\n"); + + test_mem32[0] = 0x11111111; + test_mem32[1] = 0x22222222; + test_mem32[2] = 0x33333333; + test_mem32[3] = 0x44444444; + + if (test_mem64[0] != 0x2222222211111111ULL) { + printf("[SNITCH] ERROR: 32-bit stores failed, got 0x%08x%08x\n", + (uint32_t)(test_mem64[0] >> 32), (uint32_t)test_mem64[0]); + } + + // ============================================= + // Test 3: Mixed 32/64 access patterns + // ============================================= + printf("[SNITCH] Mixed 32/64-bit access pattern\n"); + + test_mem64[4] = 0xDEADBEEFCAFEBABEULL; + uint32_t low_word = test_mem32[8]; + uint32_t high_word = test_mem32[9]; + + if (low_word != 0xCAFEBABE || high_word != 0xDEADBEEF) { + printf("[SNITCH] ERROR: Mixed access failed, low=0x%08x high=0x%08x\n", + low_word, high_word); + } + + // ============================================= + // Test 4: Sequential 64-bit loads (stress reqrsp64toobi32) + // ============================================= + printf("[SNITCH] Sequential 64-bit loads\n"); + + for (int i = 0; i < 8; i++) { + test_mem64[i] = (uint64_t)i * 0x0101010101010101ULL; + } + + uint64_t sum = 0; + for (int i = 0; i < 8; i++) { + sum += test_mem64[i]; + } + + uint64_t expected_sum = 0x1C1C1C1C1C1C1C1CULL; + if (sum != expected_sum) { + printf("[SNITCH] ERROR: Sequential load sum mismatch\n"); + } + + // ============================================= + // Test 5: Alternating 32/64 stores (stress tcdm64todualtcdm32) + // ============================================= + printf("[SNITCH] Alternating 32/64-bit stores\n"); + + test_mem64[0] = 0x0000000000000000ULL; + test_mem32[0] = 0xAAAAAAAA; // Writes to low word of test_mem64[0] + test_mem64[1] = 0x5555555555555555ULL; + test_mem32[2] = 0xBBBBBBBB; // Writes to low word of test_mem64[1] + + if (test_mem64[0] != 0x00000000AAAAAAAAULL) { + printf("[SNITCH] ERROR: Alternating store failed at [0], got 0x%08x%08x\n", + (uint32_t)(test_mem64[0] >> 32), (uint32_t)test_mem64[0]); + } + if (test_mem64[1] != 0x55555555BBBBBBBBULL) { + printf("[SNITCH] ERROR: Alternating store failed at [1], got 0x%08x%08x\n", + (uint32_t)(test_mem64[1] >> 32), (uint32_t)test_mem64[1]); + } + + // ============================================= + // Test 6: Unaligned 32-bit accesses + // ============================================= + printf("[SNITCH] Unaligned 32-bit accesses\n"); + + test_mem32[1] = 0x12345678; + test_mem32[3] = 0x9ABCDEF0; + test_mem32[5] = 0xFEDCBA98; + + if (test_mem32[1] != 0x12345678) { + printf("[SNITCH] ERROR: Unaligned access failed\n"); + } + + // ============================================= + // Test 7: Vector load/store 64-bit (vle64.v/vse64.v) + // ============================================= + printf("[SNITCH] Vector load/store e64\n"); + + // Initialize pattern for vector operations + for (int i = 0; i < 8; i++) { + test_mem64[i] = 0x7000000000000000ULL + ((uint64_t)i * 0x1111111111111111ULL); + } + + // Configure vector: vtype = e64, m1, ta, ma (vl=4) + asm volatile( + "li t0, 4\n" + "vsetvli zero, t0, e64, m1, ta, ma\n" + ::: "t0" + ); + + // Vector load from test_mem64[0:3] into v2 + asm volatile( + "vle64.v v2, (%0)\n" + :: "r"(&test_mem64[0]) + : "memory" + ); + + // Vector store from v2 to test_mem64[8:11] + asm volatile( + "vse64.v v2, (%0)\n" + :: "r"(&test_mem64[8]) + : "memory" + ); + + // Verify vector store + for (int i = 0; i < 4; i++) { + if (test_mem64[8 + i] != test_mem64[i]) { + printf("[SNITCH] ERROR: Vector store failed at [%d], got 0x%08x%08x expected 0x%08x%08x\n", + i, (uint32_t)(test_mem64[8 + i] >> 32), (uint32_t)test_mem64[8 + i], + (uint32_t)(test_mem64[i] >> 32), (uint32_t)test_mem64[i]); + } + } + + return 0; +} diff --git a/spatz/sw/tasks/matmul16_task.c b/spatz/sw/tasks/matmul16_task.c new file mode 100644 index 0000000..93c4a08 --- /dev/null +++ b/spatz/sw/tasks/matmul16_task.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * FP16 Matrix Multiplication using RISC-V Vector Extension + * Computes C = A × B where C is M×K, A is M×N, B is N×K + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +// DATA register address for reading parameter pointer +#define DATA_ADDR (SPATZ_DATA) + +// Parameter structure in shared L1 memory +typedef struct { + uint32_t a_addr; // Matrix A address + uint32_t b_addr; // Matrix B address + uint32_t c_addr; // Matrix C address (result) + uint32_t m_size; // Number of rows in A and C + uint32_t n_size; // Number of columns in A, rows in B + uint32_t k_size; // Number of columns in B and C +} matmul_params_t; + +// Main entry point +int matmul16_task(void) { + + // Read parameter structure pointer from DATA register + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile matmul_params_t *params = (volatile matmul_params_t *)params_ptr; + + float16_t *A = (float16_t *)params->a_addr; + float16_t *B = (float16_t *)params->b_addr; + float16_t *C = (float16_t *)params->c_addr; + uint32_t M = params->m_size; + uint32_t N = params->n_size; + uint32_t K = params->k_size; + + // Compute matrix multiplication using Spatz benchmark kernel + // C[M×K] = A[M×N] × B[N×K] + fmatmul_v16b(C, A, B, M, N, K); + + return 0; +} diff --git a/spatz/sw/tasks/matvec16_task.c b/spatz/sw/tasks/matvec16_task.c new file mode 100644 index 0000000..690154f --- /dev/null +++ b/spatz/sw/tasks/matvec16_task.c @@ -0,0 +1,207 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * FP16 Matrix-Vector Multiplication (GEMV) + * Based on PLAY library: https://github.com/FondazioneChipsIT/PLAY + * source/linalg_gemv/arch/linalg_gemv_spatz.c + * + * Computes: dst = α·mat·vec_x + β·vec_y + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +#define DATA_ADDR (SPATZ_DATA) + +typedef struct __attribute__((packed)) { + uint32_t mat_addr; + uint32_t vec_x_addr; + uint32_t vec_y_addr; + uint32_t dst_addr; + uint16_t alpha; + uint16_t beta; + uint32_t m_size; + uint32_t n_size; +} matvec_params_t; + +/* Helper functions adapted from PLAY library */ + +static inline int vector_set_all_spatz(float16_t *vec, const float16_t val, const int len) +{ + const float16_t *v; + size_t avl; + size_t vl; + + avl = len; + v = vec; + + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vfmv.v.f v0, %0" :: "f"(val)); + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vse16.v v0, (%0)" :: "r"(v)); + v += vl; + } + + return 0; +} + +static inline int vector_memcpy_spatz(const float16_t *src, float16_t *dst, const int len) +{ + size_t avl; + size_t vl; + const float16_t *p_src; + float16_t *p_dst; + + avl = len; + p_src = src; + p_dst = dst; + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vle16.v v0, (%0)" :: "r"(p_src)); + asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst)); + p_src += vl; + p_dst += vl; + } + + return 0; +} + +static inline int vector_scale_spatz(const float16_t *src, const float16_t val, float16_t *dst, const int len) +{ + size_t avl; + size_t vl; + const float16_t *p_src; + float16_t *p_dst; + + avl = len; + p_src = src; + p_dst = dst; + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vle16.v v0, (%0)" :: "r"(p_src)); + asm volatile ("vfmul.vf v8, v0, %0" :: "f"(val)); + asm volatile ("vse16.v v8, (%0)" :: "r"(p_dst)); + p_src += vl; + p_dst += vl; + } + + return 0; +} + +/* Main GEMV function - direct translation from PLAY linalg_gemv_spatz.c */ + +static int linalg_gemv_spatz_serial( + const float16_t *mat, + const float16_t *vec_x, + const float16_t *vec_y, + const float16_t alpha, + const float16_t beta, + float16_t *dst, + const int dim_M, + const int dim_N) +{ + float16_t ZERO_f = 0.0f; + float16_t ONE_f = 1.0f; + + const float16_t *col_m; + float16_t *p_dst; + float16_t elem_x; + + size_t stride; + size_t avl; + size_t vl; + + // CRITICAL FIX: Use bit pattern comparison for FP16 + // Direct float comparison (alpha == ZERO_f) fails because it compares FP16 vs FP32 + uint16_t alpha_bits = *(uint16_t *)α + uint16_t beta_bits = *(uint16_t *)β + uint16_t zero_bits = 0x0000; + uint16_t one_bits = 0x3C00; + + if (alpha_bits == zero_bits) { + if (beta_bits == zero_bits) + vector_set_all_spatz(dst, ZERO_f, dim_M); + else if (beta_bits == one_bits) + vector_memcpy_spatz(vec_y, dst, dim_M); + else + vector_scale_spatz(vec_y, beta, dst, dim_M); + return 0; + } + + if (beta_bits == zero_bits) + vector_set_all_spatz(dst, ZERO_f, dim_M); + else if (beta_bits == one_bits) + vector_memcpy_spatz(vec_y, dst, dim_M); + else + vector_scale_spatz(vec_y, beta, dst, dim_M); + + avl = dim_M; + p_dst = dst; + stride = dim_N * sizeof(float16_t); + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vfmv.v.f v0, %0" :: "f"(ZERO_f)); + asm volatile ("vle16.v v24, (%0)" :: "r"(p_dst)); + + for (int n = 0; n < dim_N; n++) { + col_m = mat + n; + elem_x = vec_x[n]; + asm volatile ("vlse16.v v8, (%0), %1" :: "r"(col_m), "r"(stride)); + asm volatile ("vfmacc.vf v0, %0, v8" :: "f"(elem_x)); + } + + asm volatile ("vfmul.vf v0, v0, %0" :: "f"(alpha)); + asm volatile ("vfadd.vv v0, v24, v0"); + asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst)); + + p_dst += vl; + } + + return 0; +} + +/* Task entry point */ + +int matvec16_task(void) { + + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile matvec_params_t *params = (volatile matvec_params_t *)params_ptr; + + float16_t *mat = (float16_t *)params->mat_addr; + float16_t *vec_x = (float16_t *)params->vec_x_addr; + float16_t *vec_y = (float16_t *)params->vec_y_addr; + float16_t *dst = (float16_t *)params->dst_addr; + uint32_t M = params->m_size; + uint32_t N = params->n_size; + + // Bitcast uint16_t to float16_t + union { uint16_t u; float16_t f; } alpha_u, beta_u; + alpha_u.u = params->alpha; + beta_u.u = params->beta; + + linalg_gemv_spatz_serial(mat, vec_x, vec_y, alpha_u.f, beta_u.f, dst, M, N); + + return 0; +} diff --git a/spatz/sw/tasks/matvectrans16_task.c b/spatz/sw/tasks/matvectrans16_task.c new file mode 100644 index 0000000..4babd22 --- /dev/null +++ b/spatz/sw/tasks/matvectrans16_task.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * FP16 Transposed Matrix-Vector Multiplication (GEMV Transposed) + * Based on PLAY library: https://github.com/FondazioneChipsIT/PLAY + * source/linalg_gemv_trans/arch/linalg_gemv_trans_spatz.c + * + * Computes: dst = α·mat^T·vec_x + β·vec_y + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +#define DATA_ADDR (SPATZ_DATA) + +typedef struct __attribute__((packed)) { + uint32_t mat_addr; + uint32_t vec_x_addr; + uint32_t vec_y_addr; + uint32_t dst_addr; + uint16_t alpha; + uint16_t beta; + uint32_t m_size; + uint32_t n_size; +} matvectrans_params_t; + +/* Helper functions adapted from PLAY library */ + +static inline int vector_set_all_spatz(float16_t *vec, const float16_t val, const int len) +{ + const float16_t *v; + size_t avl; + size_t vl; + + avl = len; + v = vec; + + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vfmv.v.f v0, %0" :: "f"(val)); + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vse16.v v0, (%0)" :: "r"(v)); + v += vl; + } + + return 0; +} + +static inline int vector_memcpy_spatz(const float16_t *src, float16_t *dst, const int len) +{ + size_t avl; + size_t vl; + const float16_t *p_src; + float16_t *p_dst; + + avl = len; + p_src = src; + p_dst = dst; + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vle16.v v0, (%0)" :: "r"(p_src)); + asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst)); + p_src += vl; + p_dst += vl; + } + + return 0; +} + +static inline int vector_scale_spatz(const float16_t *src, const float16_t val, float16_t *dst, const int len) +{ + size_t avl; + size_t vl; + const float16_t *p_src; + float16_t *p_dst; + + avl = len; + p_src = src; + p_dst = dst; + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vle16.v v0, (%0)" :: "r"(p_src)); + asm volatile ("vfmul.vf v8, v0, %0" :: "f"(val)); + asm volatile ("vse16.v v8, (%0)" :: "r"(p_dst)); + p_src += vl; + p_dst += vl; + } + + return 0; +} + +/* Main GEMV Transposed function - direct translation from PLAY linalg_gemv_trans_spatz.c */ + +static int linalg_gemv_trans_spatz_serial( + const float16_t *mat, + const float16_t *vec_x, + const float16_t *vec_y, + const float16_t alpha, + const float16_t beta, + float16_t *dst, + const int dim_M, + const int dim_N) +{ + float16_t ZERO_f = 0.0f; + float16_t ONE_f = 1.0f; + + const float16_t *row_m; + float16_t *p_dst; + float16_t elem_x; + + size_t avl; + size_t vl; + + // CRITICAL FIX: Use bit pattern comparison for FP16 + // Direct float comparison (alpha == ZERO_f) fails because it compares FP16 vs FP32 + uint16_t alpha_bits = *(uint16_t *)α + uint16_t beta_bits = *(uint16_t *)β + uint16_t zero_bits = 0x0000; + uint16_t one_bits = 0x3C00; + + if (alpha_bits == zero_bits) { + if (beta_bits == zero_bits) + vector_set_all_spatz(dst, ZERO_f, dim_N); + else if (beta_bits == one_bits) + vector_memcpy_spatz(vec_y, dst, dim_N); + else + vector_scale_spatz(vec_y, beta, dst, dim_N); + return 0; + } + + if (beta_bits == zero_bits) + vector_set_all_spatz(dst, ZERO_f, dim_N); + else if (beta_bits == one_bits) + vector_memcpy_spatz(vec_y, dst, dim_N); + else + vector_scale_spatz(vec_y, beta, dst, dim_N); + + avl = dim_N; + p_dst = dst; + + for (; avl > 0; avl -= vl) { + asm volatile ("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + asm volatile ("vfmv.v.f v0, %0" :: "f"(ZERO_f)); + asm volatile ("vle16.v v24, (%0)" :: "r"(p_dst)); + + for (int m = 0; m < dim_M; m++) { + row_m = mat + (m * dim_N); + elem_x = vec_x[m]; + + asm volatile ("vle16.v v8, (%0)" :: "r"(row_m)); + asm volatile ("vfmacc.vf v0, %0, v8" :: "f"(elem_x)); + } + + asm volatile ("vfmul.vf v0, v0, %0" :: "f"(alpha)); + asm volatile ("vfadd.vv v0, v24, v0"); + asm volatile ("vse16.v v0, (%0)" :: "r"(p_dst)); + + p_dst += vl; + } + + return 0; +} + +/* Task entry point */ + +int matvectrans16_task(void) { + + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile matvectrans_params_t *params = (volatile matvectrans_params_t *)params_ptr; + + float16_t *mat = (float16_t *)params->mat_addr; + float16_t *vec_x = (float16_t *)params->vec_x_addr; + float16_t *vec_y = (float16_t *)params->vec_y_addr; + float16_t *dst = (float16_t *)params->dst_addr; + uint32_t M = params->m_size; + uint32_t N = params->n_size; + + // Bitcast uint16_t to float16_t + union { uint16_t u; float16_t f; } alpha_u, beta_u; + alpha_u.u = params->alpha; + beta_u.u = params->beta; + + linalg_gemv_trans_spatz_serial(mat, vec_x, vec_y, alpha_u.f, beta_u.f, dst, M, N); + + return 0; +} diff --git a/spatz/sw/tasks/vecsum16_task.c b/spatz/sw/tasks/vecsum16_task.c new file mode 100644 index 0000000..fda1201 --- /dev/null +++ b/spatz/sw/tasks/vecsum16_task.c @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * FP16 Vector-Vector Sum using RISC-V Vector Extension + * Computes Z = X + Y where X, Y, Z are N×1 vectors + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "spatz_workers.h" + +// DATA register address for reading parameter pointer +#define DATA_ADDR (SPATZ_DATA) + +// Parameter structure in shared L1 memory +typedef struct { + uint32_t x_addr; // Vector X address [N×1] + uint32_t y_addr; // Vector Y address [N×1] + uint32_t z_addr; // Result Z address [N×1] + uint32_t n_size; // Vector length N +} vecsum_params_t; + +// Main entry point +int vecsum16_task(void) { + + // Read parameter structure pointer from DATA register + uint32_t params_ptr = mmio32(DATA_ADDR); + volatile vecsum_params_t *params = (volatile vecsum_params_t *)params_ptr; + + const float16_t *X = (const float16_t *)params->x_addr; + const float16_t *Y = (const float16_t *)params->y_addr; + float16_t *Z = (float16_t *)params->z_addr; + uint32_t N = params->n_size; + + // Vector addition: Z[N] = X[N] + Y[N] + // Use stripmine loop for large vectors + unsigned int avl = N; + unsigned int vl; + + do { + // Set vector length + asm volatile("vsetvli %0, %1, e16, m8, ta, ma" : "=r"(vl) : "r"(avl)); + + // Load X and Y vectors + asm volatile("vle16.v v0, (%0)" ::"r"(X)); + asm volatile("vle16.v v8, (%0)" ::"r"(Y)); + + // Vector floating-point add: v16 = v0 + v8 + asm volatile("vfadd.vv v16, v0, v8"); + + // Store result + asm volatile("vse16.v v16, (%0)" ::"r"(Z)); + + // Bump pointers + X += vl; + Y += vl; + Z += vl; + avl -= vl; + } while (avl > 0); + + return 0; +} diff --git a/sw/kernel/link.ld b/sw/kernel/link.ld index bad56d5..420dd19 100644 --- a/sw/kernel/link.ld +++ b/sw/kernel/link.ld @@ -22,7 +22,7 @@ MEMORY { instrram : ORIGIN = 0xcc000000, LENGTH = 0x8000 dataram : ORIGIN = 0xcc010000, LENGTH = 0xF00000 - stack : ORIGIN = 0x00010000, LENGTH = 0x10000 /* 64K, maybe it's a bit too much (?) */ + stack : ORIGIN = 0x00010000, LENGTH = 0xC000 /* 48K - CV32 stack only */ } /* Stack information variables */ @@ -61,6 +61,19 @@ SECTIONS _endtext = .; } > instrram + /* Spatz embedded binary - inlined from header file */ + /* Positioned right after CV32 .text in instrram */ + + . = ALIGN(4); + _spatz_binary_start = .; + + .spatz_binary : { + KEEP(*(.spatz_binary)) + } > instrram + + _spatz_binary_end = .; + _spatz_binary_size = _spatz_binary_end - _spatz_binary_start; + /*--------------------------------------------------------------------*/ /* Global constructor/destructor segment */ /*--------------------------------------------------------------------*/ diff --git a/sw/tests/spatz_tests/atomic_mesh_spatz_test.c b/sw/tests/spatz_tests/atomic_mesh_spatz_test.c new file mode 100644 index 0000000..2036d78 --- /dev/null +++ b/sw/tests/spatz_tests/atomic_mesh_spatz_test.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * MAGIA Mesh Atomic Test with Spatz (Snitch) + * + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "atomic_mesh_spatz_test_task_bin.h" + +#define SYNC_BASE (L1_BASE + 0x00010000) // L1 base where counters are stored +#define PARAM_BASE (L1_BASE + 0x00010100) // Parameters area (separate from counter) + +// Parameter structure for Spatz task +typedef struct { + uint32_t hartid; + uint32_t counter_addr; + uint32_t sync_base; +} atomic_params_t; + +int main(void) { + uint32_t hartid = get_hartid(); + + printf("[Tile %d] Starting Mesh Atomic Test with Spatz\n", hartid); + + // Initialize Event Unit + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + // Initialize all counters to 0 across mesh + printf("[Tile %d] Initializing counter to 0\n", hartid); + uint32_t my_counter_addr = SYNC_BASE + hartid * L1_TILE_OFFSET; + mmio32(my_counter_addr) = 0; + + // Setup parameters in L1 - each tile uses its own L1 region + uint32_t param_base = PARAM_BASE + hartid * L1_TILE_OFFSET; + volatile atomic_params_t *params = (volatile atomic_params_t *)param_base; + params->hartid = hartid; + params->counter_addr = my_counter_addr; + params->sync_base = SYNC_BASE; + + printf("[Tile %d] Initializing Spatz with atomic task\n", hartid); + + // Initialize Spatz with atomic remote task binary + spatz_init(SPATZ_BINARY_START); + + printf("[Tile %d] Starting Spatz task\n", hartid); + + spatz_pass_params(param_base); + spatz_run_task(ATOMIC_REMOTE_TASK); + + + printf("[Tile %d] Waiting for Spatz completion...\n", hartid); + + // Wait for Spatz done event using WFE + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + printf("[Tile %d] Spatz completed! Checking result...\n", hartid); + + // Check Spatz exit code (0 = pass, 1 = fail) + uint32_t exit_code = spatz_get_exit_code(); + + // Read final counter value + uint32_t final_count = mmio32(my_counter_addr); + uint32_t expected = 6; // NUM_ITER (2) * NUM_TARGETS (3) from task + + printf("[Tile %d] Counter: %d (expected %d), exit_code: %d\n", + hartid, final_count, expected, exit_code); + + if (exit_code == 0 && final_count == expected) { + printf("[Tile %d] [PASS] Test completed successfully!\n", hartid); + return 0; + } else { + printf("[Tile %d] [FAIL] Test failed!\n", hartid); + return 1; + } +} diff --git a/sw/tests/spatz_tests/dotp_mesh_spatz_test.c b/sw/tests/spatz_tests/dotp_mesh_spatz_test.c new file mode 100644 index 0000000..d8a97f5 --- /dev/null +++ b/sw/tests/spatz_tests/dotp_mesh_spatz_test.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * MAGIA Mesh Test - Spatz Dot Product + * + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "idma_mm_utils.h" +#include "dotp_mesh_spatz_test_task_bin.h" + +// Test configuration +#define DOTP_SIZE 1024 // Elements per tile +#define DIFF_TH (0x0011) // Threshold for FP16 comparison + +// FP16 test pattern: A[i] = 1.0, B[i] = 2.0 +// Expected result: 1024 * 1.0 * 2.0 = 2048.0 +typedef uint16_t fp16_t; +static const fp16_t dotp_a_value = 0x3C00; // 1.0 in FP16 +static const fp16_t dotp_b_value = 0x4000; // 2.0 in FP16 +static const fp16_t dotp_golden = 0x6800; // 2048.0 in FP16 + +#define A_VEC_BASE (L1_BASE + 0x00010000) +#define B_VEC_BASE (L1_BASE + 0x00012000) +#define RESULT_BASE (L1_BASE + 0x00014000) + +#define T_BASE (L2_BASE + 0x0004A000) +#define L2_RESULTS_BASE (L2_BASE + 0x00050000) +#define MHARTID_OFFSET (0x00010000) + +#define DOTP_PARAM_BASE (L1_BASE + 0x00015000) + +// Parameter structure for Spatz task +typedef struct { + uint32_t a_addr; // Vector A address + uint32_t b_addr; // Vector B address + uint32_t result_addr; // Result address (scalar) + uint32_t n_elements; // Number of elements +} dotp_params_t; + +int main(void) { + uint32_t my_hartid = get_hartid(); + uint32_t l2_temp = T_BASE + my_hartid * MHARTID_OFFSET; + uint32_t transfer_id; + + // Transfer vector A to L1 through iDMA + printf("Initializing vector A through iDMA...\n"); + for (unsigned int i = 0; i < DOTP_SIZE; i++) { + mmio16(l2_temp + 2*i) = dotp_a_value; + } + transfer_id = idma_L2ToL1(l2_temp, A_VEC_BASE + my_hartid * L1_TILE_OFFSET, DOTP_SIZE * 2); + dma_wait(transfer_id); + + // Transfer vector B to L1 through iDMA + printf("Initializing vector B through iDMA...\n"); + for (unsigned int i = 0; i < DOTP_SIZE; i++) { + mmio16(l2_temp + 2*i) = dotp_b_value; + } + transfer_id = idma_L2ToL1(l2_temp, B_VEC_BASE + my_hartid * L1_TILE_OFFSET, DOTP_SIZE * 2); + dma_wait(transfer_id); + + // Initialize Event Unit + printf("Initializing Event Unit and Spatz...\n"); + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + // Initialize Spatz BEFORE writing parameters + spatz_init(SPATZ_BINARY_START); + + // Write parameters to tile-specific L1 using L1_TILE_OFFSET + uint32_t param_base = DOTP_PARAM_BASE + my_hartid * L1_TILE_OFFSET; + volatile dotp_params_t *params = (volatile dotp_params_t *)param_base; + params->a_addr = A_VEC_BASE + my_hartid * L1_TILE_OFFSET; + params->b_addr = B_VEC_BASE + my_hartid * L1_TILE_OFFSET; + params->result_addr = RESULT_BASE + my_hartid * L1_TILE_OFFSET; + params->n_elements = DOTP_SIZE; + + printf("Starting Spatz dot product...\n"); + + spatz_pass_params(param_base); + spatz_run_task(DOTP_TASK); + // Wait for Spatz completion + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + printf("Spatz task completed.\n"); + + // Check Spatz exit code + uint32_t exit_code = spatz_get_exit_code(); + if (exit_code != 0) { + printf("ERROR: Spatz exit code = 0x%08x\n", exit_code); + } + + // Move result from L1 to L2 using iDMA + printf("Moving result to L2 through iDMA...\n"); + uint32_t l1_result_addr = RESULT_BASE + my_hartid * L1_TILE_OFFSET; + uint32_t l2_result_addr = L2_RESULTS_BASE + my_hartid * MHARTID_OFFSET; + transfer_id = idma_L1ToL2(l1_result_addr, l2_result_addr, 4); + dma_wait(transfer_id); + + // Read result from L2 + fp16_t result = mmio16(l2_result_addr); + + uint16_t computed = result; + uint16_t expected = dotp_golden; + uint16_t diff = (computed > expected) ? (computed - expected) : (expected - computed); + + unsigned int num_errors = 0; + if (diff > DIFF_TH) { + num_errors++; + printf("ERROR: Tile %d result (=0x%04x) != Golden (=0x%04x), diff=0x%04x\n", + my_hartid, computed, expected, diff); + } else { + printf("OK: Tile %d result matches golden value\n", my_hartid); + } + + printf("\nFinished test with %u error(s)\n", num_errors); + + return num_errors; +} diff --git a/sw/tests/spatz_tests/hello_mesh_spatz_test.c b/sw/tests/spatz_tests/hello_mesh_spatz_test.c new file mode 100644 index 0000000..bc29189 --- /dev/null +++ b/sw/tests/spatz_tests/hello_mesh_spatz_test.c @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * MAGIA Hello Mesh Spatz Test + * Simple test running Hello World on Spatz across multiple tiles + * + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "hello_mesh_spatz_test_task_bin.h" + +// Parameter structure +typedef struct { + uint32_t tile_id; +} hello_params_t; + +int main(void) { + + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + // Write parameters in L1 - each tile uses its own L1 region with offset + uint32_t my_hartid = get_hartid(); + uint32_t param_base = L1_BASE + 0x00010000 + my_hartid * L1_TILE_OFFSET; + volatile hello_params_t *params = (volatile hello_params_t *)param_base; + params->tile_id = my_hartid; + + spatz_init(SPATZ_BINARY_START); + + printf("Starting Spatz Hello World task (tile_id=%d)...\n", my_hartid); + + spatz_pass_params(param_base); + spatz_run_task(HELLO_TASK); + + // Wait for Spatz completion using WFE + printf("Waiting for Spatz completion...\n"); + eu_wait_spatz_wfe(EU_SPATZ_DONE_MASK); + + printf("Spatz task completed successfully!\n"); + + return 0; +} + diff --git a/sw/tests/spatz_tests/matmul_compare_spatz_test.c b/sw/tests/spatz_tests/matmul_compare_spatz_test.c new file mode 100644 index 0000000..c6dc381 --- /dev/null +++ b/sw/tests/spatz_tests/matmul_compare_spatz_test.c @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Matrix Multiplication Comparison Test: RedMule vs Spatz + */ + +#include +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "redmule_mm_utils.h" +#include "event_unit_utils.h" +#include "matmul_compare_spatz_test_task_bin.h" + +// Golden model test data headers +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +// Matrix dimensions (from golden model headers) +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +// Memory layout in L1 +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) + +#define A_BASE X_BASE +#define B_BASE W_BASE +#define C_REDMULE_BASE Y_BASE +#define C_SPATZ_BASE (L1_BASE + 0x00023000) +#define C_SCALAR_BASE Z_BASE + +// Spatz parameter area in shared L1 +#define MATMUL_PARAM_BASE (L1_BASE + 0x00010000) + +// Parameter structure for Spatz task (matches matmul16_task.c) +typedef struct { + uint32_t a_addr; // Matrix A address + uint32_t b_addr; // Matrix B address + uint32_t c_addr; // Matrix C address (result) + uint32_t m_size; // Number of rows in A and C + uint32_t n_size; // Number of columns in A, rows in B + uint32_t k_size; // Number of columns in B and C +} matmul_params_t; + +#define DIFF_TH (0x0011) + +typedef uint16_t fp16_t; + +int main(void) { + // Enable cycle counter (generic for both CV32E40P and CV32E40X) + ccount_en(); + + + // Initialize input matrices from golden model headers + printf("Initializing matrices from golden model...\n"); + + // X (matrix A) + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(X_BASE + 2*i) = x_inp[i]; + + // W (matrix B) + for (int i = 0; i < N_SIZE*K_SIZE; i++) + mmio16(W_BASE + 2*i) = w_inp[i]; + + // Y (RedMule output) - initialize to ZERO (no accumulation, same as Spatz) + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Y_BASE + 2*i) = 0; + + printf("Matrices initialized.\n\n"); + + // ===================================================== + // Test 1: RedMule HWPE Accelerator + // ===================================================== + + // Initialize and configure RedMulE + hwpe_cg_enable(); + hwpe_soft_clear(); + + // Acquire RedMule job + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0); + + // Configure RedMulE with matrices + redmule_cfg((unsigned int)A_BASE, (unsigned int)B_BASE, (unsigned int)C_REDMULE_BASE, + M_SIZE, N_SIZE, K_SIZE, + (uint8_t)gemm_ops, (uint8_t)Float16, (uint8_t)Float16); + + printf("Starting RedMulE computation...\n"); + + uint32_t redmule_start = get_cycle(); + + // Trigger job + hwpe_trigger_job(); + + // Wait for HWPE completion using the polling method + hwpe_wait_for_completion(); + + uint32_t redmule_cycles = get_cycle() - redmule_start; + + printf("RedMule cycles: %u\n", redmule_cycles); + printf("RedMule completed.\n\n"); + + // ===================================================== + // Test 2: Spatz Vector Processor + // ===================================================== + + // Initialize Event Unit and Spatz + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); + + printf("Initializing Spatz...\n"); + spatz_init(SPATZ_BINARY_START); + + // Write parameters to shared L1 memory for Spatz task + volatile matmul_params_t *params = (volatile matmul_params_t *)MATMUL_PARAM_BASE; + params->a_addr = A_BASE; + params->b_addr = B_BASE; + params->c_addr = C_SPATZ_BASE; + params->m_size = M_SIZE; + params->n_size = N_SIZE; + params->k_size = K_SIZE; + + printf("Initializing Spatz output matrix to zero...\n"); + for (uint32_t i = 0; i < M_SIZE * K_SIZE; i++) { + mmio16(C_SPATZ_BASE + 2*i) = 0; + } + + printf("Starting Spatz computation...\n"); + + uint32_t spatz_start = get_cycle(); + spatz_pass_params(MATMUL_PARAM_BASE); + spatz_run_task(MATMUL16_TASK); + + eu_wait_spatz_polling(EU_SPATZ_DONE_MASK); + + uint32_t spatz_cycles = get_cycle() - spatz_start; + + printf("Spatz cycles: %u\n\n", spatz_cycles); + + // ===================================================== + // Result Verification + // ===================================================== + + unsigned int mismatch_errors = 0; + uint16_t redmule_val, spatz_val, diff; + + // Verify that RedMule and Spatz produce IDENTICAL results + printf("Comparing RedMule vs Spatz results...\n"); + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + redmule_val = mmio16(C_REDMULE_BASE + 2*i); + spatz_val = mmio16(C_SPATZ_BASE + 2*i); + diff = (redmule_val > spatz_val) ? (redmule_val - spatz_val) : (spatz_val - redmule_val); + if(diff > DIFF_TH){ + mismatch_errors++; + if(mismatch_errors <= 10) // Print first 10 errors only + printf(" ERROR: RedMule[%d](=0x%04x) != Spatz[%d](=0x%04x), diff=0x%04x\n", + i, redmule_val, i, spatz_val, diff); + } + } + + printf("\n"); + if (mismatch_errors == 0) { + printf("SUCCESS: RedMule and Spatz produce IDENTICAL results!\n"); + } else { + printf("FAILURE: RedMule and Spatz differ in %d elements\n", mismatch_errors); + } + + printf("\n"); + printf("Performance Summary:\n"); + printf("RedMule cycles: %u\n", redmule_cycles); + printf("Spatz cycles: %u\n", spatz_cycles); + + return mismatch_errors; + +} diff --git a/sw/tests/spatz_tests/simple_spatz_test.c b/sw/tests/spatz_tests/simple_spatz_test.c new file mode 100644 index 0000000..bfe5c7f --- /dev/null +++ b/sw/tests/spatz_tests/simple_spatz_test.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Spatz Test - CV32 launches Spatz and waits with WFE + * + */ + +#include "magia_tile_utils.h" +#include "magia_spatz_utils.h" +#include "event_unit_utils.h" +#include "simple_spatz_test_task_bin.h" + +int main(void) { + + int errors = 0; + + printf("[CV32] Spatz Test:\n"); + + // ========================================== + // Initialization of Event Unit and Spatz + // ========================================== + + // Initialize Event Unit + eu_init(); + eu_enable_events(EU_SPATZ_DONE_MASK); // Enable Spatz DONE event + + // Enable clk and initialize Spatz (bootrom jumps to _start, does full init) + printf("\n[CV32] Initializing Spatz...\n"); + spatz_init(SPATZ_BINARY_START); + + // ========================================== + // Test: Spatz Task + // ========================================== + printf("\n[CV32] Launching SPATZ Task\n"); + spatz_run_task(HELLO_WORLD_SIMPLE_TASK); + + eu_wait_spatz_polling(EU_SPATZ_DONE_MASK); + + if(spatz_get_exit_code() != 0) { + printf("[CV32] SPATZ TASK ENDED with exit code: 0x%03x\n", spatz_get_exit_code()); + errors++; + } else { + printf("[CV32] SPATZ TASK ENDED successfully\n"); + } + + // Disable Spatz clock + spatz_clk_dis(); + + // Return error status for CV32 + return errors; +} \ No newline at end of file diff --git a/sw/utils/event_unit_utils.h b/sw/utils/event_unit_utils.h index 10ee2ea..a3156fa 100644 --- a/sw/utils/event_unit_utils.h +++ b/sw/utils/event_unit_utils.h @@ -111,6 +111,13 @@ #define EU_FSYNC_ERROR_MASK (1 << EU_FSYNC_ERROR_BIT) #define EU_FSYNC_ALL_MASK (EU_FSYNC_DONE_MASK | EU_FSYNC_ERROR_MASK) +// Spatz events (accelerator events [8] + cluster events [23]) +#define EU_SPATZ_DONE_BIT 8 +#define EU_SPATZ_START_BIT 23 +#define EU_SPATZ_DONE_MASK (1 << EU_SPATZ_DONE_BIT) +#define EU_SPATZ_START_MASK (1 << EU_SPATZ_START_BIT) +#define EU_SPATZ_ALL_MASK (EU_SPATZ_DONE_MASK | EU_SPATZ_START_MASK) + // Wait modes typedef enum { EU_WAIT_MODE_POLLING = 0, @@ -321,6 +328,34 @@ static inline uint32_t eu_fsync_has_error(void) { return eu_check_events(EU_FSYNC_ERROR_MASK); } +//============================================================================= +// SPATZ FUNCTIONS +//============================================================================= + +static inline void eu_spatz_init(void) { + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_SPATZ_DONE_MASK); +} + +static inline uint32_t eu_spatz_is_done(void) { + return eu_check_events(EU_SPATZ_DONE_MASK); +} + + +static inline void eu_wait_spatz_wfe(uint32_t event_mask) { + while (!eu_check_events(event_mask)) { + eu_evt_wait(); + } + eu_clear_events(event_mask); +} + +static inline void eu_wait_spatz_polling(uint32_t event_mask) { + while (!eu_check_events(event_mask)) { + wait_nop(10); + } + eu_clear_events(event_mask); +} + //============================================================================= // MULTI-ACCELERATOR FUNCTIONS //============================================================================= diff --git a/sw/utils/magia_spatz_utils.h b/sw/utils/magia_spatz_utils.h new file mode 100644 index 0000000..835deb1 --- /dev/null +++ b/sw/utils/magia_spatz_utils.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * Spatz Utility Functions + */ +#ifndef MAGIA_SPATZ_UTILS_H +#define MAGIA_SPATZ_UTILS_H + +#include +#include "magia_tile_utils.h" + +#define SPATZ_CLK_EN (SPATZ_CTRL_BASE + 0x00) +#define SPATZ_READY (SPATZ_CTRL_BASE + 0x04) +#define SPATZ_START (SPATZ_CTRL_BASE + 0x08) +#define SPATZ_TASKBIN (SPATZ_CTRL_BASE + 0x0C) +#define SPATZ_DATA (SPATZ_CTRL_BASE + 0x10) +#define SPATZ_RETURN (SPATZ_CTRL_BASE + 0x14) +#define SPATZ_DONE (SPATZ_CTRL_BASE + 0x18) + +#define mmio32(x) (*(volatile uint32_t *)(x)) + +static inline void spatz_clk_en(void) { + mmio32(SPATZ_CLK_EN) = 1; +} + +static inline void spatz_clk_dis(void) { + mmio32(SPATZ_CLK_EN) = 0; +} + +static inline void spatz_set_func(uint32_t addr) { + mmio32(SPATZ_TASKBIN) = addr; +} + +static inline void spatz_trigger_en_irq(void) { + mmio32(SPATZ_START) = 1; +} + +static inline void spatz_trigger_dis_irq(void) { + mmio32(SPATZ_START) = 0; +} + +static inline void spatz_done(void) { + mmio32(SPATZ_DONE) = 1; +} + +static inline uint32_t spatz_get_exit_code(void) { + return mmio32(SPATZ_RETURN); +} + +static inline void spatz_run_task(uint32_t spatz_task_addr) { + spatz_set_func(spatz_task_addr); + spatz_trigger_en_irq(); + while(mmio32(SPATZ_START) != 0); +} + +static inline void spatz_pass_params(uint32_t params_ptr) { + mmio32(SPATZ_DATA) = params_ptr; +} + +static inline void spatz_run_task_with_params(uint32_t spatz_task_addr, uint32_t params_ptr) { + spatz_pass_params(params_ptr); + spatz_run_task(spatz_task_addr); +} + +static inline void spatz_init(uint32_t spatz_start_addr) { + spatz_set_func(spatz_start_addr); + spatz_clk_en(); + while (mmio32(SPATZ_READY) == 0){ + printf("Waiting for Spatz to be ready...\n"); + }; +} + +#endif diff --git a/sw/utils/magia_tile_utils.h b/sw/utils/magia_tile_utils.h index 533bd3f..88783ad 100644 --- a/sw/utils/magia_tile_utils.h +++ b/sw/utils/magia_tile_utils.h @@ -25,6 +25,7 @@ #include #include "tinyprintf.h" + #define NUM_L1_BANKS (32) #define WORDS_BANK (8192) #define BITS_WORD (32) @@ -38,7 +39,9 @@ #define FSYNC_END (0x000006FF) #define EVENT_UNIT_BASE (0x00000700) #define EVENT_UNIT_END (0x000016FF) -#define RESERVED_START (0x00001700) +#define SPATZ_CTRL_BASE (0x00001700) +#define SPATZ_CTRL_END (0x000017FF) +#define RESERVED_START (0x00001800) #define RESERVED_END (0x0000FFFF) #define STACK_START (0x00010000) #define STACK_END (0x0001FFFF) @@ -126,8 +129,8 @@ static inline void ccount_en(){ #ifdef CV32E40X asm volatile("csrrci zero, 0x320, 0x1" ::); #else - uint32_t pcmr = 1; - asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); + asm volatile("csrw 0x7E0, %0" :: "r"(0x1)); + asm volatile("csrw 0x7E1, %0" :: "r"(0x1)); #endif } @@ -135,8 +138,7 @@ static inline void ccount_dis(){ #ifdef CV32E40X asm volatile("csrrsi zero, 0x320, 0x1" ::); #else - uint32_t pcmr = 0; - asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); + asm volatile("csrw 0x7E1, %0" :: "r"(0x0)); #endif } @@ -146,8 +148,7 @@ static inline uint32_t get_cyclel(){ asm volatile("csrr %0, cycle" :"=r"(cyclel):); #else - asm volatile("csrr %0, 0x780" - :"=r"(cyclel):); + asm volatile("csrr %0, 0x780" : "=r"(cyclel)); #endif return cyclel; } @@ -192,66 +193,4 @@ uint32_t get_time(){ return timel; } -// Additional Flex-V CSR access functions based on CSR table -static inline uint32_t get_mstatus(){ - uint32_t mstatus; - asm volatile("csrr %0, 0x300" :"=r"(mstatus):); // MSTATUS (0x300) - return mstatus; -} - -static inline void set_mstatus(uint32_t value){ - asm volatile("csrw 0x300, %0" ::"r"(value)); // MSTATUS (0x300) -} - -static inline uint32_t get_mtvec(){ - uint32_t mtvec; - asm volatile("csrr %0, 0x305" :"=r"(mtvec):); // MTVEC (0x305) - return mtvec; -} - -static inline void set_mtvec(uint32_t value){ - asm volatile("csrw 0x305, %0" ::"r"(value)); // MTVEC (0x305) -} - -static inline uint32_t get_mepc(){ - uint32_t mepc; - asm volatile("csrr %0, 0x341" :"=r"(mepc):); // MEPC (0x341) - return mepc; -} - -static inline void set_mepc(uint32_t value){ - asm volatile("csrw 0x341, %0" ::"r"(value)); // MEPC (0x341) -} - -static inline uint32_t get_mcause(){ - uint32_t mcause; - asm volatile("csrr %0, 0x342" :"=r"(mcause):); // MCAUSE (0x342) - return mcause; -} - -static inline uint32_t get_privlv(){ - uint32_t privlv; - asm volatile("csrr %0, 0xc10" :"=r"(privlv):); // PRIVLV (0xC10) - return privlv; -} - -static inline uint32_t get_uhartid(){ - uint32_t uhartid; - asm volatile("csrr %0, 0x014" :"=r"(uhartid):); // UHARTID (0x014) - return uhartid; -} - -// Flex-V performance counter control -static inline void perf_counter_enable(){ - uint32_t pcer = 3; // Enable cycles (bit 0) and instruction count (bit 1) - uint32_t pcmr = 1; // Enable global performance counter - asm volatile("csrw 0x7e0, %0" ::"r"(pcer)); // PCER_MACHINE (0x7E0) - asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1) -} - -static inline void perf_counter_disable(){ - uint32_t pcmr = 0; // Disable global performance counter - asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1) -} - #endif /*MAGIA_TILE_UTILS_H*/ diff --git a/sw/utils/redmule_mm_utils.h b/sw/utils/redmule_mm_utils.h index c08a728..cb06f0b 100644 --- a/sw/utils/redmule_mm_utils.h +++ b/sw/utils/redmule_mm_utils.h @@ -52,12 +52,20 @@ #define REDMULE_REG_MOPCNT 0x38 /* Operations and formats */ -#define gemm_ops 0x1 #define Float16 0x1 #define Float16Alt 0x2 #define Float8 0x3 #define Float8Alt 0x4 +#define gemm_ops 0x1 +#define matmul_ops 0x0 +#define addmax 0x2 +#define addmin 0x3 +#define mulmax 0x4 +#define mulmin 0x5 +#define maxmin 0x6 +#define minmax 0x7 + /* HWPE Register Access Functions */ static inline void redmule_x_add_set(unsigned int value) { HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_MARITH0); diff --git a/sw/utils/tinyprintf.h b/sw/utils/tinyprintf.h index 914a675..df789e7 100644 --- a/sw/utils/tinyprintf.h +++ b/sw/utils/tinyprintf.h @@ -139,7 +139,8 @@ void putf(char *null, char c) { /* Optional external types dependencies */ #if TINYPRINTF_DEFINE_TFP_SPRINTF -# include /* size_t */ +// Define size_t for embedded environment (no sys/types.h) +typedef unsigned int size_t; #endif /* Declarations */ diff --git a/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv b/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv index dcb55bc..46b795f 100644 --- a/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv +++ b/target/sim/src/tile/floo_axi_mesh_1x2_pkg.sv @@ -69,7 +69,7 @@ localparam sam_rule_t[SamNumRules-1:0] Sam = '{ typedef logic[31:0] axi_data_mst_addr_t; typedef logic[31:0] axi_data_mst_data_t; typedef logic[3:0] axi_data_mst_strb_t; -typedef logic[1:0] axi_data_mst_id_t; +typedef logic[2:0] axi_data_mst_id_t; typedef logic[0:0] axi_data_mst_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_mst, axi_data_mst_req_t, axi_data_mst_rsp_t, axi_data_mst_addr_t, axi_data_mst_id_t, axi_data_mst_data_t, axi_data_mst_strb_t, axi_data_mst_user_t) @@ -77,7 +77,7 @@ typedef logic[0:0] axi_data_mst_user_t; typedef logic[31:0] axi_data_slv_addr_t; typedef logic[31:0] axi_data_slv_data_t; typedef logic[3:0] axi_data_slv_strb_t; -typedef logic[3:0] axi_data_slv_id_t; +typedef logic[5:0] axi_data_slv_id_t; typedef logic[0:0] axi_data_slv_user_t; `AXI_TYPEDEF_ALL_CT(axi_data_slv, axi_data_slv_req_t, axi_data_slv_rsp_t, axi_data_slv_addr_t, axi_data_slv_id_t, axi_data_slv_data_t, axi_data_slv_strb_t, axi_data_slv_user_t) @@ -87,8 +87,8 @@ typedef logic[0:0] axi_data_slv_user_t; localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, DataWidth: 32, UserWidth: 1, - InIdWidth: 4, - OutIdWidth: 2}; + InIdWidth: 6, + OutIdWidth: 3}; `FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_data_slv, AxiCfg, hdr_t) `FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp)