From ae234315d133ea03fdcc2c815a1bb78407c23a57 Mon Sep 17 00:00:00 2001 From: "Amir Kiamarzi amirhossein.kiamarz2@unibo.it" Date: Thu, 26 Feb 2026 14:24:07 +0100 Subject: [PATCH] Fix strided load and store instruction --- hw/ip/spatz/src/spatz.sv | 6 +- hw/ip/spatz/src/spatz_vrf.sv | 33 +++++- sw/riscvTests/CMakeLists.txt | 3 + sw/riscvTests/isa/rv64uv/vls.c | 190 +++++++++++++++++++++++++++++++++ sw/riscvTests/isa/rv64uv/vss.c | 146 +++++++++++++++++++++++++ 5 files changed, 374 insertions(+), 4 deletions(-) create mode 100644 sw/riscvTests/isa/rv64uv/vls.c create mode 100644 sw/riscvTests/isa/rv64uv/vss.c diff --git a/hw/ip/spatz/src/spatz.sv b/hw/ip/spatz/src/spatz.sv index c1843599..107c0369 100644 --- a/hw/ip/spatz/src/spatz.sv +++ b/hw/ip/spatz/src/spatz.sv @@ -215,6 +215,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( logic [NrReadPorts-1:0] vrf_re; vrf_data_t [NrReadPorts-1:0] vrf_rdata; logic [NrReadPorts-1:0] vrf_rvalid; + logic [NrWritePorts-1:0] vrf_wvalid_vlsu; spatz_vrf #( .NrReadPorts (NrReadPorts ), @@ -229,6 +230,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .we_i (vrf_we ), .wbe_i (vrf_wbe_buf ), .wvalid_o (vrf_wvalid ), + .wvalid_vlsu_o (vrf_wvalid_vlsu), + `ifdef BUF_FPU .fpu_buf_usage_i (vfu_buf_usage ), `endif @@ -424,7 +427,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .vrf_wdata_o (vrf_wdata[VLSU_VD_WD] ), .vrf_we_o (sb_we[VLSU_VD_WD] ), .vrf_wbe_o (vrf_wbe[VLSU_VD_WD] ), - .vrf_wvalid_i (vrf_wvalid[VLSU_VD_WD] ), + // .vrf_wvalid_i (vrf_wvalid[VLSU_VD_WD] ), + .vrf_wvalid_i (vrf_wvalid_vlsu[VLSU_VD_WD] ),//vrf_wvalid[1] .vrf_raddr_o (vrf_raddr[VLSU_VD_RD:VLSU_VS2_RD] ), .vrf_re_o (sb_re[VLSU_VD_RD:VLSU_VS2_RD] ), .vrf_rdata_i (vrf_rdata[VLSU_VD_RD:VLSU_VS2_RD] ), diff --git a/hw/ip/spatz/src/spatz_vrf.sv b/hw/ip/spatz/src/spatz_vrf.sv index ea590195..75c74aeb 100644 --- a/hw/ip/spatz/src/spatz_vrf.sv +++ b/hw/ip/spatz/src/spatz_vrf.sv @@ -22,6 +22,8 @@ module spatz_vrf input logic [NrWritePorts-1:0] we_i, input vrf_be_t [NrWritePorts-1:0] wbe_i, output logic [NrWritePorts-1:0] wvalid_o, + output logic [NrWritePorts-1:0] wvalid_vlsu_o, + output logic [NrWritePorts-1:0] wbe_o, `ifdef BUF_FPU // Signal to track if result can be buffered or not input logic [$clog2(FpuBufDepth)-1:0] fpu_buf_usage_i, @@ -90,13 +92,32 @@ module spatz_vrf end end: gen_write_request + + vrf_be_t [NrVRFBanks-1:0] wbe_d,wbe_q; + + always_ff @(posedge clk_i or negedge rst_ni) begin + if(~rst_ni) begin + wbe_q <= '0; + end else begin + for (int bank = 0; bank < NrVRFBanks; bank++) begin + if(!write_request[bank][VLSU_VD_WD])begin + wbe_q[bank] <= '0; + end else begin + wbe_q[bank] <= wbe_d[bank]; + end + end + end + end + + always_comb begin : proc_write waddr = '0; wdata = '0; we = '0; wbe = '0; wvalid_o = '0; - + wvalid_vlsu_o = '0; + wbe_d = wbe_q; // For each bank, we have a priority based access scheme. First priority always has the VFU, // second priority has the LSU, and third priority has the slide unit. for (int unsigned bank = 0; bank < NrVRFBanks; bank++) begin @@ -123,7 +144,10 @@ module spatz_vrf wdata[bank] = wdata_i[VFU_VD_WD]; we[bank] = 1'b1; wbe[bank] = wbe_i[VFU_VD_WD]; - wvalid_o[VFU_VD_WD] = 1'b1; + // wvalid_o[VFU_VD_WD] = 1'b1; + wbe_d[bank] = wbe_q[bank]|wbe_i[VLSU_VD_WD]; + wvalid_o[VLSU_VD_WD] = &(wbe_q[bank]|wbe_i[VLSU_VD_WD]);//1 + wvalid_vlsu_o[VLSU_VD_WD] = 1'b1;//1 end else if (write_request[bank][VSLDU_VD_WD]) begin waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); wdata[bank] = wdata_i[VSLDU_VD_WD]; @@ -144,7 +168,10 @@ module spatz_vrf wdata[bank] = wdata_i[VLSU_VD_WD]; we[bank] = 1'b1; wbe[bank] = wbe_i[VLSU_VD_WD]; - wvalid_o[VLSU_VD_WD] = 1'b1; + // wvalid_o[VLSU_VD_WD] = 1'b1; + wbe_d[bank] = wbe_q[bank]|wbe_i[VLSU_VD_WD]; + wvalid_o[VLSU_VD_WD] = &(wbe_q[bank]|wbe_i[VLSU_VD_WD]);//1 + wvalid_vlsu_o[VLSU_VD_WD] = 1'b1;//1 end else if (write_request[bank][VSLDU_VD_WD]) begin waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); wdata[bank] = wdata_i[VSLDU_VD_WD]; diff --git a/sw/riscvTests/CMakeLists.txt b/sw/riscvTests/CMakeLists.txt index d5838211..f701e2b7 100644 --- a/sw/riscvTests/CMakeLists.txt +++ b/sw/riscvTests/CMakeLists.txt @@ -128,3 +128,6 @@ add_snitch_test(vfcvt isa/rv64uv/vfcvt.c) add_snitch_test(vfncvt isa/rv64uv/vfncvt.c) add_snitch_test(vfmv isa/rv64uv/vfmv.c) + +add_snitch_test(vls isa/rv64uv/vls.c) +add_snitch_test(vss isa/rv64uv/vss.c) diff --git a/sw/riscvTests/isa/rv64uv/vls.c b/sw/riscvTests/isa/rv64uv/vls.c new file mode 100644 index 00000000..5d85d405 --- /dev/null +++ b/sw/riscvTests/isa/rv64uv/vls.c @@ -0,0 +1,190 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Matteo Perotti + +#include "vector_macros.h" + +// Positive-stride tests +void TEST_CASE1(void) { + VSET(4, e8, m1); + volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0, + 0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3}; + uint64_t stride = 3; + asm volatile("vlse8.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U8(1, v1, 0x9f, 0x20, 0x05, 0xaa); +} + +void TEST_CASE2(void) { + VSET(4, e16, m1); + volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0, + 0xf9aa, 0x71f0, 0xc394, 0xbbd3}; + uint64_t stride = 4; + asm volatile("vlse16.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U16(2, v1, 0x9fe4, 0x8f2e, 0xf9aa, 0xc394); +} + +void TEST_CASE3(void) { + VSET(4, e32, m1); + volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3, + 0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1}; + uint64_t stride = 8; + asm volatile("vlse32.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U32(3, v1, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348); +} + +void TEST_CASE4(void) { + VSET(4, e64, m1); + volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, + 0xa11a9384a7163840, 0x99991348a9f38cd1}; + uint64_t stride = 8; + asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U64(4, v1, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840, + 0x99991348a9f38cd1); +} + +// Zero-stride tests +// The implementation must perform all the memory accesses +void TEST_CASE5(void) { + VSET(16, e8, m1); + volatile uint8_t INP1[] = {0x9f}; + uint64_t stride = 0; + asm volatile("vlse8.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U8(5, v1, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, + 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f); +} + +// The implementation can also perform fewer accesses +void TEST_CASE6(void) { + VSET(16, e8, m1); + volatile uint8_t INP1[] = {0x9f}; + asm volatile("vlse8.v v1, (%0), x0" ::"r"(INP1)); + VCMP_U8(6, v1, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, + 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f); +} + +// Different LMUL +void TEST_CASE7(void) { + VSET(8, e64, m2); + volatile uint64_t INP1[] = {0x9fa831c7a11a9384}; + asm volatile("vlse64.v v2, (%0), x0" ::"r"(INP1)); + VCMP_U64(7, v2, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, + 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, + 0x9fa831c7a11a9384, 0x9fa831c7a11a9384); +} + +// Others +// Negative-stride test +void TEST_CASE8(void) { + VSET(4, e16, m1); + volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0, + 0xf9aa, 0x71f0, 0xc394, 0xbbd3}; + uint64_t stride = -4; + asm volatile("vlse16.v v1, (%0), %1" ::"r"(&INP1[7]), "r"(stride)); + VCMP_U16(8, v1, 0xbbd3, 0x71f0, 0x05e0, 0x1920); +} + +// Stride greater than default Ara AXI width == 128-bit (4 lanes) +void TEST_CASE9(void) { + VSET(2, e64, m1); + volatile uint64_t INP1[] = {0x99991348a9f38cd1, 0x9fa831c7a11a9384, + 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, + 0x9fa831c7a11a9384, 0x01015ac1309bb678}; + uint64_t stride = 40; + asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U64(9, v1, 0x99991348a9f38cd1, 0x01015ac1309bb678); +} + +// Fill Ara internal Load Buffer +void TEST_CASE10(void) { + VSET(8, e64, m1); + volatile uint64_t INP1[] = { + 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840, + 0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548, + 0x1893179501093489, 0x81937598aa819388, 0x1874754791888188, + 0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891, + 0x9031850931584902, 0x3189759837598759, 0x8319599991911111, + 0x8913984898951989}; + uint64_t stride = 16; + asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride)); + VCMP_U64(10, v1, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0x9fa831c7a11a9384, + 0x1893179501093489, 0x1874754791888188, 0x9013930148815808, + 0x9031850931584902, 0x8319599991911111); +} + +// Masked stride loads +void TEST_CASE11(void) { + VSET(4, e8, m1); + volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0, + 0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3}; + uint64_t stride = 3; + VLOAD_8(v0, 0xAA); + VCLEAR(v1); + asm volatile("vlse8.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride)); + VCMP_U8(11, v1, 0x00, 0x20, 0x00, 0xaa); +} + +void TEST_CASE12(void) { + VSET(4, e16, m1); + volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0, + 0xf9aa, 0x71f0, 0xc394, 0xbbd3}; + uint64_t stride = 4; + VLOAD_8(v0, 0xAA); + VCLEAR(v1); + asm volatile("vlse16.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride)); + VCMP_U16(12, v1, 0, 0x8f2e, 0, 0xc394); +} + +void TEST_CASE13(void) { + VSET(4, e32, m1); + volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3, + 0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1}; + uint64_t stride = 8; + VLOAD_8(v0, 0xAA); + VCLEAR(v1); + asm volatile("vlse32.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride)); + VCMP_U32(13, v1, 0, 0xf9aa71f0, 0, 0x99991348); +} + +void TEST_CASE14(void) { + VSET(8, e64, m1); + volatile uint64_t INP1[] = { + 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840, + 0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548, + 0x1893179501093489, 0x81937598aa819388, 0x1874754791888188, + 0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891, + 0x9031850931584902, 0x3189759837598759, 0x8319599991911111, + 0x8913984898951989}; + uint64_t stride = 16; + VLOAD_8(v0, 0xAA); + VCLEAR(v1); + asm volatile("vlse64.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride)); + VCMP_U64(14, v1, 0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0, + 0x9013930148815808, 0, 0x8319599991911111); +} + +int main(void) { + INIT_CHECK(); + enable_vec(); + + TEST_CASE1(); + TEST_CASE2(); + TEST_CASE3(); + TEST_CASE4(); + + TEST_CASE5(); + TEST_CASE6(); + TEST_CASE7(); + + TEST_CASE8(); + TEST_CASE9(); + TEST_CASE10(); + +// TEST_CASE11(); +// TEST_CASE12(); +// TEST_CASE13(); +// TEST_CASE14(); + + EXIT_CHECK(); +} \ No newline at end of file diff --git a/sw/riscvTests/isa/rv64uv/vss.c b/sw/riscvTests/isa/rv64uv/vss.c new file mode 100644 index 00000000..a62362c5 --- /dev/null +++ b/sw/riscvTests/isa/rv64uv/vss.c @@ -0,0 +1,146 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 +// +// Author: Matteo Perotti + +#include "vector_macros.h" + +// Positive-stride tests +void TEST_CASE1(void) { + VSET(4, e8, m1); + volatile uint8_t OUT1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + uint64_t stride = 3; + VLOAD_8(v1, 0x9f, 0xe4, 0x19, 0x20); + asm volatile("vsse8.v v1, (%0), %1" ::"r"(OUT1), "r"(stride)); + VVCMP_U8(1, OUT1, 0x9f, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x19, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); +} + +void TEST_CASE2(void) { + VSET(8, e16, m1); + volatile uint16_t OUT1[] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000}; + uint64_t stride = 4; + VLOAD_16(v1, 0x9f11, 0xe478, 0x1549, 0x3240, 0x2f11, 0xe448, 0x1546, 0x3220); + asm volatile("vsse16.v v1, (%0), %1" ::"r"(OUT1), "r"(stride)); + VVCMP_U16(2, OUT1, 0x9f11, 0x0000, 0xe478, 0x0000, 0x1549, 0x0000, 0x3240, + 0x0000, 0x2f11, 0x0000, 0xe448, 0x0000, 0x1546, 0x0000, 0x3220, + 0x0000); +} + +void TEST_CASE3(void) { + VSET(4, e32, m1); + volatile uint32_t OUT1[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + uint64_t stride = 8; + VLOAD_32(v1, 0x9f872456, 0xe1356784, 0x13241139, 0x20862497); + asm volatile("vsse32.v v1, (%0), %1" ::"r"(OUT1), "r"(stride)); + VVCMP_U32(3, OUT1, 0x9f872456, 0x00000000, 0xe1356784, 0x00000000, 0x13241139, + 0x00000000, 0x20862497, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000); +} + +void TEST_CASE4(void) { + VSET(16, e64, m8); + volatile uint64_t OUT1[] = { + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000}; + uint64_t stride = 16; + VLOAD_64(v0, 0x9f87245315434136, 0xe135578794246784, 0x1315345345241139, + 0x2086252110062497, 0x1100229933847136, 0xaaffaaffaaffaaff, + 0xaf87245315434136, 0xa135578794246784, 0x2315345345241139, + 0x1086252110062497, 0x1100229933847134, 0xaaffaaffaaffaaf4, + 0x9315345345241139, 0x9086252110062497, 0x9100229933847134, + 0x9affaaffaaffaaf4); + asm volatile("vsse64.v v0, (%0), %1" ::"r"(OUT1), "r"(stride)); + VVCMP_U64(4, OUT1, 0x9f87245315434136, 0x0000000000000000, 0xe135578794246784, + 0x0000000000000000, 0x1315345345241139, 0x0000000000000000, + 0x2086252110062497, 0x0000000000000000, 0x1100229933847136, + 0x0000000000000000, 0xaaffaaffaaffaaff, 0x0000000000000000, + 0xaf87245315434136, 0x0000000000000000, 0xa135578794246784, + 0x0000000000000000, 0x2315345345241139, 0x0000000000000000, + 0x1086252110062497, 0x0000000000000000, 0x1100229933847134, + 0x0000000000000000, 0xaaffaaffaaffaaf4, 0x0000000000000000, + 0x9315345345241139, 0x0000000000000000, 0x9086252110062497, + 0x0000000000000000, 0x9100229933847134, 0x0000000000000000, + 0x9affaaffaaffaaf4, 0x0000000000000000); +} + +// Masked strided store +void TEST_CASE5(void) { + VSET(4, e8, m1); + volatile uint8_t OUT1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + uint64_t stride = 3; + VLOAD_8(v0, 0xAA); + VLOAD_8(v1, 0x9f, 0xe4, 0x19, 0x20); + asm volatile("vsse8.v v1, (%0), %1, v0.t" ::"r"(OUT1), "r"(stride)); + VVCMP_U8(5, OUT1, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); +} + +void TEST_CASE6(void) { + VSET(16, e64, m1); + volatile uint64_t OUT1[] = { + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000}; + uint64_t stride = 16; + VLOAD_64(v1, 0x9f87245315434136, 0xe135578794246784, 0x1315345345241139, + 0x2086252110062497, 0x1100229933847136, 0xaaffaaffaaffaaff, + 0xaf87245315434136, 0xa135578794246784, 0x2315345345241139, + 0x1086252110062497, 0x1100229933847134, 0xaaffaaffaaffaaf4, + 0x9315345345241139, 0x9086252110062497, 0x9100229933847134, + 0x9affaaffaaffaaf4); + VLOAD_8(v0, 0xAA, 0xAA); + asm volatile("vsse64.v v1, (%0), %1, v0.t" ::"r"(OUT1), "r"(stride)); + VVCMP_U64(6, OUT1, 0x0000000000000000, 0x0000000000000000, 0xe135578794246784, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x2086252110062497, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0xaaffaaffaaffaaff, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0xa135578794246784, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x1086252110062497, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0xaaffaaffaaffaaf4, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x9086252110062497, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x9affaaffaaffaaf4, 0x0000000000000000); +} + +int main(void) { + INIT_CHECK(); + enable_vec(); + + TEST_CASE1(); + TEST_CASE2(); + TEST_CASE3(); + TEST_CASE4(); + +// TEST_CASE5(); +// TEST_CASE6(); + + EXIT_CHECK(); +} \ No newline at end of file