diff --git a/.github/actions/bench/action.yml b/.github/actions/bench/action.yml index aea498054..bdc18266c 100644 --- a/.github/actions/bench/action.yml +++ b/.github/actions/bench/action.yml @@ -36,7 +36,7 @@ inputs: required: true nix-shell: description: Run in the specified Nix environment if exists - default: "ci-bench" + default: "bench" nix-cache: description: Determine whether to enable nix cache default: 'false' diff --git a/.github/actions/cbmc/action.yml b/.github/actions/cbmc/action.yml index e75feda2f..8778b752e 100644 --- a/.github/actions/cbmc/action.yml +++ b/.github/actions/cbmc/action.yml @@ -8,7 +8,7 @@ description: Run CBMC proofs for mldsa-native inputs: nix-shell: description: Run in the specified Nix environment if exists - default: "ci-cbmc" + default: "cbmc" nix-cache: description: Determine whether to enable nix cache default: 'true' diff --git a/.github/actions/lint/action.yml b/.github/actions/lint/action.yml index 13b4f7406..409e64a9d 100644 --- a/.github/actions/lint/action.yml +++ b/.github/actions/lint/action.yml @@ -8,7 +8,7 @@ description: Lint inputs: nix-shell: description: Run in the specified Nix environment if exists - default: "ci-linter" + default: "linter" nix-cache: description: Determine whether to enable nix cache default: "false" diff --git a/.github/workflows/baremetal.yml b/.github/workflows/baremetal.yml index 90b33b766..745684e9b 100644 --- a/.github/workflows/baremetal.yml +++ b/.github/workflows/baremetal.yml @@ -18,13 +18,13 @@ jobs: - runner: ubuntu-latest name: 'M55-AN547' makefile: test/baremetal/platform/m55-an547/platform.mk - nix-shell: arm-embedded + nix-shell: cross-arm-embedded func: true kat: true acvp: true alloc: true bench: true - opt: no_opt + opt: all runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -53,6 +53,6 @@ jobs: nix-shell: ${{ matrix.target.nix-shell }} gh_token: ${{ secrets.GITHUB_TOKEN }} perf: PMU - opt: false + opt: true store_results: false diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index d04023541..c6c409f5d 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -21,7 +21,7 @@ jobs: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - uses: ./.github/actions/lint with: - nix-shell: ci-linter + nix-shell: linter gh_token: ${{ secrets.GITHUB_TOKEN }} cross-prefix: "aarch64-unknown-linux-gnu-" quickcheck: diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 0bf34ae03..7712daaee 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -36,7 +36,7 @@ jobs: cflags: "-flto -DMLD_FORCE_AARCH64" ldflags: "-flto" bench_extra_args: "" - nix_shell: ci-bench + nix_shell: bench only_no_opt: false - system: rpi5 name: Arm Cortex-A76 (Raspberry Pi 5) benchmarks @@ -45,7 +45,7 @@ jobs: cflags: "-flto -DMLD_FORCE_AARCH64" ldflags: "-flto" bench_extra_args: "" - nix_shell: ci-bench + nix_shell: bench only_no_opt: false - system: a55 name: Arm Cortex-A55 (Snapdragon 888) benchmarks @@ -54,7 +54,7 @@ jobs: cflags: "-flto -DMLD_FORCE_AARCH64" ldflags: "-flto -static" bench_extra_args: -w exec-on-a55 - nix_shell: ci-bench + nix_shell: bench only_no_opt: false - system: bpi name: SpacemiT K1 8 (Banana Pi F3) benchmarks @@ -64,7 +64,7 @@ jobs: ldflags: "-static" bench_extra_args: -w exec-on-bpi cross_prefix: riscv64-unknown-linux-gnu- - nix_shell: ci-cross-riscv64 + nix_shell: cross-riscv64 only_no_opt: true - system: m1-mac-mini name: Mac Mini (M1, 2020) benchmarks @@ -73,7 +73,7 @@ jobs: cflags: "-flto" ldflags: "-flto" bench_extra_args: "-r" - nix_shell: ci-bench + nix_shell: bench only_no_opt: false if: github.repository_owner == 'pq-code-package' && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main') runs-on: self-hosted-${{ matrix.target.system }} diff --git a/.github/workflows/bench_ec2_reusable.yml b/.github/workflows/bench_ec2_reusable.yml index 7c68c4b82..45562d6e7 100644 --- a/.github/workflows/bench_ec2_reusable.yml +++ b/.github/workflows/bench_ec2_reusable.yml @@ -176,7 +176,7 @@ jobs: - uses: ./.github/actions/bench if: ${{ inputs.opt == 'all' || inputs.opt == 'opt' }} with: - nix-shell: 'ci-bench' + nix-shell: 'bench' custom-shell: 'bash' nix-cache: false nix-verbose: ${{ inputs.verbose }} @@ -192,7 +192,7 @@ jobs: - uses: ./.github/actions/bench if: ${{ inputs.opt == 'all' || inputs.opt == 'no_opt' }} with: - nix-shell: 'ci-bench' + nix-shell: 'bench' custom-shell: 'bash' nix-cache: false nix-verbose: ${{ inputs.verbose }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4048ae518..3c7a9c29e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,22 +36,22 @@ jobs: name: 'ubuntu-latest (aarch64)' arch: x86_64 mode: cross-x86_64 - nix_shell: ci-cross-x86_64 + nix_shell: cross-x86_64 - runner: ubuntu-24.04-arm name: 'ubuntu-latest (aarch64)' arch: riscv64 mode: cross-riscv64 - nix_shell: ci-cross-riscv64 + nix_shell: cross-riscv64 - runner: ubuntu-24.04-arm name: 'ubuntu-latest (aarch64)' arch: riscv32 mode: cross-riscv32 - nix_shell: ci-cross-riscv32 + nix_shell: cross-riscv32 - runner: ubuntu-24.04-arm name: 'ubuntu-latest (ppc64le)' arch: ppc64le mode: cross-ppc64le - nix_shell: ci-cross-ppc64le + nix_shell: cross-ppc64le - runner: ubuntu-latest name: 'ubuntu-latest (x86_64)' arch: x86_64 @@ -61,12 +61,12 @@ jobs: name: 'ubuntu-latest (x86_64)' arch: aarch64 mode: cross-aarch64 - nix_shell: ci-cross-aarch64 + nix_shell: cross-aarch64 - runner: ubuntu-latest name: 'ubuntu-latest (x86_64)' arch: aarch64_be mode: cross-aarch64_be - nix_shell: ci-cross-aarch64_be + nix_shell: cross-aarch64_be exclude: - {external: true, target: { @@ -82,7 +82,7 @@ jobs: name: 'ubuntu-latest (aarch64)', arch: x86_64, mode: cross-x86_64, - nix_shell: ci-cross-x86_64 + nix_shell: cross-x86_64 }} - {external: true, target: { @@ -90,7 +90,7 @@ jobs: name: 'ubuntu-latest (aarch64)', arch: riscv64, mode: cross-riscv64, - nix_shell: ci-cross-riscv64 + nix_shell: cross-riscv64 }} - {external: true, target: { @@ -98,7 +98,7 @@ jobs: name: 'ubuntu-latest (aarch64)', arch: riscv32, mode: cross-riscv32, - nix_shell: ci-cross-riscv32 + nix_shell: cross-riscv32 }} - {external: true, target: { @@ -106,7 +106,7 @@ jobs: name: 'ubuntu-latest (ppc64le)', arch: ppc64le, mode: cross-ppc64le, - nix_shell: ci-cross-ppc64le + nix_shell: cross-ppc64le }} - {external: true, target: { @@ -122,7 +122,7 @@ jobs: name: 'ubuntu-latest (x86_64)', arch: aarch64, mode: cross-aarch64, - nix_shell: ci-cross-aarch64 + nix_shell: cross-aarch64 }} - {external: true, target: { @@ -130,7 +130,7 @@ jobs: name: 'ubuntu-latest (x86_64)', arch: aarch64_be, mode: cross-aarch64_be, - nix_shell: ci-cross-aarch64_be + nix_shell: cross-aarch64_be }} name: Functional tests (${{ matrix.target.arch }}${{ matrix.target.mode != 'native' && ', cross' || ''}}) runs-on: ${{ matrix.target.runner }} @@ -211,77 +211,77 @@ jobs: name: 'macos' compiler: - name: gcc-4.8 - shell: ci_gcc48 + shell: gcc48 darwin: False c17: False c23: False opt: all examples: true - name: gcc-4.9 - shell: ci_gcc49 + shell: gcc49 darwin: False c17: False c23: False opt: all examples: true - name: gcc-7 - shell: ci_gcc7 + shell: gcc7 darwin: False c17: False c23: False opt: all examples: true - name: gcc-11 - shell: ci_gcc11 + shell: gcc11 darwin: True c17: True c23: False opt: all examples: true - name: gcc-13 - shell: ci_gcc13 + shell: gcc13 darwin: True c17: True c23: False opt: all examples: true - name: gcc-14 - shell: ci_gcc14 + shell: gcc14 darwin: True c17: True c23: True opt: all examples: true - name: gcc-15 - shell: ci_gcc15 + shell: gcc15 darwin: True c17: True c23: True opt: all examples: true - name: clang-18 - shell: ci_clang18 + shell: clang18 darwin: True c17: True c23: True opt: all examples: true - name: clang-19 - shell: ci_clang19 + shell: clang19 darwin: True c17: True c23: True opt: all examples: true - name: clang-20 - shell: ci_clang20 + shell: clang20 darwin: True c17: True c23: True opt: all examples: true - name: clang-21 - shell: ci_clang21 + shell: clang21 darwin: True c17: True c23: True @@ -294,28 +294,28 @@ jobs: # We omit all examples since there is currently no way to run # only those examples not involving native code. - name: zig-0.12 - shell: ci_zig0_12 + shell: zig0_12 darwin: True c17: True c23: False examples: False opt: no_opt - name: zig-0.13 - shell: ci_zig0_13 + shell: zig0_13 darwin: True c17: True c23: False examples: False opt: no_opt - name: zig-0.14 - shell: ci_zig0_14 + shell: zig0_14 darwin: True c17: True c23: True examples: False opt: no_opt - name: zig-0.15 - shell: ci_zig0_15 + shell: zig0_15 darwin: True c17: True c23: True @@ -431,7 +431,7 @@ jobs: with: gh_token: ${{ secrets.GITHUB_TOKEN }} compile_mode: native - nix-shell: ci_valgrind-varlat_gcc15 + nix-shell: valgrind-varlat_gcc15 nix-cache: false opt: all cflags: "${{ matrix.cflags }}" @@ -478,11 +478,11 @@ jobs: matrix: compiler: - name: gcc-14 - shell: ci_gcc14 + shell: gcc14 - name: gcc-15 - shell: ci_gcc15 + shell: gcc15 - name: clang-19 - shell: ci_clang19 + shell: clang19 # On AArch64 -fcf-protection is not supported anyway runs-on: ubuntu-latest steps: @@ -727,7 +727,7 @@ jobs: matrix: target: - system: macos-latest - nix_shell: 'ci-cross-x86_64' + nix_shell: 'cross-autogen' nix_cache: 'true' extra_args: '--force-cross' # TODO: This does not yet work (#1304) <- MLKEM ISSUE, Port when fixed @@ -735,11 +735,11 @@ jobs: # nix_cache: 'false' # nix_shell: 'ci' - system: ubuntu-latest - nix_shell: 'ci-cross-aarch64' + nix_shell: 'cross-autogen' nix_cache: 'true' extra_args: '--force-cross' - system: ubuntu-24.04-arm - nix_shell: 'ci-cross-x86_64' + nix_shell: 'cross-autogen' nix_cache: 'true' extra_args: '--force-cross' runs-on: ${{ matrix.target.system }} diff --git a/.github/workflows/ci_ec2_reusable.yml b/.github/workflows/ci_ec2_reusable.yml index 5fbaf0b33..2fa071ccf 100644 --- a/.github/workflows/ci_ec2_reusable.yml +++ b/.github/workflows/ci_ec2_reusable.yml @@ -160,14 +160,14 @@ jobs: if: ${{ inputs.lint }} uses: ./.github/actions/lint with: - nix-shell: ci-linter + nix-shell: linter gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} nix-verbose: ${{ inputs.verbose }} - name: Functional Tests if: ${{ inputs.test }} uses: ./.github/actions/multi-functest with: - nix-shell: ${{ (inputs.compile_mode == 'cross' || inputs.compile_mode == 'all') && 'ci-cross' || 'ci' }} + nix-shell: ${{ (inputs.compile_mode == 'cross' || inputs.compile_mode == 'all') && 'cross' || 'ci' }} nix-cache: ${{ inputs.cbmc || inputs.compile_mode == 'cross' || inputs.compile_mode == 'all' }} nix-verbose: ${{ inputs.verbose }} gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} @@ -188,7 +188,7 @@ jobs: if: ${{ inputs.cbmc && (success() || failure()) }} uses: ./.github/actions/cbmc with: - nix-shell: ci-cbmc + nix-shell: cbmc nix-verbose: ${{ inputs.verbose }} mldsa_parameter_set: ${{ inputs.cbmc_mldsa_parameter_set }} gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} @@ -198,7 +198,7 @@ jobs: with: gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} nix-cache: true - nix-shell: ci-slothy + nix-shell: slothy script: | autogen --slothy tests all --opt opt diff --git a/.github/workflows/ct-tests.yml b/.github/workflows/ct-tests.yml index 2163e4dc8..5df4194f5 100644 --- a/.github/workflows/ct-tests.yml +++ b/.github/workflows/ct-tests.yml @@ -21,22 +21,22 @@ jobs: matrix: system: [ubuntu-latest, ubuntu-24.04-arm] nix-shell: - - ci_valgrind-varlat_clang14 - - ci_valgrind-varlat_clang15 - - ci_valgrind-varlat_clang16 - - ci_valgrind-varlat_clang17 - - ci_valgrind-varlat_clang18 - - ci_valgrind-varlat_clang19 - - ci_valgrind-varlat_clang20 - - ci_valgrind-varlat_clang21 - - ci_valgrind-varlat_gcc48 - - ci_valgrind-varlat_gcc49 - - ci_valgrind-varlat_gcc7 - - ci_valgrind-varlat_gcc11 - - ci_valgrind-varlat_gcc12 - - ci_valgrind-varlat_gcc13 - - ci_valgrind-varlat_gcc14 - - ci_valgrind-varlat_gcc15 + - valgrind-varlat_clang14 + - valgrind-varlat_clang15 + - valgrind-varlat_clang16 + - valgrind-varlat_clang17 + - valgrind-varlat_clang18 + - valgrind-varlat_clang19 + - valgrind-varlat_clang20 + - valgrind-varlat_clang21 + - valgrind-varlat_gcc48 + - valgrind-varlat_gcc49 + - valgrind-varlat_gcc7 + - valgrind-varlat_gcc11 + - valgrind-varlat_gcc12 + - valgrind-varlat_gcc13 + - valgrind-varlat_gcc14 + - valgrind-varlat_gcc15 runs-on: ${{ matrix.system }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -48,7 +48,7 @@ jobs: nix-cache: true - name: Build and run test (-Oz) # -Oz got introduced in gcc12 - if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_gcc48' && matrix.nix-shell != 'ci_valgrind-varlat_gcc49' && matrix.nix-shell != 'ci_valgrind-varlat_gcc7' && matrix.nix-shell != 'ci_valgrind-varlat_gcc11'}} + if: ${{ matrix.nix-shell != 'valgrind-varlat_gcc48' && matrix.nix-shell != 'valgrind-varlat_gcc49' && matrix.nix-shell != 'valgrind-varlat_gcc7' && matrix.nix-shell != 'valgrind-varlat_gcc11'}} uses: ./.github/actions/ct-test with: cflags: -Oz -DMLD_CONFIG_KEYGEN_PCT @@ -65,7 +65,7 @@ jobs: valgrind_flags: --variable-latency-errors=yes - name: Build and run test (-Ofast) # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_clang19' && matrix.nix-shell != 'ci_valgrind-varlat_clang20' && matrix.nix-shell != 'ci_valgrind-varlat_clang21'}} + if: ${{ matrix.nix-shell != 'valgrind-varlat_clang19' && matrix.nix-shell != 'valgrind-varlat_clang20' && matrix.nix-shell != 'valgrind-varlat_clang21'}} uses: ./.github/actions/ct-test with: cflags: -Ofast -DMLD_CONFIG_KEYGEN_PCT diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 82b53c3e3..009182a9e 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -72,9 +72,9 @@ jobs: # We only run cross-compilation checks for x86 on macos-latest, # so restrict caching to the corresponding cross shell. if [[ ${{ runner.os }} == 'macOS' ]]; then - nix develop .#ci-cross-x86_64 --profile tmp-cross + nix develop .#cross-x86_64 --profile tmp-cross else - nix develop .#ci-cross --profile tmp-cross + nix develop .#cross --profile tmp-cross # GH ubuntu-24.04 image tend to run outof space if [[ ${{ matrix.runner }} == 'ubuntu-24.04' ]]; then nix-collect-garbage diff --git a/README.md b/README.md index cbc59bab3..32b6d02c2 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ mldsa-native currently offers the following backends: * Default portable C backend * 64-bit Arm backend (using Neon) * 64-bit Intel/AMD backend (using AVX2) +* 32-bit Armv8.1-M backend (using Helium/MVE). This is still experimental and disabled by default. If you'd like contribute new backends, please reach out! diff --git a/dev/fips202/armv81m/README.md b/dev/fips202/armv81m/README.md new file mode 100644 index 000000000..76c4c1d22 --- /dev/null +++ b/dev/fips202/armv81m/README.md @@ -0,0 +1,9 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# FIPS202 backend for Armv8.1-M + MVE: Development sources + +This directory contains the development sources for a FIPS202 backend targeting +the Armv8.1-M + MVE/Helium architecture. + +**Warning:** This backend is still in active development and has not yet undergone +the same level of review as the rest of the code. Use at your own risk! diff --git a/dev/fips202/armv81m/mve.h b/dev/fips202/armv81m/mve.h new file mode 100644 index 000000000..1b6c6356e --- /dev/null +++ b/dev/fips202/armv81m/mve.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_DEV_FIPS202_ARMV81M_MVE_H +#define MLD_DEV_FIPS202_ARMV81M_MVE_H + +#define MLD_FIPS202_NATIVE_ARMV81M + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_ARMV81M_NEED_X4 + +#if !defined(__ASSEMBLER__) +#include +#include "../api.h" + +#define mld_keccak_f1600_x4_native_impl \ + MLD_NAMESPACE(keccak_f1600_x4_native_impl) +int mld_keccak_f1600_x4_native_impl(uint64_t *state); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) +{ + return mld_keccak_f1600_x4_native_impl(state); +} + +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */ diff --git a/dev/fips202/armv81m/src/fips202_native_armv81m.h b/dev/fips202/armv81m/src/fips202_native_armv81m.h new file mode 100644 index 000000000..ca3b9945d --- /dev/null +++ b/dev/fips202/armv81m/src/fips202_native_armv81m.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H +#define MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H + +#include +#include "../../../../common.h" + +/* Keccak round constants in bit-interleaved form */ +#define mld_keccakf1600_round_constants \ + MLD_NAMESPACE(keccakf1600_round_constants) +extern const uint32_t mld_keccakf1600_round_constants[48]; + +#define mld_keccak_f1600_x4_mve_asm MLD_NAMESPACE(keccak_f1600_x4_mve_asm) +void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100], + const uint32_t rc[48]); + +#endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */ diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S new file mode 100644 index 000000000..50c06595f --- /dev/null +++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S @@ -0,0 +1,1608 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2025 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: keccak_f1600_x4_mve_asm + Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) + ABI: + r0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *state + description: Four bit-interleaved Keccak states (low halves followed by high halves) + r1: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *tmpstate + description: Temporary storage for intermediate state + r2: + type: buffer + size_bytes: 192 + permissions: read + c_parameter: const uint32_t *rc + description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words) + Stack: + bytes: 236 + description: register preservation (44) + SIMD registers (64) + temporary storage (128) +*/ + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.thumb +.syntax unified +.text +.equ QSTACK0, 0 +.equ A__00, 0 +.equ A__01, 80 +.equ A__02, 160 +.equ A__03, 240 +.equ A__04, 320 +.equ A__10, 16 +.equ A__11, 96 +.equ A__12, 176 +.equ A__13, 256 +.equ A__14, 336 +.equ A__20, 32 +.equ A__21, 112 +.equ A__22, 192 +.equ A__23, 272 +.equ A__24, 352 +.equ A__30, 48 +.equ A__31, 128 +.equ A__32, 208 +.equ A__33, 288 +.equ A__34, 368 +.equ A__40, 64 +.equ A__41, 144 +.equ A__42, 224 +.equ A__43, 304 +.equ A__44, 384 +.equ B__00, 0 +.equ B__01, 256 +.equ B__02, 112 +.equ B__03, 368 +.equ B__04, 224 +.equ B__10, 160 +.equ B__11, 16 +.equ B__12, 272 +.equ B__13, 128 +.equ B__14, 384 +.equ B__20, 320 +.equ B__21, 176 +.equ B__22, 32 +.equ B__23, 288 +.equ B__24, 144 +.equ B__30, 80 +.equ B__31, 336 +.equ B__32, 192 +.equ B__33, 48 +.equ B__34, 304 +.equ B__40, 240 +.equ B__41, 96 +.equ B__42, 352 +.equ B__43, 208 +.equ B__44, 64 +.equ RCxy_00, 0 +.equ RCxy_01, 36 +.equ RCxy_02, 3 +.equ RCxy_03, 41 +.equ RCxy_04, 18 +.equ RCxy_10, 1 +.equ RCxy_11, 44 +.equ RCxy_12, 10 +.equ RCxy_13, 45 +.equ RCxy_14, 2 +.equ RCxy_20, 62 +.equ RCxy_21, 6 +.equ RCxy_22, 43 +.equ RCxy_23, 15 +.equ RCxy_24, 61 +.equ RCxy_30, 28 +.equ RCxy_31, 55 +.equ RCxy_32, 25 +.equ RCxy_33, 21 +.equ RCxy_34, 56 +.equ RCxy_40, 27 +.equ RCxy_41, 20 +.equ RCxy_42, 39 +.equ RCxy_43, 8 +.equ RCxy_44, 14 + +qA00_h .req q0 +qA00_l .req q1 +qA20_l .req q2 + +.macro ld_xor5 state, round, x, C, A + vldrw.u32 q<\C>, [\state, #A__\x\()0] // @slothy:reads=A\state\()__\x\()0 + vldrw.u32 q<\A>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + +.macro ld_xor5_0 state, round, x, C, A, A0 + vldrw.u32 q<\C>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A0> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + + +.macro rot1_xor_l D1_l, C0_l, C2_h + vshr.u32 q<\D1_l>, q<\C2_h>, #31 + vsli.32 q<\D1_l>, q<\C2_h>, #1 + veor q<\D1_l>, q<\D1_l>, q<\C0_l> + .endm + +.macro rot1_xor_h D1_h, C0_h, C2_l + veor q<\D1_h>, q<\C2_l>, q<\C0_h> + .endm + +.macro rot_str_e s_l, s_h, A_l, A_h, RC, x, y + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro rot_str_o s_l, s_h, A_l, A_h, RC, x, y + .if (\RC-1)/2 == 0 + vstrw.32 q, [\s_h, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC-1)/2) + vsli.u32 q, q, #(\RC-1)/2 + vstrw.32 q, [\s_h, #B__\x\()\y] + .endif + + .if (\RC+1)/2 == 0 + // should never happen + vstrw.32 q, [\s_l, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC+1)/2) + vsli.u32 q, q, #(\RC+1)/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + .endif +.endm + +.macro ld_xorD_rot_str_e state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro rot_str_e_0 s_l, s_h, A_l, A_h, RC, x, y, regl, regh + vshr.u32 q<\regl>, q, #32-(\RC/2) + vsli.u32 q<\regl>, q, #\RC/2 + // vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q<\regh>, q, #32-(\RC/2) + vsli.u32 q<\regh>, q, #\RC/2 + // vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro ld_xorD_rot_str_e_0 state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h, regl, regh + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e_0 \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y, \regl, \regh +.endm + +.macro ld_xorD_rot_str_o state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_o \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro ld_bic_str state, state_n, round, y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_bic_str_0 state, state_n round, y, A0 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_bic_str_1 state, state_n, round, y, A0, A2 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q<\A2>, q, q + vstrw.32 q<\A2>, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_1_bic_str state, state_n, round, y, B1 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + // vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q<\B1> + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q<\B1>, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q<\B1>, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_3_bic_str state, state_n, round, y, B3 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + // vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q<\B3>, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q<\B3> + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q<\B3>, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + + + +.macro keccak_4fold_round_theta_rho_pi state_l, state_h, state_nl, state_nh, rc + ld_xor5_0 \state_h, 0, 0, C0_h, A0_h, qA00_h + ld_xor5_0 \state_l, 0, 2, C2_l, A2_l, qA20_l + rot1_xor_h D1_h, C0_h, C2_l + vstrw.32 q, [r13, #QSTACK0] // @slothy:writes=stack0 + + ld_xor5_0 \state_l, 0, 0, C0_l, A0_l, qA00_l + ld_xor5 \state_h, 0, 2, C2_h, A2_h + rot1_xor_l D1_l, C0_l, C2_h + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 0, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 1, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 2, D1_l, D1_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 3, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 4, D1_l, D1_h + + ld_xor5 \state_h, 0, 4, C4_h, A4_h + rot1_xor_l D3_l, C2_l, C4_h + + ld_xor5 \state_l, 0, 4, C4_l, A4_l + rot1_xor_h D3_h, C2_h, C4_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 0, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 1, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 2, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 3, D3_l, D3_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 4, D3_l, D3_h + + ld_xor5 \state_h, 0, 1, C1_h, A1_h + rot1_xor_l D0_l, C4_l, C1_h + ld_xor5 \state_l, 0, 1, C1_l, A1_l + rot1_xor_h D0_h, C4_h, C1_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 0, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 1, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 2, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 3, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 4, D0_l, D0_h + + ld_xor5 \state_l, 0, 3, C3_l, A3_l + rot1_xor_h D2_h, C1_h, C3_l + ld_xor5 \state_h, 0, 3, C3_h, A3_h + rot1_xor_l D2_l, C1_l, C3_h + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 0, D2_l, D2_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 1, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 2, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 3, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 4, D2_l, D2_h + + rot1_xor_h D4_h, C3_h, C0_l + vldrw.32 q, [r13, #QSTACK0] // @slothy:reads=stack0 + rot1_xor_l D4_l, C3_l, C0_h + + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 0, D4_l, D4_h // B40 = A03 + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 2, D4_l, D4_h // B42 = A24 + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 4, 4, D4_l, D4_h // B44 = A40 + // A11_l, A11_h, A32_l are held in registers from the next step + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 3, D4_l, D4_h, A32_l, A32_h // B43 = A32 + vstrw.32 q, [\state_nh, #B__43] + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 1, D4_l, D4_h, A11_l, A11_h // B41 = A11 +.endm + +.macro keccak_4fold_round_chi_iota state_l, state_h, state_nl, state_nh, rc // now BIC + // A11_l, A11_h, A32_l are held in registers from the previous step + ld_1_bic_str \state_l, \state_nl, 0, 1, A11_l + ld_1_bic_str \state_h, \state_nh, 0, 1, A11_h + + ld_3_bic_str \state_l, \state_nl, 0, 2, A32_l + ld_bic_str \state_h, \state_nh, 0, 2 + + ld_bic_str \state_l, \state_nl, 0, 3 + ld_bic_str \state_h, \state_nh, 0, 3 + + ld_bic_str \state_l, \state_nl, 0, 4 + ld_bic_str \state_h, \state_nh, 0, 4 + + ld_bic_str_1 \state_l, \state_nl, 0, 0, A00_l, qA20_l + ld_bic_str_0 \state_h, \state_nh, 0, 0, A00_h + + + ldrd r, r, [\rc] + vdup.32 q, r + veor qA00_l, q, q + vstrw.32 qA00_l, [\state_l, #A__00] // @slothy:writes=A\state_l\()__00 + vdup.32 q, r + veor qA00_h, q, q + vstrw.32 qA00_h, [\state_h, #A__00] // @slothy:writes=A\state_h\()__00 +.endm + +.text +.balign 8 +.type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) + + push {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + vpush {d8-d15} + sub sp, #8*16 + + mov r6, r2 // r6 = rc table pointer (from r2 parameter) + + // r0: state 0 + // r1: state 1 + // r2: this state low (reused from rc parameter) + // r3: this state high + // r4: next state low + // r5: next state high + // r6: rc table + + + mov lr, #24 + + mov r2, r0 + mov r4, r1 + + // pre-fetch so we can keep in registers between rounds + add r3, r2, #400 + vldrw.u32 qA00_h, [r3, #A__00] + vldrw.u32 qA00_l, [r2, #A__00] + vldrw.u32 qA20_l, [r2, #A__20] + + wls lr, lr, keccak_f1600_x4_mve_asm_roundend + keccak_f1600_x4_mve_asm_roundstart: + // Instructions: 559 + // Expected cycles: 559 + // Expected IPC: 1.00 + // + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ cycle (expected) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 500 525 550 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------- + vldrw.U32 q6, [r2, #112] // *.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__21 + veor q7, q6, q2 // .*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q2, [r2, #80] // ..*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__01 + veor q1, q2, q1 // ...*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + add r5, r2, #400 // ....*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q5, [r5, #80] // .....*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__01 + veor q4, q5, q0 // ......*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q0, [r2, #192] // .......*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__22 + veor q3, q7, q0 // ........*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q0, [r2, #160] // .........*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__02 + veor q1, q1, q0 // ..........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q0, [r5, #160] // ...........*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__02 + veor q0, q4, q0 // ............*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #272] // .............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__23 + veor q2, q3, q6 // ..............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q7, [r2, #240] // ...............*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__03 + veor q5, q1, q7 // ................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r5, #240] // .................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__03 + veor q4, q0, q4 // ..................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r2, #352] // ...................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__24 + veor q3, q2, q6 // ....................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q0, [r2, #320] // .....................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__04 + veor q2, q5, q0 // ......................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r5, #320] // .......................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__04 + veor q5, q4, q1 // ........................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r5, #32] // .........................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__20 + veor q0, q3, q5 // ..........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #16] // ...........................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__10 + veor q6, q1, q0 // ............................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q5, [r13, #0] // .............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:writes=stack0 + vshr.U32 q7, q6, #32-((1+1)/2) // ..............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + add r10, r4, #400 // ...............................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q7, q6, #(1+1)/2 // ................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r5, #112] // .................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__21 + veor q6, q4, q6 // ..................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q4, [r5, #192] // ...................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__22 + veor q4, q6, q4 // ....................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r5, #272] // .....................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__23 + veor q4, q4, q6 // ......................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r5, #352] // .......................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__24 + veor q5, q4, q6 // ........................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q7, [r4, #160] // .........................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q4, q5, #31 // ..........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.32 q4, q5, #1 // ...........................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r2, #16] // ............................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__10 + veor q7, q4, q2 // .............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + veor q1, q6, q7 // ..............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r5, #96] // ...............................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__11 + veor q6, q6, q0 // ................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q1, [r10, #160] // .................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q1, q6, #32-(44/2) // ..................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vsli.U32 q1, q6, #44/2 // ...................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r2, #96] // ....................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__11 + veor q4, q6, q7 // .....................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r10, #16] // ......................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q4, #32-(44/2) // .......................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q4, #44/2 // ........................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #336] // .........................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__14 + veor q4, q1, q0 // ..........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #176] // ...........................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__12 + veor q1, q1, q7 // ............................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r4, #16] // .............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q6, q1, #32-(10/2) // ..............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vsli.U32 q6, q1, #10/2 // ...............................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #256] // ................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__13 + veor q1, q1, q7 // .................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r4, #272] // ..................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q1, #32-((45-1)/2) // ...................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #(45-1)/2 // ....................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #336] // .....................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__14 + veor q1, q1, q7 // ......................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q6, [r10, #128] // .......................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q1, #32-(2/2) // ........................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #2/2 // .........................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q7, [r5, #176] // ..........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__12 + veor q7, q7, q0 // ...........................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #384] // ............................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q1, q7, #32-(10/2) // .............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vsli.U32 q1, q7, #10/2 // ..............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r5, #256] // ...............................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__13 + veor q0, q6, q0 // ................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q1, [r10, #272] // .................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q1, q4, #32-(2/2) // ..................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q7, [r5, #64] // ...................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__40 + vsli.U32 q1, q4, #2/2 // ....................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r5, #144] // .....................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__41 + vshr.U32 q6, q0, #32-((45+1)/2) // ......................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q1, [r10, #384] // .......................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q0, #(45+1)/2 // ........................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + veor q7, q7, q4 // .........................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #224] // ..........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__42 + veor q4, q7, q1 // ...........................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q7, [r5, #304] // ............................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__43 + veor q1, q4, q7 // .............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q0, [r5, #384] // ..............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__44 + veor q7, q1, q0 // ...............................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #128] // ................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q1, q7, #31 // .................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vsli.32 q1, q7, #1 // ..................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r2, #144] // ...................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__41 + veor q0, q1, q3 // ....................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r2, #64] // .....................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__40 + veor q1, q3, q6 // ......................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r2, #224] // .......................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__42 + veor q1, q1, q6 // ........................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r2, #304] // .........................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__43 + veor q6, q1, q3 // ..........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r2, #384] // ...........................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__44 + veor q3, q6, q4 // ............................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r2, #48] // .............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__30 + veor q5, q3, q5 // ..............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r5, #48] // ...............................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__30 + veor q1, q1, q5 // ................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q6, q1, #32-(28/2) // .................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vsli.U32 q6, q1, #28/2 // ..................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r2, #128] // ...................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__31 + veor q1, q1, q0 // ....................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r10, #80] // .....................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q1, #32-((55-1)/2) // ......................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vsli.U32 q6, q1, #(55-1)/2 // .......................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #128] // ........................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__31 + veor q1, q1, q5 // .........................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r10, #336] // ..........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q1, #32-((55+1)/2) // ...........................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #(55+1)/2 // ............................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................. + veor q1, q4, q0 // .............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r4, #336] // ..............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q4, q1, #32-(28/2) // ...............................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q4, q1, #28/2 // ................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #208] // .................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__32 + veor q6, q6, q0 // ..................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q4, [r4, #80] // ...................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q1, q6, #32-((25-1)/2) // ....................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q1, q6, #(25-1)/2 // .....................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r2, #288] // ......................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__33 + veor q4, q4, q0 // .......................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r2, #368] // ........................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__34 + veor q0, q6, q0 // .........................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q0, #32-(56/2) // ..........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r10, #192] // ...........................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q0, #56/2 // ............................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q0, q4, #32-((21-1)/2) // .............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q1, [r5, #368] // ..............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__34 + vsli.U32 q0, q4, #(21-1)/2 // ...............................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #304] // ................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................. + veor q4, q1, q5 // .................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q0, [r10, #48] // ..................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q1, q4, #32-(56/2) // ...................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q1, q4, #56/2 // ....................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r5, #208] // .....................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__32 + veor q6, q6, q5 // ......................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q0, [r5, #288] // .......................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__33 + veor q5, q0, q5 // ........................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r10, #304] // .........................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q0, q6, #32-((25+1)/2) // ..........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q0, q6, #(25+1)/2 // ...........................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #96] // ............................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__11 + vshr.U32 q6, q5, #32-((21+1)/2) // .............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r5, #16] // ..............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__10 + vsli.U32 q6, q5, #(21+1)/2 // ...............................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q5, [r5, #176] // ................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__12 + veor q1, q4, q1 // .................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r5, #256] // ..................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__13 + veor q5, q1, q5 // ...................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #336] // ....................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__14 + veor q5, q5, q4 // .....................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q0, [r4, #192] // ......................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................ + veor q0, q5, q1 // .......................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #48] // ........................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q5, q0, #31 // .........................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................... + vsli.32 q5, q0, #1 // ..........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r2, #16] // ...........................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__10 + veor q3, q5, q3 // ............................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #96] // .............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__11 + veor q4, q4, q6 // ..............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r2, #176] // ...............................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__12 + veor q5, q4, q1 // ................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #256] // .................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__13 + veor q6, q5, q6 // ..................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q4, [r2, #336] // ...................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__14 + veor q5, q6, q4 // ....................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r5, #0] // .....................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__00 + veor q7, q5, q7 // ......................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q4, [r2, #0] // .......................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__00 + veor q1, q1, q7 // ........................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................... + veor q4, q4, q3 // .........................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q1, #32-(0/2) // ..........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #0/2 // ...........................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #80] // ............................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__01 + veor q1, q1, q3 // .............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r10, #0] // ..............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q4, #32-(0/2) // ...............................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q4, #0/2 // ................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r5, #80] // .................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__01 + veor q4, q4, q7 // ..................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q6, [r4, #0] // ...................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q1, #32-(36/2) // ....................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #36/2 // .....................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #160] // ......................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__02 + veor q1, q1, q3 // .......................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #256] // ........................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q6, q4, #32-(36/2) // .........................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q4, #36/2 // ..........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r2, #240] // ...........................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__03 + veor q4, q4, q3 // ............................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r10, #256] // .............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q6, q1, #32-((3-1)/2) // ..............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................ + vsli.U32 q6, q1, #(3-1)/2 // ...............................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q1, [r2, #320] // ................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__04 + veor q1, q1, q3 // .................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................. + vstrw.32 q6, [r10, #112] // ..................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q4, #32-((41-1)/2) // ...................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q4, #(41-1)/2 // ....................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r5, #240] // .....................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__03 + veor q3, q3, q7 // ......................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q6, [r10, #368] // .......................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q4, q3, #32-((41+1)/2) // ........................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q4, q3, #(41+1)/2 // .........................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r5, #160] // ..........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__02 + veor q6, q3, q7 // ...........................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................... + vstrw.32 q4, [r4, #368] // ............................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................. + vshr.U32 q3, q6, #32-((3+1)/2) // .............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................. + vsli.U32 q3, q6, #(3+1)/2 // ..............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q6, [r5, #320] // ...............................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__04 + veor q7, q6, q7 // ................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r2, #368] // .................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__34 + vshr.U32 q6, q1, #32-(18/2) // ..................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................ + vstrw.32 q3, [r4, #112] // ...................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #18/2 // ....................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................... + vshr.U32 q1, q7, #32-(18/2) // .....................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r2, #48] // ......................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__30 + vsli.U32 q1, q7, #18/2 // .......................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q7, [r2, #128] // ........................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__31 + veor q3, q3, q7 // .........................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q7, [r2, #208] // ..........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__32 + veor q7, q3, q7 // ...........................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q3, [r2, #288] // ............................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__33 + veor q3, q7, q3 // .............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q7, [r5, #128] // ..............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__31 + veor q3, q3, q4 // ...............................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r5, #48] // ................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__30 + veor q0, q3, q0 // .................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................. + veor q4, q4, q7 // ..................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q7, [r5, #208] // ...................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__32 + veor q4, q4, q7 // ....................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................... + vldrw.U32 q7, [r5, #288] // .....................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__33 + veor q4, q4, q7 // ......................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................ + vldrw.U32 q7, [r5, #368] // .......................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__34 + veor q7, q4, q7 // ........................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #224] // .........................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................... + vshr.U32 q4, q7, #31 // ..........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r10, #224] // ...........................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................... + vsli.32 q4, q7, #1 // ............................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................. + veor q5, q4, q5 // .............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #192] // ..............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__22 + veor q1, q6, q5 // ...............................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r5, #112] // ................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................. // @slothy:reads=Ar3__21 + veor q7, q2, q7 // .................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r5, #32] // ..................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__20 + vshr.U32 q2, q1, #32-((43-1)/2) // ...................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................... + vsli.U32 q2, q1, #(43-1)/2 // ....................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................... + veor q1, q6, q0 // .....................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................... + vstrw.32 q2, [r10, #32] // ......................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q1, #32-(62/2) // .......................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #62/2 // ........................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................... + vldrw.U32 q2, [r2, #112] // .........................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__21 + veor q2, q2, q5 // ..........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r10, #320] // ...........................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................... + vshr.U32 q1, q2, #32-(6/2) // ............................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................. + vsli.U32 q1, q2, #6/2 // .............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r2, #32] // ..............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__20 + veor q4, q4, q0 // ...............................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r4, #176] // ................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................. + veor q2, q6, q5 // .................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................. + vshr.U32 q6, q2, #32-(62/2) // ..................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r5, #352] // ...................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__24 + vsli.U32 q6, q2, #62/2 // ....................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................... + veor q1, q1, q0 // .....................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #320] // ......................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................ + vshr.U32 q6, q1, #32-((61+1)/2) // .......................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................... + vsli.U32 q6, q1, #(61+1)/2 // ........................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................... + vldrw.U32 q2, [r5, #192] // .........................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__22 + vshr.U32 q1, q4, #32-(6/2) // ..........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #144] // ...........................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................... + vsli.U32 q1, q4, #6/2 // ............................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................. + veor q2, q2, q0 // .............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................. + vldrw.U32 q6, [r5, #272] // ..............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................ // @slothy:reads=Ar3__23 + veor q0, q6, q0 // ...............................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................... + vldrw.U32 q4, [r2, #352] // ................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................. // @slothy:reads=Ar2__24 + veor q6, q4, q5 // .................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................. + vldrw.U32 q4, [r2, #272] // ..................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................ // @slothy:reads=Ar2__23 + veor q4, q4, q5 // ...................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r10, #176] // ....................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................... + vshr.U32 q1, q2, #32-((43+1)/2) // .....................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................... + vsli.U32 q1, q2, #(43+1)/2 // ......................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................ + vldrw.32 q5, [r13, #0] // .......................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................... // @slothy:reads=stack0 + vshr.U32 q2, q0, #32-((15+1)/2) // ........................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................... + vstrw.32 q1, [r4, #32] // .........................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................... + vsli.U32 q2, q0, #(15+1)/2 // ..........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................... + vshr.U32 q1, q6, #32-((61-1)/2) // ...........................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................... + vstrw.32 q2, [r4, #288] // ............................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................. + vsli.U32 q1, q6, #(61-1)/2 // .............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................. + vshr.U32 q6, q4, #32-((15-1)/2) // ..............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................ + vstrw.32 q1, [r10, #144] // ...............................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................... + vsli.U32 q6, q4, #(15-1)/2 // ................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................. + vshr.U32 q0, q5, #31 // .................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................. + vstrw.32 q6, [r10, #288] // ..................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................ + vsli.32 q0, q5, #1 // ...................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................... + veor q5, q0, q3 // ....................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................... + vldrw.U32 q6, [r2, #64] // .....................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__40 + veor q3, q6, q5 // ......................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................ + vldrw.U32 q1, [r5, #64] // .......................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__40 + vshr.U32 q4, q3, #32-((27-1)/2) // ........................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................... + vldrw.U32 q2, [r2, #384] // .........................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................... // @slothy:reads=Ar2__44 + vsli.U32 q4, q3, #(27-1)/2 // ..........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................... + vldrw.U32 q0, [r5, #224] // ...........................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................... // @slothy:reads=Ar3__42 + veor q6, q1, q7 // ............................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................. + vstrw.32 q4, [r10, #240] // .............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................. + veor q2, q2, q5 // ..............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................ + veor q3, q0, q7 // ...............................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................... + vldrw.U32 q0, [r2, #224] // ................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................. // @slothy:reads=Ar2__42 + vshr.U32 q4, q6, #32-((27+1)/2) // .................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................. + vldrw.U32 q1, [r5, #384] // ..................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................ // @slothy:reads=Ar3__44 + vsli.U32 q4, q6, #(27+1)/2 // ...................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................... + vshr.U32 q6, q2, #32-(14/2) // ....................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................... + vstrw.32 q4, [r4, #240] // .....................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................... + vsli.U32 q6, q2, #14/2 // ......................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................ + vshr.U32 q2, q3, #32-((39+1)/2) // .......................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................... + vstrw.32 q6, [r4, #64] // ........................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................... + vsli.U32 q2, q3, #(39+1)/2 // .........................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................... + veor q0, q0, q5 // ..........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................... + vldrw.U32 q6, [r2, #144] // ...........................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................... // @slothy:reads=Ar2__41 + veor q4, q1, q7 // ............................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................. + veor q6, q6, q5 // .............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................. + vstrw.32 q2, [r4, #352] // ..............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................ + vshr.U32 q2, q4, #32-(14/2) // ...............................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................... + vsli.U32 q2, q4, #14/2 // ................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................. + vldrw.U32 q1, [r2, #304] // .................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................. // @slothy:reads=Ar2__43 + veor q5, q1, q5 // ..................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................ + vldrw.U32 q1, [r5, #144] // ...................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................... // @slothy:reads=Ar3__41 + veor q4, q1, q7 // ....................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................... + vldrw.U32 q3, [r5, #304] // .....................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................... // @slothy:reads=Ar3__43 + veor q1, q3, q7 // ......................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + vstrw.32 q2, [r10, #64] // .......................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................... + vshr.U32 q3, q0, #32-((39-1)/2) // ........................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................... + vsli.U32 q3, q0, #(39-1)/2 // .........................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................... + vldrw.U32 q7, [r4, #80] // ..........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................... // @slothy:reads=Ar4__01 + vshr.U32 q0, q6, #32-(20/2) // ...........................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................... + vstrw.32 q3, [r10, #352] // ............................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................. + vsli.U32 q0, q6, #20/2 // .............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................. + vshr.U32 q2, q5, #32-(8/2) // ..............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................ + vsli.U32 q2, q5, #8/2 // ...............................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................... + vldrw.U32 q5, [r4, #112] // ................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................. // @slothy:reads=Ar4__21 + vshr.U32 q3, q1, #32-(8/2) // .................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................. + vsli.U32 q3, q1, #8/2 // ..................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................ + vldrw.U32 q1, [r4, #128] // ...................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................... // @slothy:reads=Ar4__31 + vbic q6, q5, q0 // ....................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................... + vstrw.32 q3, [r10, #208] // .....................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................... + vbic q3, q1, q5 // ......................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................ + veor q3, q0, q3 // .......................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................... + vstrw.32 q3, [r2, #96] // ........................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................... // @slothy:writes=Ar2__11 + vbic q3, q0, q7 // .........................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................... + veor q0, q7, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................... + vldrw.U32 q6, [r4, #144] // ...........................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................... // @slothy:reads=Ar4__41 + vbic q7, q7, q6 // ............................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................. + vstrw.32 q0, [r2, #80] // .............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................. // @slothy:writes=Ar2__01 + veor q3, q6, q3 // ..............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................ + vstrw.32 q3, [r2, #144] // ...............................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................... // @slothy:writes=Ar2__41 + veor q0, q1, q7 // ................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................. + vstrw.32 q0, [r2, #128] // .................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................. // @slothy:writes=Ar2__31 + vbic q1, q6, q1 // ..................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................ + vshr.U32 q6, q4, #32-(20/2) // ...................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................... + vldrw.U32 q3, [r10, #112] // ....................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................... // @slothy:reads=Ar5__21 + vsli.U32 q6, q4, #20/2 // .....................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................... + vldrw.U32 q4, [r10, #80] // ......................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................ // @slothy:reads=Ar5__01 + veor q1, q5, q1 // .......................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................... + vldrw.U32 q0, [r10, #144] // ........................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................... // @slothy:reads=Ar5__41 + vbic q7, q4, q0 // .........................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................... + vldrw.U32 q5, [r10, #128] // ..........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................... // @slothy:reads=Ar5__31 + veor q7, q5, q7 // ...........................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................... + vstrw.32 q1, [r2, #112] // ............................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................. // @slothy:writes=Ar2__21 + vbic q1, q0, q5 // .............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................. + vstrw.32 q7, [r5, #128] // ..............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................ // @slothy:writes=Ar3__31 + veor q7, q3, q1 // ...............................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................... + vstrw.32 q7, [r5, #112] // ................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................. // @slothy:writes=Ar3__21 + vbic q7, q5, q3 // .................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................. + vbic q1, q3, q6 // ..................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................ + vldrw.U32 q3, [r4, #176] // ...................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................... // @slothy:reads=Ar4__12 + veor q5, q4, q1 // ....................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................... + vbic q4, q6, q4 // .....................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................... + vldrw.U32 q1, [r4, #160] // ......................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................ // @slothy:reads=Ar4__02 + veor q0, q0, q4 // .......................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................... + vldrw.U32 q4, [r4, #224] // ........................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................... // @slothy:reads=Ar4__42 + veor q7, q6, q7 // .........................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................... + vstrw.32 q0, [r5, #144] // ..........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................... // @slothy:writes=Ar3__41 + vbic q0, q1, q4 // ...........................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................... + vstrw.32 q7, [r5, #96] // ............................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................. // @slothy:writes=Ar3__11 + veor q0, q2, q0 // .............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................. + vstrw.32 q0, [r2, #208] // ..............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................ // @slothy:writes=Ar2__32 + vbic q6, q3, q1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................... + vstrw.32 q5, [r5, #80] // ................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................. // @slothy:writes=Ar3__01 + vbic q7, q4, q2 // .................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................. + vldrw.U32 q0, [r10, #160] // ..................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................ // @slothy:reads=Ar5__02 + veor q6, q4, q6 // ...................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................... + vldrw.U32 q5, [r4, #192] // ....................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................... // @slothy:reads=Ar4__22 + vbic q4, q2, q5 // .....................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................... + vldrw.U32 q2, [r10, #224] // ......................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................ // @slothy:reads=Ar5__42 + veor q4, q3, q4 // .......................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................... + vstrw.32 q4, [r2, #176] // ........................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................... // @slothy:writes=Ar2__12 + vbic q4, q5, q3 // .........................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................... + vstrw.32 q6, [r2, #224] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................... // @slothy:writes=Ar2__42 + veor q4, q1, q4 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................... + vldrw.U32 q1, [r10, #208] // ............................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................. // @slothy:reads=Ar5__32 + veor q3, q5, q7 // .............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................. + vldrw.U32 q5, [r10, #192] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................ // @slothy:reads=Ar5__22 + vbic q6, q1, q5 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................... + vldrw.U32 q7, [r10, #176] // ................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................. // @slothy:reads=Ar5__12 + veor q6, q7, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................. + vstrw.32 q3, [r2, #192] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................ // @slothy:writes=Ar2__22 + vbic q3, q0, q2 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................... + vstrw.32 q6, [r5, #176] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................... // @slothy:writes=Ar3__12 + veor q3, q1, q3 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................... + vstrw.32 q3, [r5, #208] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................ // @slothy:writes=Ar3__32 + vbic q3, q5, q7 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................... + vstrw.32 q4, [r2, #160] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................... // @slothy:writes=Ar2__02 + veor q3, q0, q3 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................... + vstrw.32 q3, [r5, #160] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................... // @slothy:writes=Ar3__02 + vbic q6, q2, q1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................... + vldrw.U32 q1, [r4, #288] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................. // @slothy:reads=Ar4__33 + vbic q7, q7, q0 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................. + vldrw.U32 q3, [r4, #272] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................ // @slothy:reads=Ar4__23 + veor q0, q5, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................... + vldrw.U32 q4, [r4, #304] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................. // @slothy:reads=Ar4__43 + veor q6, q2, q7 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................. + vldrw.U32 q7, [r4, #256] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................ // @slothy:reads=Ar4__13 + vbic q5, q4, q1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................... + vstrw.32 q0, [r5, #192] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................... // @slothy:writes=Ar3__22 + veor q5, q3, q5 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................... + vstrw.32 q6, [r5, #224] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................ // @slothy:writes=Ar3__42 + vbic q0, q3, q7 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................... + vstrw.32 q5, [r2, #272] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................... // @slothy:writes=Ar2__23 + vbic q6, q1, q3 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................... + veor q5, q7, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................... + vldrw.U32 q3, [r4, #240] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................... // @slothy:reads=Ar4__03 + veor q6, q3, q0 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................. + vldrw.U32 q2, [r10, #288] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................. // @slothy:reads=Ar5__33 + vbic q0, q3, q4 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................ + vstrw.32 q6, [r2, #240] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................... // @slothy:writes=Ar2__03 + vbic q7, q7, q3 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................. + vstrw.32 q5, [r2, #256] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................. // @slothy:writes=Ar2__13 + veor q7, q4, q7 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................ + vstrw.32 q7, [r2, #304] // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................... // @slothy:writes=Ar2__43 + veor q7, q1, q0 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................... + vstrw.32 q7, [r2, #288] // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................... // @slothy:writes=Ar2__33 + vldrw.U32 q5, [r10, #304] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................ // @slothy:reads=Ar5__43 + vbic q7, q5, q2 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................... + vldrw.U32 q3, [r10, #272] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................... // @slothy:reads=Ar5__23 + veor q1, q3, q7 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................... + vldrw.U32 q7, [r4, #336] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................... // @slothy:reads=Ar4__14 + vbic q4, q2, q3 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................... + vldrw.U32 q6, [r10, #256] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................. // @slothy:reads=Ar5__13 + vbic q3, q3, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................. + vldrw.U32 q0, [r10, #240] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................ // @slothy:reads=Ar5__03 + veor q3, q0, q3 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................... + vstrw.32 q1, [r5, #272] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................. // @slothy:writes=Ar3__23 + vbic q1, q0, q5 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................. + vstrw.32 q3, [r5, #240] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................ // @slothy:writes=Ar3__03 + veor q1, q2, q1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................... + vldrw.U32 q3, [r4, #384] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................... // @slothy:reads=Ar4__44 + vbic q2, q6, q0 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................... + vldrw.U32 q0, [r4, #320] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................ // @slothy:reads=Ar4__04 + veor q2, q5, q2 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................... + vldrw.U32 q5, [r4, #352] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................... // @slothy:reads=Ar4__24 + veor q4, q6, q4 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................... + vstrw.32 q2, [r5, #304] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................... // @slothy:writes=Ar3__43 + vbic q2, q7, q0 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................... + vstrw.32 q1, [r5, #288] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................. // @slothy:writes=Ar3__33 + veor q1, q3, q2 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................. + vstrw.32 q1, [r2, #384] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................ // @slothy:writes=Ar2__44 + vbic q2, q5, q7 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................... + vstrw.32 q4, [r5, #256] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................. // @slothy:writes=Ar3__13 + veor q4, q0, q2 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................. + vstrw.32 q4, [r2, #320] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................ // @slothy:writes=Ar2__04 + vbic q2, q0, q3 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................... + vldrw.U32 q4, [r4, #368] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................... // @slothy:reads=Ar4__34 + vbic q3, q3, q4 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................... + vldrw.U32 q0, [r10, #320] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................ // @slothy:reads=Ar5__04 + veor q1, q5, q3 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................... + vldrw.U32 q6, [r10, #336] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................... // @slothy:reads=Ar5__14 + vbic q5, q4, q5 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................... + vstrw.32 q1, [r2, #352] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................... // @slothy:writes=Ar2__24 + veor q5, q7, q5 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................... + vstrw.32 q5, [r2, #336] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................. // @slothy:writes=Ar2__14 + veor q3, q4, q2 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................. + vstrw.32 q3, [r2, #368] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................ // @slothy:writes=Ar2__34 + vbic q7, q6, q0 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................... + vldrw.U32 q5, [r10, #352] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................. // @slothy:reads=Ar5__24 + vbic q3, q5, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................. + vldrw.U32 q1, [r10, #368] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................ // @slothy:reads=Ar5__34 + vbic q4, q1, q5 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................... + vldrw.U32 q2, [r4, #16] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................... // @slothy:reads=Ar4__10 + veor q6, q6, q4 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................... + vldrw.U32 q4, [r10, #384] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................ // @slothy:reads=Ar5__44 + veor q3, q0, q3 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................... + vstrw.32 q3, [r5, #320] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................... // @slothy:writes=Ar3__04 + veor q3, q4, q7 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................... + vstrw.32 q3, [r5, #384] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................... // @slothy:writes=Ar3__44 + vbic q0, q0, q4 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................... + vstrw.32 q6, [r5, #336] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................. // @slothy:writes=Ar3__14 + veor q3, q1, q0 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................. + vstrw.32 q3, [r5, #368] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................ // @slothy:writes=Ar3__34 + vbic q7, q4, q1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................... + veor q5, q5, q7 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................. + vldrw.U32 q6, [r4, #32] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................. // @slothy:reads=Ar4__20 + vbic q3, q6, q2 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................ + vldrw.U32 q4, [r4, #48] // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................... // @slothy:reads=Ar4__30 + vbic q0, q4, q6 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................... + vldrw.U32 q1, [r4, #0] // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................... // @slothy:reads=Ar4__00 + veor q0, q2, q0 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................ + vldrw.U32 q7, [r4, #64] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................... // @slothy:reads=Ar4__40 + veor q3, q1, q3 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................... + vstrw.32 q5, [r5, #352] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................... // @slothy:writes=Ar3__24 + vbic q5, q1, q7 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................... + vstrw.32 q0, [r2, #16] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................... // @slothy:writes=Ar2__10 + veor q0, q4, q5 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................. + vstrw.32 q0, [r2, #48] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................. // @slothy:writes=Ar2__30 + vbic q5, q2, q1 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................ + veor q2, q7, q5 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................... + vldrw.U32 q0, [r10, #16] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................. // @slothy:reads=Ar5__10 + vbic q5, q7, q4 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................. + vldrw.U32 q4, [r10, #0] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................ // @slothy:reads=Ar5__00 + vbic q1, q0, q4 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................... + vldrw.U32 q7, [r10, #64] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................... // @slothy:reads=Ar5__40 + veor q1, q7, q1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................... + vstrw.32 q2, [r2, #64] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................ // @slothy:writes=Ar2__40 + veor q2, q6, q5 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................... + vbic q6, q4, q7 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................... + vldrw.U32 q5, [r10, #48] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................... // @slothy:reads=Ar5__30 + veor q6, q5, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................... + ldrd r7, r8, [r6] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................... + vbic q7, q7, q5 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................. + vstrw.32 q1, [r5, #64] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................. // @slothy:writes=Ar3__40 + vdup.32 q1, r7 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................ + veor q1, q3, q1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............... + vldrw.U32 q3, [r10, #32] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............. // @slothy:reads=Ar5__20 + veor q7, q3, q7 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............. + add r6, r6, #8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............ + vbic q5, q5, q3 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........... + vstrw.32 q6, [r5, #48] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......... // @slothy:writes=Ar3__30 + vbic q6, q3, q0 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......... + vstrw.32 q1, [r2, #0] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........ // @slothy:writes=Ar2__00 + veor q5, q0, q5 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....... + vstrw.32 q7, [r5, #32] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...... // @slothy:writes=Ar3__20 + veor q4, q4, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..... + vstrw.32 q5, [r5, #16] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.... // @slothy:writes=Ar3__10 + vdup.32 q6, r8 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*... + vstrw.32 q2, [r2, #32] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.. // @slothy:writes=Ar2__20 + veor q0, q4, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*. + vstrw.32 q0, [r5, #0] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................* // @slothy:writes=Ar3__00 + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ cycle (expected) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 500 525 550 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|-------- + // add r5, r2, #400 // ....*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // add r10, r4, #400 // ...............................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #80] // .....*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q5, q6, q0 // ......*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r5, #160] // ...........*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #240] // .................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q5, q4 // ............*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // ..................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #320] // .......................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q5, q4, q6 // ........................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #112] // *.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q2 // .*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #192] // .......*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ........*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #272] // .............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // ..............*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r2, #352] // ...................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q3, q4, q6 // ....................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q0, q3, q5 // ..........................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q5, [r13, #0] // .............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #80] // ..*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q4, q6, q1 // ...*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #160] // .........*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ..........*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #240] // ...............*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #320] // .....................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q2, q4, q6 // ......................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r5, #32] // .........................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #112] // .................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // ..................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #192] // ...................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ....................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #272] // .....................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ......................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #352] // .......................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q5, q4, q6 // ........................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q1, q5, #31 // ..........................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.32 q1, q5, #1 // ...........................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q7, q1, q2 // .............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #16] // ............................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #16] // ...........................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q7 // ..............................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q6, q6, q0 // ............................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q4, [r10, #160] // .................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q1, q6, #32-((1+1)/2) // ..............................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q1, q6, #(1+1)/2 // ................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q1, [r4, #160] // .........................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #96] // ....................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #96] // ...............................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q7 // .....................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q6, q6, q0 // ................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q1, q4, #32-(44/2) // .......................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q1, q4, #44/2 // ........................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q1, [r4, #16] // .............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q4, q6, #32-(44/2) // ..................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q4, q6, #44/2 // ...................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q4, [r10, #16] // ......................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q1, [r2, #176] // ...........................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #176] // ..........................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q0 // ...........................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q1, q7 // ............................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-(10/2) // ..............................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #10/2 // ...............................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #272] // ..................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-(10/2) // .............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q4, #10/2 // ..............................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vstrw.32 q6, [r10, #272] // .................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #256] // ................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #256] // ...............................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q7 // .................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q0 // ................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-((45-1)/2) // ...................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(45-1)/2 // ....................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #128] // .......................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-((45+1)/2) // ......................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q4, #(45+1)/2 // ........................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #128] // ................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #336] // .....................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #336] // .........................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q7 // ......................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q4, q6, q0 // ..........................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-(2/2) // ........................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #2/2 // .........................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #384] // ............................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q4, #32-(2/2) // ..................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q4, #2/2 // ....................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #384] // .......................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r5, #64] // ...................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #144] // .....................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // .........................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #224] // ..........................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ...........................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #304] // ............................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // .............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #384] // ..............................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q7, q4, q6 // ...............................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q7, #31 // .................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vsli.32 q6, q7, #1 // ..................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q0, q6, q3 // ....................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #64] // .....................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #144] // ...................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ......................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r2, #224] // .......................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ........................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #304] // .........................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ..........................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #384] // ...........................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q3, q4, q6 // ............................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q5, q3, q5 // ..............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #48] // .............................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #48] // ...............................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q0 // .............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q5 // ................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-(28/2) // ...............................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #28/2 // ................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r4, #80] // ...................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(28/2) // .................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q4, #28/2 // ..................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vstrw.32 q6, [r10, #80] // .....................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #128] // ...................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q6, q0 // ....................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #128] // ........................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q5 // .........................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((55-1)/2) // ......................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #(55-1)/2 // .......................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #336] // ..........................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-((55+1)/2) // ...........................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(55+1)/2 // ............................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r4, #336] // ..............................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #208] // .................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #208] // .....................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q0 // ..................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q4, q6, q5 // ......................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q1, #32-((25-1)/2) // ....................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(25-1)/2 // .....................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #192] // ...........................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-((25+1)/2) // ..........................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(25+1)/2 // ...........................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #192] // ......................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #288] // ......................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #288] // .......................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q0 // .......................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q5 // ........................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((21-1)/2) // .............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q1, #(21-1)/2 // ...............................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #48] // ..................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-((21+1)/2) // .............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q4, #(21+1)/2 // ...............................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #48] // ........................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #368] // ........................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #368] // ..............................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q1, q4, q0 // .........................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q5 // .................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-(56/2) // ..........................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #56/2 // ............................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r4, #304] // ................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q4, #32-(56/2) // ...................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #56/2 // ....................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #304] // .........................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r5, #16] // ..............................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #96] // ............................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // .................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #176] // ................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // ...................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #256] // ..................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................................ + // veor q4, q4, q6 // .....................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #336] // ....................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q0, q4, q6 // .......................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q0, #31 // .........................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.32 q6, q0, #1 // ..........................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q3, q6, q3 // ............................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #96] // .............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #16] // ...........................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ..............................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r2, #176] // ...............................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #256] // .................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // ..................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r2, #336] // ...................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................... + // veor q5, q4, q6 // ....................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................................... + // veor q7, q5, q7 // ......................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #0] // .......................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #0] // .....................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q3 // .........................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q7 // ........................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-(0/2) // ...............................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #0/2 // ................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r4, #0] // ...................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(0/2) // ..........................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #0/2 // ...........................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #0] // ..............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #80] // ............................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #80] // .................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................. + // veor q1, q4, q3 // .............................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q7 // ..................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q1, #32-(36/2) // ....................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #36/2 // .....................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #256] // ........................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(36/2) // .........................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #36/2 // ..........................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #256] // .............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #160] // ......................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #160] // ..........................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q3 // .......................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q7 // ...........................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((3-1)/2) // ..............................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #(3-1)/2 // ...............................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #112] // ..................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-((3+1)/2) // .............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q4, #(3+1)/2 // ..............................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................ + // vstrw.32 q6, [r4, #112] // ...................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #240] // ...........................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #240] // .....................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q3 // ............................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q7 // ......................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q1, #32-((41-1)/2) // ...................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(41-1)/2 // ....................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #368] // .......................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-((41+1)/2) // ........................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(41+1)/2 // .........................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #368] // ............................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #320] // ................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #320] // ...............................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................... + // veor q1, q4, q3 // .................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................................. + // veor q4, q6, q7 // ................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-(18/2) // ..................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #18/2 // ....................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #224] // .........................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(18/2) // .....................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #18/2 // .......................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #224] // ...........................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #48] // ......................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r2, #128] // ........................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // .........................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #208] // ..........................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ...........................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #288] // ............................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................................. + // veor q4, q4, q6 // .............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r2, #368] // .................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................................. + // veor q3, q4, q6 // ...............................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................... + // veor q0, q3, q0 // .................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r5, #48] // ................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #128] // ..............................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................................ + // veor q4, q4, q6 // ..................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #208] // ...................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ....................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #288] // .....................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................................... + // veor q4, q4, q6 // ......................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #368] // .......................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................................... + // veor q7, q4, q6 // ........................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q7, #31 // ..........................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................... + // vsli.32 q6, q7, #1 // ............................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................................. + // veor q5, q6, q5 // .............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #32] // ..............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #32] // ..................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................ + // veor q1, q4, q5 // .................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................. + // veor q4, q6, q0 // .....................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-(62/2) // ..................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #62/2 // ....................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #320] // ......................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-(62/2) // .......................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #62/2 // ........................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #320] // ...........................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #112] // .........................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #112] // ................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................................. + // veor q1, q4, q5 // ..........................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q0 // ...............................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-(6/2) // ............................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................................. + // vsli.U32 q6, q1, #6/2 // .............................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r4, #176] // ................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q4, #32-(6/2) // ..........................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #6/2 // ............................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r10, #176] // ....................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #192] // ..............................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #192] // .........................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................................... + // veor q1, q4, q5 // ...............................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................... + // veor q4, q6, q0 // .............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-((43-1)/2) // ...................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(43-1)/2 // ....................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #32] // ......................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-((43+1)/2) // .....................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(43+1)/2 // ......................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................ + // vstrw.32 q6, [r4, #32] // .........................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #272] // ..................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................ + // vldrw.U32 q6, [r5, #272] // ..............................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................ + // veor q1, q4, q5 // ...................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................... + // veor q4, q6, q0 // ...............................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((15-1)/2) // ..............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................ + // vsli.U32 q6, q1, #(15-1)/2 // ................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r10, #288] // ..................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................ + // vshr.U32 q6, q4, #32-((15+1)/2) // ........................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(15+1)/2 // ..........................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #288] // ............................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................................. + // vldrw.U32 q4, [r2, #352] // ................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #352] // ...................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................................................... + // veor q1, q4, q5 // .................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................. + // veor q4, q6, q0 // .....................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((61-1)/2) // ...........................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(61-1)/2 // .............................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................. + // vstrw.32 q6, [r10, #144] // ...............................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-((61+1)/2) // .......................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(61+1)/2 // ........................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #144] // ...........................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................................................... + // veor q7, q2, q7 // .................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................................................................. + // vldrw.32 q4, [r13, #0] // .......................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #31 // .................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................................. + // vsli.32 q6, q4, #1 // ...................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................... + // veor q5, q6, q3 // ....................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #64] // .....................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #64] // .......................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................................... + // veor q1, q4, q5 // ......................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................................ + // veor q4, q6, q7 // ............................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-((27-1)/2) // ........................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(27-1)/2 // ..........................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #240] // .............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................. + // vshr.U32 q6, q4, #32-((27+1)/2) // .................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................. + // vsli.U32 q6, q4, #(27+1)/2 // ...................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #240] // .....................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #224] // ................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................................. + // vldrw.U32 q6, [r5, #224] // ...........................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................... + // veor q1, q4, q5 // ..........................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................... + // veor q4, q6, q7 // ...............................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................... + // vshr.U32 q6, q1, #32-((39-1)/2) // ........................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #(39-1)/2 // .........................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................... + // vstrw.32 q6, [r10, #352] // ............................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................. + // vshr.U32 q6, q4, #32-((39+1)/2) // .......................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #(39+1)/2 // .........................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................... + // vstrw.32 q6, [r4, #352] // ..............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................ + // vldrw.U32 q4, [r2, #384] // .........................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #384] // ..................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................................ + // veor q1, q4, q5 // ..............................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................................ + // veor q4, q6, q7 // ............................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................................. + // vshr.U32 q6, q1, #32-(14/2) // ....................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................................... + // vsli.U32 q6, q1, #14/2 // ......................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................................ + // vstrw.32 q6, [r4, #64] // ........................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(14/2) // ...............................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................... + // vsli.U32 q6, q4, #14/2 // ................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................................. + // vstrw.32 q6, [r10, #64] // .......................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................................... + // vldrw.U32 q6, [r2, #304] // .................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................. + // vldrw.U32 q4, [r5, #304] // .....................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................................... + // veor q6, q6, q5 // ..................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................................ + // veor q4, q4, q7 // ......................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................ + // vshr.U32 q2, q6, #32-(8/2) // ..............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................ + // vsli.U32 q2, q6, #8/2 // ...............................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................... + // vshr.U32 q6, q4, #32-(8/2) // .................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................. + // vsli.U32 q6, q4, #8/2 // ..................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................................ + // vstrw.32 q6, [r10, #208] // .....................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................................... + // vldrw.U32 q4, [r2, #144] // ...........................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................... + // vldrw.U32 q6, [r5, #144] // ...................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................................... + // veor q4, q4, q5 // .............................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................................. + // veor q6, q6, q7 // ....................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................................... + // vshr.U32 q3, q4, #32-(20/2) // ...........................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................... + // vsli.U32 q3, q4, #20/2 // .............................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................................. + // vshr.U32 q0, q6, #32-(20/2) // ...................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................... + // vsli.U32 q0, q6, #20/2 // .....................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................................... + // vldrw.U32 q7, [r4, #80] // ..........................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................................... + // vldrw.U32 q5, [r4, #112] // ................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................................. + // vbic q6, q5, q3 // ....................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................................... + // veor q6, q7, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................... + // vstrw.32 q6, [r2, #80] // .............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................. + // vldrw.U32 q1, [r4, #128] // ...................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................... + // vbic q6, q1, q5 // ......................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................................ + // veor q6, q3, q6 // .......................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................................... + // vstrw.32 q6, [r2, #96] // ........................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................................... + // vldrw.U32 q4, [r4, #144] // ...........................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................... + // vbic q6, q4, q1 // ..................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................ + // veor q6, q5, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................................... + // vstrw.32 q6, [r2, #112] // ............................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................. + // vbic q6, q7, q4 // ............................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................................. + // veor q6, q1, q6 // ................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................................. + // vstrw.32 q6, [r2, #128] // .................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................. + // vbic q6, q3, q7 // .........................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................................... + // veor q6, q4, q6 // ..............................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................................ + // vstrw.32 q6, [r2, #144] // ...............................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................................... + // vldrw.U32 q7, [r10, #80] // ......................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................................ + // vldrw.U32 q5, [r10, #112] // ....................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................................... + // vbic q6, q5, q0 // ..................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................ + // veor q6, q7, q6 // ....................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................................... + // vstrw.32 q6, [r5, #80] // ................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................. + // vldrw.U32 q1, [r10, #128] // ..........................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................................... + // vbic q6, q1, q5 // .................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................. + // veor q6, q0, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................... + // vstrw.32 q6, [r5, #96] // ............................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................. + // vldrw.U32 q4, [r10, #144] // ........................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................................... + // vbic q6, q4, q1 // .............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................. + // veor q6, q5, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................................... + // vstrw.32 q6, [r5, #112] // ................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................................................. + // vbic q6, q7, q4 // .........................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................................................... + // veor q6, q1, q6 // ...........................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................... + // vstrw.32 q6, [r5, #128] // ..............................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................................ + // vbic q6, q0, q7 // .....................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................................... + // veor q6, q4, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................................... + // vstrw.32 q6, [r5, #144] // ..........................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................................... + // vldrw.U32 q7, [r4, #160] // ......................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................ + // vldrw.U32 q5, [r4, #176] // ...................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................................... + // vldrw.U32 q1, [r4, #192] // ....................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................................... + // vbic q6, q1, q5 // .........................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................................... + // veor q6, q7, q6 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................... + // vstrw.32 q6, [r2, #160] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................... + // vbic q6, q2, q1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................................... + // veor q6, q5, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................................... + // vstrw.32 q6, [r2, #176] // ........................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................... + // vldrw.U32 q4, [r4, #224] // ........................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................................................................... + // vbic q6, q4, q2 // .................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................. + // veor q6, q1, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................. + // vstrw.32 q6, [r2, #192] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................ + // vbic q6, q7, q4 // ...........................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................... + // veor q6, q2, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................. + // vstrw.32 q6, [r2, #208] // ..............................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................................ + // vbic q6, q5, q7 // ...............................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................... + // veor q6, q4, q6 // ...................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................... + // vstrw.32 q6, [r2, #224] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................... + // vldrw.U32 q0, [r10, #160] // ..................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................................ + // vldrw.U32 q7, [r10, #176] // ................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................................. + // vldrw.U32 q5, [r10, #192] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................................ + // vbic q6, q5, q7 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................................... + // veor q6, q0, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................................... + // vstrw.32 q6, [r5, #160] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................... + // vldrw.U32 q1, [r10, #208] // ............................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................................. + // vbic q6, q1, q5 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................... + // veor q6, q7, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................................. + // vstrw.32 q6, [r5, #176] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................................... + // vldrw.U32 q4, [r10, #224] // ......................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................................ + // vbic q6, q4, q1 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................... + // veor q6, q5, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................... + // vstrw.32 q6, [r5, #192] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................................... + // vbic q6, q0, q4 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................... + // veor q6, q1, q6 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................................... + // vstrw.32 q6, [r5, #208] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................................ + // vbic q6, q7, q0 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................. + // veor q6, q4, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................. + // vstrw.32 q6, [r5, #224] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................ + // vldrw.U32 q0, [r4, #240] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................... + // vldrw.U32 q7, [r4, #256] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................................ + // vldrw.U32 q5, [r4, #272] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................................ + // vbic q6, q5, q7 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................................... + // veor q6, q0, q6 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................. + // vstrw.32 q6, [r2, #240] // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................... + // vldrw.U32 q1, [r4, #288] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................................. + // vbic q6, q1, q5 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................................... + // veor q6, q7, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................................... + // vstrw.32 q6, [r2, #256] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................. + // vldrw.U32 q4, [r4, #304] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................................. + // vbic q6, q4, q1 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................................... + // veor q6, q5, q6 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................................... + // vstrw.32 q6, [r2, #272] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................................... + // vbic q6, q0, q4 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................ + // veor q6, q1, q6 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................................... + // vstrw.32 q6, [r2, #288] // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................................... + // vbic q6, q7, q0 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................................. + // veor q6, q4, q6 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................................ + // vstrw.32 q6, [r2, #304] // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................... + // vldrw.U32 q0, [r10, #240] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................ + // vldrw.U32 q7, [r10, #256] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................. + // vldrw.U32 q5, [r10, #272] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................................... + // vbic q6, q5, q7 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................. + // veor q6, q0, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................... + // vstrw.32 q6, [r5, #240] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................ + // vldrw.U32 q1, [r10, #288] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................................. + // vbic q6, q1, q5 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................................... + // veor q6, q7, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................... + // vstrw.32 q6, [r5, #256] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................. + // vldrw.U32 q4, [r10, #304] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................................ + // vbic q6, q4, q1 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................................... + // veor q6, q5, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................................................... + // vstrw.32 q6, [r5, #272] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................................................. + // vbic q6, q0, q4 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................................. + // veor q6, q1, q6 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................... + // vstrw.32 q6, [r5, #288] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................. + // vbic q6, q7, q0 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................................... + // veor q6, q4, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................................... + // vstrw.32 q6, [r5, #304] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................... + // vldrw.U32 q0, [r4, #320] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................................ + // vldrw.U32 q7, [r4, #336] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................................................... + // vldrw.U32 q5, [r4, #352] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................................... + // vbic q6, q5, q7 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................... + // veor q6, q0, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................. + // vstrw.32 q6, [r2, #320] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................................ + // vldrw.U32 q1, [r4, #368] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................... + // vbic q6, q1, q5 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................................... + // veor q6, q7, q6 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................... + // vstrw.32 q6, [r2, #336] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................. + // vldrw.U32 q4, [r4, #384] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................................................... + // vbic q6, q4, q1 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................................... + // veor q6, q5, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................................... + // vstrw.32 q6, [r2, #352] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................................... + // vbic q6, q0, q4 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................... + // veor q6, q1, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................. + // vstrw.32 q6, [r2, #368] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................ + // vbic q6, q7, q0 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................... + // veor q6, q4, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................. + // vstrw.32 q6, [r2, #384] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................................................ + // vldrw.U32 q0, [r10, #320] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................................ + // vldrw.U32 q7, [r10, #336] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................................... + // vldrw.U32 q5, [r10, #352] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................................. + // vbic q6, q5, q7 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................. + // veor q6, q0, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................................... + // vstrw.32 q6, [r5, #320] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................................... + // vldrw.U32 q1, [r10, #368] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................ + // vbic q6, q1, q5 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................... + // veor q6, q7, q6 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................................... + // vstrw.32 q6, [r5, #336] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................. + // vldrw.U32 q4, [r10, #384] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................................ + // vbic q6, q4, q1 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................... + // veor q6, q5, q6 // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................................. + // vstrw.32 q6, [r5, #352] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................... + // vbic q6, q0, q4 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................... + // veor q6, q1, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................. + // vstrw.32 q6, [r5, #368] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................................ + // vbic q6, q7, q0 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................................... + // veor q6, q4, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................................................... + // vstrw.32 q6, [r5, #384] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................................... + // vldrw.U32 q0, [r4, #16] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................................... + // vldrw.U32 q7, [r4, #32] // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................. + // vldrw.U32 q5, [r4, #48] // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................... + // vbic q6, q5, q7 // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................................... + // veor q6, q0, q6 // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................................ + // vstrw.32 q6, [r2, #16] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................... + // vldrw.U32 q1, [r4, #64] // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................................... + // vbic q6, q1, q5 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................. + // veor q2, q7, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....................... + // vstrw.32 q2, [r2, #32] // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.. + // vldrw.U32 q4, [r4, #0] // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................................... + // vbic q6, q4, q1 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................... + // veor q6, q5, q6 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................................. + // vstrw.32 q6, [r2, #48] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................. + // vbic q6, q0, q4 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................................ + // veor q6, q1, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................... + // vstrw.32 q6, [r2, #64] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................ + // vbic q6, q7, q0 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................................ + // veor q3, q4, q6 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................................... + // vldrw.U32 q0, [r10, #16] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............................. + // vldrw.U32 q7, [r10, #32] // ................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.............. + // vldrw.U32 q5, [r10, #48] // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..................... + // vbic q6, q5, q7 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........... + // veor q6, q0, q6 // .......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*....... + // vstrw.32 q6, [r5, #16] // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.... + // vldrw.U32 q1, [r10, #64] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......................... + // vbic q6, q1, q5 // ............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................. + // veor q6, q7, q6 // .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............. + // vstrw.32 q6, [r5, #32] // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...... + // vldrw.U32 q4, [r10, #0] // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............................ + // vbic q6, q4, q1 // ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*...................... + // veor q6, q5, q6 // ..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.................... + // vstrw.32 q6, [r5, #48] // ....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*.......... + // vbic q6, q0, q4 // ...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........................... + // veor q6, q1, q6 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......................... + // vstrw.32 q6, [r5, #64] // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................. + // vbic q6, q7, q0 // .....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*......... + // veor q4, q4, q6 // .........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*..... + // ldrd r10, r8, [r6] // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................... + // vdup.32 q6, r10 // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*................ + // veor q1, q3, q6 // ...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............... + // vstrw.32 q1, [r2, #0] // ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*........ + // vdup.32 q6, r8 // ...........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*... + // veor q0, q4, q6 // .............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*. + // vstrw.32 q0, [r5, #0] // ..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................* + // add r6, r6, #8 // ..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................*............ + + keccak_f1600_x4_mve_asm_roundend_pre: + + + le lr, keccak_f1600_x4_mve_asm_roundstart +keccak_f1600_x4_mve_asm_roundend: + add sp, #8*16 + + vpop {d8-d15} + ldmia.w sp!, {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12, pc} + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq qA00_h + .unreq qA00_l + .unreq qA20_l + +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c new file mode 100644 index 000000000..8fd1edb76 --- /dev/null +++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../../common.h" +#include "../../../../ct.h" + +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "fips202_native_armv81m.h" + +/* + * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. + * TODO: Replace with optimized MVE assembly implementations + * (as a part of XORBytes and ExtractBytes) + */ + +/* Extract even-indexed bits from 64-bit value into lower 32 bits */ +static uint32_t bitinterleave_even(uint64_t x) +{ + uint64_t t; + t = x & 0x5555555555555555ULL; + t = (t | (t >> 1)) & 0x3333333333333333ULL; + t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL; + t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL; + t = (t | (t >> 8)) & 0x0000ffff0000ffffULL; + t = (t | (t >> 16)) & 0x00000000ffffffffULL; + return (uint32_t)t; +} + +/* Extract odd-indexed bits from 64-bit value into lower 32 bits */ +static uint32_t bitinterleave_odd(uint64_t x) +{ + return bitinterleave_even(x >> 1); +} + +/* Spread 32-bit value across even bit positions of 64-bit result */ +static uint64_t spread_even(uint32_t x) +{ + uint64_t t = x; + t = (t | (t << 16)) & 0x0000ffff0000ffffULL; + t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL; + t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL; + t = (t | (t << 2)) & 0x3333333333333333ULL; + t = (t | (t << 1)) & 0x5555555555555555ULL; + return t; +} + +/* Combine even and odd 32-bit halves into interleaved 64-bit value */ +static uint64_t bitdeinterleave(uint32_t even, uint32_t odd) +{ + return spread_even(even) | (spread_even(odd) << 1); +} + +/* + * TEMPORARY: Naive C interleaving functions. + * These will be replaced with optimized MVE assembly implementations. + */ +static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0, + const uint64_t *state1, const uint64_t *state2, + const uint64_t *state3) +{ + uint32_t *state_4xl = (uint32_t *)state_4x; + uint32_t *state_4xh = (uint32_t *)state_4x + 100; + + for (size_t i = 0; i < 25; i++) + { + state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]); + state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]); + state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]); + state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]); + + state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]); + state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]); + state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]); + state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]); + } +} + +static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0, + uint64_t *state1, uint64_t *state2, + uint64_t *state3) +{ + uint32_t *state_4xl = (uint32_t *)state_4x; + uint32_t *state_4xh = (uint32_t *)state_4x + 100; + + for (size_t i = 0; i < 25; i++) + { + state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]); + state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]); + state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]); + state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]); + } +} + +#define mld_keccak_f1600_x4_native_impl \ + MLD_NAMESPACE(keccak_f1600_x4_native_impl) +int mld_keccak_f1600_x4_native_impl(uint64_t *state) +{ + /* + * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. + * TODO: Replace with optimized MVE assembly implementations + * (as a part of XORBytes and ExtractBytes) + */ + MLD_ALIGN uint64_t state_4x[100]; + MLD_ALIGN uint64_t state_4x_tmp[100]; + + /* Interleave the 4 states into bit-interleaved format */ + interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); + + /* Run the permutation */ + mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp, + mld_keccakf1600_round_constants); + + /* Deinterleave back to 4 separate states */ + deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); + + mld_zeroize(state_4x, sizeof(state_4x)); + mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp)); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#else /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(keccak_f1600_x4_mve) + +#endif /* !(MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) \ + */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +/* Some macros are kept because they are also defined in a header. */ +/* Keep: mld_keccak_f1600_x4_native_impl (mve.h) */ diff --git a/dev/fips202/armv81m/src/keccakf1600_round_constants.c b/dev/fips202/armv81m/src/keccakf1600_round_constants.c new file mode 100644 index 000000000..7075ece41 --- /dev/null +++ b/dev/fips202/armv81m/src/keccakf1600_round_constants.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../../common.h" + +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include + +#include "fips202_native_armv81m.h" + +/* + * Keccak round constants in bit-interleaved form. + * Each 64-bit constant is split into two 32-bit words: + * - low word contains even-indexed bits + * - high word contains odd-indexed bits + */ +MLD_ALIGN const uint32_t mld_keccakf1600_round_constants[48] = { + 0x00000001, 0x00000000, /* RC0 */ + 0x00000000, 0x00000089, /* RC1 */ + 0x00000000, 0x8000008b, /* RC2 */ + 0x00000000, 0x80008080, /* RC3 */ + 0x00000001, 0x0000008b, /* RC4 */ + 0x00000001, 0x00008000, /* RC5 */ + 0x00000001, 0x80008088, /* RC6 */ + 0x00000001, 0x80000082, /* RC7 */ + 0x00000000, 0x0000000b, /* RC8 */ + 0x00000000, 0x0000000a, /* RC9 */ + 0x00000001, 0x00008082, /* RC10 */ + 0x00000000, 0x00008003, /* RC11 */ + 0x00000001, 0x0000808b, /* RC12 */ + 0x00000001, 0x8000000b, /* RC13 */ + 0x00000001, 0x8000008a, /* RC14 */ + 0x00000001, 0x80000081, /* RC15 */ + 0x00000000, 0x80000081, /* RC16 */ + 0x00000000, 0x80000008, /* RC17 */ + 0x00000000, 0x00000083, /* RC18 */ + 0x00000000, 0x80008003, /* RC19 */ + 0x00000001, 0x80008088, /* RC20 */ + 0x00000000, 0x80000088, /* RC21 */ + 0x00000001, 0x00008000, /* RC22 */ + 0x00000000, 0x80008082, /* RC23 */ +}; + +#else /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(fips202_armv81m_round_constants) + +#endif /* !(MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) \ + */ diff --git a/flake.nix b/flake.nix index d4874acef..871301020 100644 --- a/flake.nix +++ b/flake.nix @@ -77,6 +77,7 @@ packages.toolchain_riscv32 = util.toolchain_riscv32; packages.toolchain_ppc64le = util.toolchain_ppc64le; packages.toolchain_aarch64_be = util.toolchain_aarch64_be; + packages.gcc-arm-embedded = pkgs.gcc-arm-embedded; devShells.default = util.mkShell { packages = builtins.attrValues @@ -89,108 +90,115 @@ } ++ pkgs.lib.optionals (!pkgs.stdenv.isDarwin) [ config.packages.valgrind_varlat ]; }; - # arm-none-eabi-gcc + platform files from pqmx - packages.m55-an547 = util.m55-an547; - #packages.avr-toolchain = util.avr-toolchain; # TODO The AVR shell is currently unavaliable for mldsa-native - devShells.arm-embedded = util.mkShell { - packages = builtins.attrValues - { - inherit (config.packages) m55-an547; - inherit (pkgs) gcc-arm-embedded qemu coreutils python3 git; - }; - }; - - devShells.avr = util.mkShell (import ./nix/avr { inherit pkgs; }); packages.hol_server = util.hol_server.hol_server_start; devShells.hol_light = (util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters hol_light s2n_bignum hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchains hol_light s2n_bignum hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchains hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-aarch64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 hol_light s2n_bignum hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-x86_64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 hol_light s2n_bignum hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.ci = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchains_native; }; }; - devShells.ci-bench = util.mkShell { + devShells.bench = util.mkShell { packages = builtins.attrValues { inherit (config.packages) toolchains_native; }; }; - devShells.ci-cbmc = util.mkShell { + devShells.cbmc = util.mkShell { packages = builtins.attrValues { inherit (config.packages) cbmc toolchains_native; } ++ [ pkgs.gh ]; }; - devShells.ci-slothy = util.mkShell { + devShells.slothy = util.mkShell { packages = builtins.attrValues { inherit (config.packages) slothy linters toolchains_native; }; }; - devShells.ci-cross = util.mkShell { + devShells.cross = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchains; }; }; - devShells.ci-cross-x86_64 = util.mkShell { + devShells.cross-x86_64 = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64; }; }; - devShells.ci-cross-aarch64 = util.mkShell { + devShells.cross-aarch64 = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64; }; }; - devShells.ci-cross-riscv64 = util.mkShell { + devShells.cross-riscv64 = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv64; }; }; - devShells.ci-cross-riscv32 = util.mkShell { + devShells.cross-riscv32 = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_riscv32; }; }; - devShells.ci-cross-ppc64le = util.mkShell { + devShells.cross-ppc64le = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_ppc64le; }; }; - devShells.ci-cross-aarch64_be = util.mkShell { + devShells.cross-aarch64_be = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64_be; }; }; - devShells.ci-linter = util.mkShellNoCC { - packages = builtins.attrValues { inherit (config.packages) linters; }; + + # autogen shell with cross compiler for the "other" architecture + devShells.cross-autogen = util.mkShell { + packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; } + ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ] + ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ]; }; - devShells.ci_clang14 = util.mkShellWithCC' pkgs.clang_14; - devShells.ci_clang15 = util.mkShellWithCC' pkgs.clang_15; - devShells.ci_clang16 = util.mkShellWithCC' pkgs.clang_16; - devShells.ci_clang17 = util.mkShellWithCC' pkgs.clang_17; - devShells.ci_clang18 = util.mkShellWithCC' pkgs.clang_18; - devShells.ci_clang19 = util.mkShellWithCC' pkgs.clang_19; - devShells.ci_clang20 = util.mkShellWithCC' pkgs.clang_20; - devShells.ci_clang21 = util.mkShellWithCC' pkgs.clang_21; - devShells.ci_zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); - devShells.ci_zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); - devShells.ci_zig0_14 = util.mkShellWithCC' (zigWrapCC pkgs.zig); - devShells.ci_zig0_15 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_15); + # arm-none-eabi-gcc + platform files from pqmx + devShells.cross-arm-embedded = util.mkShell { + packages = builtins.attrValues + { + inherit (util) m55-an547; + inherit (config.packages) linters; + inherit (pkgs) gcc-arm-embedded qemu coreutils python3 git; + }; + }; + # TODO: AVR shell not yet available for mldsa-native + # devShells.cross-avr = util.mkShell (import ./nix/avr { inherit pkgs; }); + + devShells.linter = util.mkShellNoCC { + packages = builtins.attrValues { inherit (config.packages) linters; }; + }; + devShells.clang14 = util.mkShellWithCC' pkgs.clang_14; + devShells.clang15 = util.mkShellWithCC' pkgs.clang_15; + devShells.clang16 = util.mkShellWithCC' pkgs.clang_16; + devShells.clang17 = util.mkShellWithCC' pkgs.clang_17; + devShells.clang18 = util.mkShellWithCC' pkgs.clang_18; + devShells.clang19 = util.mkShellWithCC' pkgs.clang_19; + devShells.clang20 = util.mkShellWithCC' pkgs.clang_20; + devShells.clang21 = util.mkShellWithCC' pkgs.clang_21; + devShells.zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); + devShells.zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); + devShells.zig0_14 = util.mkShellWithCC' (zigWrapCC pkgs.zig); + devShells.zig0_15 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_15); - devShells.ci_gcc48 = util.mkShellWithCC' pkgs.gcc48; - devShells.ci_gcc49 = util.mkShellWithCC' pkgs.gcc49; - devShells.ci_gcc7 = util.mkShellWithCC' pkgs.gcc7; - devShells.ci_gcc11 = util.mkShellWithCC' pkgs.gcc11; - devShells.ci_gcc12 = util.mkShellWithCC' pkgs.gcc12; - devShells.ci_gcc13 = util.mkShellWithCC' pkgs.gcc13; - devShells.ci_gcc14 = util.mkShellWithCC' pkgs.gcc14; - devShells.ci_gcc15 = util.mkShellWithCC' pkgs.gcc15; + devShells.gcc48 = util.mkShellWithCC' pkgs.gcc48; + devShells.gcc49 = util.mkShellWithCC' pkgs.gcc49; + devShells.gcc7 = util.mkShellWithCC' pkgs.gcc7; + devShells.gcc11 = util.mkShellWithCC' pkgs.gcc11; + devShells.gcc12 = util.mkShellWithCC' pkgs.gcc12; + devShells.gcc13 = util.mkShellWithCC' pkgs.gcc13; + devShells.gcc14 = util.mkShellWithCC' pkgs.gcc14; + devShells.gcc15 = util.mkShellWithCC' pkgs.gcc15; # valgrind with a patch for detecting variable-latency instructions - devShells.ci_valgrind-varlat_clang14 = util.mkShellWithCC_valgrind' pkgs.clang_14; - devShells.ci_valgrind-varlat_clang15 = util.mkShellWithCC_valgrind' pkgs.clang_15; - devShells.ci_valgrind-varlat_clang16 = util.mkShellWithCC_valgrind' pkgs.clang_16; - devShells.ci_valgrind-varlat_clang17 = util.mkShellWithCC_valgrind' pkgs.clang_17; - devShells.ci_valgrind-varlat_clang18 = util.mkShellWithCC_valgrind' pkgs.clang_18; - devShells.ci_valgrind-varlat_clang19 = util.mkShellWithCC_valgrind' pkgs.clang_19; - devShells.ci_valgrind-varlat_clang20 = util.mkShellWithCC_valgrind' pkgs.clang_20; - devShells.ci_valgrind-varlat_clang21 = util.mkShellWithCC_valgrind' pkgs.clang_21; - devShells.ci_valgrind-varlat_gcc48 = util.mkShellWithCC_valgrind' pkgs.gcc48; - devShells.ci_valgrind-varlat_gcc49 = util.mkShellWithCC_valgrind' pkgs.gcc49; - devShells.ci_valgrind-varlat_gcc7 = util.mkShellWithCC_valgrind' pkgs.gcc7; - devShells.ci_valgrind-varlat_gcc11 = util.mkShellWithCC_valgrind' pkgs.gcc11; - devShells.ci_valgrind-varlat_gcc12 = util.mkShellWithCC_valgrind' pkgs.gcc12; - devShells.ci_valgrind-varlat_gcc13 = util.mkShellWithCC_valgrind' pkgs.gcc13; - devShells.ci_valgrind-varlat_gcc14 = util.mkShellWithCC_valgrind' pkgs.gcc14; - devShells.ci_valgrind-varlat_gcc15 = util.mkShellWithCC_valgrind' pkgs.gcc15; + devShells.valgrind-varlat_clang14 = util.mkShellWithCC_valgrind' pkgs.clang_14; + devShells.valgrind-varlat_clang15 = util.mkShellWithCC_valgrind' pkgs.clang_15; + devShells.valgrind-varlat_clang16 = util.mkShellWithCC_valgrind' pkgs.clang_16; + devShells.valgrind-varlat_clang17 = util.mkShellWithCC_valgrind' pkgs.clang_17; + devShells.valgrind-varlat_clang18 = util.mkShellWithCC_valgrind' pkgs.clang_18; + devShells.valgrind-varlat_clang19 = util.mkShellWithCC_valgrind' pkgs.clang_19; + devShells.valgrind-varlat_clang20 = util.mkShellWithCC_valgrind' pkgs.clang_20; + devShells.valgrind-varlat_clang21 = util.mkShellWithCC_valgrind' pkgs.clang_21; + devShells.valgrind-varlat_gcc48 = util.mkShellWithCC_valgrind' pkgs.gcc48; + devShells.valgrind-varlat_gcc49 = util.mkShellWithCC_valgrind' pkgs.gcc49; + devShells.valgrind-varlat_gcc7 = util.mkShellWithCC_valgrind' pkgs.gcc7; + devShells.valgrind-varlat_gcc11 = util.mkShellWithCC_valgrind' pkgs.gcc11; + devShells.valgrind-varlat_gcc12 = util.mkShellWithCC_valgrind' pkgs.gcc12; + devShells.valgrind-varlat_gcc13 = util.mkShellWithCC_valgrind' pkgs.gcc13; + devShells.valgrind-varlat_gcc14 = util.mkShellWithCC_valgrind' pkgs.gcc14; + devShells.valgrind-varlat_gcc15 = util.mkShellWithCC_valgrind' pkgs.gcc15; }; flake = { devShell.x86_64-linux = diff --git a/mldsa/mldsa_native.S b/mldsa/mldsa_native.S index 2eb417795..d437c4d71 100644 --- a/mldsa/mldsa_native.S +++ b/mldsa/mldsa_native.S @@ -99,6 +99,9 @@ #endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) #endif +#if defined(MLD_SYS_ARMV81M_MVE) +#include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S" +#endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ @@ -460,6 +463,7 @@ #undef MLD_SYS_AARCH64 #undef MLD_SYS_AARCH64_EB #undef MLD_SYS_APPLE +#undef MLD_SYS_ARMV81M_MVE #undef MLD_SYS_BIG_ENDIAN #undef MLD_SYS_H #undef MLD_SYS_LINUX @@ -575,6 +579,21 @@ #undef MLD_FIPS202_X86_64_XKCP #undef MLD_USE_FIPS202_X4_NATIVE #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_ARMV81M_MVE) +/* + * Undefine macros from native code (FIPS202, Armv8.1-M) + */ +/* mldsa/src/fips202/native/armv81m/mve.h */ +#undef MLD_FIPS202_ARMV81M_NEED_X4 +#undef MLD_FIPS202_NATIVE_ARMV81M +#undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H +#undef MLD_USE_FIPS202_X4_NATIVE +#undef mld_keccak_f1600_x4_native_impl +/* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */ +#undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H +#undef mld_keccak_f1600_x4_mve_asm +#undef mld_keccakf1600_round_constants +#endif /* MLD_SYS_ARMV81M_MVE */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) /* mldsa/src/native/api.h */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 9668cec16..04fad5643 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -104,6 +104,10 @@ #if defined(MLD_SYS_X86_64) #include "src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c" #endif +#if defined(MLD_SYS_ARMV81M_MVE) +#include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c" +#include "src/fips202/native/armv81m/src/keccakf1600_round_constants.c" +#endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ /* Macro #undef's @@ -456,6 +460,7 @@ #undef MLD_SYS_AARCH64 #undef MLD_SYS_AARCH64_EB #undef MLD_SYS_APPLE +#undef MLD_SYS_ARMV81M_MVE #undef MLD_SYS_BIG_ENDIAN #undef MLD_SYS_H #undef MLD_SYS_LINUX @@ -571,6 +576,21 @@ #undef MLD_FIPS202_X86_64_XKCP #undef MLD_USE_FIPS202_X4_NATIVE #endif /* MLD_SYS_X86_64 */ +#if defined(MLD_SYS_ARMV81M_MVE) +/* + * Undefine macros from native code (FIPS202, Armv8.1-M) + */ +/* mldsa/src/fips202/native/armv81m/mve.h */ +#undef MLD_FIPS202_ARMV81M_NEED_X4 +#undef MLD_FIPS202_NATIVE_ARMV81M +#undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H +#undef MLD_USE_FIPS202_X4_NATIVE +#undef mld_keccak_f1600_x4_native_impl +/* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */ +#undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H +#undef mld_keccak_f1600_x4_mve_asm +#undef mld_keccakf1600_round_constants +#endif /* MLD_SYS_ARMV81M_MVE */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) /* mldsa/src/native/api.h */ diff --git a/mldsa/src/fips202/native/armv81m/README.md b/mldsa/src/fips202/native/armv81m/README.md new file mode 100644 index 000000000..9c2c0d26e --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/README.md @@ -0,0 +1,10 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# FIPS202 backend for Armv8.1-M + MVE + +This directory contains the source code for a FIPS202 backend targeting +the Armv8.1-M + MVE/Helium architecture. It is automatically derived from +the respective development source in [dev/fips202/armv81m](../../../../../dev/fips202/armv81m). + +**Warning:** This backend is still in active development and has not yet undergone +the same level of review as the rest of the code. Use at your own risk! diff --git a/mldsa/src/fips202/native/armv81m/mve.h b/mldsa/src/fips202/native/armv81m/mve.h new file mode 100644 index 000000000..6653007d5 --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/mve.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_FIPS202_NATIVE_ARMV81M_MVE_H +#define MLD_FIPS202_NATIVE_ARMV81M_MVE_H + +#define MLD_FIPS202_NATIVE_ARMV81M + +/* Part of backend API */ +#define MLD_USE_FIPS202_X4_NATIVE +/* Guard for assembly file */ +#define MLD_FIPS202_ARMV81M_NEED_X4 + +#if !defined(__ASSEMBLER__) +#include +#include "../api.h" + +#define mld_keccak_f1600_x4_native_impl \ + MLD_NAMESPACE(keccak_f1600_x4_native_impl) +int mld_keccak_f1600_x4_native_impl(uint64_t *state); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) +{ + return mld_keccak_f1600_x4_native_impl(state); +} + +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_FIPS202_NATIVE_ARMV81M_MVE_H */ diff --git a/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h new file mode 100644 index 000000000..14a9d5a81 --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H +#define MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H + +#include +#include "../../../../common.h" + +/* Keccak round constants in bit-interleaved form */ +#define mld_keccakf1600_round_constants \ + MLD_NAMESPACE(keccakf1600_round_constants) +extern const uint32_t mld_keccakf1600_round_constants[48]; + +#define mld_keccak_f1600_x4_mve_asm MLD_NAMESPACE(keccak_f1600_x4_mve_asm) +void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100], + const uint32_t rc[48]); + +#endif /* !MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */ diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S new file mode 100644 index 000000000..0166b9f7e --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S @@ -0,0 +1,636 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2025 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: keccak_f1600_x4_mve_asm + Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) + ABI: + r0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *state + description: Four bit-interleaved Keccak states (low halves followed by high halves) + r1: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *tmpstate + description: Temporary storage for intermediate state + r2: + type: buffer + size_bytes: 192 + permissions: read + c_parameter: const uint32_t *rc + description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words) + Stack: + bytes: 236 + description: register preservation (44) + SIMD registers (64) + temporary storage (128) +*/ + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/fips202/armv81m/src/keccak_f1600_x4_mve.S using scripts/simpasm. Do not modify it directly. + */ + +.thumb +.syntax unified + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) + + push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + sub sp, #0x80 + mov r6, r2 + mov.w lr, #0x18 + mov r2, r0 + mov r4, r1 + add.w r3, r2, #0x190 + vldrw.u32 q0, [r3] + vldrw.u32 q1, [r2] + vldrw.u32 q2, [r2, #32] + wls lr, lr, keccak_f1600_x4_mve_asm_roundend @ imm = #0x8c0 + +keccak_f1600_x4_mve_asm_roundstart: + vldrw.u32 q6, [r2, #112] + veor q7, q6, q2 + vldrw.u32 q2, [r2, #80] + veor q1, q2, q1 + add.w r5, r2, #0x190 + vldrw.u32 q5, [r5, #80] + veor q4, q5, q0 + vldrw.u32 q0, [r2, #192] + veor q3, q7, q0 + vldrw.u32 q0, [r2, #160] + veor q1, q1, q0 + vldrw.u32 q0, [r5, #160] + veor q0, q4, q0 + vldrw.u32 q6, [r2, #272] + veor q2, q3, q6 + vldrw.u32 q7, [r2, #240] + veor q5, q1, q7 + vldrw.u32 q4, [r5, #240] + veor q4, q0, q4 + vldrw.u32 q6, [r2, #352] + veor q3, q2, q6 + vldrw.u32 q0, [r2, #320] + veor q2, q5, q0 + vldrw.u32 q1, [r5, #320] + veor q5, q4, q1 + vldrw.u32 q4, [r5, #32] + veor q0, q3, q5 + vldrw.u32 q1, [r5, #16] + veor q6, q1, q0 + vstrw.32 q5, [sp] + vshr.u32 q7, q6, #0x1f + add.w r10, r4, #0x190 + vsli.32 q7, q6, #0x1 + vldrw.u32 q6, [r5, #112] + veor q6, q4, q6 + vldrw.u32 q4, [r5, #192] + veor q4, q6, q4 + vldrw.u32 q6, [r5, #272] + veor q4, q4, q6 + vldrw.u32 q6, [r5, #352] + veor q5, q4, q6 + vstrw.32 q7, [r4, #160] + vshr.u32 q4, q5, #0x1f + vsli.32 q4, q5, #0x1 + vldrw.u32 q6, [r2, #16] + veor q7, q4, q2 + veor q1, q6, q7 + vldrw.u32 q6, [r5, #96] + veor q6, q6, q0 + vstrw.32 q1, [r10, #160] + vshr.u32 q1, q6, #0xa + vsli.32 q1, q6, #0x16 + vldrw.u32 q6, [r2, #96] + veor q4, q6, q7 + vstrw.32 q1, [r10, #16] + vshr.u32 q6, q4, #0xa + vsli.32 q6, q4, #0x16 + vldrw.u32 q1, [r5, #336] + veor q4, q1, q0 + vldrw.u32 q1, [r2, #176] + veor q1, q1, q7 + vstrw.32 q6, [r4, #16] + vshr.u32 q6, q1, #0x1b + vsli.32 q6, q1, #0x5 + vldrw.u32 q1, [r2, #256] + veor q1, q1, q7 + vstrw.32 q6, [r4, #272] + vshr.u32 q6, q1, #0xa + vsli.32 q6, q1, #0x16 + vldrw.u32 q1, [r2, #336] + veor q1, q1, q7 + vstrw.32 q6, [r10, #128] + vshr.u32 q6, q1, #0x1f + vsli.32 q6, q1, #0x1 + vldrw.u32 q7, [r5, #176] + veor q7, q7, q0 + vstrw.32 q6, [r4, #384] + vshr.u32 q1, q7, #0x1b + vsli.32 q1, q7, #0x5 + vldrw.u32 q6, [r5, #256] + veor q0, q6, q0 + vstrw.32 q1, [r10, #272] + vshr.u32 q1, q4, #0x1f + vldrw.u32 q7, [r5, #64] + vsli.32 q1, q4, #0x1 + vldrw.u32 q4, [r5, #144] + vshr.u32 q6, q0, #0x9 + vstrw.32 q1, [r10, #384] + vsli.32 q6, q0, #0x17 + veor q7, q7, q4 + vldrw.u32 q1, [r5, #224] + veor q4, q7, q1 + vldrw.u32 q7, [r5, #304] + veor q1, q4, q7 + vldrw.u32 q0, [r5, #384] + veor q7, q1, q0 + vstrw.32 q6, [r4, #128] + vshr.u32 q1, q7, #0x1f + vsli.32 q1, q7, #0x1 + vldrw.u32 q6, [r2, #144] + veor q0, q1, q3 + vldrw.u32 q3, [r2, #64] + veor q1, q3, q6 + vldrw.u32 q6, [r2, #224] + veor q1, q1, q6 + vldrw.u32 q3, [r2, #304] + veor q6, q1, q3 + vldrw.u32 q4, [r2, #384] + veor q3, q6, q4 + vldrw.u32 q4, [r2, #48] + veor q5, q3, q5 + vldrw.u32 q1, [r5, #48] + veor q1, q1, q5 + vshr.u32 q6, q1, #0x12 + vsli.32 q6, q1, #0xe + vldrw.u32 q1, [r2, #128] + veor q1, q1, q0 + vstrw.32 q6, [r10, #80] + vshr.u32 q6, q1, #0x5 + vsli.32 q6, q1, #0x1b + vldrw.u32 q1, [r5, #128] + veor q1, q1, q5 + vstrw.32 q6, [r10, #336] + vshr.u32 q6, q1, #0x4 + vsli.32 q6, q1, #0x1c + veor q1, q4, q0 + vstrw.32 q6, [r4, #336] + vshr.u32 q4, q1, #0x12 + vsli.32 q4, q1, #0xe + vldrw.u32 q6, [r2, #208] + veor q6, q6, q0 + vstrw.32 q4, [r4, #80] + vshr.u32 q1, q6, #0x14 + vsli.32 q1, q6, #0xc + vldrw.u32 q4, [r2, #288] + veor q4, q4, q0 + vldrw.u32 q6, [r2, #368] + veor q0, q6, q0 + vshr.u32 q6, q0, #0x4 + vstrw.32 q1, [r10, #192] + vsli.32 q6, q0, #0x1c + vshr.u32 q0, q4, #0x16 + vldrw.u32 q1, [r5, #368] + vsli.32 q0, q4, #0xa + vstrw.32 q6, [r4, #304] + veor q4, q1, q5 + vstrw.32 q0, [r10, #48] + vshr.u32 q1, q4, #0x4 + vsli.32 q1, q4, #0x1c + vldrw.u32 q6, [r5, #208] + veor q6, q6, q5 + vldrw.u32 q0, [r5, #288] + veor q5, q0, q5 + vstrw.32 q1, [r10, #304] + vshr.u32 q0, q6, #0x13 + vsli.32 q0, q6, #0xd + vldrw.u32 q1, [r5, #96] + vshr.u32 q6, q5, #0x15 + vldrw.u32 q4, [r5, #16] + vsli.32 q6, q5, #0xb + vldrw.u32 q5, [r5, #176] + veor q1, q4, q1 + vldrw.u32 q4, [r5, #256] + veor q5, q1, q5 + vldrw.u32 q1, [r5, #336] + veor q5, q5, q4 + vstrw.32 q0, [r4, #192] + veor q0, q5, q1 + vstrw.32 q6, [r4, #48] + vshr.u32 q5, q0, #0x1f + vsli.32 q5, q0, #0x1 + vldrw.u32 q4, [r2, #16] + veor q3, q5, q3 + vldrw.u32 q6, [r2, #96] + veor q4, q4, q6 + vldrw.u32 q1, [r2, #176] + veor q5, q4, q1 + vldrw.u32 q6, [r2, #256] + veor q6, q5, q6 + vldrw.u32 q4, [r2, #336] + veor q5, q6, q4 + vldrw.u32 q1, [r5] + veor q7, q5, q7 + vldrw.u32 q4, [r2] + veor q1, q1, q7 + veor q4, q4, q3 + vshr.u32 q6, q1, #0x20 + vsli.32 q6, q1, #0x0 + vldrw.u32 q1, [r2, #80] + veor q1, q1, q3 + vstrw.32 q6, [r10] + vshr.u32 q6, q4, #0x20 + vsli.32 q6, q4, #0x0 + vldrw.u32 q4, [r5, #80] + veor q4, q4, q7 + vstrw.32 q6, [r4] + vshr.u32 q6, q1, #0xe + vsli.32 q6, q1, #0x12 + vldrw.u32 q1, [r2, #160] + veor q1, q1, q3 + vstrw.32 q6, [r4, #256] + vshr.u32 q6, q4, #0xe + vsli.32 q6, q4, #0x12 + vldrw.u32 q4, [r2, #240] + veor q4, q4, q3 + vstrw.32 q6, [r10, #256] + vshr.u32 q6, q1, #0x1f + vsli.32 q6, q1, #0x1 + vldrw.u32 q1, [r2, #320] + veor q1, q1, q3 + vstrw.32 q6, [r10, #112] + vshr.u32 q6, q4, #0xc + vsli.32 q6, q4, #0x14 + vldrw.u32 q3, [r5, #240] + veor q3, q3, q7 + vstrw.32 q6, [r10, #368] + vshr.u32 q4, q3, #0xb + vsli.32 q4, q3, #0x15 + vldrw.u32 q3, [r5, #160] + veor q6, q3, q7 + vstrw.32 q4, [r4, #368] + vshr.u32 q3, q6, #0x1e + vsli.32 q3, q6, #0x2 + vldrw.u32 q6, [r5, #320] + veor q7, q6, q7 + vldrw.u32 q4, [r2, #368] + vshr.u32 q6, q1, #0x17 + vstrw.32 q3, [r4, #112] + vsli.32 q6, q1, #0x9 + vshr.u32 q1, q7, #0x17 + vldrw.u32 q3, [r2, #48] + vsli.32 q1, q7, #0x9 + vldrw.u32 q7, [r2, #128] + veor q3, q3, q7 + vldrw.u32 q7, [r2, #208] + veor q7, q3, q7 + vldrw.u32 q3, [r2, #288] + veor q3, q7, q3 + vldrw.u32 q7, [r5, #128] + veor q3, q3, q4 + vldrw.u32 q4, [r5, #48] + veor q0, q3, q0 + veor q4, q4, q7 + vldrw.u32 q7, [r5, #208] + veor q4, q4, q7 + vldrw.u32 q7, [r5, #288] + veor q4, q4, q7 + vldrw.u32 q7, [r5, #368] + veor q7, q4, q7 + vstrw.32 q6, [r4, #224] + vshr.u32 q4, q7, #0x1f + vstrw.32 q1, [r10, #224] + vsli.32 q4, q7, #0x1 + veor q5, q4, q5 + vldrw.u32 q6, [r2, #192] + veor q1, q6, q5 + vldrw.u32 q4, [r5, #112] + veor q7, q2, q7 + vldrw.u32 q6, [r5, #32] + vshr.u32 q2, q1, #0xb + vsli.32 q2, q1, #0x15 + veor q1, q6, q0 + vstrw.32 q2, [r10, #32] + vshr.u32 q6, q1, #0x1 + vsli.32 q6, q1, #0x1f + vldrw.u32 q2, [r2, #112] + veor q2, q2, q5 + vstrw.32 q6, [r10, #320] + vshr.u32 q1, q2, #0x1d + vsli.32 q1, q2, #0x3 + vldrw.u32 q6, [r2, #32] + veor q4, q4, q0 + vstrw.32 q1, [r4, #176] + veor q2, q6, q5 + vshr.u32 q6, q2, #0x1 + vldrw.u32 q1, [r5, #352] + vsli.32 q6, q2, #0x1f + veor q1, q1, q0 + vstrw.32 q6, [r4, #320] + vshr.u32 q6, q1, #0x1 + vsli.32 q6, q1, #0x1f + vldrw.u32 q2, [r5, #192] + vshr.u32 q1, q4, #0x1d + vstrw.32 q6, [r4, #144] + vsli.32 q1, q4, #0x3 + veor q2, q2, q0 + vldrw.u32 q6, [r5, #272] + veor q0, q6, q0 + vldrw.u32 q4, [r2, #352] + veor q6, q4, q5 + vldrw.u32 q4, [r2, #272] + veor q4, q4, q5 + vstrw.32 q1, [r10, #176] + vshr.u32 q1, q2, #0xa + vsli.32 q1, q2, #0x16 + vldrw.u32 q5, [sp] + vshr.u32 q2, q0, #0x18 + vstrw.32 q1, [r4, #32] + vsli.32 q2, q0, #0x8 + vshr.u32 q1, q6, #0x2 + vstrw.32 q2, [r4, #288] + vsli.32 q1, q6, #0x1e + vshr.u32 q6, q4, #0x19 + vstrw.32 q1, [r10, #144] + vsli.32 q6, q4, #0x7 + vshr.u32 q0, q5, #0x1f + vstrw.32 q6, [r10, #288] + vsli.32 q0, q5, #0x1 + veor q5, q0, q3 + vldrw.u32 q6, [r2, #64] + veor q3, q6, q5 + vldrw.u32 q1, [r5, #64] + vshr.u32 q4, q3, #0x13 + vldrw.u32 q2, [r2, #384] + vsli.32 q4, q3, #0xd + vldrw.u32 q0, [r5, #224] + veor q6, q1, q7 + vstrw.32 q4, [r10, #240] + veor q2, q2, q5 + veor q3, q0, q7 + vldrw.u32 q0, [r2, #224] + vshr.u32 q4, q6, #0x12 + vldrw.u32 q1, [r5, #384] + vsli.32 q4, q6, #0xe + vshr.u32 q6, q2, #0x19 + vstrw.32 q4, [r4, #240] + vsli.32 q6, q2, #0x7 + vshr.u32 q2, q3, #0xc + vstrw.32 q6, [r4, #64] + vsli.32 q2, q3, #0x14 + veor q0, q0, q5 + vldrw.u32 q6, [r2, #144] + veor q4, q1, q7 + veor q6, q6, q5 + vstrw.32 q2, [r4, #352] + vshr.u32 q2, q4, #0x19 + vsli.32 q2, q4, #0x7 + vldrw.u32 q1, [r2, #304] + veor q5, q1, q5 + vldrw.u32 q1, [r5, #144] + veor q4, q1, q7 + vldrw.u32 q3, [r5, #304] + veor q1, q3, q7 + vstrw.32 q2, [r10, #64] + vshr.u32 q3, q0, #0xd + vsli.32 q3, q0, #0x13 + vldrw.u32 q7, [r4, #80] + vshr.u32 q0, q6, #0x16 + vstrw.32 q3, [r10, #352] + vsli.32 q0, q6, #0xa + vshr.u32 q2, q5, #0x1c + vsli.32 q2, q5, #0x4 + vldrw.u32 q5, [r4, #112] + vshr.u32 q3, q1, #0x1c + vsli.32 q3, q1, #0x4 + vldrw.u32 q1, [r4, #128] + vbic q6, q5, q0 + vstrw.32 q3, [r10, #208] + vbic q3, q1, q5 + veor q3, q0, q3 + vstrw.32 q3, [r2, #96] + vbic q3, q0, q7 + veor q0, q7, q6 + vldrw.u32 q6, [r4, #144] + vbic q7, q7, q6 + vstrw.32 q0, [r2, #80] + veor q3, q6, q3 + vstrw.32 q3, [r2, #144] + veor q0, q1, q7 + vstrw.32 q0, [r2, #128] + vbic q1, q6, q1 + vshr.u32 q6, q4, #0x16 + vldrw.u32 q3, [r10, #112] + vsli.32 q6, q4, #0xa + vldrw.u32 q4, [r10, #80] + veor q1, q5, q1 + vldrw.u32 q0, [r10, #144] + vbic q7, q4, q0 + vldrw.u32 q5, [r10, #128] + veor q7, q5, q7 + vstrw.32 q1, [r2, #112] + vbic q1, q0, q5 + vstrw.32 q7, [r5, #128] + veor q7, q3, q1 + vstrw.32 q7, [r5, #112] + vbic q7, q5, q3 + vbic q1, q3, q6 + vldrw.u32 q3, [r4, #176] + veor q5, q4, q1 + vbic q4, q6, q4 + vldrw.u32 q1, [r4, #160] + veor q0, q0, q4 + vldrw.u32 q4, [r4, #224] + veor q7, q6, q7 + vstrw.32 q0, [r5, #144] + vbic q0, q1, q4 + vstrw.32 q7, [r5, #96] + veor q0, q2, q0 + vstrw.32 q0, [r2, #208] + vbic q6, q3, q1 + vstrw.32 q5, [r5, #80] + vbic q7, q4, q2 + vldrw.u32 q0, [r10, #160] + veor q6, q4, q6 + vldrw.u32 q5, [r4, #192] + vbic q4, q2, q5 + vldrw.u32 q2, [r10, #224] + veor q4, q3, q4 + vstrw.32 q4, [r2, #176] + vbic q4, q5, q3 + vstrw.32 q6, [r2, #224] + veor q4, q1, q4 + vldrw.u32 q1, [r10, #208] + veor q3, q5, q7 + vldrw.u32 q5, [r10, #192] + vbic q6, q1, q5 + vldrw.u32 q7, [r10, #176] + veor q6, q7, q6 + vstrw.32 q3, [r2, #192] + vbic q3, q0, q2 + vstrw.32 q6, [r5, #176] + veor q3, q1, q3 + vstrw.32 q3, [r5, #208] + vbic q3, q5, q7 + vstrw.32 q4, [r2, #160] + veor q3, q0, q3 + vstrw.32 q3, [r5, #160] + vbic q6, q2, q1 + vldrw.u32 q1, [r4, #288] + vbic q7, q7, q0 + vldrw.u32 q3, [r4, #272] + veor q0, q5, q6 + vldrw.u32 q4, [r4, #304] + veor q6, q2, q7 + vldrw.u32 q7, [r4, #256] + vbic q5, q4, q1 + vstrw.32 q0, [r5, #192] + veor q5, q3, q5 + vstrw.32 q6, [r5, #224] + vbic q0, q3, q7 + vstrw.32 q5, [r2, #272] + vbic q6, q1, q3 + veor q5, q7, q6 + vldrw.u32 q3, [r4, #240] + veor q6, q3, q0 + vldrw.u32 q2, [r10, #288] + vbic q0, q3, q4 + vstrw.32 q6, [r2, #240] + vbic q7, q7, q3 + vstrw.32 q5, [r2, #256] + veor q7, q4, q7 + vstrw.32 q7, [r2, #304] + veor q7, q1, q0 + vstrw.32 q7, [r2, #288] + vldrw.u32 q5, [r10, #304] + vbic q7, q5, q2 + vldrw.u32 q3, [r10, #272] + veor q1, q3, q7 + vldrw.u32 q7, [r4, #336] + vbic q4, q2, q3 + vldrw.u32 q6, [r10, #256] + vbic q3, q3, q6 + vldrw.u32 q0, [r10, #240] + veor q3, q0, q3 + vstrw.32 q1, [r5, #272] + vbic q1, q0, q5 + vstrw.32 q3, [r5, #240] + veor q1, q2, q1 + vldrw.u32 q3, [r4, #384] + vbic q2, q6, q0 + vldrw.u32 q0, [r4, #320] + veor q2, q5, q2 + vldrw.u32 q5, [r4, #352] + veor q4, q6, q4 + vstrw.32 q2, [r5, #304] + vbic q2, q7, q0 + vstrw.32 q1, [r5, #288] + veor q1, q3, q2 + vstrw.32 q1, [r2, #384] + vbic q2, q5, q7 + vstrw.32 q4, [r5, #256] + veor q4, q0, q2 + vstrw.32 q4, [r2, #320] + vbic q2, q0, q3 + vldrw.u32 q4, [r4, #368] + vbic q3, q3, q4 + vldrw.u32 q0, [r10, #320] + veor q1, q5, q3 + vldrw.u32 q6, [r10, #336] + vbic q5, q4, q5 + vstrw.32 q1, [r2, #352] + veor q5, q7, q5 + vstrw.32 q5, [r2, #336] + veor q3, q4, q2 + vstrw.32 q3, [r2, #368] + vbic q7, q6, q0 + vldrw.u32 q5, [r10, #352] + vbic q3, q5, q6 + vldrw.u32 q1, [r10, #368] + vbic q4, q1, q5 + vldrw.u32 q2, [r4, #16] + veor q6, q6, q4 + vldrw.u32 q4, [r10, #384] + veor q3, q0, q3 + vstrw.32 q3, [r5, #320] + veor q3, q4, q7 + vstrw.32 q3, [r5, #384] + vbic q0, q0, q4 + vstrw.32 q6, [r5, #336] + veor q3, q1, q0 + vstrw.32 q3, [r5, #368] + vbic q7, q4, q1 + veor q5, q5, q7 + vldrw.u32 q6, [r4, #32] + vbic q3, q6, q2 + vldrw.u32 q4, [r4, #48] + vbic q0, q4, q6 + vldrw.u32 q1, [r4] + veor q0, q2, q0 + vldrw.u32 q7, [r4, #64] + veor q3, q1, q3 + vstrw.32 q5, [r5, #352] + vbic q5, q1, q7 + vstrw.32 q0, [r2, #16] + veor q0, q4, q5 + vstrw.32 q0, [r2, #48] + vbic q5, q2, q1 + veor q2, q7, q5 + vldrw.u32 q0, [r10, #16] + vbic q5, q7, q4 + vldrw.u32 q4, [r10] + vbic q1, q0, q4 + vldrw.u32 q7, [r10, #64] + veor q1, q7, q1 + vstrw.32 q2, [r2, #64] + veor q2, q6, q5 + vbic q6, q4, q7 + vldrw.u32 q5, [r10, #48] + veor q6, q5, q6 + ldrd r7, r8, [r6] + vbic q7, q7, q5 + vstrw.32 q1, [r5, #64] + vdup.32 q1, r7 + veor q1, q3, q1 + vldrw.u32 q3, [r10, #32] + veor q7, q3, q7 + add.w r6, r6, #0x8 + vbic q5, q5, q3 + vstrw.32 q6, [r5, #48] + vbic q6, q3, q0 + vstrw.32 q1, [r2] + veor q5, q0, q5 + vstrw.32 q7, [r5, #32] + veor q4, q4, q6 + vstrw.32 q5, [r5, #16] + vdup.32 q6, r8 + vstrw.32 q2, [r2, #32] + veor q0, q4, q6 + vstrw.32 q0, [r5] + +keccak_f1600_x4_mve_asm_roundend_pre: + le lr, keccak_f1600_x4_mve_asm_roundstart @ imm = #-0x8c0 + +keccak_f1600_x4_mve_asm_roundend: + add sp, #0x80 + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + pop.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + nop + +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c new file mode 100644 index 000000000..8fd1edb76 --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../../common.h" +#include "../../../../ct.h" + +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include "fips202_native_armv81m.h" + +/* + * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. + * TODO: Replace with optimized MVE assembly implementations + * (as a part of XORBytes and ExtractBytes) + */ + +/* Extract even-indexed bits from 64-bit value into lower 32 bits */ +static uint32_t bitinterleave_even(uint64_t x) +{ + uint64_t t; + t = x & 0x5555555555555555ULL; + t = (t | (t >> 1)) & 0x3333333333333333ULL; + t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL; + t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL; + t = (t | (t >> 8)) & 0x0000ffff0000ffffULL; + t = (t | (t >> 16)) & 0x00000000ffffffffULL; + return (uint32_t)t; +} + +/* Extract odd-indexed bits from 64-bit value into lower 32 bits */ +static uint32_t bitinterleave_odd(uint64_t x) +{ + return bitinterleave_even(x >> 1); +} + +/* Spread 32-bit value across even bit positions of 64-bit result */ +static uint64_t spread_even(uint32_t x) +{ + uint64_t t = x; + t = (t | (t << 16)) & 0x0000ffff0000ffffULL; + t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL; + t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL; + t = (t | (t << 2)) & 0x3333333333333333ULL; + t = (t | (t << 1)) & 0x5555555555555555ULL; + return t; +} + +/* Combine even and odd 32-bit halves into interleaved 64-bit value */ +static uint64_t bitdeinterleave(uint32_t even, uint32_t odd) +{ + return spread_even(even) | (spread_even(odd) << 1); +} + +/* + * TEMPORARY: Naive C interleaving functions. + * These will be replaced with optimized MVE assembly implementations. + */ +static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0, + const uint64_t *state1, const uint64_t *state2, + const uint64_t *state3) +{ + uint32_t *state_4xl = (uint32_t *)state_4x; + uint32_t *state_4xh = (uint32_t *)state_4x + 100; + + for (size_t i = 0; i < 25; i++) + { + state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]); + state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]); + state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]); + state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]); + + state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]); + state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]); + state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]); + state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]); + } +} + +static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0, + uint64_t *state1, uint64_t *state2, + uint64_t *state3) +{ + uint32_t *state_4xl = (uint32_t *)state_4x; + uint32_t *state_4xh = (uint32_t *)state_4x + 100; + + for (size_t i = 0; i < 25; i++) + { + state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]); + state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]); + state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]); + state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]); + } +} + +#define mld_keccak_f1600_x4_native_impl \ + MLD_NAMESPACE(keccak_f1600_x4_native_impl) +int mld_keccak_f1600_x4_native_impl(uint64_t *state) +{ + /* + * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. + * TODO: Replace with optimized MVE assembly implementations + * (as a part of XORBytes and ExtractBytes) + */ + MLD_ALIGN uint64_t state_4x[100]; + MLD_ALIGN uint64_t state_4x_tmp[100]; + + /* Interleave the 4 states into bit-interleaved format */ + interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); + + /* Run the permutation */ + mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp, + mld_keccakf1600_round_constants); + + /* Deinterleave back to 4 separate states */ + deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); + + mld_zeroize(state_4x, sizeof(state_4x)); + mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp)); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#else /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(keccak_f1600_x4_mve) + +#endif /* !(MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) \ + */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +/* Some macros are kept because they are also defined in a header. */ +/* Keep: mld_keccak_f1600_x4_native_impl (mve.h) */ diff --git a/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c b/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c new file mode 100644 index 000000000..7075ece41 --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../../common.h" + +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include + +#include "fips202_native_armv81m.h" + +/* + * Keccak round constants in bit-interleaved form. + * Each 64-bit constant is split into two 32-bit words: + * - low word contains even-indexed bits + * - high word contains odd-indexed bits + */ +MLD_ALIGN const uint32_t mld_keccakf1600_round_constants[48] = { + 0x00000001, 0x00000000, /* RC0 */ + 0x00000000, 0x00000089, /* RC1 */ + 0x00000000, 0x8000008b, /* RC2 */ + 0x00000000, 0x80008080, /* RC3 */ + 0x00000001, 0x0000008b, /* RC4 */ + 0x00000001, 0x00008000, /* RC5 */ + 0x00000001, 0x80008088, /* RC6 */ + 0x00000001, 0x80000082, /* RC7 */ + 0x00000000, 0x0000000b, /* RC8 */ + 0x00000000, 0x0000000a, /* RC9 */ + 0x00000001, 0x00008082, /* RC10 */ + 0x00000000, 0x00008003, /* RC11 */ + 0x00000001, 0x0000808b, /* RC12 */ + 0x00000001, 0x8000000b, /* RC13 */ + 0x00000001, 0x8000008a, /* RC14 */ + 0x00000001, 0x80000081, /* RC15 */ + 0x00000000, 0x80000081, /* RC16 */ + 0x00000000, 0x80000008, /* RC17 */ + 0x00000000, 0x00000083, /* RC18 */ + 0x00000000, 0x80008003, /* RC19 */ + 0x00000001, 0x80008088, /* RC20 */ + 0x00000000, 0x80000088, /* RC21 */ + 0x00000001, 0x00008000, /* RC22 */ + 0x00000000, 0x80008082, /* RC23 */ +}; + +#else /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ + +MLD_EMPTY_CU(fips202_armv81m_round_constants) + +#endif /* !(MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED) \ + */ diff --git a/mldsa/src/fips202/native/auto.h b/mldsa/src/fips202/native/auto.h index 0a09c733c..94fbe703f 100644 --- a/mldsa/src/fips202/native/auto.h +++ b/mldsa/src/fips202/native/auto.h @@ -20,4 +20,10 @@ #include "x86_64/xkcp.h" #endif +/* We do not yet include the FIPS202 backend for Armv8.1-M+MVE by default + * as it is still experimental and undergoing review. */ +/* #if defined(MLD_SYS_ARMV81M_MVE) */ +/* #include "armv81m/mve.h" */ +/* #endif */ + #endif /* !MLD_FIPS202_NATIVE_AUTO_H */ diff --git a/mldsa/src/sys.h b/mldsa/src/sys.h index 122652642..d34686b5a 100644 --- a/mldsa/src/sys.h +++ b/mldsa/src/sys.h @@ -44,6 +44,11 @@ #define MLD_SYS_AARCH64_EB #endif +/* Check if we're running on an Armv8.1-M system with MVE */ +#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE) +#define MLD_SYS_ARMV81M_MVE +#endif + #if defined(__x86_64__) #define MLD_SYS_X86_64 #if defined(__AVX2__) diff --git a/nix/util.nix b/nix/util.nix index 4236c4f70..cfe9f872a 100644 --- a/nix/util.nix +++ b/nix/util.nix @@ -40,7 +40,7 @@ rec { # and won't just work for now # - equip all toolchains if cross is explicitly set to true # - On some machines, `native-gcc` needed to be evaluated lastly (placed as the last element of the toolchain list), or else would result in environment variables (CC, AR, ...) overriding issue. - pkgs.lib.optionals cross [ pkgs.qemu x86_64-gcc aarch64-gcc riscv64-gcc riscv32-gcc ppc64le-gcc ] + pkgs.lib.optionals cross [ pkgs.qemu x86_64-gcc aarch64-gcc riscv64-gcc riscv32-gcc ppc64le-gcc pkgs.gcc-arm-embedded ] ++ pkgs.lib.optionals (cross && pkgs.stdenv.isLinux && pkgs.stdenv.isx86_64) [ aarch64_be-gcc ] ++ pkgs.lib.optionals cross [ native-gcc ] # git is not available in the nix shell on Darwin. As a workaround we add git as a dependency here. diff --git a/scripts/autogen b/scripts/autogen index dc9df25e9..f1a5643ca 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -1461,6 +1461,10 @@ def riscv64(c): return "/riscv64/" in c +def armv81m(c): + return "/armv81m/" in c + + def native_fips202(c): return native(c) and fips202(c) @@ -1477,11 +1481,16 @@ def native_fips202_x86_64(c): return native_fips202(c) and x86_64(c) +def native_fips202_armv81m(c): + return native_fips202(c) and armv81m(c) + + def native_fips202_core(c): return ( native_fips202(c) and not native_fips202_x86_64(c) and not native_fips202_aarch64(c) + and not native_fips202_armv81m(c) ) @@ -1590,6 +1599,11 @@ def gen_macro_undefs(extra_notes=None): filt=native_fips202_x86_64, desc="native code (FIPS202, x86_64)" ) yield "#endif" + yield "#if defined(MLD_SYS_ARMV81M_MVE)" + yield from gen_monolithic_undef_all_core( + filt=native_fips202_armv81m, desc="native code (FIPS202, Armv8.1-M)" + ) + yield "#endif" yield "#endif" yield "#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)" yield from gen_monolithic_undef_all_core(filt=native_arith_core, desc="") @@ -1692,6 +1706,10 @@ def gen_monolithic_source_file(): for c in filter(native_fips202_x86_64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_ARMV81M_MVE)" + for c in filter(native_fips202_armv81m, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield from gen_macro_undefs() @@ -1772,6 +1790,10 @@ def gen_monolithic_asm_file(): for c in filter(native_fips202_x86_64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLD_SYS_ARMV81M_MVE)" + for c in filter(native_fips202_armv81m, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" # We generate #undef's for all headers, even though most are not @@ -2095,6 +2117,8 @@ def update_via_simpasm( source_arch = "aarch64" elif "x86_64" in infile_full: source_arch = "x86_64" + elif "armv81m" in infile_full: + source_arch = "armv81m" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -2103,7 +2127,15 @@ def update_via_simpasm( else: native_arch = "x86_64" - if native_arch != source_arch: + # Armv8.1-M is always cross-compiled + if source_arch == "armv81m": + cross_prefix = "arm-none-eabi-" + cross_gcc = cross_prefix + "gcc" + if shutil.which(cross_gcc) is None: + if force_cross is False: + return + raise Exception(f"Could not find cross toolchain {cross_prefix}") + elif native_arch != source_arch: cross_prefix = f"{source_arch}-unknown-linux-gnu-" cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present @@ -2117,18 +2149,25 @@ def update_via_simpasm( with tempfile.NamedTemporaryFile(suffix=".S") as tmp: try: # Determine architecture from filename - arch = "aarch64" if "aarch64" in infile_full else "x86_64" + if "aarch64" in infile_full: + arch = "aarch64" + elif "armv81m" in infile_full: + arch = "armv81m" + else: + arch = "x86_64" cmd = [ "./scripts/simpasm", "--objdump=llvm-objdump", - "--cfify", "--arch=" + arch, "-i", infile_full, "-o", tmp.name, ] + # TODO: Support CFI for Armv8.1-M + if arch != "armv81m": + cmd += ["--cfify"] if cross_prefix is not None: # Stick with llvm-objdump for disassembly cmd += ["--cc", cross_prefix + "gcc"] @@ -2210,12 +2249,13 @@ def update_via_remove(filename): update_file(filename, None) -def synchronize_file(f, in_dir, out_dir, delete=False, no_simplify=False, **kwargs): +# Only synchronize sources, but not README.md, Makefile and so on +SYNCHRONIZED_EXTENSIONS = (".c", ".h", ".i", ".inc", ".S") + - # Only synchronize sources, but not README.md, Makefile and so on - extensions = (".c", ".h", ".i", ".inc", ".S") +def synchronize_file(f, in_dir, out_dir, delete=False, no_simplify=False, **kwargs): - if not f.endswith(extensions): + if not f.endswith(SYNCHRONIZED_EXTENSIONS): return None basename = os.path.basename(f) @@ -2266,6 +2306,8 @@ def synchronize_backend(in_dir, out_dir, delete=False, no_simplify=False, **kwar for f in get_files(os.path.join(out_dir, "*")): if os.path.basename(f) in copied: continue + if not f.endswith(SYNCHRONIZED_EXTENSIONS): + continue # Otherwise, remove it update_via_remove(f) @@ -2341,6 +2383,22 @@ def synchronize_backends( # comparing the object code before and after simplification. cflags="-Imldsa/src/native/x86_64/src/ -mavx2 -mbmi2 -msse4 -fcf-protection=none", ) + synchronize_backend( + "dev/fips202/armv81m/src", + "mldsa/src/fips202/native/armv81m/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Idev/fips202/armv81m/src -Imldsa/src/fips202/native/armv81m/src -march=armv8.1-m.main+mve -mthumb", + ) + synchronize_backend( + "dev/fips202/armv81m", + "mldsa/src/fips202/native/armv81m", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags="-Idev/fips202/armv81m -Imldsa/src/fips202/native/armv81m -march=armv8.1-m.main+mve -mthumb", + ) def adjust_header_guard_for_filename(content, header_file): diff --git a/scripts/simpasm b/scripts/simpasm index 289eac4c1..4b4c98499 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -249,6 +249,9 @@ def simplify(logger, args, asm_input, asm_output=None): cmd = [args.objdump, "--disassemble", tmp_objfile0] if platform.system() == "Darwin" and args.arch == "aarch64": cmd += ["--triple=aarch64"] + # Armv8.1-M requires explicit triple for Thumb disassembly + if args.arch == "armv81m": + cmd += ["--triple=thumbv8.1m.main-none-eabi"] # Add syntax option if specified if args.syntax and args.syntax.lower() != "att": @@ -267,14 +270,23 @@ def simplify(logger, args, asm_input, asm_output=None): f" * {asm_input} using scripts/simpasm. Do not modify it directly.", " */", "", - "#if defined(__ELF__)", - '.section .note.GNU-stack,"",@progbits', - "#endif", - "", - ".text", - ".balign 4", ] + # Add ELF section for non-baremetal targets + if args.arch != "armv81m": + autogen_header += [ + "#if defined(__ELF__)", + '.section .note.GNU-stack,"",@progbits', + "#endif", + "", + ] + + # Add Thumb directives for Armv8.1-M + if args.arch == "armv81m": + autogen_header += [".thumb", ".syntax unified", ""] + + autogen_header += [".text", ".balign 4"] + # Add syntax specifier for Intel syntax if args.syntax and args.syntax.lower() == "intel": autogen_header.append(".intel_syntax noprefix") @@ -344,8 +356,8 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Checking that byte-code is unchanged ...") - # When CFI is enabled, compare only the __text section content - if args.cfify: + # When CFI is enabled or for Armv8.1-M, compare only the __text section content + if args.cfify or args.arch == "armv81m": logger.debug("Comparing __text section content for CFI comparison...") # Extract __text section from both files diff --git a/test/baremetal/platform/m55-an547/platform.mk b/test/baremetal/platform/m55-an547/platform.mk index 6bae38540..6e9918123 100644 --- a/test/baremetal/platform/m55-an547/platform.mk +++ b/test/baremetal/platform/m55-an547/platform.mk @@ -13,6 +13,10 @@ CYCLES ?= PMU # Reduce iterations for benchmarking CFLAGS += -DMLD_BENCHMARK_NTESTS=3 -DMLD_BENCHMARK_NITERATIONS=2 -DMLD_BENCHMARK_NWARMUP=3 +# Explicitly include experimental Armv8.1-M + MVE backend +# Remove this once backend is finalized and enabled by default. +CFLAGS += "-DMLD_CONFIG_FIPS202_BACKEND_FILE=\"fips202/native/armv81m/mve.h\"" + CFLAGS += \ -O3 \ -Wall -Wextra -Wshadow \ diff --git a/test/mk/components.mk b/test/mk/components.mk index 7ac997ece..88931db96 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -4,7 +4,7 @@ FIPS202_SRCS = $(wildcard mldsa/src/fips202/*.c) ifeq ($(OPT),1) - FIPS202_SRCS += $(wildcard mldsa/src/fips202/native/aarch64/src/*.S) $(wildcard mldsa/src/fips202/native/aarch64/src/*.c) $(wildcard mldsa/src/fips202/native/x86_64/src/*.c) + FIPS202_SRCS += $(wildcard mldsa/src/fips202/native/aarch64/src/*.S) $(wildcard mldsa/src/fips202/native/aarch64/src/*.c) $(wildcard mldsa/src/fips202/native/x86_64/src/*.c) $(wildcard mldsa/src/fips202/native/armv81m/src/*.[csS]) endif diff --git a/test/src/test_unit.c b/test/src/test_unit.c index 0699f496d..6acb1a81b 100644 --- a/test/src/test_unit.c +++ b/test/src/test_unit.c @@ -113,7 +113,19 @@ static int compare_u64_arrays(const uint64_t *a, const uint64_t *b, defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) || \ defined(MLD_USE_FIPS202_X1_NATIVE) || defined(MLD_USE_FIPS202_X4_NATIVE) -/* Backend unit test helper functions */ +/* Backend unit test helper functions for arithmetic native backends */ +#if defined(MLD_USE_NATIVE_NTT) || defined(MLD_USE_NATIVE_INTT) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) || \ + defined(MLD_USE_NATIVE_POLY_CADDQ) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_88) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_32) || \ + defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) static void print_i32_array(const char *label, const int32_t *array, size_t len) { size_t i; @@ -151,7 +163,29 @@ static void generate_i32_array_single(int32_t *data, size_t len, size_t pos, memset(data, 0, len * sizeof(int32_t)); data[pos] = value; } +#endif /* MLD_USE_NATIVE_NTT || MLD_USE_NATIVE_INTT || \ + MLD_USE_NATIVE_POLY_DECOMPOSE_32 || MLD_USE_NATIVE_POLY_DECOMPOSE_88 \ + || MLD_USE_NATIVE_POLY_CADDQ || MLD_USE_NATIVE_POLY_USE_HINT_88 || \ + MLD_USE_NATIVE_POLY_USE_HINT_32 || \ + MLD_USE_NATIVE_POINTWISE_MONTGOMERY || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 || \ + MLD_USE_NATIVE_POLYZ_UNPACK_17 || MLD_USE_NATIVE_POLYZ_UNPACK_19 */ +#if defined(MLD_USE_NATIVE_NTT) || defined(MLD_USE_NATIVE_INTT) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) || \ + defined(MLD_USE_NATIVE_POLY_CADDQ) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_88) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_32) || \ + defined(MLD_USE_NATIVE_POLY_CHKNORM) || \ + defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) /* This does not generate a uniformly random distribution, but it's * good enough for our test. * @@ -169,7 +203,28 @@ static void generate_i32_array_ranged(int32_t *data, size_t len, int min_incl, ((unsigned)data[i] % (unsigned)(max_excl - min_incl))); } } +#endif /* MLD_USE_NATIVE_NTT || MLD_USE_NATIVE_INTT || \ + MLD_USE_NATIVE_POLY_DECOMPOSE_32 || MLD_USE_NATIVE_POLY_DECOMPOSE_88 \ + || MLD_USE_NATIVE_POLY_CADDQ || MLD_USE_NATIVE_POLY_USE_HINT_88 || \ + MLD_USE_NATIVE_POLY_USE_HINT_32 || MLD_USE_NATIVE_POLY_CHKNORM || \ + MLD_USE_NATIVE_POINTWISE_MONTGOMERY || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 || \ + MLD_USE_NATIVE_POLYZ_UNPACK_17 || MLD_USE_NATIVE_POLYZ_UNPACK_19 */ +#if defined(MLD_USE_NATIVE_NTT) || defined(MLD_USE_NATIVE_INTT) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) || \ + defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) || \ + defined(MLD_USE_NATIVE_POLY_CADDQ) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_88) || \ + defined(MLD_USE_NATIVE_POLY_USE_HINT_32) || \ + defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) || \ + defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) || \ + defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) static int compare_i32_arrays(const int32_t *a, const int32_t *b, unsigned len, const char *test_name, const int32_t *input) { @@ -193,6 +248,15 @@ static int compare_i32_arrays(const int32_t *a, const int32_t *b, unsigned len, } return 1; } +#endif /* MLD_USE_NATIVE_NTT || MLD_USE_NATIVE_INTT || \ + MLD_USE_NATIVE_POLY_DECOMPOSE_32 || MLD_USE_NATIVE_POLY_DECOMPOSE_88 \ + || MLD_USE_NATIVE_POLY_CADDQ || MLD_USE_NATIVE_POLY_USE_HINT_88 || \ + MLD_USE_NATIVE_POLY_USE_HINT_32 || \ + MLD_USE_NATIVE_POINTWISE_MONTGOMERY || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 || \ + MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 || \ + MLD_USE_NATIVE_POLYZ_UNPACK_17 || MLD_USE_NATIVE_POLYZ_UNPACK_19 */ #ifdef MLD_USE_NATIVE_NTT static int test_ntt_core(const int32_t *input, const char *test_name)