From f6003c720584c66f395d26c8f6163a20a4e4178b Mon Sep 17 00:00:00 2001 From: Brian Degenhardt Date: Thu, 26 Feb 2026 13:59:15 -0800 Subject: [PATCH 01/10] AArch64 JIT backend Enable HashLink VM on AArch64 (Apple Silicon, ARM Linux servers, etc.) by adding a new JIT backend alongside the existing x86/x64 one. - Rename jit.c to jit_x86.c, extract shared code into jit_common.h/jit_shared.c - Add jit_aarch64.c/jit_aarch64_emit.c for ARM64 instruction selection and encoding - Add jit_elf.c for GDB JIT debug interface - Architecture-aware JIT selection in Makefile and CMakeLists.txt - Add aarch64 support in hl.h, profile.c, hlmodule.h, module.c --- .github/workflows/build.yml | 42 +- CMakeLists.txt | 23 +- Makefile | 24 +- src/hl.h | 8 +- src/hlmodule.h | 3 + src/jit_aarch64.c | 6935 +++++++++++++++++++++++++++++++++++ src/jit_aarch64_emit.c | 855 +++++ src/jit_aarch64_emit.h | 182 + src/jit_common.h | 175 + src/jit_elf.c | 394 ++ src/jit_elf.h | 152 + src/jit_shared.c | 65 + src/{jit.c => jit_x86.c} | 140 +- src/module.c | 7 + src/opcodes.h | 28 + src/profile.c | 6 +- src/std/ucs2.c | 12 + 17 files changed, 8885 insertions(+), 166 deletions(-) create mode 100644 src/jit_aarch64.c create mode 100644 src/jit_aarch64_emit.c create mode 100644 src/jit_aarch64_emit.h create mode 100644 src/jit_common.h create mode 100644 src/jit_elf.c create mode 100644 src/jit_elf.h create mode 100644 src/jit_shared.c rename src/{jit.c => jit_x86.c} (94%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0ff56c5cd..d95974115 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -326,23 +326,21 @@ jobs: ${{ env.WINDOWS_BUILD_FOLDER }}/hl.exe --version ;; make*) - if [[ ${{ matrix.architecture }} != arm64 ]]; then - ./hl --version - case ${{ matrix.target }} in - linux) ldd -v ./hl ;; - darwin) otool -L ./hl ;; - esac - - haxe -hl hello.hl -cp other/tests -main HelloWorld -D interp \ - ${{ matrix.architecture == 32 && '-D hl-legacy32' || '' }} - ./hl hello.hl - - # ensure the executable still works when installed globally - cp hello.hl /tmp - pushd /tmp - hl hello.hl - popd - fi + ./hl --version + case ${{ matrix.target }} in + linux) ldd -v ./hl ;; + darwin) otool -L ./hl ;; + esac + + haxe -hl hello.hl -cp other/tests -main HelloWorld -D interp \ + ${{ matrix.architecture == 32 && '-D hl-legacy32' || '' }} + ./hl hello.hl + + # ensure the executable still works when installed globally + cp hello.hl /tmp + pushd /tmp + hl hello.hl + popd haxe -hl src/_main.c -cp other/tests -main HelloWorld make hlc @@ -431,9 +429,6 @@ jobs: os: [darwin, linux, windows] architecture: [x86_32, x86_64, arm64] include: - - architecture: arm64 - test-flags: --skip-hl-jit # not yet supported - - architecture: x86_32 test-flags: --skip-hl-jit # not stable @@ -492,12 +487,7 @@ jobs: tar -xvzf hashlink-*.tar.gz cd $(echo hashlink-*/) sudo mkdir -p /usr/local/{bin,lib} - if [[ "${{ matrix.architecture }}" == "arm64" ]]; then - echo "echo Jit is not supported on arm64" > /usr/local/bin/hl - chmod +x /usr/local/bin/hl - else - sudo cp hl /usr/local/bin/ - fi + sudo cp hl /usr/local/bin/ sudo cp libhl.* *.hdll /usr/local/lib/ - name: Install binary (Windows) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34c8755fd..f582b5fe4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,9 +20,7 @@ include(FindPkgConfig) include(CTest) set(WITH_VM_DEFAULT ON) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")) - set(WITH_VM_DEFAULT OFF) -endif() +# VM now supports x86, x86-64, and AArch64 architectures option(WITH_VM "Whether to build the Hashlink virtual machine" ${WITH_VM_DEFAULT}) option(BUILD_SHARED_LIBS "Build using shared libraries" ON) @@ -224,9 +222,24 @@ else() endif() if (WITH_VM) + # Select JIT backend based on architecture + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(JIT_SOURCES + src/jit_aarch64.c + src/jit_aarch64_emit.c + src/jit_elf.c + src/jit_shared.c + ) + else() + set(JIT_SOURCES + src/jit_x86.c + src/jit_shared.c + ) + endif() + add_executable(hl src/code.c - src/jit.c + ${JIT_SOURCES} src/main.c src/module.c src/debugger.c @@ -246,6 +259,8 @@ if (WITH_VM) if (WIN32) target_link_libraries(hl user32) + else() + target_link_libraries(hl dl) # For dlopen/dlsym in jit_elf.c endif() endif() diff --git a/Makefile b/Makefile index 52c80ab12..83ec5a990 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,16 @@ STD = src/std/array.o src/std/buffer.o src/std/bytes.o src/std/cast.o src/std/da src/std/socket.o src/std/string.o src/std/sys.o src/std/types.o src/std/ucs2.o src/std/thread.o src/std/process.o \ src/std/track.o -HL_OBJ = src/code.o src/jit.o src/main.o src/module.o src/debugger.o src/profile.o +# Conditional JIT backend selection based on architecture +ifeq ($(ARCH),aarch64) + HL_JIT = src/jit_aarch64.o src/jit_aarch64_emit.o src/jit_elf.o src/jit_shared.o +else ifeq ($(ARCH),arm64) + HL_JIT = src/jit_aarch64.o src/jit_aarch64_emit.o src/jit_elf.o src/jit_shared.o +else + HL_JIT = src/jit_x86.o src/jit_elf.o src/jit_shared.o +endif + +HL_OBJ = src/code.o $(HL_JIT) src/main.o src/module.o src/debugger.o src/profile.o FMT_CPPFLAGS = -I include/mikktspace -I include/minimp3 @@ -239,19 +248,12 @@ LIBHL = libhl.$(LIBEXT) HL = hl$(EXE_SUFFIX) HLC = hlc$(EXE_SUFFIX) -all: $(LIBHL) libs -ifeq ($(ARCH),arm64) - $(warning HashLink vm is not supported on arm64, skipping) -else -all: $(HL) -endif +all: $(LIBHL) libs $(HL) install: $(UNAME)==Darwin && ${MAKE} uninstall -ifneq ($(ARCH),arm64) mkdir -p $(INSTALL_BIN_DIR) cp $(HL) $(INSTALL_BIN_DIR) -endif mkdir -p $(INSTALL_LIB_DIR) cp *.hdll $(INSTALL_LIB_DIR) cp $(LIBHL) $(INSTALL_LIB_DIR) @@ -362,11 +364,7 @@ release_win: rm -rf $(PACKAGE_NAME) release_linux release_osx: -ifeq ($(ARCH),arm64) - cp $(LIBHL) *.hdll $(PACKAGE_NAME) -else cp $(HL) $(LIBHL) *.hdll $(PACKAGE_NAME) -endif tar -cvzf $(PACKAGE_NAME).tar.gz $(PACKAGE_NAME) rm -rf $(PACKAGE_NAME) diff --git a/src/hl.h b/src/hl.h index 2da71da81..85e56d28a 100644 --- a/src/hl.h +++ b/src/hl.h @@ -93,10 +93,14 @@ # define HL_BSD #endif -#if defined(_64BITS) || defined(__x86_64__) || defined(_M_X64) || defined(__LP64__) || defined(__wasm64__) +#if defined(_64BITS) || defined(__x86_64__) || defined(_M_X64) || defined(__LP64__) || defined(__wasm64__) || defined(__aarch64__) # define HL_64 #endif +#if defined(__aarch64__) || defined(_M_ARM64) +# define HL_ARM64 +#endif + #if defined(__GNUC__) # define HL_GCC #endif @@ -880,6 +884,8 @@ typedef struct { int length; } vstring; +HL_API int hl_str_cmp( vstring *a, vstring *b ); + #define DEFINE_PRIM(t,name,args) DEFINE_PRIM_WITH_NAME(t,name,args,name) #define _DEFINE_PRIM_WITH_NAME(t,name,args,realName) C_FUNCTION_BEGIN EXPORT void *hlp_##realName( const char **sign ) { *sign = _FUN(t,args); return (void*)(&HL_NAME(name)); } C_FUNCTION_END diff --git a/src/hlmodule.h b/src/hlmodule.h index d8ea8c912..1f75263f7 100644 --- a/src/hlmodule.h +++ b/src/hlmodule.h @@ -120,6 +120,8 @@ typedef struct { #define WIN64_UNWIND_TABLES #endif +struct jit_code_entry; /* Forward declaration for GDB JIT interface */ + typedef struct { hl_code *code; int codesize; @@ -132,6 +134,7 @@ typedef struct { hl_code_hash *hash; hl_debug_infos *jit_debug; jit_ctx *jit_ctx; + struct jit_code_entry *gdb_jit_entry; /* GDB JIT interface registration (or NULL) */ hl_module_context ctx; #ifdef WIN64_UNWIND_TABLES PRUNTIME_FUNCTION unwind_table; diff --git a/src/jit_aarch64.c b/src/jit_aarch64.c new file mode 100644 index 000000000..90eebde7f --- /dev/null +++ b/src/jit_aarch64.c @@ -0,0 +1,6935 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * AArch64 JIT Backend + * + * This file contains the AArch64-specific implementation of the HashLink JIT compiler. + * It translates HashLink bytecode into native AArch64 machine code. + * + * Key differences from x86: + * - Fixed 32-bit instruction encoding (vs variable-length on x86) + * - Load/store architecture (no direct memory operands in ALU ops) + * - AAPCS64 calling convention (8 register args vs 4/6 on x86-64) + * - Condition codes with branch instructions (no conditional moves in base ISA) + * - PC-relative addressing with ADRP/ADD for globals + */ + +#if !defined(__aarch64__) && !defined(_M_ARM64) +# error "This file is for AArch64 architecture only. Use jit_x86.c for x86/x64." +#endif + +#include +#include +#include +#include +#include +#include "jit_common.h" +#include "jit_aarch64_emit.h" +#include "jit_elf.h" +#include "hlsystem.h" + +// Helper for LDR/STR scaled offset from struct field +#define FIELD_OFFSET_SCALED(type, field) (offsetof(type, field) / 8) + +// ============================================================================ +// AArch64 Register Configuration (AAPCS64) +// ============================================================================ + +/* + * AAPCS64 (ARM Architecture Procedure Call Standard for ARM64) + * + * Register Usage: + * - X0-X7: Argument/result registers (caller-saved) + * - X8: Indirect result location register (caller-saved) + * - X9-X15: Temporary registers (caller-saved) + * - X16-X17: Intra-procedure-call temporary registers (caller-saved) + * - X18: Platform register (avoid use - may be reserved by OS) + * - X19-X28: Callee-saved registers + * - X29: Frame pointer (FP) + * - X30: Link register (LR) + * - SP: Stack pointer (must be 16-byte aligned) + * + * FP/SIMD Registers: + * - V0-V7: Argument/result registers (caller-saved) + * - V8-V15: Callee-saved (only lower 64 bits, D8-D15) + * - V16-V31: Temporary registers (caller-saved) + */ + +#define RCPU_COUNT 31 // X0-X30 (SP is not a general register) +#define RFPU_COUNT 32 // V0-V31 + +// Calling convention: first 8 args in X0-X7 +#define CALL_NREGS 8 +static const Arm64Reg CALL_REGS[] = { X0, X1, X2, X3, X4, X5, X6, X7 }; +static const Arm64FpReg FP_CALL_REGS[] = { V0, V1, V2, V3, V4, V5, V6, V7 }; + +// Caller-saved (scratch) registers: X0-X17 (avoid X18) +// Note: We use X0-X17 as scratch, but X0-X7 are also argument registers +#define RCPU_SCRATCH_COUNT 18 + +// vdynamic structure: type (8 bytes) + value (8 bytes) +#define HDYN_VALUE 8 +static const Arm64Reg RCPU_SCRATCH_REGS[] = { + X0, X1, X2, X3, X4, X5, X6, X7, + X8, X9, X10, X11, X12, X13, X14, X15, + X16, X17 +}; + +// All FP registers are caller-saved in AAPCS64 +#define RFPU_SCRATCH_COUNT 32 + +// Callee-saved registers: X19-X28 +// X29 (FP) and X30 (LR) are also callee-saved but handled specially +#define RCPU_CALLEE_SAVED_COUNT 10 +static const Arm64Reg RCPU_CALLEE_SAVED[] = { + X19, X20, X21, X22, X23, X24, X25, X26, X27, X28 +}; + +// Callee-saved registers available for allocation +// These survive function calls, so we don't need to spill them before BLR +#define RCPU_CALLEE_ALLOC_COUNT 10 +static const Arm64Reg RCPU_CALLEE_ALLOC[] = { + X19, X20, X21, X22, X23, X24, X25, X26, X27, X28 +}; + +// FP callee-saved: V8-V15 (only lower 64 bits per AAPCS64) +#define RFPU_CALLEE_SAVED_COUNT 8 +static const Arm64FpReg RFPU_CALLEE_SAVED[] = { + V8, V9, V10, V11, V12, V13, V14, V15 +}; + +// Frame size for callee-saved registers + FP/LR +// CPU callee-saved: RCPU_CALLEE_SAVED_COUNT * 8 bytes = 80 bytes +// FPU callee-saved: RFPU_CALLEE_SAVED_COUNT * 8 bytes = 64 bytes (D8-D15, lower 64 bits only) +// FP/LR: 16 bytes +// Total: 160 bytes (must be 16-byte aligned) +#define CALLEE_SAVED_FRAME_SIZE (RCPU_CALLEE_SAVED_COUNT * 8 + RFPU_CALLEE_SAVED_COUNT * 8 + 16) + +// Helper macros for accessing registers +#define REG_COUNT (RCPU_COUNT + RFPU_COUNT) +#define VFPR(i) ((i) + RCPU_COUNT) // FP register index +#define PVFPR(i) REG_AT(VFPR(i)) // Pointer to FP register + +// Reserved registers for JIT internal use (caller-saved, no need to preserve) +#define RTMP X16 // Temporary register for multi-instruction sequences (IP0) +#define RTMP2 X17 // Second temporary register (IP1) + +// Special purpose registers +#define RFP X29 // Frame pointer +#define RLR X30 // Link register + +// Stack alignment requirement +#define STACK_ALIGN 16 + +// EMIT32 is defined in jit_common.h - use EMIT32(ctx,ctx, val) + +// ============================================================================ +// Error Handling +// ============================================================================ + +void _jit_error(jit_ctx *ctx, const char *msg, int line) { + printf("JIT ERROR: %s (jit_aarch64.c:%d)\n", msg, line); + if (ctx && ctx->f) { + // hl_function doesn't have a 'name' field directly + // The function object info would be in the module + int func_index = (int)(ctx->f - ctx->m->code->functions); + printf("In function at index %d\n", func_index); + } + jit_exit(); +} + +void on_jit_error(const char *msg, int_val line) { + printf("JIT Runtime Error: %s (line %d)\n", msg, (int)line); + jit_exit(); +} + +#define JIT_ASSERT(cond) do { if (!(cond)) { \ + printf("JIT ASSERTION FAILED: %s (jit_aarch64.c:%d)\n", #cond, __LINE__); \ + jit_exit(); \ +} } while(0) + +// ============================================================================ +// Register Allocation Helpers +// ============================================================================ + +/** + * Check if a CPU register is a call (argument) register + */ +static bool is_call_reg(Arm64Reg r) { + for (int i = 0; i < CALL_NREGS; i++) { + if (CALL_REGS[i] == r) + return true; + } + return false; +} + +/** + * Get the index of a register in the call register array + * Returns -1 if not a call register + */ +static int call_reg_index(Arm64Reg r) { + for (int i = 0; i < CALL_NREGS; i++) { + if (CALL_REGS[i] == r) + return i; + } + return -1; +} + +/** + * Check if a register is callee-saved (must be preserved across calls) + */ +static bool is_callee_saved_cpu(Arm64Reg r) { + for (int i = 0; i < RCPU_CALLEE_SAVED_COUNT; i++) { + if (RCPU_CALLEE_SAVED[i] == r) + return true; + } + return r == RFP || r == RLR; +} + +static bool is_callee_saved_fpu(Arm64FpReg r) { + for (int i = 0; i < RFPU_CALLEE_SAVED_COUNT; i++) { + if (RFPU_CALLEE_SAVED[i] == r) + return true; + } + return false; +} + +/** + * Check if type is String (HOBJ with bytes:HBYTES + length:HI32) + * Used for value-based string comparison per Haxe spec. + */ +static bool is_string_type(hl_type *t) { + if (t->kind != HOBJ || !t->obj) return false; + if (t->obj->nfields != 2) return false; + return t->obj->fields[0].t->kind == HBYTES && + t->obj->fields[1].t->kind == HI32; +} + +// ============================================================================ +// Register Allocation +// ============================================================================ + +// Forward declarations +static void free_reg(jit_ctx *ctx, preg *p); +static void patch_jump(jit_ctx *ctx, int pos, int target_pos); + +/** + * Find a free CPU register, evicting if necessary + * @param k Register kind (RCPU, RCPU_CALL, etc.) + * @return Pointer to allocated physical register + */ +static preg *alloc_cpu(jit_ctx *ctx, preg_kind k) { + preg *p; + int i; + int start_idx = 0; + int count = RCPU_SCRATCH_COUNT; + const Arm64Reg *regs = RCPU_SCRATCH_REGS; + + // For RCPU_CALL, only use non-argument registers + if (k == RCPU_CALL) { + // Use registers that are NOT in CALL_REGS + // For now, use X8-X17 (scratch registers that aren't args) + start_idx = 8; // Start from X8 + } + + // First pass: find a free scratch register (not holding anything and not locked) + // Lock check: p->lock >= ctx->currentPos means locked at current operation + for (i = start_idx; i < count; i++) { + p = REG_AT(regs[i]); + if (p->holds == NULL && p->lock < ctx->currentPos) + return p; + } + + // Second pass: try callee-saved registers (X19-X26) before evicting scratch + // These survive function calls, so values don't need to be spilled before BLR + for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) { + p = REG_AT(RCPU_CALLEE_ALLOC[i]); + if (p->holds == NULL && p->lock < ctx->currentPos) { + ctx->callee_saved_used |= (1 << i); // Mark register as used for Phase 2 NOP patching + return p; + } + } + + // Third pass: evict a callee-saved register if one is unlocked + for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) { + p = REG_AT(RCPU_CALLEE_ALLOC[i]); + if (p->lock < ctx->currentPos) { + ctx->callee_saved_used |= (1 << i); // Mark register as used for Phase 2 NOP patching + free_reg(ctx, p); // Spill to stack before reusing + return p; + } + } + + // Fourth pass: evict a scratch register + for (i = start_idx; i < count; i++) { + p = REG_AT(regs[i]); + if (p->lock < ctx->currentPos) { + free_reg(ctx, p); // Spill to stack before reusing + return p; + } + } + + // All registers are locked - this is an error + JIT_ASSERT(0); + return NULL; +} + +/** + * Allocate a floating-point register + */ +static preg *alloc_fpu(jit_ctx *ctx) { + preg *p; + int i; + + // First pass: find a free register + // Prefer caller-saved registers (V0-V7, V16-V31) + // Lock check: p->lock >= ctx->currentPos means locked at current operation + for (i = 0; i < RFPU_COUNT; i++) { + if (i >= 8 && i < 16) + continue; // Skip callee-saved V8-V15 for now + p = PVFPR(i); + if (p->holds == NULL && p->lock < ctx->currentPos) + return p; + } + + // Second pass: try callee-saved if needed + for (i = 8; i < 16; i++) { + p = PVFPR(i); + if (p->holds == NULL && p->lock < ctx->currentPos) { + ctx->fpu_callee_saved_used |= (1 << (i - 8)); // Track V8-V15 usage + return p; + } + } + + // Third pass: evict an unlocked register + for (i = 0; i < RFPU_COUNT; i++) { + p = PVFPR(i); + if (p->lock < ctx->currentPos) { + if (i >= 8 && i < 16) { + ctx->fpu_callee_saved_used |= (1 << (i - 8)); // Track V8-V15 usage + } + free_reg(ctx, p); // Spill to stack before reusing + return p; + } + } + + JIT_ASSERT(0); + return NULL; +} + +/** + * Allocate a register of the appropriate type based on the virtual register's type + */ +static preg *alloc_reg(jit_ctx *ctx, vreg *r, preg_kind k) { + if (IS_FLOAT(r)) + return alloc_fpu(ctx); + else + return alloc_cpu(ctx, k); +} + +// ============================================================================ +// Register State Management +// ============================================================================ + +/** + * Store a virtual register to its stack location + */ +static void store(jit_ctx *ctx, vreg *r, preg *p); // Forward declaration +static void mov_reg_reg(jit_ctx *ctx, Arm64Reg dst, Arm64Reg src, bool is_64bit); // Forward declaration +static void ldr_stack(jit_ctx *ctx, Arm64Reg dst, int stack_offset, int size); // Forward declaration +static void emit_call_findex(jit_ctx *ctx, int findex, int stack_space); // Forward declaration + +/** + * Free a physical register by storing its content to stack if needed + */ +static void free_reg(jit_ctx *ctx, preg *p) { + vreg *r = p->holds; + if (r != NULL) { + store(ctx, r, p); + r->current = NULL; + p->holds = NULL; + } + // Unlock the register so it can be reused + RUNLOCK(p); +} + +/** + * Discard the content of a physical register, storing if dirty. + * Used when we're done using a register but the vreg might still be live. + * If the vreg is dirty (modified but not yet on stack), we store it first. + */ +static void discard(jit_ctx *ctx, preg *p) { + vreg *r = p->holds; + if (r != NULL) { + // If dirty, store to stack before clearing the binding + if (r->dirty) { + store(ctx, r, p); + } + r->current = NULL; + p->holds = NULL; + } + // Unlock the register so it can be reused + RUNLOCK(p); +} + +/** + * Spill all caller-saved registers to stack before a function call. + * In AAPCS64: X0-X17 and V0-V7 are caller-saved and may be clobbered. + * + * This function: + * 1. Stores each bound register's value to its vreg's stack slot + * 2. Clears the register↔vreg bindings + * + * IMPORTANT: Must be called BEFORE the BLR instruction, not after! + * At that point register values are still valid and can be spilled to stack. + * After the call, caller-saved registers contain garbage from the callee. + * + * ARCHITECTURAL NOTE - Why AArch64 differs from x86: + * + * The x86 JIT's discard_regs() just clears register bindings without spilling. + * This works because x86 (CISC) can use memory operands directly in ALU + * instructions: + * + * x86: ADD [rbp-8], rax ; Operate directly on stack slot + * + * So x86 treats stack slots as the "source of truth" - values are written + * to stack as part of normal operations, and registers are just caches. + * Clearing bindings is safe because the value is already on the stack. + * + * AArch64 (RISC load/store architecture) cannot do this: + * + * AArch64: LDR x1, [fp, #-8] ; Must load to register first + * ADD x0, x0, x1 ; Operate on registers only + * STR x0, [fp, #-8] ; Separate store instruction + * + * Adding a store after every operation would cost ~1 extra instruction per op. + * Instead, we keep values in registers (registers are "source of truth") and + * only spill when necessary - specifically, before function calls that will + * clobber caller-saved registers. + * + * This is not a workaround but the natural design for load/store architectures. + */ +static void spill_regs(jit_ctx *ctx) { + int i; + // Spill and discard CPU scratch registers (X0-X17) - these get clobbered by calls + for (i = 0; i < 18; i++) { + preg *r = &ctx->pregs[i]; + if (r->holds) { + if (r->holds->dirty) { + free_reg(ctx, r); // Dirty: store to stack, then clear binding + } else { + discard(ctx, r); // Clean: just clear binding (value already on stack) + } + } + } + // NOTE: Do NOT spill callee-saved registers (X19-X26)! + // They survive function calls, so their values remain valid after BLR. + // This is the key optimization - values in callee-saved don't need spilling. + + // Spill and discard FPU scratch registers (V0-V7, V16-V31) - these get clobbered by calls + // V8-V15 are callee-saved and survive calls + for (i = 0; i < 8; i++) { + preg *r = &ctx->pregs[RCPU_COUNT + i]; + if (r->holds) { + if (r->holds->dirty) { + free_reg(ctx, r); // Dirty: store to stack, then clear binding + } else { + discard(ctx, r); // Clean: just clear binding (value already on stack) + } + } + } + // Also spill V16-V31 (caller-saved temporary FPU registers) + for (i = 16; i < 32; i++) { + preg *r = &ctx->pregs[RCPU_COUNT + i]; + if (r->holds) { + if (r->holds->dirty) { + free_reg(ctx, r); // Dirty: store to stack, then clear binding + } else { + discard(ctx, r); // Clean: just clear binding (value already on stack) + } + } + } +} + +/** + * Spill callee-saved registers to stack. + * Called before jumps to labels - callee-saved must be on stack at merge points. + * NOTE: This is NOT called before function calls (callee-saved survive calls). + */ +static void spill_callee_saved(jit_ctx *ctx) { + int i; + // Spill callee-saved CPU registers (X19-X26) that are in use + for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) { + preg *r = REG_AT(RCPU_CALLEE_ALLOC[i]); + if (r->holds) { + if (r->holds->dirty) { + free_reg(ctx, r); // Dirty: store to stack, then clear binding + } else { + discard(ctx, r); // Clean: just clear binding + } + } + } +} + +/** + * Ensure a virtual register is in a physical register + * Loads from stack if necessary + */ +static preg *fetch(jit_ctx *ctx, vreg *r); // Forward declaration + +/** + * Bind a vreg to a physical register (bidirectional association) + * This is essential for proper spilling when the register is evicted + */ +static void reg_bind(jit_ctx *ctx, vreg *r, preg *p) { + // If vreg was dirty in another register, store to stack first + // This prevents losing values when rebinding (e.g., dst = dst op src) + if (r->current && r->current != p) { + if (r->dirty) { + store(ctx, r, r->current); + } + r->current->holds = NULL; + } + // Set new binding + r->current = p; + p->holds = r; +} + +/** + * Allocate a destination register for a vreg + * Helper function used by many operations + * Binds the vreg to the allocated register for proper spilling + */ +static preg *alloc_dst(jit_ctx *ctx, vreg *r) { + preg *p; + if (IS_FLOAT(r)) { + p = alloc_fpu(ctx); + } else { + p = alloc_cpu(ctx, RCPU); + } + // Bind the vreg to this register so we can spill it later if needed + reg_bind(ctx, r, p); + // Mark dirty: a new value is about to be written to this register, + // and it's not on the stack yet. This ensures spill_regs() will + // store it before the next call/jump. + r->dirty = 1; + return p; +} + +// ============================================================================ +// Basic Data Movement - Encoding Helpers +// ============================================================================ + +/** + * Generate MOV instruction (register to register) + * For integer: MOV Xd, Xn (using ORR Xd, XZR, Xn) + * For float: FMOV Vd, Vn + */ +static void mov_reg_reg(jit_ctx *ctx, Arm64Reg dst, Arm64Reg src, bool is_64bit) { + // SP (register 31) can't be used with ORR - must use ADD instead + if (src == SP_REG || dst == SP_REG) { + // MOV Xd, SP or MOV SP, Xn => ADD Xd, Xn, #0 + encode_add_sub_imm(ctx, is_64bit ? 1 : 0, 0, 0, 0, 0, src, dst); + } else if (is_64bit) { + // MOV Xd, Xn => ORR Xd, XZR, Xn + encode_logical_reg(ctx, 1, 0x01, 0, 0, src, 0, XZR, dst); + } else { + // MOV Wd, Wn => ORR Wd, WZR, Wn + encode_logical_reg(ctx, 0, 0x01, 0, 0, src, 0, XZR, dst); + } +} + +static void fmov_reg_reg(jit_ctx *ctx, Arm64FpReg dst, Arm64FpReg src, bool is_double) { + // FMOV Vd, Vn (using FP 1-source with opcode 0) + int type = is_double ? 0x01 : 0x00; // 01=double, 00=single + encode_fp_1src(ctx, 0, 0, type, 0, src, dst); +} + +/** + * Load from stack to register + * Format: LDR/LDUR Xt, [FP, #offset] + * + * Uses LDUR for signed offsets in range [-256, +255] (single instruction) + * Uses LDR with scaled unsigned offset for aligned positive offsets + * Falls back to computing address in register for large offsets + */ +static void ldr_stack(jit_ctx *ctx, Arm64Reg dst, int stack_offset, int size) { + int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : ((size == 2) ? 1 : 0)); + + // Priority 1: Use LDUR for small signed offsets (-256 to +255) + // This handles most negative stack offsets in a single instruction + if (stack_offset >= -256 && stack_offset <= 255) { + encode_ldur_stur(ctx, size_enc, 0, 0x01, stack_offset, RFP, dst); + return; + } + + // Priority 2: Use LDR with scaled unsigned offset for larger positive aligned offsets + if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) { + int scaled_offset = stack_offset / size; + encode_ldr_str_imm(ctx, size_enc, 0, 0x01, scaled_offset, RFP, dst); + return; + } + + // Fallback: Compute address in register for large/unaligned offsets + load_immediate(ctx, stack_offset, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, size_enc, 0, 0x01, 0, RTMP, dst); +} + +/** + * Load from stack to FP register + * Format: LDR/LDUR Dt/St, [FP, #offset] + */ +static void ldr_stack_fp(jit_ctx *ctx, Arm64FpReg dst, int stack_offset, int size) { + int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : 1); + + // Priority 1: Use LDUR for small signed offsets (-256 to +255) + if (stack_offset >= -256 && stack_offset <= 255) { + encode_ldur_stur(ctx, size_enc, 1, 0x01, stack_offset, RFP, dst); + return; + } + + // Priority 2: Use LDR with scaled unsigned offset for larger positive aligned offsets + if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) { + int scaled_offset = stack_offset / size; + encode_ldr_str_imm(ctx, size_enc, 1, 0x01, scaled_offset, RFP, dst); + return; + } + + // Fallback: Compute address in register + load_immediate(ctx, stack_offset, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, size_enc, 1, 0x01, 0, RTMP, dst); +} + +/** + * Store register to stack + * Format: STR/STUR Xt, [FP, #offset] + * + * Uses STUR for signed offsets in range [-256, +255] (single instruction) + * Uses STR with scaled unsigned offset for aligned positive offsets + * Falls back to computing address in register for large offsets + */ +static void str_stack(jit_ctx *ctx, Arm64Reg src, int stack_offset, int size) { + int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : ((size == 2) ? 1 : 0)); + + // Priority 1: Use STUR for small signed offsets (-256 to +255) + if (stack_offset >= -256 && stack_offset <= 255) { + encode_ldur_stur(ctx, size_enc, 0, 0x00, stack_offset, RFP, src); + return; + } + + // Priority 2: Use STR with scaled unsigned offset for larger positive aligned offsets + if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) { + int scaled_offset = stack_offset / size; + encode_ldr_str_imm(ctx, size_enc, 0, 0x00, scaled_offset, RFP, src); + return; + } + + // Fallback: Compute address in register + load_immediate(ctx, stack_offset, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, size_enc, 0, 0x00, 0, RTMP, src); +} + +/** + * Store FP register to stack + * Format: STR/STUR Dt/St, [FP, #offset] + */ +static void str_stack_fp(jit_ctx *ctx, Arm64FpReg src, int stack_offset, int size) { + int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : 1); + + // Priority 1: Use STUR for small signed offsets (-256 to +255) + if (stack_offset >= -256 && stack_offset <= 255) { + encode_ldur_stur(ctx, size_enc, 1, 0x00, stack_offset, RFP, src); + return; + } + + // Priority 2: Use STR with scaled unsigned offset for larger positive aligned offsets + if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) { + int scaled_offset = stack_offset / size; + encode_ldr_str_imm(ctx, size_enc, 1, 0x00, scaled_offset, RFP, src); + return; + } + + // Fallback: Compute address in register + load_immediate(ctx, stack_offset, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, size_enc, 1, 0x00, 0, RTMP, src); +} + +/** + * STP with signed offset (no writeback) - for NOPpable callee-saved saves. + * Format: STP Xt1, Xt2, [Xn, #imm] + * This allows individual STPs to be patched to NOPs without affecting SP. + */ +static void stp_offset(jit_ctx *ctx, Arm64Reg rt, Arm64Reg rt2, Arm64Reg rn, int offset) { + // STP imm7 is signed, scaled by 8: range -512 to +504 + if (offset < -512 || offset > 504) hl_fatal("stp_offset: offset out of range"); + int imm7 = offset / 8; + // opc=10 (64-bit), 101, addr_mode=10 (signed offset), L=0 (store), imm7, Rt2, Rn, Rt + unsigned int insn = (2u << 30) | (5u << 27) | (2u << 23) | (0u << 22) | + ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt; + EMIT32(ctx, insn); +} + +/** + * LDP with signed offset (no writeback) - for NOPpable callee-saved restores. + * Format: LDP Xt1, Xt2, [Xn, #imm] + */ +static void ldp_offset(jit_ctx *ctx, Arm64Reg rt, Arm64Reg rt2, Arm64Reg rn, int offset) { + int imm7 = offset / 8; + // opc=10 (64-bit), 101, addr_mode=10 (signed offset), L=1 (load), imm7, Rt2, Rn, Rt + unsigned int insn = (2u << 30) | (5u << 27) | (2u << 23) | (1u << 22) | + ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt; + EMIT32(ctx, insn); +} + +/** + * STP for 64-bit FP registers (D regs) with signed offset. + * Format: STP Dt1, Dt2, [Xn, #imm] + * Encoding: opc=01 (64-bit FP), V=1, addr_mode=10 + * Verified: arm-enc "stp d8, d9, [sp, #96]" -> 0x6d0627e8 + */ +static void stp_offset_fp(jit_ctx *ctx, Arm64FpReg rt, Arm64FpReg rt2, Arm64Reg rn, int offset) { + if (offset < -512 || offset > 504) hl_fatal("stp_offset_fp: offset out of range"); + int imm7 = offset / 8; + // opc=01 (64-bit FP), 101, V=1, addr_mode=10, L=0 + unsigned int insn = (1u << 30) | (5u << 27) | (1u << 26) | (2u << 23) | (0u << 22) | + ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt; + EMIT32(ctx, insn); +} + +/** + * LDP for 64-bit FP registers (D regs) with signed offset. + * Verified: arm-enc "ldp d14, d15, [sp, #144]" -> 0x6d493fee + */ +static void ldp_offset_fp(jit_ctx *ctx, Arm64FpReg rt, Arm64FpReg rt2, Arm64Reg rn, int offset) { + int imm7 = offset / 8; + // opc=01 (64-bit FP), 101, V=1, addr_mode=10, L=1 + unsigned int insn = (1u << 30) | (5u << 27) | (1u << 26) | (2u << 23) | (1u << 22) | + ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt; + EMIT32(ctx, insn); +} + +// ============================================================================ +// Data Movement Operations +// ============================================================================ + +/** + * Store a virtual register to its stack location + */ +static void store(jit_ctx *ctx, vreg *r, preg *p) { + if (r == NULL || p == NULL || r->size == 0) + return; + + int size = r->size; + int offset = r->stackPos; + + if (p->kind == RCPU) { + str_stack(ctx, p->id, offset, size); + } else if (p->kind == RFPU) { + str_stack_fp(ctx, p->id, offset, size); + } + + r->dirty = 0; // Stack is now up-to-date +} + +/** + * Mark a virtual register as dirty (register value differs from stack). + * The value will be spilled at the next basic block boundary (jump, call, label). + * This defers stores to reduce instruction count within basic blocks. + */ +static void mark_dirty(jit_ctx *ctx, vreg *r) { + (void)ctx; // unused, kept for consistency with other functions + if (r != NULL && r->current != NULL && r->size > 0) { + r->dirty = 1; + } +} + +/** + * Store to a vreg's stack slot and clear any stale register binding. + * Use this when storing directly (e.g., from X0 after a call) without + * going through the normal register allocation path. + * + * This prevents spill_regs from later overwriting the correct stack + * value with a stale register value. + */ +static void store_result(jit_ctx *ctx, vreg *dst) { + // Clear any stale binding - the correct value is now on stack + if (dst->current != NULL) { + dst->current->holds = NULL; + dst->current = NULL; + } +} + +/** + * Load a virtual register from stack to a physical register + */ +static preg *fetch(jit_ctx *ctx, vreg *r) { + preg *p; + + // HVOID registers have size 0 and no value to load + if (r->size == 0) + return UNUSED; + + // Check if already in a register + if (r->current != NULL && r->current->kind != RSTACK) { + // Lock the register to prevent eviction during subsequent allocations + RLOCK(r->current); + return r->current; + } + + // Allocate a register + p = alloc_reg(ctx, r, RCPU); + + // If the register we got already holds something, evict it + if (p->holds != NULL) + free_reg(ctx, p); + + // Load from stack + int size = r->size; + int offset = r->stackPos; + + if (IS_FLOAT(r)) { + ldr_stack_fp(ctx, p->id, offset, size); + } else { + ldr_stack(ctx, p->id, offset, size); + } + + // Bind vreg to register and lock it to prevent eviction by subsequent allocs + reg_bind(ctx, r, p); + RLOCK(p); + + return p; +} + +/** + * Copy data between locations (register, stack, immediate) + * This is the main data movement workhorse function + */ +static void copy(jit_ctx *ctx, vreg *dst, preg *dst_p, vreg *src, preg *src_p) { + if (src_p->kind == RCONST) { + // Load immediate into destination + int64_t val = src_p->id; + + if (IS_FLOAT(dst)) { + // Load float constant: load bits as integer, then move to FP register + preg *d = (dst_p && dst_p->kind == RFPU) ? dst_p : alloc_fpu(ctx); + + if (val == 0) { + // FMOV Dd, XZR - zero the FP register + EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | d->id); + } else { + // Load bits to GPR, then FMOV to FPR + load_immediate(ctx, val, RTMP, true); + // FMOV Dd, Xn: sf=1, S=0, type=01, rmode=00, opcode=00111, Rn, Rd + EMIT32(ctx, (0x9E670000) | (RTMP << 5) | d->id); + } + + if (dst_p == NULL || dst_p != d) { + reg_bind(ctx, dst, d); + } + } else { + // Load integer immediate + preg *d = (dst_p && dst_p->kind == RCPU) ? dst_p : fetch(ctx, dst); + load_immediate(ctx, val, d->id, dst->size == 8); + if (dst_p == NULL || dst_p != d) { + reg_bind(ctx, dst, d); + } + } + } else if (src_p->kind == RCPU && dst_p && dst_p->kind == RCPU) { + // Register to register + mov_reg_reg(ctx, dst_p->id, src_p->id, dst->size == 8); + } else if (src_p->kind == RFPU && dst_p && dst_p->kind == RFPU) { + // FP register to FP register + fmov_reg_reg(ctx, dst_p->id, src_p->id, dst->size == 8); + } else { + // Generic case: fetch src, store to dst + preg *s = (src_p && (src_p->kind == RCPU || src_p->kind == RFPU)) ? src_p : fetch(ctx, src); + preg *d = (dst_p && (dst_p->kind == RCPU || dst_p->kind == RFPU)) ? dst_p : fetch(ctx, dst); + + if (IS_FLOAT(dst)) { + fmov_reg_reg(ctx, d->id, s->id, dst->size == 8); + } else { + mov_reg_reg(ctx, d->id, s->id, dst->size == 8); + } + + reg_bind(ctx, dst, d); + } +} + +// ============================================================================ +// Opcode Handlers +// ============================================================================ + +/** + * OMov - Move/copy a value from one register to another + */ +static void op_mov(jit_ctx *ctx, vreg *dst, vreg *src) { + preg *r = fetch(ctx, src); + + // Handle special case for HF32 (32-bit float) + // Ensure it's in an FP register + if (src->t->kind == HF32 && r->kind != RFPU) { + r = alloc_fpu(ctx); + // Load from stack to FP register + ldr_stack_fp(ctx, r->id, src->stackPos, src->size); + reg_bind(ctx, src, r); + } + + // Store to destination stack slot + store(ctx, dst, r); + + // Clear dst's old register binding to prevent stale value from being spilled + // The correct value is now on the stack from store() above + if (dst->current != NULL) { + dst->current->holds = NULL; + dst->current = NULL; + } +} + +/** + * Store a constant value to a virtual register + */ +static void store_const(jit_ctx *ctx, vreg *dst, int64_t val) { + preg *p; + + if (IS_FLOAT(dst)) { + // Allocate FPU register for float constants + p = alloc_fpu(ctx); + if (p->holds != NULL) + free_reg(ctx, p); + + if (val == 0) { + // FMOV Dd, XZR - zero the FP register + EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | p->id); + } else { + // Load bits to GPR, then FMOV to FPR + load_immediate(ctx, val, RTMP, true); + // FMOV Dd, Xn: sf=1, S=0, type=01, rmode=00, opcode=00111, Rn, Rd + EMIT32(ctx, (0x9E670000) | (RTMP << 5) | p->id); + } + } else { + p = alloc_reg(ctx, dst, RCPU); + if (p->holds != NULL) + free_reg(ctx, p); + load_immediate(ctx, val, p->id, dst->size == 8); + } + + reg_bind(ctx, dst, p); + store(ctx, dst, p); // Constants must be stored immediately for correct loop initialization +} + +// ============================================================================ +// Arithmetic Operations +// ============================================================================ + +// Forward declaration for op_call_native (used by floating-point modulo) +static void op_call_native(jit_ctx *ctx, vreg *dst, hl_type *ftype, void *func_ptr, vreg **args, int nargs); +// Forward declaration for prepare_call_args (used by op_jump for dynamic comparisons) +static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native); + +/** + * Binary arithmetic/logic operations handler + * Handles: OAdd, OSub, OMul, OSDiv, OUDiv, OSMod, OUMod, OAnd, OOr, OXor, shifts + */ +static void op_binop(jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op op) { + bool is_64bit = dst->size == 8; + int sf = is_64bit ? 1 : 0; + + // Handle floating-point operations + if (IS_FLOAT(dst)) { + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + preg *pd; + + // If dst == a, reuse pa as destination to avoid clobbering issues + // when reg_bind tries to store the old (now stale) value + if (dst == a) { + pd = pa; + } else { + pd = alloc_fpu(ctx); + if (pd->holds != NULL) + free_reg(ctx, pd); + } + + int type = (dst->t->kind == HF64) ? 0x01 : 0x00; // 01=double, 00=single + + switch (op) { + case OAdd: + // FADD Vd, Vn, Vm + encode_fp_arith(ctx, 0, 0, type, pb->id, 0x02, pa->id, pd->id); + break; + case OSub: + // FSUB Vd, Vn, Vm + encode_fp_arith(ctx, 0, 0, type, pb->id, 0x03, pa->id, pd->id); + break; + case OMul: + // FMUL Vd, Vn, Vm + encode_fp_arith(ctx, 0, 0, type, pb->id, 0x00, pa->id, pd->id); + break; + case OSDiv: + case OUDiv: // Same as OSDiv for floats + // FDIV Vd, Vn, Vm + encode_fp_arith(ctx, 0, 0, type, pb->id, 0x01, pa->id, pd->id); + break; + case OSMod: + case OUMod: { + // Floating-point modulo: call fmod/fmodf from C library + // Need to discard pa/pb since op_call_native will spill + discard(ctx, pa); + discard(ctx, pb); + void *mod_func = (dst->t->kind == HF64) ? (void*)fmod : (void*)fmodf; + vreg *args[2] = { a, b }; + op_call_native(ctx, dst, NULL, mod_func, args, 2); + return; // op_call_native handles result storage + } + default: + JIT_ASSERT(0); // Invalid FP operation + } + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + return; + } + + // Integer operations + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + preg *pd; + + // If dst == a, reuse pa as destination to avoid clobbering issues + // when reg_bind tries to store the old (now stale) value + if (dst == a) { + pd = pa; + } else { + pd = alloc_cpu(ctx, RCPU); + if (pd->holds != NULL) + free_reg(ctx, pd); + } + + switch (op) { + case OAdd: + // ADD Xd, Xn, Xm + encode_add_sub_reg(ctx, sf, 0, 0, 0, pb->id, 0, pa->id, pd->id); + break; + + case OSub: + // SUB Xd, Xn, Xm + encode_add_sub_reg(ctx, sf, 1, 0, 0, pb->id, 0, pa->id, pd->id); + break; + + case OMul: + // MUL Xd, Xn, Xm (using MADD with XZR as addend) + encode_madd_msub(ctx, sf, 0, pb->id, XZR, pa->id, pd->id); + break; + + case OSDiv: + // SDIV Xd, Xn, Xm (signed division) + // Note: encode_div U=1 means SDIV, U=0 means UDIV (per ARM ISA) + encode_div(ctx, sf, 1, pb->id, pa->id, pd->id); + break; + + case OUDiv: + // UDIV Xd, Xn, Xm (unsigned division) + encode_div(ctx, sf, 0, pb->id, pa->id, pd->id); + break; + + case OSMod: { + // Signed modulo with special case handling: + // - divisor == 0: return 0 (avoid returning dividend) + // - divisor == -1: return 0 (avoid MIN % -1 overflow) + // CBZ divisor, zero_case + int jz = BUF_POS(); + encode_cbz_cbnz(ctx, sf, 0, 0, pb->id); // CBZ + + // CMP divisor, #-1; B.EQ zero_case + // CMN is ADD setting flags, so CMN Xn, #1 checks if Xn == -1 + encode_add_sub_imm(ctx, sf, 1, 1, 0, 1, pb->id, XZR); // CMN divisor, #1 + int jneg1 = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ + + // Normal path: remainder = dividend - (quotient * divisor) + encode_div(ctx, sf, 1, pb->id, pa->id, RTMP); // RTMP = a / b (signed) + encode_madd_msub(ctx, sf, 1, pb->id, pa->id, RTMP, pd->id); // pd = a - (RTMP * b) + int jend = BUF_POS(); + encode_branch_uncond(ctx, 0); // B end + + // Zero case: return 0 + int zero_pos = BUF_POS(); + // MOV pd, #0 (using ORR with XZR) + encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, pd->id); // ORR pd, XZR, XZR + + patch_jump(ctx, jz, zero_pos); + patch_jump(ctx, jneg1, zero_pos); + patch_jump(ctx, jend, BUF_POS()); + break; + } + + case OUMod: { + // Unsigned modulo with special case: + // - divisor == 0: return 0 + // CBZ divisor, zero_case + int jz = BUF_POS(); + encode_cbz_cbnz(ctx, sf, 0, 0, pb->id); // CBZ + + // Normal path + encode_div(ctx, sf, 0, pb->id, pa->id, RTMP); // RTMP = a / b (unsigned) + encode_madd_msub(ctx, sf, 1, pb->id, pa->id, RTMP, pd->id); // pd = a - (RTMP * b) + int jend = BUF_POS(); + encode_branch_uncond(ctx, 0); // B end + + // Zero case: return 0 + int zero_pos = BUF_POS(); + encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, pd->id); // ORR pd, XZR, XZR + + patch_jump(ctx, jz, zero_pos); + patch_jump(ctx, jend, BUF_POS()); + break; + } + + case OAnd: + // AND Xd, Xn, Xm + encode_logical_reg(ctx, sf, 0x00, 0, 0, pb->id, 0, pa->id, pd->id); + break; + + case OOr: + // ORR Xd, Xn, Xm + encode_logical_reg(ctx, sf, 0x01, 0, 0, pb->id, 0, pa->id, pd->id); + break; + + case OXor: + // EOR Xd, Xn, Xm + encode_logical_reg(ctx, sf, 0x02, 0, 0, pb->id, 0, pa->id, pd->id); + break; + + case OShl: + // LSL Xd, Xn, Xm (logical shift left) + encode_shift_reg(ctx, sf, 0x00, pb->id, pa->id, pd->id); + break; + + case OUShr: + // LSR Xd, Xn, Xm (logical shift right - unsigned) + encode_shift_reg(ctx, sf, 0x01, pb->id, pa->id, pd->id); + break; + + case OSShr: + // ASR Xd, Xn, Xm (arithmetic shift right - signed) + encode_shift_reg(ctx, sf, 0x02, pb->id, pa->id, pd->id); + break; + + default: + JIT_ASSERT(0); // Unknown operation + } + + // Mask result for sub-32-bit integer types (UI8, UI16) + // AArch64 doesn't have 8/16-bit registers like x86, so we need explicit masking + if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) { + // AND Wd, Wd, #0xFF (sf=0, opc=0, N=0, immr=0, imms=7) + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id); + } else if (dst->t->kind == HUI16) { + // AND Wd, Wd, #0xFFFF (sf=0, opc=0, N=0, immr=0, imms=15) + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id); + } + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); +} + +/** + * Unary negation (ONeg) + */ +static void op_neg(jit_ctx *ctx, vreg *dst, vreg *a) { + if (IS_FLOAT(a)) { + // FNEG Vd, Vn + preg *pa = fetch(ctx, a); + preg *pd = alloc_fpu(ctx); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + int type = (dst->t->kind == HF64) ? 0x01 : 0x00; + encode_fp_1src(ctx, 0, 0, type, 0x02, pa->id, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + } else { + // NEG Xd, Xn (implemented as SUB Xd, XZR, Xn) + preg *pa = fetch(ctx, a); + preg *pd = alloc_cpu(ctx, RCPU); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + int sf = (dst->size == 8) ? 1 : 0; + encode_add_sub_reg(ctx, sf, 1, 0, 0, pa->id, 0, XZR, pd->id); + + // Mask result for sub-32-bit integer types + if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id); + } else if (dst->t->kind == HUI16) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id); + } + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + } +} + +/** + * Logical NOT (ONot) - boolean negation + */ +static void op_not(jit_ctx *ctx, vreg *dst, vreg *a) { + // XOR with 1 (boolean NOT) + preg *pa = fetch(ctx, a); + preg *pd = alloc_cpu(ctx, RCPU); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + // Load immediate 1 + load_immediate(ctx, 1, RTMP, false); + + // EOR Wd, Wn, Wtmp (32-bit XOR with 1) + encode_logical_reg(ctx, 0, 0x02, 0, 0, RTMP, 0, pa->id, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); +} + +/** + * Increment (OIncr) + */ +static void op_incr(jit_ctx *ctx, vreg *dst) { + // ADD Xd, Xd, #1 with memory writeback + preg *pd = fetch(ctx, dst); + int sf = (dst->size == 8) ? 1 : 0; + + // ADD Xd, Xn, #1 + encode_add_sub_imm(ctx, sf, 0, 0, 0, 1, pd->id, pd->id); + + // Mask result for sub-32-bit integer types + if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id); + } else if (dst->t->kind == HUI16) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id); + } + + mark_dirty(ctx, dst); +} + +/** + * Decrement (ODecr) + */ +static void op_decr(jit_ctx *ctx, vreg *dst) { + // SUB Xd, Xd, #1 with memory writeback + preg *pd = fetch(ctx, dst); + int sf = (dst->size == 8) ? 1 : 0; + + // SUB Xd, Xn, #1 + encode_add_sub_imm(ctx, sf, 1, 0, 0, 1, pd->id, pd->id); + + // Mask result for sub-32-bit integer types + if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id); + } else if (dst->t->kind == HUI16) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id); + } + + mark_dirty(ctx, dst); +} + +// ============================================================================ +// Type Conversion Operations +// ============================================================================ + +/** + * Convert to integer (OToInt) + * Handles: float->int, i32->i64 sign extension, and int->int copy + */ +static void op_toint(jit_ctx *ctx, vreg *dst, vreg *src) { + // Same register optimization + if (dst == src) return; + + // Case 1: Float to integer conversion + if (IS_FLOAT(src)) { + preg *ps = fetch(ctx, src); + preg *pd = alloc_cpu(ctx, RCPU); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + int sf = (dst->size == 8) ? 1 : 0; + int type = (src->t->kind == HF64) ? 0x01 : 0x00; + + // FCVTZS Xd, Vn (float to signed int, round toward zero) + encode_fcvt_int(ctx, sf, 0, type, 0x03, 0x00, ps->id, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + return; + } + + // Case 2: i32 to i64 sign extension + if (dst->size == 8 && src->size == 4) { + preg *ps = fetch(ctx, src); + preg *pd = alloc_cpu(ctx, RCPU); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + Arm64Reg src_r = (ps->kind == RCPU) ? (Arm64Reg)ps->id : RTMP; + if (ps->kind == RCONST) { + load_immediate(ctx, ps->id, src_r, false); + } else if (ps->kind != RCPU) { + ldr_stack(ctx, src_r, src->stackPos, src->size); + } + + // SXTW Xd, Wn (sign extend word to doubleword) + // Encoding: 0x93407c00 | (Rn << 5) | Rd + EMIT32(ctx, 0x93407c00 | (src_r << 5) | pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + return; + } + + // Case 3: Integer to integer copy (same size or truncation) + preg *ps = fetch(ctx, src); + preg *pd = alloc_cpu(ctx, RCPU); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + Arm64Reg src_r = (ps->kind == RCPU) ? (Arm64Reg)ps->id : RTMP; + if (ps->kind == RCONST) { + load_immediate(ctx, ps->id, src_r, src->size == 8); + } else if (ps->kind != RCPU) { + ldr_stack(ctx, src_r, src->stackPos, src->size); + } + + // MOV Xd, Xn (or MOV Wd, Wn for 32-bit) + int sf = (dst->size == 8) ? 1 : 0; + mov_reg_reg(ctx, pd->id, src_r, sf); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); +} + +/** + * Convert signed integer to float, or convert between float precisions (OToSFloat) + * Handles: integer -> float (SCVTF), F64 -> F32, F32 -> F64 (FCVT) + */ +static void op_tosfloat(jit_ctx *ctx, vreg *dst, vreg *src) { + // Handle float-to-float precision conversions + if (src->t->kind == HF64 && dst->t->kind == HF32) { + // F64 -> F32: FCVT Sd, Dn + preg *ps = fetch(ctx, src); + preg *pd = alloc_fpu(ctx); + if (pd->holds != NULL) + free_reg(ctx, pd); + + Arm64FpReg src_r = (ps->kind == RFPU) ? (Arm64FpReg)ps->id : V16; + if (ps->kind != RFPU) { + ldr_stack_fp(ctx, src_r, src->stackPos, src->size); + } + + // FCVT Sd, Dn: type=1 (double source), opcode=4 (convert to single) + encode_fp_1src(ctx, 0, 0, 1, 4, src_r, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + return; + } + + if (src->t->kind == HF32 && dst->t->kind == HF64) { + // F32 -> F64: FCVT Dd, Sn + preg *ps = fetch(ctx, src); + preg *pd = alloc_fpu(ctx); + if (pd->holds != NULL) + free_reg(ctx, pd); + + Arm64FpReg src_r = (ps->kind == RFPU) ? (Arm64FpReg)ps->id : V16; + if (ps->kind != RFPU) { + ldr_stack_fp(ctx, src_r, src->stackPos, src->size); + } + + // FCVT Dd, Sn: type=0 (single source), opcode=5 (convert to double) + encode_fp_1src(ctx, 0, 0, 0, 5, src_r, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); + return; + } + + // Integer to float conversion (original behavior) + preg *ps = fetch(ctx, src); + preg *pd = alloc_fpu(ctx); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + int sf = (src->size == 8) ? 1 : 0; + int type = (dst->t->kind == HF64) ? 0x01 : 0x00; + + // SCVTF Vd, Xn (signed int to float) + encode_int_fcvt(ctx, sf, 0, type, 0x00, 0x02, ps->id, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); +} + +/** + * Convert unsigned integer to float (OToUFloat) + */ +static void op_toufloat(jit_ctx *ctx, vreg *dst, vreg *src) { + preg *ps = fetch(ctx, src); + preg *pd = alloc_fpu(ctx); + + if (pd->holds != NULL) + free_reg(ctx, pd); + + int sf = (src->size == 8) ? 1 : 0; + int type = (dst->t->kind == HF64) ? 0x01 : 0x00; + + // UCVTF Vd, Xn (unsigned int to float) + encode_int_fcvt(ctx, sf, 0, type, 0x00, 0x03, ps->id, pd->id); + + reg_bind(ctx, dst, pd); + mark_dirty(ctx, dst); +} + +// ============================================================================ +// Jump Patching +// ============================================================================ + +/** + * Add a jump to the patch list + * Also mark the target opcode so we know to discard registers when we reach it + */ +static void register_jump(jit_ctx *ctx, int pos, int target) { + jlist *j = (jlist*)malloc(sizeof(jlist)); + j->pos = pos; + j->target = target; + j->next = ctx->jumps; + ctx->jumps = j; + + // Mark target as a jump destination (like x86 does) + // This tells us to discard register bindings when we reach this opcode + if (target > 0 && target < ctx->maxOps && ctx->opsPos[target] == 0) + ctx->opsPos[target] = -1; +} + +/** + * Patch a jump instruction with the correct offset + * AArch64 branches use instruction offsets (divide byte offset by 4) + */ +static void patch_jump(jit_ctx *ctx, int pos, int target_pos) { + unsigned int *code = (unsigned int*)(ctx->startBuf + pos); + int offset = target_pos - pos; // Byte offset + int insn_offset = offset / 4; // Instruction offset + + // Check if this is a conditional branch (B.cond) or unconditional (B) + unsigned int insn = *code; + unsigned int opcode = (insn >> 24) & 0xFF; + + if (opcode == 0x54) { + // B.cond - 19-bit signed offset + // Range: ±1MB (±0x40000 instructions, ±0x100000 bytes) + if (insn_offset < -0x40000 || insn_offset >= 0x40000) { + printf("JIT Error: Conditional branch offset too large: %d\n", insn_offset); + JIT_ASSERT(0); + } + // Clear old offset, set new offset (bits 5-23) + *code = (insn & 0xFF00001F) | ((insn_offset & 0x7FFFF) << 5); + } else if ((opcode & 0xFC) == 0x14) { + // B or BL - 26-bit signed offset + // Range: ±128MB (±0x2000000 instructions, ±0x8000000 bytes) + if (insn_offset < -0x2000000 || insn_offset >= 0x2000000) { + printf("JIT Error: Branch offset too large: %d\n", insn_offset); + JIT_ASSERT(0); + } + // Clear old offset, set new offset (bits 0-25) + *code = (insn & 0xFC000000) | (insn_offset & 0x3FFFFFF); + } else if ((opcode & 0x7E) == 0x34) { + // CBZ/CBNZ - 19-bit signed offset + if (insn_offset < -0x40000 || insn_offset >= 0x40000) { + printf("JIT Error: CBZ/CBNZ offset too large: %d\n", insn_offset); + JIT_ASSERT(0); + } + *code = (insn & 0xFF00001F) | ((insn_offset & 0x7FFFF) << 5); + } else { + printf("JIT Error: Unknown branch instruction at %d: 0x%08X\n", pos, insn); + JIT_ASSERT(0); + } +} + +// ============================================================================ +// Control Flow & Comparisons +// ============================================================================ + +/** + * Map HashLink condition to AArch64 condition code + */ +static ArmCondition hl_cond_to_arm(hl_op op, bool is_float) { + switch (op) { + case OJEq: return COND_EQ; // Equal + case OJNotEq: return COND_NE; // Not equal + case OJSLt: return is_float ? COND_MI : COND_LT; // Signed less than + case OJSGte: return is_float ? COND_PL : COND_GE; // Signed greater or equal + case OJSGt: return COND_GT; // Signed greater than + case OJSLte: return COND_LE; // Signed less or equal + case OJULt: return COND_LO; // Unsigned less than (carry clear) + case OJUGte: return COND_HS; // Unsigned greater or equal (carry set) + // Float NaN-aware comparisons (includes unordered case) + case OJNotLt: return COND_HS; // Not less than (C=1: >=, or unordered) + case OJNotGte: return COND_LT; // Not greater/equal (N!=V: <, or unordered) + default: + JIT_ASSERT(0); + return COND_AL; + } +} + +/** + * Conditional and comparison jumps + * + * Handles special cases for dynamic types: + * - HDYN/HFUN: Call hl_dyn_compare() to compare dynamic values + * - HTYPE: Call hl_same_type() to compare type objects + * - HNULL: Compare boxed values (Null) + * - HVIRTUAL: Compare virtual objects with underlying values + */ +static void op_jump(jit_ctx *ctx, vreg *a, vreg *b, hl_op op, int target_opcode) { + // Spill all registers to stack BEFORE the branch. + // Target label will use discard_regs() and expect values on stack. + spill_regs(ctx); + spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge + + // Handle dynamic and function type comparisons + if (a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN) { + // Call hl_dyn_compare(a, b) which returns: + // 0 if equal + // negative if a < b + // positive if a > b + // hl_invalid_comparison (0xAABBCCDD) for incomparable types + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + + // Load function pointer and call + load_immediate(ctx, (int64_t)hl_dyn_compare, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + + // Clean up stack + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + + // Handle ordered comparisons (OJSLt/OJSGt/OJSLte/OJSGte) - need to check for hl_invalid_comparison + if (op == OJSLt || op == OJSGt || op == OJSLte || op == OJSGte) { + // Compare result with hl_invalid_comparison (0xAABBCCDD) + // If equal, don't take the branch (skip the jump) + load_immediate(ctx, hl_invalid_comparison, RTMP, false); + encode_add_sub_reg(ctx, 0, 1, 1, 0, RTMP, 0, X0, XZR); // CMP W0, WTMP + int skip_pos = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (if invalid comparison) + + // Valid comparison - compare result with 0 for sign flags + encode_add_sub_imm(ctx, 0, 1, 1, 0, 0, X0, XZR); // CMP W0, #0 + ArmCondition cond = hl_cond_to_arm(op, false); + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); + register_jump(ctx, jump_pos, target_opcode); + + // Patch the skip branch to here + int skip_offset = (BUF_POS() - skip_pos) / 4; + *(int*)(ctx->startBuf + skip_pos) = (*(int*)(ctx->startBuf + skip_pos) & 0xFF00001F) | ((skip_offset & 0x7FFFF) << 5); + return; + } + + // For OJEq/OJNotEq: result == 0 means equal + // TST W0, W0 (sets flags based on W0 & W0) + encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // ANDS WZR, W0, W0 + + // Branch based on zero flag (only equality ops should reach here) + ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE; + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); + register_jump(ctx, jump_pos, target_opcode); + return; + } + + // Handle type comparisons + if (a->t->kind == HTYPE) { + // Call hl_same_type(a, b) which returns bool + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + + load_immediate(ctx, (int64_t)hl_same_type, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + + // Compare result with 1 (true): CMP W0, #1 = SUBS WZR, W0, #1 + // Note: S=1 is required both to set flags AND to make Rd=31 mean XZR (not SP) + encode_add_sub_imm(ctx, 0, 1, 1, 0, 1, X0, XZR); // CMP W0, #1 + + ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE; + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); + register_jump(ctx, jump_pos, target_opcode); + return; + } + + // Handle HNULL (Null) comparisons + // HNULL values have their inner value at offset HDYN_VALUE (8) + if (a->t->kind == HNULL) { + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP; + Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2; + if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8); + if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8); + + if (op == OJEq) { + // if (a == b || (a && b && a->v == b->v)) goto target + // First: CMP a, b - if equal, jump to target + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jump_pos1 = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_pos1, target_opcode); + + // If a == NULL, skip (don't jump) + int skip_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, skip + + // If b == NULL, skip (don't jump) + int skip_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, skip + + // Load inner values: a->v and b->v (at offset HDYN_VALUE) + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, ra, ra); // LDR ra, [ra, #8] + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, rb, rb); // LDR rb, [rb, #8] + + // Compare inner values + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jump_pos2 = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_pos2, target_opcode); + + // Patch skip branches to here + int here = BUF_POS(); + int off_a = (here - skip_a) / 4; + int off_b = (here - skip_b) / 4; + *(int*)(ctx->startBuf + skip_a) = (*(int*)(ctx->startBuf + skip_a) & 0xFF00001F) | ((off_a & 0x7FFFF) << 5); + *(int*)(ctx->startBuf + skip_b) = (*(int*)(ctx->startBuf + skip_b) & 0xFF00001F) | ((off_b & 0x7FFFF) << 5); + } else if (op == OJNotEq) { + // if (a != b && (!a || !b || a->v != b->v)) goto target + // First: CMP a, b - if equal, skip entirely + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int skip_eq = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (a == b means not-not-equal) + + // If a == NULL, goto target (NULL != non-NULL) + int jump_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, target + register_jump(ctx, jump_a, target_opcode); + + // If b == NULL, goto target (non-NULL != NULL) + int jump_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, target + register_jump(ctx, jump_b, target_opcode); + + // Load inner values + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, ra, ra); + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, rb, rb); + + // Compare inner values - if not equal, goto target + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int skip_cmp = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (values equal, don't jump) + + // Values not equal - jump to target + int jump_ne = BUF_POS(); + encode_branch_uncond(ctx, 0); + register_jump(ctx, jump_ne, target_opcode); + + // Patch skip branches + int here = BUF_POS(); + int off_eq = (here - skip_eq) / 4; + int off_cmp = (here - skip_cmp) / 4; + *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5); + *(int*)(ctx->startBuf + skip_cmp) = (*(int*)(ctx->startBuf + skip_cmp) & 0xFF00001F) | ((off_cmp & 0x7FFFF) << 5); + } else { + jit_error("Unsupported comparison op for HNULL"); + } + // Clear register bindings - the registers now contain dereferenced inner values, + // not the original Null pointers. Without this, subsequent ops would think + // pa/pb still hold the original vregs, causing use of stale/wrong values. + discard(ctx, pa); + discard(ctx, pb); + return; + } + + // Handle HVIRTUAL comparisons + // Virtual objects have a 'value' pointer at offset HL_WSIZE (8) + if (a->t->kind == HVIRTUAL) { + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP; + Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2; + if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8); + if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8); + + if (b->t->kind == HOBJ) { + // Comparing virtual to object: compare a->value with b + if (op == OJEq) { + // if (a ? (b && a->value == b) : (b == NULL)) goto target + int ja = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, check_b_null + + // a != NULL: check if b != NULL and a->value == b + int jb = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, skip (a!=NULL, b==NULL: not equal) + + // Load a->value and compare with b + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra); // LDR ra, [ra, #8] + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jvalue = BUF_POS(); + encode_branch_uncond(ctx, 0); // B to_cmp + + // a == NULL: check if b == NULL + int here_ja = BUF_POS(); + int off_ja = (here_ja - ja) / 4; + *(int*)(ctx->startBuf + ja) = (*(int*)(ctx->startBuf + ja) & 0xFF00001F) | ((off_ja & 0x7FFFF) << 5); + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, XZR, XZR); // CMP rb, #0 (TST rb) + + // Patch jvalue to here (to_cmp) + int here_jv = BUF_POS(); + int off_jv = (here_jv - jvalue) / 4; + *(int*)(ctx->startBuf + jvalue) = 0x14000000 | (off_jv & 0x3FFFFFF); + + // Now flags are set - branch if equal + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_pos, target_opcode); + + // Patch jb to skip + int here_jb = BUF_POS(); + int off_jb = (here_jb - jb) / 4; + *(int*)(ctx->startBuf + jb) = (*(int*)(ctx->startBuf + jb) & 0xFF00001F) | ((off_jb & 0x7FFFF) << 5); + } else if (op == OJNotEq) { + // if (a ? (b == NULL || a->value != b) : (b != NULL)) goto target + int ja = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, check_b_notnull + + // a != NULL: jump if b == NULL + int jump_b_null = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, target + register_jump(ctx, jump_b_null, target_opcode); + + // Load a->value and compare with b + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra); + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jvalue = BUF_POS(); + encode_branch_uncond(ctx, 0); // B to_cmp + + // a == NULL: check if b != NULL + int here_ja = BUF_POS(); + int off_ja = (here_ja - ja) / 4; + *(int*)(ctx->startBuf + ja) = (*(int*)(ctx->startBuf + ja) & 0xFF00001F) | ((off_ja & 0x7FFFF) << 5); + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, XZR, XZR); // CMP rb, #0 + + // Patch jvalue + int here_jv = BUF_POS(); + int off_jv = (here_jv - jvalue) / 4; + *(int*)(ctx->startBuf + jvalue) = 0x14000000 | (off_jv & 0x3FFFFFF); + + // Branch if not equal + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, COND_NE); + register_jump(ctx, jump_pos, target_opcode); + } else { + jit_error("Unsupported comparison op for HVIRTUAL vs HOBJ"); + } + // Clear register bindings - ra now contains a->value, not a + discard(ctx, pa); + return; + } + + // Both are HVIRTUAL - compare underlying values + if (op == OJEq) { + // if (a == b || (a && b && a->value && b->value && a->value == b->value)) goto + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jump_eq = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_eq, target_opcode); + + // Check a != NULL + int skip_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + // Check b != NULL + int skip_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + + // Load a->value + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra); + int skip_av = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ if a->value == NULL + + // Load b->value + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, rb, rb); + int skip_bv = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ if b->value == NULL + + // Compare values + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int jump_val = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_val, target_opcode); + + // Patch all skips to here + int here = BUF_POS(); + int patches[] = { skip_a, skip_b, skip_av, skip_bv }; + for (int i = 0; i < 4; i++) { + int off = (here - patches[i]) / 4; + *(int*)(ctx->startBuf + patches[i]) = (*(int*)(ctx->startBuf + patches[i]) & 0xFF00001F) | ((off & 0x7FFFF) << 5); + } + } else if (op == OJNotEq) { + // if (a != b && (!a || !b || !a->value || !b->value || a->value != b->value)) goto + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int skip_eq = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // Skip if a == b + + // If a == NULL, jump + int jump_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + register_jump(ctx, jump_a, target_opcode); + + // If b == NULL, jump + int jump_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + register_jump(ctx, jump_b, target_opcode); + + // Load a->value + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra); + int jump_av = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + register_jump(ctx, jump_av, target_opcode); + + // Load b->value + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, rb, rb); + int jump_bv = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + register_jump(ctx, jump_bv, target_opcode); + + // Compare - if not equal, jump + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); + int skip_val = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + + // Not equal - jump to target + int jump_ne = BUF_POS(); + encode_branch_uncond(ctx, 0); + register_jump(ctx, jump_ne, target_opcode); + + // Patch skips + int here = BUF_POS(); + int off_eq = (here - skip_eq) / 4; + int off_val = (here - skip_val) / 4; + *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5); + *(int*)(ctx->startBuf + skip_val) = (*(int*)(ctx->startBuf + skip_val) & 0xFF00001F) | ((off_val & 0x7FFFF) << 5); + } else { + jit_error("Unsupported comparison op for HVIRTUAL"); + } + // Clear register bindings - ra/rb now contain ->value, not the original vregs + discard(ctx, pa); + discard(ctx, pb); + return; + } + + // Handle HOBJ/HSTRUCT vs HVIRTUAL (swap operands) + if ((a->t->kind == HOBJ || a->t->kind == HSTRUCT) && b->t->kind == HVIRTUAL) { + // Swap and recurse - the HVIRTUAL case handles HOBJ on the right + op_jump(ctx, b, a, op, target_opcode); + return; + } + + // Handle String EQUALITY comparison (value-based per Haxe spec) + // hl_str_cmp only returns 0 (equal) or 1 (not equal), so it can only be used + // for OJEq/OJNotEq. For ordered comparisons, fall through to compareFun path. + if ((op == OJEq || op == OJNotEq) && is_string_type(a->t) && is_string_type(b->t)) { + // Spill before call + spill_regs(ctx); + spill_callee_saved(ctx); + + // Call hl_str_cmp(a, b) - returns 0 if equal, non-zero if not equal + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + load_immediate(ctx, (int64_t)hl_str_cmp, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + + // Result in X0: 0 = equal, non-zero = not equal + // TST X0, X0 sets Z flag (Z=1 if X0==0) + encode_logical_reg(ctx, 1, 0x3, 0, 0, X0, 0, X0, XZR); // TST X0, X0 + + // Branch based on op (only EQ or NE) + ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE; + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); + register_jump(ctx, jump_pos, target_opcode); + return; + } + + // Handle HOBJ/HSTRUCT with compareFun (e.g., String) + // Use hl_get_obj_rt() to ensure runtime object is initialized (like x86 does) + // NOTE: compareFun is a FUNCTION INDEX, not a function pointer! + if ((a->t->kind == HOBJ || a->t->kind == HSTRUCT) && hl_get_obj_rt(a->t)->compareFun) { + int compareFunIndex = (int)(int_val)hl_get_obj_rt(a->t)->compareFun; + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP; + Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2; + if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8); + if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8); + + if (op == OJEq) { + // if (a == b || (a && b && cmp(a,b) == 0)) goto target + // First check pointer equality + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); // CMP ra, rb + int jump_eq = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + register_jump(ctx, jump_eq, target_opcode); + + // If a == NULL, skip + int skip_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + + // If b == NULL, skip + int skip_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + + // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer! + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + emit_call_findex(ctx, compareFunIndex, stack_space); + + // If result == 0, goto target + encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // TST W0, W0 + int skip_cmp = BUF_POS(); + encode_branch_cond(ctx, 0, COND_NE); // Skip if result != 0 + + // Jump to target + int jump_target = BUF_POS(); + encode_branch_uncond(ctx, 0); + register_jump(ctx, jump_target, target_opcode); + + // Patch all skips to here + int here = BUF_POS(); + int patches[] = { skip_a, skip_b, skip_cmp }; + for (int i = 0; i < 3; i++) { + int off = (here - patches[i]) / 4; + *(int*)(ctx->startBuf + patches[i]) = (*(int*)(ctx->startBuf + patches[i]) & 0xFF00001F) | ((off & 0x7FFFF) << 5); + } + } else if (op == OJNotEq) { + // if (a != b && (!a || !b || cmp(a,b) != 0)) goto target + // First check pointer equality - if equal, skip entirely + encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); // CMP ra, rb + int skip_eq = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); + + // If a == NULL, goto target + int jump_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + register_jump(ctx, jump_a, target_opcode); + + // If b == NULL, goto target + int jump_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + register_jump(ctx, jump_b, target_opcode); + + // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer! + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + emit_call_findex(ctx, compareFunIndex, stack_space); + + // If result != 0, goto target + encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // TST W0, W0 + int skip_cmp = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // Skip if result == 0 + + // Jump to target + int jump_target = BUF_POS(); + encode_branch_uncond(ctx, 0); + register_jump(ctx, jump_target, target_opcode); + + // Patch skips to here + int here = BUF_POS(); + int off_eq = (here - skip_eq) / 4; + int off_cmp = (here - skip_cmp) / 4; + *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5); + *(int*)(ctx->startBuf + skip_cmp) = (*(int*)(ctx->startBuf + skip_cmp) & 0xFF00001F) | ((off_cmp & 0x7FFFF) << 5); + } else { + // For OJSGt, OJSGte, OJSLt, OJSLte: if (a && b && cmp(a,b) ?? 0) goto + int skip_a = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, ra); + + int skip_b = BUF_POS(); + encode_cbz_cbnz(ctx, 1, 0, 0, rb); + + // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer! + vreg *args[2] = { a, b }; + int stack_space = prepare_call_args(ctx, NULL, args, 2, true); + emit_call_findex(ctx, compareFunIndex, stack_space); + + // Compare result with 0: CMP W0, #0 + encode_add_sub_imm(ctx, 0, 1, 1, 0, 0, X0, XZR); // CMP W0, #0 + + // Branch based on condition + ArmCondition cond = hl_cond_to_arm(op, false); + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); + register_jump(ctx, jump_pos, target_opcode); + + // Patch skips to here + int here = BUF_POS(); + int off_a = (here - skip_a) / 4; + int off_b = (here - skip_b) / 4; + *(int*)(ctx->startBuf + skip_a) = (*(int*)(ctx->startBuf + skip_a) & 0xFF00001F) | ((off_a & 0x7FFFF) << 5); + *(int*)(ctx->startBuf + skip_b) = (*(int*)(ctx->startBuf + skip_b) & 0xFF00001F) | ((off_b & 0x7FFFF) << 5); + } + return; + } + + // Standard comparison for other types + bool is_float = IS_FLOAT(a); + preg *pa = fetch(ctx, a); + preg *pb = fetch(ctx, b); + + if (is_float) { + // Floating-point comparison: FCMP Vn, Vm + int type = (a->t->kind == HF64) ? 0x01 : 0x00; + encode_fp_compare(ctx, 0, 0, type, pb->id, 0, pa->id); + } else { + // Integer comparison: CMP Xn, Xm (implemented as SUBS XZR, Xn, Xm) + int sf = (a->size == 8) ? 1 : 0; + encode_add_sub_reg(ctx, sf, 1, 1, 0, pb->id, 0, pa->id, XZR); + } + + // Emit conditional branch + ArmCondition cond = hl_cond_to_arm(op, is_float); + int jump_pos = BUF_POS(); + encode_branch_cond(ctx, 0, cond); // Offset will be patched later + + // Register for patching + register_jump(ctx, jump_pos, target_opcode); +} + +/** + * Simple conditional jumps (OJTrue, OJFalse, OJNull, OJNotNull) + */ +static void op_jcond(jit_ctx *ctx, vreg *a, hl_op op, int target_opcode) { + // Spill all registers to stack BEFORE the branch. + // Target label will use discard_regs() and expect values on stack. + spill_regs(ctx); + spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge + + preg *pa = fetch(ctx, a); + int jump_pos = BUF_POS(); + + // Determine which condition to test + bool test_zero = (op == OJFalse || op == OJNull); + + // Use CBZ (compare and branch if zero) or CBNZ (compare and branch if non-zero) + int sf = (a->size == 8) ? 1 : 0; + int op_bit = test_zero ? 0 : 1; // 0=CBZ, 1=CBNZ + + encode_cbz_cbnz(ctx, sf, op_bit, 0, pa->id); // Offset will be patched + + // Register for patching + register_jump(ctx, jump_pos, target_opcode); +} + +/** + * Unconditional jump (OJAlways) + */ +static void op_jalways(jit_ctx *ctx, int target_opcode) { + // Spill all registers to stack BEFORE the branch. + // Target label will use discard_regs() and expect values on stack. + spill_regs(ctx); + spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge + + int jump_pos = BUF_POS(); + encode_branch_uncond(ctx, 0); // Offset will be patched + + // Register for patching + register_jump(ctx, jump_pos, target_opcode); +} + +/** + * Discard all register bindings at merge points (labels). + * + * Used at labels where control flow can come from multiple paths. + * Clears register↔vreg bindings so subsequent operations load from stack. + * + * With dirty tracking: If reached via fallthrough (not a jump), registers + * might still be dirty and need to be spilled first. Registers reached via + * jump are already clean because spill_regs() is called before all jumps. + */ +static void discard_regs(jit_ctx *ctx) { + int i; + // Handle CPU scratch registers (X0-X17) + // NOTE: This function must NOT emit any code! + // At labels, spill_regs() + spill_callee_saved() is called BEFORE this (for fallthrough). + // We just clear bindings here - values are already on stack. + for (i = 0; i < 18; i++) { + preg *r = &ctx->pregs[i]; + if (r->holds) { + r->holds->dirty = 0; + r->holds->current = NULL; + r->holds = NULL; + } + } + // Handle callee-saved CPU registers (X19-X26) + // At merge points, callee-saved must also be discarded for consistent state + for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) { + preg *r = REG_AT(RCPU_CALLEE_ALLOC[i]); + if (r->holds) { + r->holds->dirty = 0; + r->holds->current = NULL; + r->holds = NULL; + } + } + // Handle ALL FPU registers (V0-V31) at merge points + // At labels, control flow may come from different paths with different allocations + for (i = 0; i < RFPU_COUNT; i++) { + preg *r = &ctx->pregs[RCPU_COUNT + i]; + if (r->holds) { + r->holds->dirty = 0; + r->holds->current = NULL; + r->holds = NULL; + } + } +} + +/** + * Label marker (OLabel) - just records position for jump targets + * At a label, control flow could come from multiple places, + * so we must invalidate all register associations. + * + * IMPORTANT: No code is emitted here! The main loop calls spill_regs() + * BEFORE this function for the fallthrough path. Jump paths have already + * spilled before jumping. We just clear bindings so subsequent ops + * load from stack. + */ +static void op_label(jit_ctx *ctx) { + // Just clear bindings - spill_regs() was already called in main loop + discard_regs(ctx); +} + +// ============================================================================ +// Memory Operations +// ============================================================================ + +/* + * Load byte/halfword/word from memory + * OGetI8/OGetI16/OGetI32: dst = *(type*)(base + offset) + */ +static void op_get_mem(jit_ctx *ctx, vreg *dst, vreg *base, int offset, int size) { + preg *base_reg = fetch(ctx, base); + preg *dst_reg = alloc_dst(ctx, dst); + + Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP; + if (base_reg->kind != RCPU) { + ldr_stack(ctx, base_r, base->stackPos, base->size); + } + + // Handle float and integer cases separately + if (IS_FLOAT(dst)) { + // Float: load into FPU register + Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16; + int size_bits = (size == 8) ? 0x03 : 0x02; // D or S + + if (offset >= 0 && offset < (1 << 12) * size) { + int imm12 = offset / size; + encode_ldr_str_imm(ctx, size_bits, 1, 0x01, imm12, base_r, dst_r); // V=1 for FP + } else { + load_immediate(ctx, offset, RTMP2, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP); + encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, dst_r); // V=1 for FP + } + + str_stack_fp(ctx, dst_r, dst->stackPos, dst->size); + } else { + // Integer/pointer: load into CPU register + // Use RTMP2 as temp (not RTMP) because str_stack's fallback uses RTMP internally. + // If we loaded into RTMP, str_stack would clobber the value. + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP2; + + // Load with offset + // LDR Xd, [Xn, #offset] or LDRB/LDRH for smaller sizes + if (offset >= 0 && offset < (1 << 12) * size) { + // Fits in immediate offset + int imm12 = offset / size; + // size: 1=LDRB, 2=LDRH, 4=LDR(W), 8=LDR(X) + int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03; + encode_ldr_str_imm(ctx, size_bits, 0, 0x01, imm12, base_r, dst_r); + } else { + // Offset too large - compute effective address in RTMP, then load into dst_r + load_immediate(ctx, offset, RTMP, false); // Use RTMP for address computation + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, base_r, RTMP); + // LDR dst_r, [RTMP] + encode_ldr_str_imm(ctx, (size == 8) ? 0x03 : 0x02, 0, 0x01, 0, RTMP, dst_r); + } + + // Always store to stack - it's the source of truth for later loads + // (registers may be clobbered by subsequent calls) + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + // Release the base register - discard() will store if dirty + discard(ctx, base_reg); +} + +/* + * Store byte/halfword/word to memory + * OSetI8/OSetI16/OSetI32: *(type*)(base + offset) = value + */ +static void op_set_mem(jit_ctx *ctx, vreg *base, int offset, vreg *value, int size) { + preg *base_reg = fetch(ctx, base); + preg *value_reg = fetch(ctx, value); + + /* + * IMPORTANT: Load value FIRST, then base. + * ldr_stack's fallback path uses RTMP internally, so if we load base into RTMP + * first, then load value from stack, RTMP would get clobbered. + * By loading value first (into RTMP2 or FPU reg), any RTMP usage is harmless. + * Then we load base into RTMP, which is safe since value is already loaded. + */ + + // Handle float and integer cases separately + if (IS_FLOAT(value)) { + // Float: load value first into FPU register + Arm64FpReg value_r = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16; + if (value_reg->kind != RFPU) { + ldr_stack_fp(ctx, value_r, value->stackPos, value->size); + } + + // Now load base (safe - value is already in FPU reg) + Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP; + if (base_reg->kind != RCPU) { + ldr_stack(ctx, base_r, base->stackPos, base->size); + } + + int size_bits = (size == 8) ? 0x03 : 0x02; // D or S + + if (offset >= 0 && offset < (1 << 12) * size) { + int imm12 = offset / size; + encode_ldr_str_imm(ctx, size_bits, 1, 0x00, imm12, base_r, value_r); // V=1 for FP + } else { + load_immediate(ctx, offset, RTMP2, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP); + encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, value_r); // V=1 for FP + } + } else { + // Integer/pointer: load value first into CPU register + Arm64Reg value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : RTMP2; + if (value_reg->kind == RCONST) { + load_immediate(ctx, value_reg->id, value_r, value->size == 8); + } else if (value_reg->kind != RCPU) { + ldr_stack(ctx, value_r, value->stackPos, value->size); + } + + // Now load base (safe - value is already in RTMP2 or CPU reg) + Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP; + if (base_reg->kind != RCPU) { + ldr_stack(ctx, base_r, base->stackPos, base->size); + } + + // Store with offset + // STR Xd, [Xn, #offset] or STRB/STRH for smaller sizes + if (offset >= 0 && offset < (1 << 12) * size) { + // Fits in immediate offset + int imm12 = offset / size; + int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03; + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, imm12, base_r, value_r); + } else { + // Offset too large - load offset to temp register + if (value_r == RTMP2) { + // Value is already in RTMP2, use a different temp + load_immediate(ctx, offset, X9, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, X9, 0, base_r, RTMP); + } else { + load_immediate(ctx, offset, RTMP2, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP); + } + // STR value_r, [RTMP] + encode_ldr_str_imm(ctx, (size == 8) ? 0x03 : 0x02, 0, 0x00, 0, RTMP, value_r); + } + } + + discard(ctx, base_reg); + discard(ctx, value_reg); +} + +/* + * Load byte/halfword/word from memory with register offset + * OGetI8/OGetI16/OGetMem: dst = *(type*)(base + offset_reg) + * Unlike op_get_mem which takes an immediate offset, this takes an offset vreg + */ +/* + * IMPORTANT: We must load offset BEFORE base when base uses RTMP, + * because ldr_stack's fallback path uses RTMP as a temporary. + * Order: offset -> base -> compute address -> load + */ +static void op_get_mem_reg(jit_ctx *ctx, vreg *dst, vreg *base, vreg *offset, int size) { + preg *base_reg = fetch(ctx, base); + preg *offset_reg = fetch(ctx, offset); + preg *dst_reg = alloc_dst(ctx, dst); + + // Step 1: Load offset FIRST (may clobber RTMP in fallback, but we haven't used it yet) + Arm64Reg offset_r = (offset_reg->kind == RCPU) ? (Arm64Reg)offset_reg->id : RTMP2; + if (offset_reg->kind == RCONST) { + load_immediate(ctx, offset_reg->id, offset_r, false); + } else if (offset_reg->kind != RCPU) { + ldr_stack(ctx, offset_r, offset->stackPos, offset->size); + } + + // Step 2: Load base (if it needs RTMP, the value will stay in RTMP) + Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP; + if (base_reg->kind != RCPU) { + ldr_stack(ctx, base_r, base->stackPos, base->size); + } + + // Step 3: Compute effective address: RTMP = base + offset + encode_add_sub_reg(ctx, 1, 0, 0, 0, offset_r, 0, base_r, RTMP); + + // Load from [RTMP] - handle float vs integer types + int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03; + + if (IS_FLOAT(dst)) { + // Float load: use FPU register and V=1 + Arm64FpReg dst_fp = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16; + encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, dst_fp); // V=1 for FP + str_stack_fp(ctx, dst_fp, dst->stackPos, dst->size); + } else { + // Integer load + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : X9; + encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, RTMP, dst_r); + // For byte/halfword loads, the result is zero-extended automatically by LDRB/LDRH + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + discard(ctx, base_reg); + discard(ctx, offset_reg); +} + +/* + * Store byte/halfword/word to memory with register offset + * OSetI8/OSetI16/OSetMem: *(type*)(base + offset_reg) = value + * Unlike op_set_mem which takes an immediate offset, this takes an offset vreg + * + * IMPORTANT: We must load the value BEFORE computing the address in RTMP, + * because ldr_stack's fallback path for large/unaligned offsets uses RTMP + * as a temporary register. + */ +static void op_set_mem_reg(jit_ctx *ctx, vreg *base, vreg *offset, vreg *value, int size) { + preg *base_reg = fetch(ctx, base); + preg *offset_reg = fetch(ctx, offset); + preg *value_reg = fetch(ctx, value); + + int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03; + + // Step 1: Load value FIRST (before using RTMP for address computation) + // ldr_stack's fallback path uses RTMP, so we must do this before RTMP holds the address + Arm64FpReg value_fp = V16; + Arm64Reg value_r = X9; + + if (IS_FLOAT(value)) { + value_fp = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16; + if (value_reg->kind != RFPU) { + ldr_stack_fp(ctx, value_fp, value->stackPos, value->size); + } + } else { + value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : X9; + if (value_reg->kind == RCONST) { + load_immediate(ctx, value_reg->id, value_r, value->size == 8); + } else if (value_reg->kind != RCPU) { + ldr_stack(ctx, value_r, value->stackPos, value->size); + } + } + + // Step 2: Load base and offset (these may also use RTMP in fallback, but that's ok + // since we compute the final address in RTMP at the end) + Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP; + if (base_reg->kind != RCPU) { + ldr_stack(ctx, base_r, base->stackPos, base->size); + } + + Arm64Reg offset_r = (offset_reg->kind == RCPU) ? (Arm64Reg)offset_reg->id : RTMP2; + if (offset_reg->kind == RCONST) { + load_immediate(ctx, offset_reg->id, offset_r, false); + } else if (offset_reg->kind != RCPU) { + ldr_stack(ctx, offset_r, offset->stackPos, offset->size); + } + + // Step 3: Compute effective address: RTMP = base + offset + encode_add_sub_reg(ctx, 1, 0, 0, 0, offset_r, 0, base_r, RTMP); + + // Step 4: Store to [RTMP] + if (IS_FLOAT(value)) { + encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, value_fp); // V=1 for FP + } else { + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, RTMP, value_r); + } + + discard(ctx, base_reg); + discard(ctx, offset_reg); + discard(ctx, value_reg); +} + +/* + * Field access: dst = obj->field + * OField: dst = *(obj + field_offset) + * + * Special handling for HPACKED -> HSTRUCT: return address of inline storage + * instead of loading a value (LEA semantics). + */ +static void op_field(jit_ctx *ctx, vreg *dst, vreg *obj, int field_index) { + hl_runtime_obj *rt = hl_get_obj_rt(obj->t); + int offset = rt->fields_indexes[field_index]; + + // Check for packed field -> struct destination (LEA semantics) + if (dst->t->kind == HSTRUCT) { + hl_type *ft = hl_obj_field_fetch(obj->t, field_index)->t; + if (ft->kind == HPACKED) { + // Return address of inline storage: dst = &obj->field + preg *p_obj = fetch(ctx, obj); + preg *p_dst = alloc_dst(ctx, dst); // Allocates register, binds to dst, marks dirty + + Arm64Reg obj_r = (p_obj->kind == RCPU) ? (Arm64Reg)p_obj->id : RTMP; + if (p_obj->kind != RCPU) { + ldr_stack(ctx, obj_r, obj->stackPos, obj->size); + } + + Arm64Reg dst_r = (Arm64Reg)p_dst->id; // alloc_dst always returns RCPU for non-float + + // ADD dst, obj, #offset (equivalent to LEA) + if (offset >= 0 && offset < 4096) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, offset, obj_r, dst_r); + } else { + load_immediate(ctx, offset, RTMP2, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, obj_r, dst_r); + } + + // Don't call store_result - alloc_dst already set up the binding + // The value will be spilled when needed + discard(ctx, p_obj); + return; + } + } + + op_get_mem(ctx, dst, obj, offset, dst->size); +} + +/* + * Field assignment: obj->field = value + * OSetField: *(obj + field_offset) = value + * + * Special handling for HSTRUCT -> HPACKED: must copy struct byte-by-byte + * because HPACKED means the struct is stored inline, not as a pointer. + */ +static void op_set_field(jit_ctx *ctx, vreg *obj, int field_index, vreg *value) { + hl_runtime_obj *rt = hl_get_obj_rt(obj->t); + int field_offset = rt->fields_indexes[field_index]; + + // Check for struct-to-packed-field assignment + if (value->t->kind == HSTRUCT) { + hl_type *ft = hl_obj_field_fetch(obj->t, field_index)->t; + if (ft->kind == HPACKED) { + // Copy struct byte-by-byte + hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam); + + // Load obj pointer into RTMP and value pointer into RTMP2. + // This is simpler than trying to manage register allocation for the copy. + preg *p_obj = fetch(ctx, obj); + preg *p_val = fetch(ctx, value); + + // Always load to scratch registers to avoid conflicts with copy temp + Arm64Reg obj_r = RTMP; + Arm64Reg val_r = RTMP2; + + if (p_obj->kind == RCPU) { + // Move from allocated register to RTMP: ORR RTMP, XZR, Rm + encode_logical_reg(ctx, 1, 0x01, 0, 0, (Arm64Reg)p_obj->id, 0, XZR, obj_r); + } else { + ldr_stack(ctx, obj_r, obj->stackPos, obj->size); + } + + if (p_val->kind == RCPU) { + // Move from allocated register to RTMP2: ORR RTMP2, XZR, Rm + encode_logical_reg(ctx, 1, 0x01, 0, 0, (Arm64Reg)p_val->id, 0, XZR, val_r); + } else { + ldr_stack(ctx, val_r, value->stackPos, value->size); + } + + // Use X9 for data copy, X10 for large offset computation + // Evict both if they're holding values + preg *p_x9 = &ctx->pregs[X9]; + preg *p_x10 = &ctx->pregs[X10]; + if (p_x9->holds != NULL) { + free_reg(ctx, p_x9); + } + if (p_x10->holds != NULL) { + free_reg(ctx, p_x10); + } + + Arm64Reg tmp = X9; + int offset = 0; + while (offset < frt->size) { + int remain = frt->size - offset; + int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); + int size_bits = (copy_size == 8) ? 0x03 : (copy_size == 4) ? 0x02 : (copy_size == 2) ? 0x01 : 0x00; + + // Load from source: LDR tmp, [val_r, #offset] + // Source offset starts at 0 and increments by copy_size, so always aligned + encode_ldr_str_imm(ctx, size_bits, 0, 0x01, offset / copy_size, val_r, tmp); + + // Store to dest: STR tmp, [obj_r + field_offset + offset] + // Dest offset may not be aligned to copy_size, so compute address explicitly + int dest_offset = field_offset + offset; + if ((dest_offset % copy_size) == 0 && dest_offset >= 0 && dest_offset < (1 << 12) * copy_size) { + // Aligned and fits in immediate - use scaled offset + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, dest_offset / copy_size, obj_r, tmp); + } else { + // Misaligned or large offset - compute address in X10 + load_immediate(ctx, dest_offset, X10, false); + encode_add_sub_reg(ctx, 1, 0, 0, 0, X10, 0, obj_r, X10); + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, X10, tmp); + } + + offset += copy_size; + } + + discard(ctx, p_obj); + discard(ctx, p_val); + return; + } + } + + op_set_mem(ctx, obj, field_offset, value, value->size); +} + +/* + * Array element access: dst = array[index] + * OGetArray: dst = hl_aptr(array)[index] + * + * varray layout: { hl_type *t, hl_type *at, int size, int __pad } = 24 bytes + * Data is INLINE immediately after the header (not via a pointer!) + * hl_aptr(a,t) = (t*)(((varray*)(a))+1) = array + sizeof(varray) + * + * CArray (HABSTRACT) layout: raw memory, no header + * For HOBJ/HSTRUCT: return address of element (LEA) + * For other types: load value (LDR) + */ +/* + * IMPORTANT: We must load index BEFORE array when array uses RTMP, + * because ldr_stack's fallback path uses RTMP as a temporary. + * Order: index -> array -> compute address -> load + */ +static void op_get_array(jit_ctx *ctx, vreg *dst, vreg *array, vreg *index) { + preg *array_reg = fetch(ctx, array); + preg *index_reg = fetch(ctx, index); + preg *dst_reg = alloc_dst(ctx, dst); + + // CArrays (HABSTRACT) have different layout - no header, and for HOBJ/HSTRUCT + // we return the address (LEA) rather than loading the value + bool is_carray = (array->t->kind == HABSTRACT); + bool is_lea = is_carray && (dst->t->kind == HOBJ || dst->t->kind == HSTRUCT); + + int elem_size; + if (is_carray) { + if (is_lea) { + // For HOBJ/HSTRUCT in CArray, element size is the runtime object size + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + elem_size = rt->size; + } else { + // For other types in CArray, element size is pointer size + elem_size = sizeof(void*); + } + } else { + elem_size = hl_type_size(dst->t); + } + + // Step 1: Load index FIRST (may clobber RTMP in fallback, but we haven't used it yet) + Arm64Reg index_r = (index_reg->kind == RCPU) ? (Arm64Reg)index_reg->id : RTMP2; + if (index_reg->kind == RCONST) { + load_immediate(ctx, index_reg->id, index_r, false); + } else if (index_reg->kind != RCPU) { + ldr_stack(ctx, index_r, index->stackPos, index->size); + } + + // Step 2: Load array (if it needs RTMP, the value will stay in RTMP) + Arm64Reg array_r = (array_reg->kind == RCPU) ? (Arm64Reg)array_reg->id : RTMP; + if (array_reg->kind != RCPU) { + ldr_stack(ctx, array_r, array->stackPos, array->size); + } + + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : X9; + + // Step 3: Calculate element address + // For varray: array + sizeof(varray) + index * elem_size + // For CArray: array + index * elem_size (no header) + + if (is_carray) { + // CArray: no header offset, start from array_r directly + // Scale index by elem_size + if (elem_size == 1) { + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, array_r, RTMP); + } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) { + int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3; + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, array_r, RTMP); + } else { + // Non-power-of-2: compute index * elem_size in RTMP2, then add + load_immediate(ctx, elem_size, RTMP2, false); + encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, array_r, RTMP); + } + } else { + // varray: add sizeof(varray) header offset first + encode_add_sub_imm(ctx, 1, 0, 0, 0, sizeof(varray), array_r, RTMP); + + // Add scaled index offset: RTMP = RTMP + (index_r << shift) + if (elem_size == 1) { + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, RTMP, RTMP); + } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) { + int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3; + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, RTMP, RTMP); + } else { + // Non-power-of-2: scale index into RTMP2, then add + load_immediate(ctx, elem_size, RTMP2, false); + encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, RTMP, RTMP); + } + } + + if (is_lea) { + // LEA: just move the computed address to dst + mov_reg_reg(ctx, dst_r, RTMP, true); + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } else if (IS_FLOAT(dst)) { + // Float load: use FP register with V=1 + preg *pv0 = PVFPR(0); + if (pv0->holds != NULL && pv0->holds != dst) { + free_reg(ctx, pv0); + } + int size_bits = (dst->size == 8) ? 0x03 : 0x02; // F64 or F32 + encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, V0); // V=1 for FP + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + // Clear dst's old binding - value is now on stack, not in a register + if (dst->current != NULL) { + dst->current->holds = NULL; + dst->current = NULL; + } + } else { + // Integer load + int size_bits = (elem_size == 1) ? 0x00 : (elem_size == 2) ? 0x01 : (elem_size == 4) ? 0x02 : 0x03; + encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, RTMP, dst_r); + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + discard(ctx, array_reg); + discard(ctx, index_reg); +} + +/* + * Array element assignment: array[index] = value + * OSetArray: hl_aptr(array)[index] = value + * + * varray layout: { hl_type *t, hl_type *at, int size, int __pad } = 24 bytes + * Data is INLINE immediately after the header (not via a pointer!) + * + * CArray (HABSTRACT) layout: raw memory, no header + * For HOBJ/HSTRUCT: copy entire struct from value (which is address from LEA) + * For other types: store value directly + */ +/* + * IMPORTANT: We must load value and index BEFORE array when array uses RTMP, + * because ldr_stack's fallback path uses RTMP as a temporary. + * Order: value -> index -> array -> compute address -> store + */ +static void op_set_array(jit_ctx *ctx, vreg *array, vreg *index, vreg *value) { + preg *array_reg = fetch(ctx, array); + preg *index_reg = fetch(ctx, index); + preg *value_reg = fetch(ctx, value); + + // CArrays (HABSTRACT) have different semantics + bool is_carray = (array->t->kind == HABSTRACT); + bool is_struct_copy = is_carray && (value->t->kind == HOBJ || value->t->kind == HSTRUCT); + + int elem_size; + if (is_carray) { + if (is_struct_copy) { + // For HOBJ/HSTRUCT in CArray, element size is the runtime object size + hl_runtime_obj *rt = hl_get_obj_rt(value->t); + elem_size = rt->size; + } else { + // For other types in CArray, element size is pointer size + elem_size = sizeof(void*); + } + } else { + elem_size = hl_type_size(value->t); + } + + // Step 1: Load value FIRST (before using RTMP for address computation) + // For struct copy, value is a pointer to the source struct + // For floats, use FP register; for integers, use CPU register + Arm64Reg value_r = X9; + Arm64FpReg value_fp = V0; + bool is_float_value = IS_FLOAT(value); + + if (is_float_value) { + if (value_reg->kind == RFPU) { + value_fp = (Arm64FpReg)value_reg->id; + } else { + ldr_stack_fp(ctx, value_fp, value->stackPos, value->size); + } + } else { + value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : X9; + if (value_reg->kind == RCONST) { + load_immediate(ctx, value_reg->id, value_r, value->size == 8); + } else if (value_reg->kind != RCPU) { + ldr_stack(ctx, value_r, value->stackPos, value->size); + } + } + + // Step 2: Load index (may clobber RTMP in fallback, but we haven't used it yet) + Arm64Reg index_r = (index_reg->kind == RCPU) ? (Arm64Reg)index_reg->id : RTMP2; + if (index_reg->kind == RCONST) { + load_immediate(ctx, index_reg->id, index_r, false); + } else if (index_reg->kind != RCPU) { + ldr_stack(ctx, index_r, index->stackPos, index->size); + } + + // Step 3: Load array (if it needs RTMP, the value will stay in RTMP) + Arm64Reg array_r = (array_reg->kind == RCPU) ? (Arm64Reg)array_reg->id : RTMP; + if (array_reg->kind != RCPU) { + ldr_stack(ctx, array_r, array->stackPos, array->size); + } + + // Step 4: Calculate element address + // For varray: array + sizeof(varray) + index * elem_size + // For CArray: array + index * elem_size (no header) + + if (is_carray) { + // CArray: no header offset + if (elem_size == 1) { + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, array_r, RTMP); + } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) { + int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3; + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, array_r, RTMP); + } else { + // Non-power-of-2: compute index * elem_size + load_immediate(ctx, elem_size, RTMP2, false); + encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, array_r, RTMP); + } + } else { + // varray: add sizeof(varray) header offset first + encode_add_sub_imm(ctx, 1, 0, 0, 0, sizeof(varray), array_r, RTMP); + + // Add scaled index offset + if (elem_size == 1) { + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, RTMP, RTMP); + } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) { + int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3; + encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, RTMP, RTMP); + } else { + load_immediate(ctx, elem_size, RTMP2, false); + encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, RTMP, RTMP); + } + } + + if (is_struct_copy) { + // Copy struct from value (pointer) to RTMP (destination) + // value_r points to source struct, RTMP points to destination + // Use X10 as temporary for copy (not value_r which we need as source base) + int offset = 0; + while (offset < elem_size) { + int remain = elem_size - offset; + int copy_size, size_bits; + if (remain >= 8) { + copy_size = 8; + size_bits = 0x03; + } else if (remain >= 4) { + copy_size = 4; + size_bits = 0x02; + } else if (remain >= 2) { + copy_size = 2; + size_bits = 0x01; + } else { + copy_size = 1; + size_bits = 0x00; + } + // Load from source: X10 = [value_r + offset] + encode_ldur_stur(ctx, size_bits, 0, 0x01, offset, value_r, X10); + // Store to dest: [RTMP + offset] = X10 + encode_ldur_stur(ctx, size_bits, 0, 0x00, offset, RTMP, X10); + offset += copy_size; + } + } else if (is_float_value) { + // Float store: STR Vn, [RTMP] with V=1 + int size_bits = (value->size == 8) ? 0x03 : 0x02; // F64 or F32 + encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, value_fp); // V=1 for FP + } else { + // Integer store: STR Xn, [RTMP] + int size_bits = (elem_size == 1) ? 0x00 : (elem_size == 2) ? 0x01 : (elem_size == 4) ? 0x02 : 0x03; + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, RTMP, value_r); + } + + discard(ctx, array_reg); + discard(ctx, index_reg); + discard(ctx, value_reg); +} + +/* + * Global variable access: dst = globals[index] + * OGetGlobal: Use PC-relative addressing with ADRP + LDR + */ +static void op_get_global(jit_ctx *ctx, vreg *dst, int global_index) { + preg *dst_reg = alloc_dst(ctx, dst); + + // Get global address from module + void **globals = (void**)ctx->m->globals_data; + void *global_addr = &globals[global_index]; + + // Load global address to RTMP2 + load_immediate(ctx, (int64_t)global_addr, RTMP2, true); + + if (IS_FLOAT(dst)) { + // Float: load into FPU register + Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16; + // LDR Vn, [RTMP2] - floating point load + // size: 0x02=32-bit (S), 0x03=64-bit (D) + encode_ldr_str_imm(ctx, dst->size == 8 ? 0x03 : 0x02, 1, 0x01, 0, RTMP2, dst_r); + // Store to stack + str_stack_fp(ctx, dst_r, dst->stackPos, dst->size); + } else { + // Integer/pointer: load into CPU register + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + // LDR Xn, [RTMP2] + encode_ldr_str_imm(ctx, dst->size == 8 ? 0x03 : 0x02, 0, 0x01, 0, RTMP2, dst_r); + // Store to stack + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } +} + +/* + * Global variable assignment: globals[index] = value + * OSetGlobal + */ +static void op_set_global(jit_ctx *ctx, int global_index, vreg *value) { + preg *value_reg = fetch(ctx, value); + + // Get global address from module + void **globals = (void**)ctx->m->globals_data; + void *global_addr = &globals[global_index]; + + // Load global address to RTMP2 + load_immediate(ctx, (int64_t)global_addr, RTMP2, true); + + if (IS_FLOAT(value)) { + // Float: store from FPU register + Arm64FpReg value_r = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16; + if (value_reg->kind != RFPU) { + // Load from stack into temp FPU register + ldr_stack_fp(ctx, value_r, value->stackPos, value->size); + } + // STR Vn, [RTMP2] - floating point store + encode_ldr_str_imm(ctx, value->size == 8 ? 0x03 : 0x02, 1, 0x00, 0, RTMP2, value_r); + } else { + // Integer/pointer: store from CPU register + Arm64Reg value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : RTMP; + if (value_reg->kind == RCONST) { + load_immediate(ctx, value_reg->id, value_r, value->size == 8); + } else if (value_reg->kind != RCPU) { + ldr_stack(ctx, value_r, value->stackPos, value->size); + } + // STR Xn, [RTMP2] + encode_ldr_str_imm(ctx, value->size == 8 ? 0x03 : 0x02, 0, 0x00, 0, RTMP2, value_r); + } + + discard(ctx, value_reg); +} + +// ============================================================================ +// Reference Operations +// ============================================================================ + +/* + * Create reference: dst = &src + * ORef: dst = address of vreg + * + * IMPORTANT: After taking a reference to a vreg, that vreg may be modified + * through the reference (via OSetref). We must: + * 1. Ensure src is spilled to stack (in case it's only in a register) + * 2. Invalidate src's register binding so future reads go to stack + */ +static void op_ref(jit_ctx *ctx, vreg *dst, vreg *src) { + // First, ensure src is on stack and invalidate its register binding + // (like x86's scratch(ra->current)) + if (src->current != NULL) { + // Spill to stack if in a register + store(ctx, src, src->current); + // Invalidate the binding so future reads go to stack + src->current->holds = NULL; + src->current = NULL; + } + + preg *dst_reg = alloc_dst(ctx, dst); + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + + // Calculate stack address: FP + src->stackPos + if (src->stackPos >= 0) { + // ADD dst_r, FP, #stackPos + encode_add_sub_imm(ctx, 1, 0, 0, 0, src->stackPos, FP, dst_r); + } else { + // SUB dst_r, FP, #(-stackPos) + encode_add_sub_imm(ctx, 1, 1, 0, 0, -src->stackPos, FP, dst_r); + } + + // Always store to stack - source of truth for later loads + str_stack(ctx, dst_r, dst->stackPos, dst->size); +} + +/* + * Dereference: dst = *src + * OUnref: Load value from pointer + */ +static void op_unref(jit_ctx *ctx, vreg *dst, vreg *src) { + preg *src_reg = fetch(ctx, src); + + // Load the pointer (always integer register since it's an address) + Arm64Reg src_r = (src_reg->kind == RCPU) ? (Arm64Reg)src_reg->id : RTMP; + if (src_reg->kind != RCPU) { + ldr_stack(ctx, src_r, src->stackPos, src->size); + } + + int size_bits = (dst->size == 1) ? 0x00 : (dst->size == 2) ? 0x01 : (dst->size == 4) ? 0x02 : 0x03; + + if (IS_FLOAT(dst)) { + // Float dereference: LDR Vd, [src_r] + preg *dst_reg = alloc_dst(ctx, dst); + Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16; + encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, src_r, dst_r); + str_stack_fp(ctx, dst_r, dst->stackPos, dst->size); + } else { + // Integer dereference: LDR Xd, [src_r] + preg *dst_reg = alloc_dst(ctx, dst); + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP2; + encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, src_r, dst_r); + str_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + discard(ctx, src_reg); +} + +/* + * Set reference: *dst = src + * OSetref: Store value to pointer + */ +static void op_setref(jit_ctx *ctx, vreg *dst, vreg *src) { + preg *dst_reg = fetch(ctx, dst); + preg *src_reg = fetch(ctx, src); + + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + if (dst_reg->kind != RCPU) { + ldr_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + Arm64Reg src_r = (src_reg->kind == RCPU) ? (Arm64Reg)src_reg->id : RTMP2; + if (src_reg->kind == RCONST) { + load_immediate(ctx, src_reg->id, src_r, src->size == 8); + } else if (src_reg->kind != RCPU) { + ldr_stack(ctx, src_r, src->stackPos, src->size); + } + + // Store to pointer: STR src_r, [dst_r] + int size_bits = (src->size == 1) ? 0x00 : (src->size == 2) ? 0x01 : (src->size == 4) ? 0x02 : 0x03; + encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, dst_r, src_r); + + discard(ctx, dst_reg); + discard(ctx, src_reg); +} + +// ============================================================================ +// Comparison Operations (result stored, not branching) +// ============================================================================ + +/* + * Equality comparison: dst = (a == b) + * OEq/ONeq/OLt/OGte/etc: Store comparison result as boolean + */ +static void op_compare(jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op op) { + preg *a_reg = fetch(ctx, a); + preg *b_reg = fetch(ctx, b); + preg *dst_reg = alloc_dst(ctx, dst); + + Arm64Reg a_r = (a_reg->kind == RCPU) ? (Arm64Reg)a_reg->id : RTMP; + if (a_reg->kind == RCONST) { + load_immediate(ctx, a_reg->id, a_r, a->size == 8); + } else if (a_reg->kind != RCPU) { + ldr_stack(ctx, a_r, a->stackPos, a->size); + } + + Arm64Reg b_r = (b_reg->kind == RCPU) ? (Arm64Reg)b_reg->id : RTMP2; + if (b_reg->kind == RCONST) { + load_immediate(ctx, b_reg->id, b_r, b->size == 8); + } else if (b_reg->kind != RCPU) { + ldr_stack(ctx, b_r, b->stackPos, b->size); + } + + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : X9; + + bool is_float = IS_FLOAT(a); + + if (is_float) { + // Floating-point comparison + Arm64FpReg fa_r = (a_reg->kind == RFPU) ? (Arm64FpReg)a_reg->id : V16; + Arm64FpReg fb_r = (b_reg->kind == RFPU) ? (Arm64FpReg)b_reg->id : V17; + + if (a_reg->kind != RFPU) { + // Load from stack to FP register + ldr_stack_fp(ctx, fa_r, a->stackPos, a->size); + } + if (b_reg->kind != RFPU) { + ldr_stack_fp(ctx, fb_r, b->stackPos, b->size); + } + + // FCMP fa_r, fb_r + int is_double = a->size == 8 ? 1 : 0; + encode_fp_compare(ctx, 0, is_double, is_double, fb_r, 0, fa_r); + } else { + // Integer comparison: CMP a_r, b_r + encode_add_sub_reg(ctx, a->size == 8 ? 1 : 0, 1, 1, 0, b_r, 0, a_r, XZR); + } + + // Get condition code for this operation + ArmCondition cond = hl_cond_to_arm(op, is_float); + + // CSET dst_r, cond (Set register to 1 if condition true, 0 otherwise) + // Encoding: CSINC dst, XZR, XZR, !cond + // This sets dst = (cond) ? 1 : 0 + int inv_cond = cond ^ 1; // Invert condition + // CSINC: sf=0, op=0, S=0, Rm=XZR, cond=inv_cond, o2=1, Rn=XZR, Rd=dst_r + EMIT32(ctx,(0 << 31) | (0 << 30) | (0xD4 << 21) | (XZR << 16) | (inv_cond << 12) | (1 << 10) | (XZR << 5) | dst_r); + + // Always store to stack - source of truth for later loads + str_stack(ctx, dst_r, dst->stackPos, dst->size); + + discard(ctx, a_reg); + discard(ctx, b_reg); +} + +// ============================================================================ +// Type and Object Operations +// ============================================================================ + +/* + * Get object type: dst = obj->type + * OType: Load type pointer from object + */ +static void op_type(jit_ctx *ctx, vreg *dst, vreg *obj) { + preg *obj_reg = fetch(ctx, obj); + preg *dst_reg = alloc_dst(ctx, dst); + + Arm64Reg obj_r = (obj_reg->kind == RCPU) ? (Arm64Reg)obj_reg->id : RTMP; + if (obj_reg->kind != RCPU) { + ldr_stack(ctx, obj_r, obj->stackPos, obj->size); + } + + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP2; + + // Load type pointer from object header (first field at offset 0) + // LDR dst_r, [obj_r] + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, obj_r, dst_r); + + // Always store to stack - source of truth for later loads + str_stack(ctx, dst_r, dst->stackPos, dst->size); + + discard(ctx, obj_reg); +} + +/* + * OGetThis: Load a field from the "this" object (R(0)) + * Equivalent to OField but implicitly uses R(0) as the object + */ +static void op_get_this(jit_ctx *ctx, vreg *dst, int field_idx) { + vreg *this_vreg = R(0); + op_field(ctx, dst, this_vreg, field_idx); +} + +/* + * Get the dynamic cast function for a given type + */ +static void *get_dyncast(hl_type *t) { + switch (t->kind) { + case HF32: + return hl_dyn_castf; + case HF64: + return hl_dyn_castd; + case HI64: + case HGUID: + return hl_dyn_casti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_casti; + default: + return hl_dyn_castp; + } +} + +/* + * Cast operation (safe cast with runtime check) + * OSafeCast: dst = (target_type)obj or NULL if cast fails + */ +static void op_safe_cast(jit_ctx *ctx, vreg *dst, vreg *obj, hl_type *target_type) { + // Special case: Null to T - unbox with null check + if (obj->t->kind == HNULL && obj->t->tparam->kind == dst->t->kind) { + int jnull, jend; + + switch (dst->t->kind) { + case HUI8: + case HUI16: + case HI32: + case HBOOL: + case HI64: + case HGUID: + { + preg *tmp = fetch(ctx, obj); + Arm64Reg r = (tmp->kind == RCPU) ? tmp->id : RTMP; + if (tmp->kind != RCPU) { + ldr_stack(ctx, r, obj->stackPos, obj->size); + } + // Test for null + encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, r, XZR); // CMP r, #0 + jnull = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ null_path + + // Non-null: load value from offset 8 with correct size + // Size determines scale: 0x00=1, 0x01=2, 0x02=4, 0x03=8 + // So offset = 8 / scale to get byte offset 8 + int size_code; + int scaled_offset; + switch (dst->size) { + case 1: size_code = 0x00; scaled_offset = 8; break; // LDRB [r, #8] + case 2: size_code = 0x01; scaled_offset = 4; break; // LDRH [r, #8] + case 4: size_code = 0x02; scaled_offset = 2; break; // LDR W [r, #8] + default: size_code = 0x03; scaled_offset = 1; break; // LDR X [r, #8] + } + // The LDR below clobbers r. If obj is dirty in r, save it to stack first. + // This preserves obj's value (the dynamic pointer) for later use. + if (obj->dirty && obj->current == tmp) { + str_stack(ctx, r, obj->stackPos, obj->size); + obj->dirty = 0; + } + encode_ldr_str_imm(ctx, size_code, 0, 0x01, scaled_offset, r, r); + jend = BUF_POS(); + encode_branch_uncond(ctx, 0); // B end + + // Null path: set to zero + patch_jump(ctx, jnull, BUF_POS()); + load_immediate(ctx, 0, r, dst->size == 8); + + // End + patch_jump(ctx, jend, BUF_POS()); + str_stack(ctx, r, dst->stackPos, dst->size); + // Clear binding - register no longer holds obj's original value + discard(ctx, tmp); + // Invalidate dst's old binding since we wrote directly to stack + if (dst->current) { + dst->current->holds = NULL; + dst->current = NULL; + } + } + return; + + case HF32: + case HF64: + { + preg *tmp = fetch(ctx, obj); + Arm64Reg r = (tmp->kind == RCPU) ? tmp->id : RTMP; + if (tmp->kind != RCPU) { + ldr_stack(ctx, r, obj->stackPos, obj->size); + } + // Evict any vreg currently bound to V0 before using it + preg *pv0 = PVFPR(0); + if (pv0->holds != NULL && pv0->holds != dst) { + free_reg(ctx, pv0); + } + // Test for null + encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, r, XZR); // CMP r, #0 + jnull = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ null_path + + // Non-null: load float from offset 8 + encode_ldr_str_imm(ctx, (dst->size == 8) ? 0x03 : 0x02, 1, 0x01, 8 / dst->size, r, V0); + jend = BUF_POS(); + encode_branch_uncond(ctx, 0); // B end + + // Null path: set to zero + patch_jump(ctx, jnull, BUF_POS()); + // FMOV Vd, XZR + EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | V0); + + // End + patch_jump(ctx, jend, BUF_POS()); + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + // Clear binding - register no longer holds obj's original value + discard(ctx, tmp); + // Invalidate dst's old binding since we wrote directly to stack + if (dst->current) { + dst->current->holds = NULL; + dst->current = NULL; + } + } + return; + + default: + break; + } + } + + // General case: call runtime cast function + spill_regs(ctx); + + // Get stack address of obj + // LEA X0, [FP, #obj->stackPos] or similar + if (obj->stackPos >= 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, obj->stackPos, FP, X0); + } else { + encode_add_sub_imm(ctx, 1, 1, 0, 0, -obj->stackPos, FP, X0); + } + + // Set up arguments based on destination type + void *cast_func = get_dyncast(dst->t); + switch (dst->t->kind) { + case HF32: + case HF64: + case HI64: + // 2 args: ptr, src_type + load_immediate(ctx, (int64_t)obj->t, X1, true); + break; + default: + // 3 args: ptr, src_type, dst_type + load_immediate(ctx, (int64_t)obj->t, X1, true); + load_immediate(ctx, (int64_t)dst->t, X2, true); + break; + } + + // Call cast function + load_immediate(ctx, (int64_t)cast_func, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + + // Store result and clear stale binding + if (IS_FLOAT(dst)) { + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + } else { + str_stack(ctx, X0, dst->stackPos, dst->size); + } + store_result(ctx, dst); +} + +/* + * Null coalescing: dst = (a != null) ? a : b + * OCoalesce/ONullCheck + */ +static void op_null_check(jit_ctx *ctx, vreg *dst) { + // Check if dst is null and call hl_null_access if so + preg *dst_reg = fetch(ctx, dst); + + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + if (dst_reg->kind != RCPU) { + ldr_stack(ctx, dst_r, dst->stackPos, dst->size); + } + + // Compare with zero: CMP dst_r, #0 (actually SUBS XZR, dst_r, #0) + encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, dst_r, XZR); + + // If not zero (not null), skip error handling: B.NE skip + int bne_pos = BUF_POS(); + encode_branch_cond(ctx, 0, COND_NE); // B.NE (will patch offset) + + // Null path: call hl_null_access(field, hashed_name) + // For simplicity, pass 0 for both (basic null access error) + // NOTE: Do NOT call spill_regs() here! hl_null_access never returns (it throws), + // and spill_regs() would corrupt compile-time register bindings for the non-null path. + load_immediate(ctx, 0, X0, true); // field = NULL + load_immediate(ctx, 0, X1, true); // hashed_name = 0 + load_immediate(ctx, (int64_t)hl_null_access, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + // hl_null_access doesn't return (it throws), but we don't emit anything after + + // Patch the B.NE to skip here + int skip_pos = BUF_POS(); + int bne_offset = (skip_pos - bne_pos) / 4; + ctx->buf.b = ctx->startBuf + bne_pos; + encode_branch_cond(ctx, bne_offset, COND_NE); + ctx->buf.b = ctx->startBuf + skip_pos; + + discard(ctx, dst_reg); +} + +/* + * Object/memory allocation operations + * These typically call into the runtime allocator + */ +static void op_new(jit_ctx *ctx, vreg *dst, hl_type *type) { + // Call runtime allocator based on type kind + // Different type kinds require different allocation functions: + // - HOBJ/HSTRUCT: hl_alloc_obj(type) + // - HDYNOBJ: hl_alloc_dynobj() - no arguments! + // - HVIRTUAL: hl_alloc_virtual(type) + + // Spill all caller-saved registers BEFORE the call + spill_regs(ctx); + + void *alloc_func; + int has_type_arg = 1; + + switch (type->kind) { + case HOBJ: + case HSTRUCT: + alloc_func = (void*)hl_alloc_obj; + break; + case HDYNOBJ: + alloc_func = (void*)hl_alloc_dynobj; + has_type_arg = 0; // hl_alloc_dynobj takes no arguments + break; + case HVIRTUAL: + alloc_func = (void*)hl_alloc_virtual; + break; + default: + // Unsupported type for ONew + printf("op_new: unsupported type kind %d\n", type->kind); + return; + } + + // Load type address to X0 (first argument) if needed + if (has_type_arg) { + load_immediate(ctx, (int64_t)type, X0, true); + } + + // Load function pointer and call + load_immediate(ctx, (int64_t)alloc_func, RTMP, true); + + // Call allocator: BLR RTMP + EMIT32(ctx, (0xD63F0000) | (RTMP << 5)); + + // Result is in X0 - always store to stack first (source of truth for later loads) + str_stack(ctx, X0, dst->stackPos, dst->size); + + // Also keep in a register if allocated + preg *dst_reg = alloc_dst(ctx, dst); + if (dst_reg->kind == RCPU && (Arm64Reg)dst_reg->id != X0) { + mov_reg_reg(ctx, (Arm64Reg)dst_reg->id, X0, 8); + } +} + +/* + * String/bytes operations + */ +static void op_string(jit_ctx *ctx, vreg *dst, int string_index) { + // Load UTF-16 string from module string table + preg *dst_reg = alloc_dst(ctx, dst); + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + + // Get UTF-16 string pointer (hl_get_ustring converts from UTF-8 and caches) + const uchar *string_ptr = hl_get_ustring(ctx->m->code, string_index); + + // Load string address + load_immediate(ctx, (int64_t)string_ptr, dst_r, true); + + // Always store to stack - source of truth for later loads + str_stack(ctx, dst_r, dst->stackPos, dst->size); +} + +static void op_bytes(jit_ctx *ctx, vreg *dst, int bytes_index) { + // Load bytes from module bytes table + preg *dst_reg = alloc_dst(ctx, dst); + Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP; + + // Get bytes pointer from module - use bytes_pos lookup for version >= 5 + char *bytes_ptr; + if (ctx->m->code->version >= 5) + bytes_ptr = ctx->m->code->bytes + ctx->m->code->bytes_pos[bytes_index]; + else + bytes_ptr = ctx->m->code->strings[bytes_index]; + + // Load bytes address + load_immediate(ctx, (int64_t)bytes_ptr, dst_r, true); + + // Always store to stack - source of truth for later loads + str_stack(ctx, dst_r, dst->stackPos, dst->size); +} + +// Forward declaration for prepare_call_args (defined later) +static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native); + +/* + * Virtual/method calls + * OCallMethod/OCallThis/OCallClosure + */ +// ============================================================================ +// Dynamic Object Helpers +// ============================================================================ + +/** + * Get the appropriate dynamic set function for a type + */ +static void *get_dynset(hl_type *t) { + switch (t->kind) { + case HF32: + return hl_dyn_setf; + case HF64: + return hl_dyn_setd; + case HI64: + case HGUID: + return hl_dyn_seti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_seti; + default: + return hl_dyn_setp; + } +} + +/** + * Get the appropriate dynamic get function for a type + */ +static void *get_dynget(hl_type *t) { + switch (t->kind) { + case HF32: + return hl_dyn_getf; + case HF64: + return hl_dyn_getd; + case HI64: + case HGUID: + return hl_dyn_geti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_geti; + default: + return hl_dyn_getp; + } +} + +// ============================================================================ +// Method and Function Calls +// ============================================================================ + +static void op_call_method_obj(jit_ctx *ctx, vreg *dst, vreg *obj, int method_index, vreg **args, int nargs) { + // HOBJ method call: obj->type->vobj_proto[method_index](obj, args...) + + // Spill all caller-saved registers BEFORE the call + spill_regs(ctx); + + // Now fetch obj (will load from stack since we just spilled) + preg *obj_reg = fetch(ctx, obj); + + Arm64Reg obj_r = (obj_reg->kind == RCPU) ? (Arm64Reg)obj_reg->id : RTMP; + if (obj_reg->kind != RCPU) { + ldr_stack(ctx, obj_r, obj->stackPos, obj->size); + } + + // Load type from obj[0] + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, obj_r, RTMP); // RTMP = obj->type + // Load vobj_proto from type[16] (HL_WSIZE*2 = offset index 2) + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 2, RTMP, RTMP); // RTMP = type->vobj_proto + // Load method pointer from proto[method_index] into RTMP2 + // NOTE: We use RTMP2 here because prepare_call_args uses RTMP for stack calculations + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, method_index, RTMP, RTMP2); + + discard(ctx, obj_reg); + + // Prepare call with obj as first argument + vreg **full_args = (vreg**)malloc(sizeof(vreg*) * (nargs + 1)); + full_args[0] = obj; + for (int i = 0; i < nargs; i++) { + full_args[i + 1] = args[i]; + } + + // Prepare arguments (this uses RTMP, but method pointer is safe in RTMP2) + int stack_space = prepare_call_args(ctx, NULL, full_args, nargs + 1, false); + free(full_args); + + // Call method: BLR RTMP2 + EMIT32(ctx,(0xD63F0000) | (RTMP2 << 5)); + + // Clean up stack + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + + // Store return value + if (dst && dst->t->kind != HVOID) { + preg *p = alloc_dst(ctx, dst); + if (IS_FLOAT(dst)) { + if (p->kind == RFPU && (Arm64FpReg)p->id != V0) { + fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size); + } else if (p->kind == RSTACK) { + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + } + } else { + if (p->kind == RCPU && (Arm64Reg)p->id != X0) { + mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size); + } else if (p->kind == RSTACK) { + str_stack(ctx, X0, dst->stackPos, dst->size); + } + } + } +} + +// ============================================================================ +// Function Calls +// ============================================================================ + +/* + * Prepare arguments for a function call according to AAPCS64: + * - First 8 integer/pointer args in X0-X7 + * - First 8 floating-point args in V0-V7 + * - Additional args on stack (16-byte aligned) + * - Returns the total stack space needed for overflow args + */ +static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native) { + int int_reg_count = 0; + int fp_reg_count = 0; + int stack_offset = 0; + + // First pass: count args and calculate stack space needed + for (int i = 0; i < nargs; i++) { + bool is_fp = IS_FLOAT(args[i]); + int *reg_count = is_fp ? &fp_reg_count : &int_reg_count; + + if (*reg_count >= CALL_NREGS) { + // Arg goes on stack + stack_offset += 8; // Each stack arg takes 8 bytes (aligned) + } + (*reg_count)++; + } + + // Align stack to 16 bytes + if (stack_offset & 15) + stack_offset = (stack_offset + 15) & ~15; + + // Allocate stack space for overflow args if needed + if (stack_offset > 0) { + // SUB SP, SP, #stack_offset + encode_add_sub_imm(ctx, 1, 1, 0, 0, stack_offset, SP_REG, SP_REG); + } + + // Second pass: move arguments to their locations + int_reg_count = 0; + fp_reg_count = 0; + int current_stack_offset = 0; + + // After spill_regs(), all values are on stack. + // Load arguments directly to their destination registers to avoid + // the register allocation problem where fetch() reuses registers. + for (int i = 0; i < nargs; i++) { + vreg *arg = args[i]; + bool is_fp = IS_FLOAT(arg); + + if (is_fp) { + if (fp_reg_count < CALL_NREGS) { + // Load directly to FP argument register + Arm64FpReg dest_reg = FP_CALL_REGS[fp_reg_count]; + ldr_stack_fp(ctx, dest_reg, arg->stackPos, arg->size); + fp_reg_count++; + } else { + // Overflow: load to temp, then store to stack + ldr_stack_fp(ctx, V16, arg->stackPos, arg->size); + encode_ldr_str_imm(ctx, arg->size == 4 ? 0x02 : 0x03, 1, 0x00, + current_stack_offset / (arg->size == 4 ? 4 : 8), + SP_REG, V16); + current_stack_offset += 8; + } + } else { + // Integer/pointer argument + if (int_reg_count < CALL_NREGS) { + // Load directly to integer argument register + Arm64Reg dest_reg = CALL_REGS[int_reg_count]; + ldr_stack(ctx, dest_reg, arg->stackPos, arg->size); + int_reg_count++; + } else { + // Overflow: load to temp, then store to stack + ldr_stack(ctx, RTMP, arg->stackPos, arg->size); + encode_ldr_str_imm(ctx, arg->size == 8 ? 0x03 : 0x02, 0, 0x00, + current_stack_offset / (arg->size == 8 ? 8 : 4), + SP_REG, RTMP); + current_stack_offset += 8; + } + } + } + + return stack_offset; +} + +/* + * Call a native C function + */ +static void op_call_native(jit_ctx *ctx, vreg *dst, hl_type *ftype, void *func_ptr, vreg **args, int nargs) { + // Spill all caller-saved registers BEFORE the call + spill_regs(ctx); + + // Prepare arguments (arg_types not actually used by prepare_call_args) + int stack_space = prepare_call_args(ctx, NULL, args, nargs, true); + + // Load function pointer to RTMP + load_immediate(ctx, (int64_t)func_ptr, RTMP, true); + + // BLR RTMP (Branch with Link to Register) + // Encoding: 1101 0110 0011 1111 0000 00rr rrr0 0000 + // where rrrrr = RTMP register number + EMIT32(ctx,(0xD63F0000) | (RTMP << 5)); + + // Clean up stack if we allocated space for args + if (stack_space > 0) { + // ADD SP, SP, #stack_space + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + + // Store return value if needed + if (dst && dst->t->kind != HVOID) { + // Always store to stack first (source of truth for later loads) + if (IS_FLOAT(dst)) { + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + } else { + str_stack(ctx, X0, dst->stackPos, dst->size); + } + + // Also keep in a register if allocated to a different one + preg *p = alloc_dst(ctx, dst); + if (IS_FLOAT(dst)) { + if (p->kind == RFPU && (Arm64FpReg)p->id != V0) { + fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size); + } + } else { + if (p->kind == RCPU && (Arm64Reg)p->id != X0) { + mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size); + } + } + } +} + +/* + * Call a native function with a known absolute address + * The address is embedded directly in the instruction stream (no patching needed) + */ +static void call_native(jit_ctx *ctx, void *nativeFun, int stack_space) { + // Emit indirect call sequence with the address embedded inline: + // LDR X17, #12 ; load target address from PC+12 + // BLR X17 ; call + // B #12 ; skip over the literal + // .quad addr ; 8-byte absolute address (embedded now, not patched later) + + EMIT32(ctx, 0x58000071); // LDR X17, #12 + EMIT32(ctx, 0xD63F0220); // BLR X17 + EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes) + + // Embed the native function address directly + uint64_t addr = (uint64_t)nativeFun; + EMIT32(ctx, (uint32_t)(addr & 0xFFFFFFFF)); // Low 32 bits + EMIT32(ctx, (uint32_t)((addr >> 32) & 0xFFFFFFFF)); // High 32 bits + + // Clean up stack if we allocated space for args + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } +} + +/* + * Emit a call to a function by its index (without spill/prepare - for use when those are already done) + * Used by compareFun and other places that set up args manually + */ +static void emit_call_findex(jit_ctx *ctx, int findex, int stack_space) { + int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex]; + bool isNative = fid >= ctx->m->code->nfunctions; + + if (fid < 0) { + jit_error("Invalid function index"); + } else if (isNative) { + // Native function - address is already resolved + call_native(ctx, ctx->m->functions_ptrs[findex], stack_space); + } else { + // JIT function - use indirect call via literal pool (patched later) + EMIT32(ctx, 0x58000071); // LDR X17, #12 + EMIT32(ctx, 0xD63F0220); // BLR X17 + + // Register literal position for patching + jlist *j = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist)); + j->pos = BUF_POS() + 4; // Position of the 8-byte literal (after B instruction) + j->target = findex; + j->next = ctx->calls; + ctx->calls = j; + + EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes) + EMIT32(ctx, 0); // Low 32 bits placeholder + EMIT32(ctx, 0); // High 32 bits placeholder + + // Clean up stack if we allocated space for args + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + } +} + +/* + * Call a HashLink function (native or JIT-compiled) + * For OCall0-OCall4, OCallN + */ +static void op_call_hl(jit_ctx *ctx, vreg *dst, int findex, vreg **args, int nargs) { + // Spill all caller-saved registers BEFORE the call + // This must happen before prepare_call_args to save values that might be clobbered + spill_regs(ctx); + + // Prepare arguments + int stack_space = prepare_call_args(ctx, NULL, args, nargs, false); + + // Check if this is a native function or JIT function + int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex]; + bool isNative = fid >= ctx->m->code->nfunctions; + + if (fid < 0) { + // Invalid function index + jit_error("Invalid function index"); + } else if (isNative) { + // Native function - address is already resolved, call directly + call_native(ctx, ctx->m->functions_ptrs[findex], stack_space); + } else { + // JIT function - use indirect call via literal pool (patched later) + // During JIT compilation, functions_ptrs contains CODE OFFSETS. + // The conversion to absolute addresses happens in hl_jit_code. + // + // Sequence: + // LDR X17, #12 ; load target address from PC+12 + // BLR X17 ; call + // B #12 ; skip over the literal + // .quad addr ; 8-byte address placeholder (patched later) + + EMIT32(ctx, 0x58000071); // LDR X17, #12 + EMIT32(ctx, 0xD63F0220); // BLR X17 + + // Register literal position for patching + jlist *j = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist)); + j->pos = BUF_POS() + 4; // Position of the 8-byte literal (after B instruction) + j->target = findex; + j->next = ctx->calls; + ctx->calls = j; + + EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes) + EMIT32(ctx, 0); // Low 32 bits placeholder + EMIT32(ctx, 0); // High 32 bits placeholder + + // Clean up stack if we allocated space for args + if (stack_space > 0) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG); + } + } + + // Note: spill_regs was already called before prepare_call_args + + // Store return value if needed + if (dst && dst->t->kind != HVOID) { + // Always store to stack first (source of truth for later loads) + if (IS_FLOAT(dst)) { + str_stack_fp(ctx, V0, dst->stackPos, dst->size); + } else { + str_stack(ctx, X0, dst->stackPos, dst->size); + } + + // Also keep in a register if allocated to a different one + preg *p = alloc_dst(ctx, dst); + if (IS_FLOAT(dst)) { + if (p->kind == RFPU && (Arm64FpReg)p->id != V0) { + fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size); + } + } else { + if (p->kind == RCPU && (Arm64Reg)p->id != X0) { + mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size); + } + } + } +} + +// ============================================================================ +// C↔HL Trampolines +// ============================================================================ + +static void *call_jit_c2hl = NULL; +static void *call_jit_hl2c = NULL; + +// Maximum args for dynamic calls +#define MAX_ARGS 64 + +/** + * Wrapper function for HL->C calls - unpacks arguments and calls the wrapped function. + * Called from jit_hl2c trampoline. + */ +static vdynamic *jit_wrapper_call(vclosure_wrapper *c, char *stack_args, void **regs) { + vdynamic *args[MAX_ARGS]; + int i; + int nargs = c->cl.t->fun->nargs; + int nextCpu = 1; // Skip X0 which holds the closure pointer + int nextFpu = 0; + + if (nargs > MAX_ARGS) + hl_error("Too many arguments for wrapped call"); + + for (i = 0; i < nargs; i++) { + hl_type *t = c->cl.t->fun->args[i]; + + if (t->kind == HF32 || t->kind == HF64) { + // Float argument + if (nextFpu < CALL_NREGS) { + // In FP register - regs[CALL_NREGS + fpu_index] + args[i] = hl_make_dyn(regs + CALL_NREGS + nextFpu, &hlt_f64); + nextFpu++; + } else { + // On stack + args[i] = hl_make_dyn(stack_args, &hlt_f64); + stack_args += 8; + } + } else { + // Integer/pointer argument + if (nextCpu < CALL_NREGS) { + // In CPU register + if (hl_is_dynamic(t)) { + args[i] = *(vdynamic**)(regs + nextCpu); + } else { + args[i] = hl_make_dyn(regs + nextCpu, t); + } + nextCpu++; + } else { + // On stack + if (hl_is_dynamic(t)) { + args[i] = *(vdynamic**)stack_args; + } else { + args[i] = hl_make_dyn(stack_args, t); + } + stack_args += 8; + } + } + } + return hl_dyn_call(c->wrappedFun, args, nargs); +} + +/** + * Wrapper for pointer-returning HL->C calls + */ +static void *jit_wrapper_ptr(vclosure_wrapper *c, char *stack_args, void **regs) { + vdynamic *ret = jit_wrapper_call(c, stack_args, regs); + hl_type *tret = c->cl.t->fun->ret; + switch (tret->kind) { + case HVOID: + return NULL; + case HUI8: + case HUI16: + case HI32: + case HBOOL: + return (void*)(int_val)hl_dyn_casti(&ret, &hlt_dyn, tret); + case HI64: + case HGUID: + return (void*)(int_val)hl_dyn_casti64(&ret, &hlt_dyn); + default: + return hl_dyn_castp(&ret, &hlt_dyn, tret); + } +} + +/** + * Wrapper for float-returning HL->C calls + */ +static double jit_wrapper_d(vclosure_wrapper *c, char *stack_args, void **regs) { + vdynamic *ret = jit_wrapper_call(c, stack_args, regs); + return hl_dyn_castd(&ret, &hlt_dyn); +} + +/** + * Select which register to use for an argument based on type and position. + * Returns register ID or -1 if should go on stack. + */ +static int select_call_reg_c2hl(int *nextCpu, int *nextFpu, hl_type *t) { + if (t->kind == HF32 || t->kind == HF64) { + if (*nextFpu < CALL_NREGS) + return RCPU_COUNT + (*nextFpu)++; // FPU register + return -1; // Stack + } else { + if (*nextCpu < CALL_NREGS) + return (*nextCpu)++; // CPU register + return -1; // Stack + } +} + +/** + * Get the stack size for a type + */ +static int stack_size_c2hl(hl_type *t) { + switch (t->kind) { + case HUI8: + case HBOOL: + return 1; + case HUI16: + return 2; + case HI32: + case HF32: + return 4; + default: + return 8; + } +} + +/** + * Callback function that prepares arguments and calls the JIT trampoline. + * Called from C code to invoke JIT-compiled functions. + */ +static void *callback_c2hl(void *_f, hl_type *t, void **args, vdynamic *ret) { + void **f = (void**)_f; + // Stack layout: + // [0..size) = stack args (pushed in reverse) + // [size..size+CALL_NREGS*8) = integer register args (X0-X7) + // [size+CALL_NREGS*8..size+CALL_NREGS*16) = FP register args (V0-V7) + unsigned char stack[MAX_ARGS * 16]; + int nextCpu = 0, nextFpu = 0; + int mappedRegs[MAX_ARGS]; + + // Zero-initialize the stack to avoid passing garbage to unused registers + // The jit_c2hl trampoline loads ALL 8 int + 8 FP registers unconditionally + memset(stack, 0, sizeof(stack)); + + if (t->fun->nargs > MAX_ARGS) + hl_error("Too many arguments for dynamic call"); + + // First pass: determine register assignments and stack size + int i, size = 0; + for (i = 0; i < t->fun->nargs; i++) { + hl_type *at = t->fun->args[i]; + int creg = select_call_reg_c2hl(&nextCpu, &nextFpu, at); + mappedRegs[i] = creg; + if (creg < 0) { + int tsize = stack_size_c2hl(at); + if (tsize < 8) tsize = 8; // Align to 8 bytes on stack + size += tsize; + } + } + + // Align stack size to 16 bytes + int pad = (-size) & 15; + size += pad; + + // Second pass: copy arguments to appropriate locations + int pos = 0; + for (i = 0; i < t->fun->nargs; i++) { + hl_type *at = t->fun->args[i]; + void *v = args[i]; + int creg = mappedRegs[i]; + void *store; + + if (creg >= 0) { + if (creg >= RCPU_COUNT) { + // FP register - stored after integer registers + store = stack + size + CALL_NREGS * 8 + (creg - RCPU_COUNT) * 8; + } else { + // Integer register + store = stack + size + creg * 8; + } + switch (at->kind) { + case HBOOL: + case HUI8: + *(int64*)store = *(unsigned char*)v; + break; + case HUI16: + *(int64*)store = *(unsigned short*)v; + break; + case HI32: + *(int64*)store = *(int*)v; + break; + case HF32: + *(double*)store = *(float*)v; + break; + case HF64: + *(double*)store = *(double*)v; + break; + case HI64: + case HGUID: + *(int64*)store = *(int64*)v; + break; + default: + *(void**)store = v; + break; + } + } else { + // Stack argument + store = stack + pos; + int tsize = 8; + switch (at->kind) { + case HBOOL: + case HUI8: + *(int64*)store = *(unsigned char*)v; + break; + case HUI16: + *(int64*)store = *(unsigned short*)v; + break; + case HI32: + case HF32: + *(int64*)store = *(int*)v; + break; + case HF64: + *(double*)store = *(double*)v; + break; + case HI64: + case HGUID: + *(int64*)store = *(int64*)v; + break; + default: + *(void**)store = v; + break; + } + pos += tsize; + } + } + + pos += pad; + pos >>= 3; // Convert to 64-bit units + + // Call the trampoline with: function pointer, reg args pointer, stack args end + switch (t->fun->ret->kind) { + case HUI8: + case HUI16: + case HI32: + case HBOOL: + ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack); + return &ret->v.i; + case HI64: + case HGUID: + ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack); + return &ret->v.i64; + case HF32: + ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack); + return &ret->v.f; + case HF64: + ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack); + return &ret->v.d; + default: + return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack); + } +} + +/** + * Generate the HL-to-C trampoline. + * Called from C code with a vclosure_wrapper* in X0 and native args in X1-X7, V0-V7. + * Saves registers and calls jit_wrapper_ptr or jit_wrapper_d based on return type. + */ +static void jit_hl2c(jit_ctx *ctx) { + hl_type_fun *ft = NULL; + + // Function prologue - save frame + // STP X29, X30, [SP, #-16]! + encode_ldp_stp(ctx, 0x02, 0, 0x03, -2, LR, SP_REG, FP); + // MOV X29, SP + mov_reg_reg(ctx, FP, SP_REG, true); + + // Allocate space for saved registers: 8 CPU regs + 8 FP regs = 16 * 8 = 128 bytes + // SUB SP, SP, #128 + encode_add_sub_imm(ctx, 1, 1, 0, 0, 128, SP_REG, SP_REG); + + // Trampoline marker: MOV W17, #0xE001 (HL2C trampoline) + EMIT32(ctx, 0x52800011 | (0xE001 << 5)); + + // Save integer argument registers X0-X7 at [SP, #0..63] + for (int i = 0; i < CALL_NREGS; i++) { + encode_ldr_str_imm(ctx, 0x03, 0, 0x00, i, SP_REG, i); // STR Xi, [SP, #i*8] + } + + // Save FP argument registers V0-V7 at [SP, #64..127] + for (int i = 0; i < CALL_NREGS; i++) { + encode_ldr_str_imm(ctx, 0x03, 1, 0x00, 8 + i, SP_REG, i); // STR Di, [SP, #(8+i)*8] + } + + // X0 = closure pointer (vclosure_wrapper*) + // Check return type: closure->t->fun->ret->kind + // X9 = X0->t + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, X0, X9); + // X9 = X9->fun (hl_type->fun is at offset 8 on 64-bit) + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 1, X9, X9); + // X9 = X9->ret (hl_type_fun->ret offset) + int ret_offset = (int)(int_val)&ft->ret; + if (ret_offset < 4096 && (ret_offset % 8) == 0) { + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, ret_offset / 8, X9, X9); + } else { + load_immediate(ctx, ret_offset, RTMP, true); + encode_ldr_str_reg(ctx, 0x03, 0, 0x01, RTMP, 0x03, 0, X9, X9); + } + // W9 = X9->kind (hl_type->kind is at offset 0, 32-bit) + encode_ldr_str_imm(ctx, 0x02, 0, 0x01, 0, X9, X9); + + // Compare with HF64 and HF32 + // CMP W9, #HF64 + encode_add_sub_imm(ctx, 0, 1, 1, 0, HF64, X9, XZR); + int jfloat1 = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ float_path + + // CMP W9, #HF32 + encode_add_sub_imm(ctx, 0, 1, 1, 0, HF32, X9, XZR); + int jfloat2 = BUF_POS(); + encode_branch_cond(ctx, 0, COND_EQ); // B.EQ float_path + + // Integer/pointer path: call jit_wrapper_ptr(closure, stack_args, regs) + // X0 = closure (reload from saved regs) + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, SP_REG, X0); + // X1 = stack_args (FP + 16 is return address area, args start after saved frame) + encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1); + // X2 = regs pointer (SP) + mov_reg_reg(ctx, X2, SP_REG, true); + + load_immediate(ctx, (int64_t)jit_wrapper_ptr, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + + // Result in X0, jump to exit + int jexit = BUF_POS(); + encode_branch_uncond(ctx, 0); // B exit + + // Float path + int float_pos = BUF_POS(); + patch_jump(ctx, jfloat1, float_pos); + patch_jump(ctx, jfloat2, float_pos); + + // X0 = closure (reload) + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, SP_REG, X0); + // X1 = stack_args + encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1); + // X2 = regs pointer + mov_reg_reg(ctx, X2, SP_REG, true); + + load_immediate(ctx, (int64_t)jit_wrapper_d, RTMP, true); + EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP + // Result in V0 + + // Exit path + int exit_pos = BUF_POS(); + patch_jump(ctx, jexit, exit_pos); + + // Restore frame and return + // MOV SP, X29 + mov_reg_reg(ctx, SP_REG, FP, true); + // LDP X29, X30, [SP], #16 + encode_ldp_stp(ctx, 0x02, 0, 0x01, 2, LR, SP_REG, FP); + // RET + encode_branch_reg(ctx, 0x02, LR); +} + +/** + * Generate the C-to-HL trampoline. + * Input: X0 = function pointer, X1 = reg args pointer, X2 = stack args end + * The trampoline loads arguments from the prepared stack and calls the function. + */ +static void jit_c2hl(jit_ctx *ctx) { + // Save callee-saved registers and set up frame + // STP X29, X30, [SP, #-16]! + encode_ldp_stp(ctx, 0x02, 0, 0x03, -2, LR, SP_REG, FP); + // MOV X29, SP + mov_reg_reg(ctx, FP, SP_REG, true); + + // Trampoline marker: MOV W17, #0xE002 (C2HL trampoline) + EMIT32(ctx, 0x52800011 | (0xE002 << 5)); + + // Save function pointer to X9 (caller-saved, will survive loads) + // MOV X9, X0 + mov_reg_reg(ctx, X9, X0, true); + + // Save stack args pointers to X10, X11 + // MOV X10, X1 (reg args pointer) + // MOV X11, X2 (stack args end) + mov_reg_reg(ctx, X10, X1, true); + mov_reg_reg(ctx, X11, X2, true); + + // Load integer register arguments X0-X7 from [X10] + for (int i = 0; i < CALL_NREGS; i++) { + // LDR Xi, [X10, #i*8] + encode_ldr_str_imm(ctx, 0x03, 0, 0x01, i, X10, CALL_REGS[i]); + } + + // Load FP register arguments V0-V7 from [X10 + CALL_NREGS*8] + for (int i = 0; i < CALL_NREGS; i++) { + // LDR Di, [X10, #(CALL_NREGS + i)*8] + // Using 64-bit FP load: size=11, opc=01 + EMIT32(ctx,0xFD400000 | (((CALL_NREGS + i) & 0x1FF) << 10) | (X10 << 5) | FP_CALL_REGS[i]); + } + + // Push stack args from buffer to actual stack + // X10 = end of stack args in buffer (= start of reg args area) + // X11 = start of stack args in buffer + // Stack args size = X10 - X11 (already 16-byte aligned by callback_c2hl) + + // Calculate stack args size: X12 = X10 - X11 + // SUB X12, X10, X11 + encode_add_sub_reg(ctx, 1, 1, 0, 0, X11, 0, X10, X12); + + // Skip if no stack args: CBZ X12, done + int cbz_pos = BUF_POS(); + EMIT32(ctx, 0xB4000000 | (X12 & 0x1F)); // CBZ X12, + + // Allocate stack space: SUB SP, SP, X12 + encode_add_sub_reg(ctx, 1, 1, 0, 0, X12, 0, SP_REG, SP_REG); + + // Copy args contiguously at 8-byte intervals + // X13 = destination pointer (SP), X14 = source pointer (X11) + mov_reg_reg(ctx, X13, SP_REG, true); // MOV X13, SP + mov_reg_reg(ctx, X14, X11, true); // MOV X14, X11 (source = buffer start) + + // Loop: copy 8 bytes at a time until X14 reaches X10 + int loop_start = BUF_POS(); + // CMP X14, X10 + encode_add_sub_reg(ctx, 1, 1, 1, 0, X10, 0, X14, XZR); + // B.GE done (if X14 >= X10, we've copied all args) + int bge_pos = BUF_POS(); + EMIT32(ctx, 0x54000000 | (COND_GE & 0xF)); // B.GE (will patch) + + // LDR X15, [X14], #8 (load and post-increment source) + EMIT32(ctx, 0xF8408000 | (8 << 12) | (X14 << 5) | X15); // LDR X15, [X14], #8 + + // STR X15, [X13], #8 (store and post-increment destination) + EMIT32(ctx, 0xF8008000 | (8 << 12) | (X13 << 5) | X15); // STR X15, [X13], #8 + + // B loop_start + int b_offset = (loop_start - BUF_POS()) / 4; + EMIT32(ctx, 0x14000000 | (b_offset & 0x3FFFFFF)); + + // Patch the CBZ and B.GE to jump here + int done_pos = BUF_POS(); + int cbz_offset = (done_pos - cbz_pos) / 4; + ctx->buf.w = (unsigned int*)(ctx->startBuf + cbz_pos); + EMIT32(ctx, 0xB4000000 | ((cbz_offset & 0x7FFFF) << 5) | (X12 & 0x1F)); + int bge_offset = (done_pos - bge_pos) / 4; + ctx->buf.w = (unsigned int*)(ctx->startBuf + bge_pos); + EMIT32(ctx, 0x54000000 | ((bge_offset & 0x7FFFF) << 5) | (COND_GE & 0xF)); + ctx->buf.w = (unsigned int*)(ctx->startBuf + done_pos); + + // Call the function: BLR X9 + EMIT32(ctx,0xD63F0000 | (X9 << 5)); + + // Restore frame and return + // MOV SP, X29 + mov_reg_reg(ctx, SP_REG, FP, true); + // LDP X29, X30, [SP], #16 + encode_ldp_stp(ctx, 0x02, 0, 0x01, 2, LR, SP_REG, FP); + // RET + encode_branch_reg(ctx, 0x02, LR); +} + +/** + * Get wrapper function for HL-to-C calls. + * This is used for callbacks from C code back into HashLink. + * Returns the jit_hl2c trampoline address. + */ +static void *get_wrapper(hl_type *t) { + return call_jit_hl2c; +} + +// ============================================================================ +// JIT API Implementation +// ============================================================================ + +// Forward declaration +static void hl_jit_init_module(jit_ctx *ctx, hl_module *m); + +jit_ctx *hl_jit_alloc() { + jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx)); + if (ctx == NULL) + return NULL; + memset(ctx, 0, sizeof(jit_ctx)); + return ctx; +} + +void hl_jit_free(jit_ctx *ctx, h_bool can_reset) { + if (ctx == NULL) + return; + + if (ctx->startBuf) + free(ctx->startBuf); + if (ctx->vregs) + free(ctx->vregs); + if (ctx->opsPos) + free(ctx->opsPos); + + free(ctx); +} + +void hl_jit_reset(jit_ctx *ctx, hl_module *m) { + ctx->debug = NULL; + hl_jit_init_module(ctx, m); +} + +/** + * Build a JIT helper function, ensuring buffer is allocated. + * Returns the position in the buffer where the function starts. + */ +static int jit_build(jit_ctx *ctx, void (*fbuild)(jit_ctx *)) { + int pos; + jit_buf(ctx); // Ensure buffer is allocated + pos = BUF_POS(); + fbuild(ctx); + return pos; +} + +/** + * Initialize module-specific data in JIT context. + */ +static void hl_jit_init_module(jit_ctx *ctx, hl_module *m) { + int i; + ctx->m = m; + ctx->closure_list = NULL; + + // Allocate debug info array if bytecode has debug info + if (m->code->hasdebug && m->code->nfunctions > 0) { + ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions); + if (ctx->debug) + memset(ctx->debug, 0, sizeof(hl_debug_infos) * m->code->nfunctions); + } + + // Store float constants in the code buffer (like x86 does) + for (i = 0; i < m->code->nfloats; i++) { + jit_buf(ctx); + *ctx->buf.d++ = m->code->floats[i]; + } +} + +void hl_jit_init(jit_ctx *ctx, hl_module *m) { + hl_jit_init_module(ctx, m); + + // Generate C↔HL trampolines + ctx->c2hl = jit_build(ctx, jit_c2hl); + ctx->hl2c = jit_build(ctx, jit_hl2c); +} + +/** + * Allocate a static closure object. + * For native functions, the function pointer is set immediately. + * For JIT functions, the function pointer is stored temporarily as the findex + * and the closure is added to closure_list for later patching. + */ +static vclosure *alloc_static_closure(jit_ctx *ctx, int fid) { + hl_module *m = ctx->m; + vclosure *c = hl_malloc(&m->ctx.alloc, sizeof(vclosure)); + int fidx = m->functions_indexes[fid]; + c->hasValue = 0; + if (fidx >= m->code->nfunctions) { + // Native function - pointer is already resolved + c->t = m->code->natives[fidx - m->code->nfunctions].t; + c->fun = m->functions_ptrs[fid]; + c->value = NULL; + } else { + // JIT function - store fid temporarily, add to closure_list for patching + c->t = m->code->functions[fidx].type; + c->fun = (void*)(int_val)fid; + c->value = ctx->closure_list; + ctx->closure_list = c; + } + return c; +} + +int hl_jit_function(jit_ctx *ctx, hl_module *m, hl_function *f) { + int i, size = 0, opCount; + int codePos = BUF_POS(); + int nargs = f->type->fun->nargs; + unsigned short *debug16 = NULL; + int *debug32 = NULL; + + ctx->f = f; + ctx->m = m; + ctx->allocOffset = 0; + + // Allocate virtual register array if needed + // Always reallocate to ensure clean state (no stale data from previous functions) + free(ctx->vregs); + ctx->vregs = (vreg*)calloc(f->nregs + 1, sizeof(vreg)); + if (ctx->vregs == NULL) { + ctx->maxRegs = 0; + return -1; + } + ctx->maxRegs = f->nregs; + + // Allocate opcode position array if needed + if (f->nops > ctx->maxOps) { + free(ctx->opsPos); + ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1)); + if (ctx->opsPos == NULL) { + ctx->maxOps = 0; + return -1; + } + ctx->maxOps = f->nops; + } + + memset(ctx->opsPos, 0, (f->nops + 1) * sizeof(int)); + + // Clear/initialize physical registers + for (i = 0; i < RCPU_COUNT; i++) { + preg *p = &ctx->pregs[i]; + p->kind = RCPU; + p->id = i; + p->holds = NULL; + p->lock = 0; + } + for (i = 0; i < RFPU_COUNT; i++) { + preg *p = &ctx->pregs[RCPU_COUNT + i]; + p->kind = RFPU; + p->id = i; + p->holds = NULL; + p->lock = 0; + } + + // Initialize virtual registers + for (i = 0; i < f->nregs; i++) { + vreg *r = R(i); + r->t = f->regs[i]; + r->size = hl_type_size(r->t); + r->stackPos = 0; + r->current = NULL; + r->stack.holds = NULL; + r->stack.id = i; + r->stack.kind = RSTACK; + r->stack.lock = 0; + } + + // Calculate stack layout + // Arguments: first 8 integer args in X0-X7, first 8 FP args in V0-V7 + // Additional args on stack + size = 0; + int argsSize = 0; + int int_arg_count = 0; + int fp_arg_count = 0; + + for (i = 0; i < nargs; i++) { + vreg *r = R(i); + bool is_fp = IS_FLOAT(r); + int *arg_count = is_fp ? &fp_arg_count : &int_arg_count; + + if (*arg_count < CALL_NREGS) { + // Argument is in register - allocate stack space for it + size += r->size; + size += hl_pad_size(size, r->t); + r->stackPos = -size; + (*arg_count)++; + } else { + // Argument is on stack (caller's frame) + // Offset by CALLEE_SAVED_FRAME_SIZE to skip saved registers + // Each stack arg occupies 8 bytes (matching caller's prepare_call_args) + r->stackPos = argsSize + CALLEE_SAVED_FRAME_SIZE; + argsSize += 8; + } + } + + // Local variables + for (i = nargs; i < f->nregs; i++) { + vreg *r = R(i); + size += r->size; + size += hl_pad_size(size, r->t); + r->stackPos = -size; + } + + // Align stack to 16 bytes + size += (-size) & 15; + ctx->totalRegsSize = size; + + jit_buf(ctx); + ctx->functionPos = BUF_POS(); + ctx->currentPos = 1; + + // Initialize Phase 2 callee-saved tracking + ctx->callee_saved_used = 0; + ctx->fpu_callee_saved_used = 0; + memset(ctx->stp_positions, 0, sizeof(ctx->stp_positions)); + memset(ctx->ldp_positions, 0, sizeof(ctx->ldp_positions)); + memset(ctx->stp_fpu_positions, 0, sizeof(ctx->stp_fpu_positions)); + memset(ctx->ldp_fpu_positions, 0, sizeof(ctx->ldp_fpu_positions)); + + // Function prologue - offset-based for selective NOP patching (Phase 2) + // Reserve space for callee-saved registers + FP/LR + encode_add_sub_imm(ctx, 1, 1, 0, 0, CALLEE_SAVED_FRAME_SIZE, SP_REG, SP_REG); // SUB SP, SP, #CALLEE_SAVED_FRAME_SIZE + + // Save callee-saved at fixed offsets (NOPpable) - positions recorded for backpatching + ctx->stp_positions[0] = BUF_POS(); + stp_offset(ctx, X27, X28, SP_REG, 80); // STP X27, X28, [SP, #80] + + ctx->stp_positions[1] = BUF_POS(); + stp_offset(ctx, X25, X26, SP_REG, 64); // STP X25, X26, [SP, #64] + + ctx->stp_positions[2] = BUF_POS(); + stp_offset(ctx, X23, X24, SP_REG, 48); // STP X23, X24, [SP, #48] + + ctx->stp_positions[3] = BUF_POS(); + stp_offset(ctx, X21, X22, SP_REG, 32); // STP X21, X22, [SP, #32] + + ctx->stp_positions[4] = BUF_POS(); + stp_offset(ctx, X19, X20, SP_REG, 16); // STP X19, X20, [SP, #16] + + // Save FPU callee-saved at fixed offsets (NOPpable) + ctx->stp_fpu_positions[0] = BUF_POS(); + stp_offset_fp(ctx, V8, V9, SP_REG, 96); // STP D8, D9, [SP, #96] + + ctx->stp_fpu_positions[1] = BUF_POS(); + stp_offset_fp(ctx, V10, V11, SP_REG, 112); // STP D10, D11, [SP, #112] + + ctx->stp_fpu_positions[2] = BUF_POS(); + stp_offset_fp(ctx, V12, V13, SP_REG, 128); // STP D12, D13, [SP, #128] + + ctx->stp_fpu_positions[3] = BUF_POS(); + stp_offset_fp(ctx, V14, V15, SP_REG, 144); // STP D14, D15, [SP, #144] + + // Save FP/LR at bottom (NOT NOPpable - always needed) + stp_offset(ctx, FP, LR, SP_REG, 0); // STP X29, X30, [SP, #0] + + // MOV X29, SP ; Set frame pointer (points to saved FP/LR) + mov_reg_reg(ctx, FP, SP_REG, true); + + // SUB SP, SP, #size ; Allocate stack space + if (size > 0) { + if (size < 4096) { + encode_add_sub_imm(ctx, 1, 1, 0, 0, size, SP_REG, SP_REG); + } else { + // Large stack frame - use multiple instructions + // Must use extended register form (UXTX) for SP, not shifted register + load_immediate(ctx, size, RTMP, true); + encode_add_sub_ext(ctx, 1, 1, 0, RTMP, 3, 0, SP_REG, SP_REG); // SUB SP, SP, RTMP, UXTX + } + } + + // Function marker: MOV W17, #(0xF000 | (findex & 0xFFF)) ; MOVK W17, #(findex >> 12), LSL #16 + // This encodes as 0xFnnnnnnn where nnnnnnn is the function index + // Distinguishes from opcode markers which are smaller numbers + { + int findex = f->findex; + int low12 = 0xF000 | (findex & 0xFFF); + int high = (findex >> 12) & 0xFFFF; + // MOV W17, #low12 + EMIT32(ctx, 0x52800011 | (low12 << 5)); + if (high != 0) { + // MOVK W17, #high, LSL #16 + EMIT32(ctx, 0x72A00011 | (high << 5)); + } + } + + // Store register arguments to their stack locations FIRST + // (before we clobber the argument registers with zero-init) + int_arg_count = 0; + fp_arg_count = 0; + for (i = 0; i < nargs && i < f->nregs; i++) { + vreg *r = R(i); + bool is_fp = IS_FLOAT(r); + int *arg_count = is_fp ? &fp_arg_count : &int_arg_count; + + if (*arg_count < CALL_NREGS) { + // This arg was in a register - store it to stack + // Skip void arguments (size 0) - they don't need storage + // but still consume a call register slot + if (r->size > 0) { + if (is_fp) { + str_stack_fp(ctx, FP_CALL_REGS[fp_arg_count], r->stackPos, r->size); + } else { + str_stack(ctx, CALL_REGS[int_arg_count], r->stackPos, r->size); + } + } + (*arg_count)++; + } + } + + // Zero-initialize local variables on stack (not arguments) + // This ensures reading unassigned locals returns null/0 + if (f->nregs > nargs) { + // Store zeros to each local variable slot using XZR + for (i = nargs; i < f->nregs; i++) { + vreg *r = R(i); + if (r->size > 0 && r->stackPos < 0) { + // Use str_stack with XZR as source + if (r->size == 8) { + load_immediate(ctx, r->stackPos, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, 0x03, 0, 0x00, 0, RTMP, XZR); + } else if (r->size == 4) { + load_immediate(ctx, r->stackPos, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, 0x02, 0, 0x00, 0, RTMP, XZR); + } else if (r->size == 2) { + load_immediate(ctx, r->stackPos, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, 0x01, 0, 0x00, 0, RTMP, XZR); + } else if (r->size == 1) { + load_immediate(ctx, r->stackPos, RTMP, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP); + encode_ldr_str_imm(ctx, 0x00, 0, 0x00, 0, RTMP, XZR); + } + } + } + } + + ctx->opsPos[0] = BUF_POS(); + + // Initialize debug offset tracking + if (ctx->m->code->hasdebug) { + debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1)); + debug16[0] = (unsigned short)(BUF_POS() - codePos); + } + + // Main opcode translation loop + for (opCount = 0; opCount < f->nops; opCount++) { + hl_opcode *o = f->ops + opCount; + vreg *dst = R(o->p1); + vreg *ra = R(o->p2); + vreg *rb = R(o->p3); + + ctx->currentPos = opCount + 1; + jit_buf(ctx); + + // Emit opcode marker for debugging: MOV W17, #(opcode | (opCount << 8)) + // W17 is IP1, a scratch register. This encodes both the opcode type and index. + { + int marker = (o->op & 0xFF) | ((opCount & 0xFF) << 8); + EMIT32(ctx, 0x52800011 | ((marker & 0xFFFF) << 5)); // MOV W17, #marker + } + + // Before a label (merge point), spill dirty registers for fallthrough path. + // After spilling, update the label position so jumps bypass the spill code. + // discard_regs() in op_label just clears bindings (no code). + if (o->op == OLabel) { + spill_regs(ctx); + // Update label position AFTER spill - jumps should target here, + // not before the spill (which is only for fallthrough path) + ctx->opsPos[opCount] = BUF_POS(); + } + + // Emit code based on opcode + switch (o->op) { + case OMov: + case OUnsafeCast: + op_mov(ctx, dst, ra); + break; + + case OInt: + store_const(ctx, dst, m->code->ints[o->p2]); + break; + + case OBool: + store_const(ctx, dst, o->p2); + break; + + case ONull: + // Set register to NULL (0) + store_const(ctx, dst, 0); + break; + + case OFloat: { + // Load float constant from module + // Float constants are stored at the start of the code buffer (offset o->p2 * 8) + double float_val = m->code->floats[o->p2]; + preg *dst_reg = alloc_fpu(ctx); + + if (float_val == 0.0) { + // Zero out FP register: FMOV Dd, XZR + // FMOV Dd, XZR: sf=1, S=0, type=01, rmode=00, opcode=000111, Rn=31, Rd + EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | dst_reg->id); + } else { + // Float constants are at the start of the code buffer + // Calculate PC-relative offset from current position to the float data + int float_offset = o->p2 * 8; // Offset from start of code buffer + int cur_pos = BUF_POS(); // Current position in code buffer + int pc_offset = float_offset - cur_pos; // PC-relative offset + + // LDR Dt,