diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0ff56c5cd..d95974115 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -326,23 +326,21 @@ jobs:
${{ env.WINDOWS_BUILD_FOLDER }}/hl.exe --version
;;
make*)
- if [[ ${{ matrix.architecture }} != arm64 ]]; then
- ./hl --version
- case ${{ matrix.target }} in
- linux) ldd -v ./hl ;;
- darwin) otool -L ./hl ;;
- esac
-
- haxe -hl hello.hl -cp other/tests -main HelloWorld -D interp \
- ${{ matrix.architecture == 32 && '-D hl-legacy32' || '' }}
- ./hl hello.hl
-
- # ensure the executable still works when installed globally
- cp hello.hl /tmp
- pushd /tmp
- hl hello.hl
- popd
- fi
+ ./hl --version
+ case ${{ matrix.target }} in
+ linux) ldd -v ./hl ;;
+ darwin) otool -L ./hl ;;
+ esac
+
+ haxe -hl hello.hl -cp other/tests -main HelloWorld -D interp \
+ ${{ matrix.architecture == 32 && '-D hl-legacy32' || '' }}
+ ./hl hello.hl
+
+ # ensure the executable still works when installed globally
+ cp hello.hl /tmp
+ pushd /tmp
+ hl hello.hl
+ popd
haxe -hl src/_main.c -cp other/tests -main HelloWorld
make hlc
@@ -431,9 +429,6 @@ jobs:
os: [darwin, linux, windows]
architecture: [x86_32, x86_64, arm64]
include:
- - architecture: arm64
- test-flags: --skip-hl-jit # not yet supported
-
- architecture: x86_32
test-flags: --skip-hl-jit # not stable
@@ -492,12 +487,7 @@ jobs:
tar -xvzf hashlink-*.tar.gz
cd $(echo hashlink-*/)
sudo mkdir -p /usr/local/{bin,lib}
- if [[ "${{ matrix.architecture }}" == "arm64" ]]; then
- echo "echo Jit is not supported on arm64" > /usr/local/bin/hl
- chmod +x /usr/local/bin/hl
- else
- sudo cp hl /usr/local/bin/
- fi
+ sudo cp hl /usr/local/bin/
sudo cp libhl.* *.hdll /usr/local/lib/
- name: Install binary (Windows)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34c8755fd..d77416270 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,12 +19,7 @@ include(GNUInstallDirs)
include(FindPkgConfig)
include(CTest)
-set(WITH_VM_DEFAULT ON)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64"))
- set(WITH_VM_DEFAULT OFF)
-endif()
-
-option(WITH_VM "Whether to build the Hashlink virtual machine" ${WITH_VM_DEFAULT})
+option(WITH_VM "Whether to build the Hashlink virtual machine" ON)
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
if(BUILD_SHARED_LIBS)
# ensure third-party static libs are built with PIC
@@ -59,6 +54,10 @@ if (MINGW)
add_link_options(-municode)
endif()
+if(NOT MSVC)
+ add_compile_options(-fno-omit-frame-pointer)
+endif()
+
list(APPEND CMAKE_MODULE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/other/cmake
)
@@ -224,9 +223,24 @@ else()
endif()
if (WITH_VM)
+ # Select JIT backend based on target architecture
+ # On macOS, CMAKE_OSX_ARCHITECTURES overrides CMAKE_SYSTEM_PROCESSOR for cross-compilation
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64" AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
+ set(JIT_SOURCES
+ src/jit_aarch64.c
+ src/jit_aarch64_emit.c
+ src/jit_common.c
+ )
+ else()
+ set(JIT_SOURCES
+ src/jit_x86.c
+ src/jit_common.c
+ )
+ endif()
+
add_executable(hl
src/code.c
- src/jit.c
+ ${JIT_SOURCES}
src/main.c
src/module.c
src/debugger.c
@@ -246,6 +260,8 @@ if (WITH_VM)
if (WIN32)
target_link_libraries(hl user32)
+ else()
+ target_link_libraries(hl dl)
endif()
endif()
diff --git a/Makefile b/Makefile
index 52c80ab12..3b079ec6e 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,13 @@ STD = src/std/array.o src/std/buffer.o src/std/bytes.o src/std/cast.o src/std/da
src/std/socket.o src/std/string.o src/std/sys.o src/std/types.o src/std/ucs2.o src/std/thread.o src/std/process.o \
src/std/track.o
-HL_OBJ = src/code.o src/jit.o src/main.o src/module.o src/debugger.o src/profile.o
+ifneq (,$(filter aarch64 arm64,$(ARCH)))
+ HL_JIT = src/jit_aarch64.o src/jit_aarch64_emit.o src/jit_common.o
+else
+ HL_JIT = src/jit_x86.o src/jit_common.o
+endif
+
+HL_OBJ = src/code.o $(HL_JIT) src/main.o src/module.o src/debugger.o src/profile.o
FMT_CPPFLAGS = -I include/mikktspace -I include/minimp3
@@ -239,19 +245,12 @@ LIBHL = libhl.$(LIBEXT)
HL = hl$(EXE_SUFFIX)
HLC = hlc$(EXE_SUFFIX)
-all: $(LIBHL) libs
-ifeq ($(ARCH),arm64)
- $(warning HashLink vm is not supported on arm64, skipping)
-else
-all: $(HL)
-endif
+all: $(LIBHL) libs $(HL)
install:
$(UNAME)==Darwin && ${MAKE} uninstall
-ifneq ($(ARCH),arm64)
mkdir -p $(INSTALL_BIN_DIR)
cp $(HL) $(INSTALL_BIN_DIR)
-endif
mkdir -p $(INSTALL_LIB_DIR)
cp *.hdll $(INSTALL_LIB_DIR)
cp $(LIBHL) $(INSTALL_LIB_DIR)
@@ -362,11 +361,7 @@ release_win:
rm -rf $(PACKAGE_NAME)
release_linux release_osx:
-ifeq ($(ARCH),arm64)
- cp $(LIBHL) *.hdll $(PACKAGE_NAME)
-else
cp $(HL) $(LIBHL) *.hdll $(PACKAGE_NAME)
-endif
tar -cvzf $(PACKAGE_NAME).tar.gz $(PACKAGE_NAME)
rm -rf $(PACKAGE_NAME)
diff --git a/hl.vcxproj b/hl.vcxproj
index 88e95b28b..389a83b38 100644
--- a/hl.vcxproj
+++ b/hl.vcxproj
@@ -360,7 +360,8 @@
-
+
+
@@ -369,6 +370,7 @@
+
diff --git a/hl.vcxproj.filters b/hl.vcxproj.filters
index f86723996..7adf1423c 100644
--- a/hl.vcxproj.filters
+++ b/hl.vcxproj.filters
@@ -4,7 +4,8 @@
-
+
+
@@ -13,5 +14,6 @@
+
\ No newline at end of file
diff --git a/other/osx/entitlements.xml b/other/osx/entitlements.xml
index c834321b2..d61d415fb 100644
--- a/other/osx/entitlements.xml
+++ b/other/osx/entitlements.xml
@@ -4,5 +4,7 @@
com.apple.security.get-task-allow
+ com.apple.security.cs.allow-jit
+
\ No newline at end of file
diff --git a/src/gc.c b/src/gc.c
index d4da96fd8..6c81fe318 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1149,6 +1149,10 @@ HL_PRIM void *hl_alloc_executable_memory( int size ) {
return malloc(size);
#elif defined(HL_CONSOLE)
return NULL;
+#elif defined(__APPLE__) && defined(__aarch64__)
+ void *p;
+ p = mmap(NULL,size,PROT_READ|PROT_WRITE|PROT_EXEC,(MAP_PRIVATE|MAP_ANONYMOUS|MAP_JIT),-1,0);
+ return p;
#else
void *p;
p = mmap(NULL,size,PROT_READ|PROT_WRITE|PROT_EXEC,(MAP_PRIVATE|MAP_ANONYMOUS),-1,0);
diff --git a/src/hl.h b/src/hl.h
index 2da71da81..19c8c64e5 100644
--- a/src/hl.h
+++ b/src/hl.h
@@ -93,7 +93,7 @@
# define HL_BSD
#endif
-#if defined(_64BITS) || defined(__x86_64__) || defined(_M_X64) || defined(__LP64__) || defined(__wasm64__)
+#if defined(_64BITS) || defined(__x86_64__) || defined(_M_X64) || defined(__LP64__) || defined(__wasm64__) || defined(__aarch64__)
# define HL_64
#endif
diff --git a/src/jit_aarch64.c b/src/jit_aarch64.c
new file mode 100644
index 000000000..1e0d732b7
--- /dev/null
+++ b/src/jit_aarch64.c
@@ -0,0 +1,6836 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * AArch64 JIT Backend
+ *
+ * This file contains the AArch64-specific implementation of the HashLink JIT compiler.
+ * It translates HashLink bytecode into native AArch64 machine code.
+ *
+ * Key differences from x86:
+ * - Fixed 32-bit instruction encoding (vs variable-length on x86)
+ * - Load/store architecture (no direct memory operands in ALU ops)
+ * - AAPCS64 calling convention (8 register args vs 4/6 on x86-64)
+ * - Condition codes with branch instructions (no conditional moves in base ISA)
+ * - PC-relative addressing with ADRP/ADD for globals
+ */
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+# error "This file is for AArch64 architecture only. Use jit_x86.c for x86/x64."
+#endif
+
+#include
+#include
+#include
+#include
+#include
+#include "jit_common.h"
+#include "jit_aarch64_emit.h"
+#include "hlsystem.h"
+
+// macOS ARM64 W^X support: MAP_JIT memory requires toggling write protection
+#if defined(__APPLE__) && defined(__aarch64__)
+#include
+#define JIT_ENABLE_WRITE() pthread_jit_write_protect_np(false)
+#define JIT_ENABLE_EXEC() pthread_jit_write_protect_np(true)
+#else
+#define JIT_ENABLE_WRITE()
+#define JIT_ENABLE_EXEC()
+#endif
+
+// Helper for LDR/STR scaled offset from struct field
+#define FIELD_OFFSET_SCALED(type, field) (offsetof(type, field) / 8)
+
+// ============================================================================
+// AArch64 Register Configuration (AAPCS64)
+// ============================================================================
+
+/*
+ * AAPCS64 (ARM Architecture Procedure Call Standard for ARM64)
+ *
+ * Register Usage:
+ * - X0-X7: Argument/result registers (caller-saved)
+ * - X8: Indirect result location register (caller-saved)
+ * - X9-X15: Temporary registers (caller-saved)
+ * - X16-X17: Intra-procedure-call temporary registers (caller-saved)
+ * - X18: Platform register (avoid use - may be reserved by OS)
+ * - X19-X28: Callee-saved registers
+ * - X29: Frame pointer (FP)
+ * - X30: Link register (LR)
+ * - SP: Stack pointer (must be 16-byte aligned)
+ *
+ * FP/SIMD Registers:
+ * - V0-V7: Argument/result registers (caller-saved)
+ * - V8-V15: Callee-saved (only lower 64 bits, D8-D15)
+ * - V16-V31: Temporary registers (caller-saved)
+ */
+
+#define RCPU_COUNT 31 // X0-X30 (SP is not a general register)
+#define RFPU_COUNT 32 // V0-V31
+
+// Calling convention: first 8 args in X0-X7
+#define CALL_NREGS 8
+static const Arm64Reg CALL_REGS[] = { X0, X1, X2, X3, X4, X5, X6, X7 };
+static const Arm64FpReg FP_CALL_REGS[] = { V0, V1, V2, V3, V4, V5, V6, V7 };
+
+// Caller-saved (scratch) registers available for vreg allocation: X0-X15
+// X16 (IP0) and X17 (IP1) are reserved as RTMP/RTMP2 for multi-instruction
+// sequences and opcode debug markers — they must NOT be in this pool.
+// X18 is the platform register (reserved on some OSes).
+#define RCPU_SCRATCH_COUNT 16
+
+// vdynamic structure: type (8 bytes) + value (8 bytes)
+#define HDYN_VALUE 8
+static const Arm64Reg RCPU_SCRATCH_REGS[] = {
+ X0, X1, X2, X3, X4, X5, X6, X7,
+ X8, X9, X10, X11, X12, X13, X14, X15
+};
+
+// All FP registers are caller-saved in AAPCS64
+#define RFPU_SCRATCH_COUNT 32
+
+// Callee-saved registers: X19-X28
+// X29 (FP) and X30 (LR) are also callee-saved but handled specially
+#define RCPU_CALLEE_SAVED_COUNT 10
+
+// Callee-saved registers available for allocation
+// These survive function calls, so we don't need to spill them before BLR
+#define RCPU_CALLEE_ALLOC_COUNT 10
+static const Arm64Reg RCPU_CALLEE_ALLOC[] = {
+ X19, X20, X21, X22, X23, X24, X25, X26, X27, X28
+};
+
+// FP callee-saved: V8-V15 (only lower 64 bits per AAPCS64)
+#define RFPU_CALLEE_SAVED_COUNT 8
+
+// Frame size for callee-saved registers + FP/LR
+// CPU callee-saved: RCPU_CALLEE_SAVED_COUNT * 8 bytes = 80 bytes
+// FPU callee-saved: RFPU_CALLEE_SAVED_COUNT * 8 bytes = 64 bytes (D8-D15, lower 64 bits only)
+// FP/LR: 16 bytes
+// Total: 160 bytes (must be 16-byte aligned)
+#define CALLEE_SAVED_FRAME_SIZE (RCPU_CALLEE_SAVED_COUNT * 8 + RFPU_CALLEE_SAVED_COUNT * 8 + 16)
+
+// Helper macros for accessing registers
+#define REG_COUNT (RCPU_COUNT + RFPU_COUNT)
+#define VFPR(i) ((i) + RCPU_COUNT) // FP register index
+#define PVFPR(i) REG_AT(VFPR(i)) // Pointer to FP register
+
+// Reserved registers for JIT internal use (caller-saved, no need to preserve)
+#define RTMP X16 // Temporary register for multi-instruction sequences (IP0)
+#define RTMP2 X17 // Second temporary register (IP1)
+
+// Special purpose registers
+#define RFP X29 // Frame pointer
+#define RLR X30 // Link register
+
+// Stack alignment requirement
+#define STACK_ALIGN 16
+
+// EMIT32 is defined in jit_common.h - use EMIT32(ctx,ctx, val)
+
+// ============================================================================
+// Error Handling
+// ============================================================================
+
+void _jit_error(jit_ctx *ctx, const char *msg, int line) {
+ printf("JIT ERROR: %s (jit_aarch64.c:%d)\n", msg, line);
+ if (ctx && ctx->f) {
+ // hl_function doesn't have a 'name' field directly
+ // The function object info would be in the module
+ int func_index = (int)(ctx->f - ctx->m->code->functions);
+ printf("In function at index %d\n", func_index);
+ }
+ jit_exit();
+}
+
+void on_jit_error(const char *msg, int_val line) {
+ printf("JIT Runtime Error: %s (line %d)\n", msg, (int)line);
+ jit_exit();
+}
+
+#define JIT_ASSERT(cond) do { if (!(cond)) { \
+ printf("JIT ASSERTION FAILED: %s (jit_aarch64.c:%d)\n", #cond, __LINE__); \
+ jit_exit(); \
+} } while(0)
+
+// ============================================================================
+// Register Allocation Helpers
+// ============================================================================
+
+/**
+ * Check if type is String (HOBJ with bytes:HBYTES + length:HI32)
+ * Used for value-based string comparison per Haxe spec.
+ */
+static bool is_string_type(hl_type *t) {
+ if (t->kind != HOBJ || !t->obj) return false;
+ if (t->obj->nfields != 2) return false;
+ return t->obj->fields[0].t->kind == HBYTES &&
+ t->obj->fields[1].t->kind == HI32;
+}
+
+/**
+ * Fast string equality comparison (called from JIT-compiled code).
+ * Returns 0 if equal, non-zero if different.
+ */
+static int jit_str_cmp( vstring *a, vstring *b ) {
+ if( a == b ) return 0;
+ if( !a || !b ) return 1;
+ if( a->length != b->length ) return 1;
+ return memcmp(a->bytes, b->bytes, a->length * sizeof(uchar));
+}
+
+// ============================================================================
+// Register Allocation
+// ============================================================================
+
+// Forward declarations
+static void free_reg(jit_ctx *ctx, preg *p);
+static void patch_jump(jit_ctx *ctx, int pos, int target_pos);
+
+/**
+ * Find a free CPU register, evicting if necessary
+ * @param k Register kind (RCPU, RCPU_CALL, etc.)
+ * @return Pointer to allocated physical register
+ */
+static preg *alloc_cpu(jit_ctx *ctx, preg_kind k) {
+ preg *p;
+ int i;
+ int start_idx = 0;
+ int count = RCPU_SCRATCH_COUNT;
+ const Arm64Reg *regs = RCPU_SCRATCH_REGS;
+
+ // For RCPU_CALL, only use non-argument registers
+ if (k == RCPU_CALL) {
+ // Use registers that are NOT in CALL_REGS
+ // For now, use X8-X17 (scratch registers that aren't args)
+ start_idx = 8; // Start from X8
+ }
+
+ // First pass: find a free scratch register (not holding anything and not locked)
+ // Lock check: p->lock >= ctx->currentPos means locked at current operation
+ for (i = start_idx; i < count; i++) {
+ p = REG_AT(regs[i]);
+ if (p->holds == NULL && p->lock < ctx->currentPos)
+ return p;
+ }
+
+ // Second pass: try callee-saved registers (X19-X26) before evicting scratch
+ // These survive function calls, so values don't need to be spilled before BLR
+ for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) {
+ p = REG_AT(RCPU_CALLEE_ALLOC[i]);
+ if (p->holds == NULL && p->lock < ctx->currentPos) {
+ ctx->callee_saved_used |= (1 << i);
+ return p;
+ }
+ }
+
+ // Third pass: evict a callee-saved register if one is unlocked
+ for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) {
+ p = REG_AT(RCPU_CALLEE_ALLOC[i]);
+ if (p->lock < ctx->currentPos) {
+ ctx->callee_saved_used |= (1 << i);
+ free_reg(ctx, p); // Spill to stack before reusing
+ return p;
+ }
+ }
+
+ // Fourth pass: evict a scratch register
+ for (i = start_idx; i < count; i++) {
+ p = REG_AT(regs[i]);
+ if (p->lock < ctx->currentPos) {
+ free_reg(ctx, p); // Spill to stack before reusing
+ return p;
+ }
+ }
+
+ // All registers are locked - this is an error
+ JIT_ASSERT(0);
+ return NULL;
+}
+
+/**
+ * Allocate a floating-point register
+ */
+static preg *alloc_fpu(jit_ctx *ctx) {
+ preg *p;
+ int i;
+
+ // First pass: find a free register
+ // Prefer caller-saved registers (V0-V7, V16-V31)
+ // Lock check: p->lock >= ctx->currentPos means locked at current operation
+ for (i = 0; i < RFPU_COUNT; i++) {
+ if (i >= 8 && i < 16)
+ continue; // Skip callee-saved V8-V15 for now
+ p = PVFPR(i);
+ if (p->holds == NULL && p->lock < ctx->currentPos)
+ return p;
+ }
+
+ // Second pass: try callee-saved if needed
+ for (i = 8; i < 16; i++) {
+ p = PVFPR(i);
+ if (p->holds == NULL && p->lock < ctx->currentPos) {
+ ctx->fpu_callee_saved_used |= (1 << (i - 8)); // Track V8-V15 usage
+ return p;
+ }
+ }
+
+ // Third pass: evict an unlocked register
+ for (i = 0; i < RFPU_COUNT; i++) {
+ p = PVFPR(i);
+ if (p->lock < ctx->currentPos) {
+ if (i >= 8 && i < 16) {
+ ctx->fpu_callee_saved_used |= (1 << (i - 8)); // Track V8-V15 usage
+ }
+ free_reg(ctx, p); // Spill to stack before reusing
+ return p;
+ }
+ }
+
+ JIT_ASSERT(0);
+ return NULL;
+}
+
+/**
+ * Allocate a register of the appropriate type based on the virtual register's type
+ */
+static preg *alloc_reg(jit_ctx *ctx, vreg *r, preg_kind k) {
+ if (IS_FLOAT(r))
+ return alloc_fpu(ctx);
+ else
+ return alloc_cpu(ctx, k);
+}
+
+// ============================================================================
+// Register State Management
+// ============================================================================
+
+/**
+ * Store a virtual register to its stack location
+ */
+static void store(jit_ctx *ctx, vreg *r, preg *p); // Forward declaration
+static void mov_reg_reg(jit_ctx *ctx, Arm64Reg dst, Arm64Reg src, bool is_64bit); // Forward declaration
+static void ldr_stack(jit_ctx *ctx, Arm64Reg dst, int stack_offset, int size); // Forward declaration
+static void emit_call_findex(jit_ctx *ctx, int findex, int stack_space); // Forward declaration
+
+/**
+ * Free a physical register by storing its content to stack if needed
+ */
+static void free_reg(jit_ctx *ctx, preg *p) {
+ vreg *r = p->holds;
+ if (r != NULL) {
+ store(ctx, r, p);
+ r->current = NULL;
+ p->holds = NULL;
+ }
+ // Unlock the register so it can be reused
+ RUNLOCK(p);
+}
+
+/**
+ * Discard the content of a physical register, storing if dirty.
+ * Used when we're done using a register but the vreg might still be live.
+ * If the vreg is dirty (modified but not yet on stack), we store it first.
+ */
+static void discard(jit_ctx *ctx, preg *p) {
+ vreg *r = p->holds;
+ if (r != NULL) {
+ // If dirty, store to stack before clearing the binding
+ if (r->dirty) {
+ store(ctx, r, p);
+ }
+ r->current = NULL;
+ p->holds = NULL;
+ }
+ // Unlock the register so it can be reused
+ RUNLOCK(p);
+}
+
+/**
+ * Spill all caller-saved registers to stack before a function call.
+ * In AAPCS64: X0-X17 and V0-V7 are caller-saved and may be clobbered.
+ *
+ * This function:
+ * 1. Stores each bound register's value to its vreg's stack slot
+ * 2. Clears the register↔vreg bindings
+ *
+ * IMPORTANT: Must be called BEFORE the BLR instruction, not after!
+ * At that point register values are still valid and can be spilled to stack.
+ * After the call, caller-saved registers contain garbage from the callee.
+ *
+ * ARCHITECTURAL NOTE - Why AArch64 differs from x86:
+ *
+ * The x86 JIT's discard_regs() just clears register bindings without spilling.
+ * This works because x86 (CISC) can use memory operands directly in ALU
+ * instructions:
+ *
+ * x86: ADD [rbp-8], rax ; Operate directly on stack slot
+ *
+ * So x86 treats stack slots as the "source of truth" - values are written
+ * to stack as part of normal operations, and registers are just caches.
+ * Clearing bindings is safe because the value is already on the stack.
+ *
+ * AArch64 (RISC load/store architecture) cannot do this:
+ *
+ * AArch64: LDR x1, [fp, #-8] ; Must load to register first
+ * ADD x0, x0, x1 ; Operate on registers only
+ * STR x0, [fp, #-8] ; Separate store instruction
+ *
+ * Adding a store after every operation would cost ~1 extra instruction per op.
+ * Instead, we keep values in registers (registers are "source of truth") and
+ * only spill when necessary - specifically, before function calls that will
+ * clobber caller-saved registers.
+ *
+ * This is not a workaround but the natural design for load/store architectures.
+ */
+static void spill_regs(jit_ctx *ctx) {
+ int i;
+ // Spill and discard CPU scratch registers (X0-X17) - these get clobbered by calls
+ for (i = 0; i < 18; i++) {
+ preg *r = &ctx->pregs[i];
+ if (r->holds) {
+ if (r->holds->dirty) {
+ free_reg(ctx, r); // Dirty: store to stack, then clear binding
+ } else {
+ discard(ctx, r); // Clean: just clear binding (value already on stack)
+ }
+ }
+ }
+ // NOTE: Do NOT spill callee-saved registers (X19-X26)!
+ // They survive function calls, so their values remain valid after BLR.
+ // This is the key optimization - values in callee-saved don't need spilling.
+
+ // Spill and discard FPU scratch registers (V0-V7, V16-V31) - these get clobbered by calls
+ // V8-V15 are callee-saved and survive calls
+ for (i = 0; i < 8; i++) {
+ preg *r = &ctx->pregs[RCPU_COUNT + i];
+ if (r->holds) {
+ if (r->holds->dirty) {
+ free_reg(ctx, r); // Dirty: store to stack, then clear binding
+ } else {
+ discard(ctx, r); // Clean: just clear binding (value already on stack)
+ }
+ }
+ }
+ // Also spill V16-V31 (caller-saved temporary FPU registers)
+ for (i = 16; i < 32; i++) {
+ preg *r = &ctx->pregs[RCPU_COUNT + i];
+ if (r->holds) {
+ if (r->holds->dirty) {
+ free_reg(ctx, r); // Dirty: store to stack, then clear binding
+ } else {
+ discard(ctx, r); // Clean: just clear binding (value already on stack)
+ }
+ }
+ }
+}
+
+/**
+ * Spill callee-saved registers to stack.
+ * Called before jumps to labels - callee-saved must be on stack at merge points.
+ * NOTE: This is NOT called before function calls (callee-saved survive calls).
+ */
+static void spill_callee_saved(jit_ctx *ctx) {
+ int i;
+ // Spill callee-saved CPU registers (X19-X26) that are in use
+ for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) {
+ preg *r = REG_AT(RCPU_CALLEE_ALLOC[i]);
+ if (r->holds) {
+ if (r->holds->dirty) {
+ free_reg(ctx, r); // Dirty: store to stack, then clear binding
+ } else {
+ discard(ctx, r); // Clean: just clear binding
+ }
+ }
+ }
+}
+
+/**
+ * Ensure a virtual register is in a physical register
+ * Loads from stack if necessary
+ */
+static preg *fetch(jit_ctx *ctx, vreg *r); // Forward declaration
+
+/**
+ * Bind a vreg to a physical register (bidirectional association)
+ * This is essential for proper spilling when the register is evicted
+ */
+static void reg_bind(jit_ctx *ctx, vreg *r, preg *p) {
+ // If vreg was dirty in another register, store to stack first
+ // This prevents losing values when rebinding (e.g., dst = dst op src)
+ if (r->current && r->current != p) {
+ if (r->dirty) {
+ store(ctx, r, r->current);
+ }
+ r->current->holds = NULL;
+ }
+ // Set new binding
+ r->current = p;
+ p->holds = r;
+}
+
+/**
+ * Allocate a destination register for a vreg
+ * Helper function used by many operations
+ * Binds the vreg to the allocated register for proper spilling
+ */
+static preg *alloc_dst(jit_ctx *ctx, vreg *r) {
+ preg *p;
+ if (IS_FLOAT(r)) {
+ p = alloc_fpu(ctx);
+ } else {
+ p = alloc_cpu(ctx, RCPU);
+ }
+ // Bind the vreg to this register so we can spill it later if needed
+ reg_bind(ctx, r, p);
+ // Mark dirty: a new value is about to be written to this register,
+ // and it's not on the stack yet. This ensures spill_regs() will
+ // store it before the next call/jump.
+ r->dirty = 1;
+ return p;
+}
+
+// ============================================================================
+// Basic Data Movement - Encoding Helpers
+// ============================================================================
+
+/**
+ * Generate MOV instruction (register to register)
+ * For integer: MOV Xd, Xn (using ORR Xd, XZR, Xn)
+ * For float: FMOV Vd, Vn
+ */
+static void mov_reg_reg(jit_ctx *ctx, Arm64Reg dst, Arm64Reg src, bool is_64bit) {
+ // SP (register 31) can't be used with ORR - must use ADD instead
+ if (src == SP_REG || dst == SP_REG) {
+ // MOV Xd, SP or MOV SP, Xn => ADD Xd, Xn, #0
+ encode_add_sub_imm(ctx, is_64bit ? 1 : 0, 0, 0, 0, 0, src, dst);
+ } else if (is_64bit) {
+ // MOV Xd, Xn => ORR Xd, XZR, Xn
+ encode_logical_reg(ctx, 1, 0x01, 0, 0, src, 0, XZR, dst);
+ } else {
+ // MOV Wd, Wn => ORR Wd, WZR, Wn
+ encode_logical_reg(ctx, 0, 0x01, 0, 0, src, 0, XZR, dst);
+ }
+}
+
+static void fmov_reg_reg(jit_ctx *ctx, Arm64FpReg dst, Arm64FpReg src, bool is_double) {
+ // FMOV Vd, Vn (using FP 1-source with opcode 0)
+ int type = is_double ? 0x01 : 0x00; // 01=double, 00=single
+ encode_fp_1src(ctx, 0, 0, type, 0, src, dst);
+}
+
+/**
+ * Load from stack to register
+ * Format: LDR/LDUR Xt, [FP, #offset]
+ *
+ * Uses LDUR for signed offsets in range [-256, +255] (single instruction)
+ * Uses LDR with scaled unsigned offset for aligned positive offsets
+ * Falls back to computing address in register for large offsets
+ */
+static void ldr_stack(jit_ctx *ctx, Arm64Reg dst, int stack_offset, int size) {
+ int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : ((size == 2) ? 1 : 0));
+
+ // Priority 1: Use LDUR for small signed offsets (-256 to +255)
+ // This handles most negative stack offsets in a single instruction
+ if (stack_offset >= -256 && stack_offset <= 255) {
+ encode_ldur_stur(ctx, size_enc, 0, 0x01, stack_offset, RFP, dst);
+ return;
+ }
+
+ // Priority 2: Use LDR with scaled unsigned offset for larger positive aligned offsets
+ if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) {
+ int scaled_offset = stack_offset / size;
+ encode_ldr_str_imm(ctx, size_enc, 0, 0x01, scaled_offset, RFP, dst);
+ return;
+ }
+
+ // Fallback: Compute address in register for large/unaligned offsets
+ load_immediate(ctx, stack_offset, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, size_enc, 0, 0x01, 0, RTMP, dst);
+}
+
+/**
+ * Load from stack to FP register
+ * Format: LDR/LDUR Dt/St, [FP, #offset]
+ */
+static void ldr_stack_fp(jit_ctx *ctx, Arm64FpReg dst, int stack_offset, int size) {
+ int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : 1);
+
+ // Priority 1: Use LDUR for small signed offsets (-256 to +255)
+ if (stack_offset >= -256 && stack_offset <= 255) {
+ encode_ldur_stur(ctx, size_enc, 1, 0x01, stack_offset, RFP, (Arm64Reg)dst);
+ return;
+ }
+
+ // Priority 2: Use LDR with scaled unsigned offset for larger positive aligned offsets
+ if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) {
+ int scaled_offset = stack_offset / size;
+ encode_ldr_str_imm(ctx, size_enc, 1, 0x01, scaled_offset, RFP, (Arm64Reg)dst);
+ return;
+ }
+
+ // Fallback: Compute address in register
+ load_immediate(ctx, stack_offset, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, size_enc, 1, 0x01, 0, RTMP, (Arm64Reg)dst);
+}
+
+/**
+ * Store register to stack
+ * Format: STR/STUR Xt, [FP, #offset]
+ *
+ * Uses STUR for signed offsets in range [-256, +255] (single instruction)
+ * Uses STR with scaled unsigned offset for aligned positive offsets
+ * Falls back to computing address in register for large offsets
+ */
+static void str_stack(jit_ctx *ctx, Arm64Reg src, int stack_offset, int size) {
+ int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : ((size == 2) ? 1 : 0));
+
+ // Priority 1: Use STUR for small signed offsets (-256 to +255)
+ if (stack_offset >= -256 && stack_offset <= 255) {
+ encode_ldur_stur(ctx, size_enc, 0, 0x00, stack_offset, RFP, src);
+ return;
+ }
+
+ // Priority 2: Use STR with scaled unsigned offset for larger positive aligned offsets
+ if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) {
+ int scaled_offset = stack_offset / size;
+ encode_ldr_str_imm(ctx, size_enc, 0, 0x00, scaled_offset, RFP, src);
+ return;
+ }
+
+ // Fallback: Compute address in register
+ load_immediate(ctx, stack_offset, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, size_enc, 0, 0x00, 0, RTMP, src);
+}
+
+/**
+ * Store FP register to stack
+ * Format: STR/STUR Dt/St, [FP, #offset]
+ */
+static void str_stack_fp(jit_ctx *ctx, Arm64FpReg src, int stack_offset, int size) {
+ int size_enc = (size == 8) ? 3 : ((size == 4) ? 2 : 1);
+
+ // Priority 1: Use STUR for small signed offsets (-256 to +255)
+ if (stack_offset >= -256 && stack_offset <= 255) {
+ encode_ldur_stur(ctx, size_enc, 1, 0x00, stack_offset, RFP, (Arm64Reg)src);
+ return;
+ }
+
+ // Priority 2: Use STR with scaled unsigned offset for larger positive aligned offsets
+ if (stack_offset >= 0 && (stack_offset % size == 0) && stack_offset < 4096 * size) {
+ int scaled_offset = stack_offset / size;
+ encode_ldr_str_imm(ctx, size_enc, 1, 0x00, scaled_offset, RFP, (Arm64Reg)src);
+ return;
+ }
+
+ // Fallback: Compute address in register
+ load_immediate(ctx, stack_offset, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, size_enc, 1, 0x00, 0, RTMP, (Arm64Reg)src);
+}
+
+/**
+ * STP with signed offset (no writeback) - for NOPpable callee-saved saves.
+ * Format: STP Xt1, Xt2, [Xn, #imm]
+ * This allows individual STPs to be patched to NOPs without affecting SP.
+ */
+static void stp_offset(jit_ctx *ctx, Arm64Reg rt, Arm64Reg rt2, Arm64Reg rn, int offset) {
+ // STP imm7 is signed, scaled by 8: range -512 to +504
+ if (offset < -512 || offset > 504) hl_fatal("stp_offset: offset out of range");
+ int imm7 = offset / 8;
+ // opc=10 (64-bit), 101, addr_mode=10 (signed offset), L=0 (store), imm7, Rt2, Rn, Rt
+ unsigned int insn = (2u << 30) | (5u << 27) | (2u << 23) | (0u << 22) |
+ ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt;
+ EMIT32(ctx, insn);
+}
+
+/**
+ * LDP with signed offset (no writeback) - for NOPpable callee-saved restores.
+ * Format: LDP Xt1, Xt2, [Xn, #imm]
+ */
+static void ldp_offset(jit_ctx *ctx, Arm64Reg rt, Arm64Reg rt2, Arm64Reg rn, int offset) {
+ int imm7 = offset / 8;
+ // opc=10 (64-bit), 101, addr_mode=10 (signed offset), L=1 (load), imm7, Rt2, Rn, Rt
+ unsigned int insn = (2u << 30) | (5u << 27) | (2u << 23) | (1u << 22) |
+ ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt;
+ EMIT32(ctx, insn);
+}
+
+/**
+ * STP for 64-bit FP registers (D regs) with signed offset.
+ * Format: STP Dt1, Dt2, [Xn, #imm]
+ * Encoding: opc=01 (64-bit FP), V=1, addr_mode=10
+ * Verified: arm-enc "stp d8, d9, [sp, #96]" -> 0x6d0627e8
+ */
+static void stp_offset_fp(jit_ctx *ctx, Arm64FpReg rt, Arm64FpReg rt2, Arm64Reg rn, int offset) {
+ if (offset < -512 || offset > 504) hl_fatal("stp_offset_fp: offset out of range");
+ int imm7 = offset / 8;
+ // opc=01 (64-bit FP), 101, V=1, addr_mode=10, L=0
+ unsigned int insn = (1u << 30) | (5u << 27) | (1u << 26) | (2u << 23) | (0u << 22) |
+ ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt;
+ EMIT32(ctx, insn);
+}
+
+/**
+ * LDP for 64-bit FP registers (D regs) with signed offset.
+ * Verified: arm-enc "ldp d14, d15, [sp, #144]" -> 0x6d493fee
+ */
+static void ldp_offset_fp(jit_ctx *ctx, Arm64FpReg rt, Arm64FpReg rt2, Arm64Reg rn, int offset) {
+ int imm7 = offset / 8;
+ // opc=01 (64-bit FP), 101, V=1, addr_mode=10, L=1
+ unsigned int insn = (1u << 30) | (5u << 27) | (1u << 26) | (2u << 23) | (1u << 22) |
+ ((imm7 & 0x7F) << 15) | (rt2 << 10) | (rn << 5) | rt;
+ EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Data Movement Operations
+// ============================================================================
+
+/**
+ * Store a virtual register to its stack location
+ */
+static void store(jit_ctx *ctx, vreg *r, preg *p) {
+ if (r == NULL || p == NULL || r->size == 0)
+ return;
+
+ int size = r->size;
+ int offset = r->stackPos;
+
+ if (p->kind == RCPU) {
+ str_stack(ctx, p->id, offset, size);
+ } else if (p->kind == RFPU) {
+ str_stack_fp(ctx, p->id, offset, size);
+ }
+
+ r->dirty = 0; // Stack is now up-to-date
+}
+
+/**
+ * Mark a virtual register as dirty (register value differs from stack).
+ * The value will be spilled at the next basic block boundary (jump, call, label).
+ * This defers stores to reduce instruction count within basic blocks.
+ */
+static void mark_dirty(jit_ctx *ctx, vreg *r) {
+ (void)ctx; // unused, kept for consistency with other functions
+ if (r != NULL && r->current != NULL && r->size > 0) {
+ r->dirty = 1;
+ }
+}
+
+/**
+ * Store to a vreg's stack slot and clear any stale register binding.
+ * Use this when storing directly (e.g., from X0 after a call) without
+ * going through the normal register allocation path.
+ *
+ * This prevents spill_regs from later overwriting the correct stack
+ * value with a stale register value.
+ */
+static void store_result(jit_ctx *ctx, vreg *dst) {
+ // Clear any stale binding - the correct value is now on stack
+ if (dst->current != NULL) {
+ dst->current->holds = NULL;
+ dst->current = NULL;
+ }
+}
+
+/**
+ * Load a virtual register from stack to a physical register
+ */
+static preg *fetch(jit_ctx *ctx, vreg *r) {
+ preg *p;
+
+ // HVOID registers have size 0 and no value to load
+ if (r->size == 0)
+ return UNUSED;
+
+ // Check if already in a register
+ if (r->current != NULL && r->current->kind != RSTACK) {
+ // Lock the register to prevent eviction during subsequent allocations
+ RLOCK(r->current);
+ return r->current;
+ }
+
+ // Allocate a register
+ p = alloc_reg(ctx, r, RCPU);
+
+ // If the register we got already holds something, evict it
+ if (p->holds != NULL)
+ free_reg(ctx, p);
+
+ // Load from stack
+ int size = r->size;
+ int offset = r->stackPos;
+
+ if (IS_FLOAT(r)) {
+ ldr_stack_fp(ctx, p->id, offset, size);
+ } else {
+ ldr_stack(ctx, p->id, offset, size);
+ }
+
+ // Bind vreg to register and lock it to prevent eviction by subsequent allocs
+ reg_bind(ctx, r, p);
+ RLOCK(p);
+
+ return p;
+}
+
+// ============================================================================
+// Opcode Handlers
+// ============================================================================
+
+/**
+ * OMov - Move/copy a value from one register to another
+ */
+static void op_mov(jit_ctx *ctx, vreg *dst, vreg *src) {
+ preg *r = fetch(ctx, src);
+
+ // Handle special case for HF32 (32-bit float)
+ // Ensure it's in an FP register
+ if (src->t->kind == HF32 && r->kind != RFPU) {
+ r = alloc_fpu(ctx);
+ // Load from stack to FP register
+ ldr_stack_fp(ctx, r->id, src->stackPos, src->size);
+ reg_bind(ctx, src, r);
+ }
+
+ // Store to destination stack slot
+ store(ctx, dst, r);
+
+ // Clear dst's old register binding to prevent stale value from being spilled
+ // The correct value is now on the stack from store() above
+ if (dst->current != NULL) {
+ dst->current->holds = NULL;
+ dst->current = NULL;
+ }
+}
+
+/**
+ * Store a constant value to a virtual register
+ */
+static void store_const(jit_ctx *ctx, vreg *dst, int64_t val) {
+ preg *p;
+
+ if (IS_FLOAT(dst)) {
+ // Allocate FPU register for float constants
+ p = alloc_fpu(ctx);
+ if (p->holds != NULL)
+ free_reg(ctx, p);
+
+ if (val == 0) {
+ // FMOV Dd, XZR - zero the FP register
+ EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | p->id);
+ } else {
+ // Load bits to GPR, then FMOV to FPR
+ load_immediate(ctx, val, RTMP, true);
+ // FMOV Dd, Xn: sf=1, S=0, type=01, rmode=00, opcode=00111, Rn, Rd
+ EMIT32(ctx, (0x9E670000) | (RTMP << 5) | p->id);
+ }
+ } else {
+ p = alloc_reg(ctx, dst, RCPU);
+ if (p->holds != NULL)
+ free_reg(ctx, p);
+ load_immediate(ctx, val, p->id, dst->size == 8);
+ }
+
+ reg_bind(ctx, dst, p);
+ store(ctx, dst, p); // Constants must be stored immediately for correct loop initialization
+}
+
+// ============================================================================
+// Arithmetic Operations
+// ============================================================================
+
+// Forward declaration for op_call_native (used by floating-point modulo)
+static void op_call_native(jit_ctx *ctx, vreg *dst, hl_type *ftype, void *func_ptr, vreg **args, int nargs);
+// Forward declaration for prepare_call_args (used by op_jump for dynamic comparisons)
+static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native);
+
+/**
+ * Binary arithmetic/logic operations handler
+ * Handles: OAdd, OSub, OMul, OSDiv, OUDiv, OSMod, OUMod, OAnd, OOr, OXor, shifts
+ */
+static void op_binop(jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op op) {
+ bool is_64bit = dst->size == 8;
+ int sf = is_64bit ? 1 : 0;
+
+ // Handle floating-point operations
+ if (IS_FLOAT(dst)) {
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+ preg *pd;
+
+ // If dst == a, reuse pa as destination to avoid clobbering issues
+ // when reg_bind tries to store the old (now stale) value
+ if (dst == a) {
+ pd = pa;
+ } else {
+ pd = alloc_fpu(ctx);
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+ }
+
+ int type = (dst->t->kind == HF64) ? 0x01 : 0x00; // 01=double, 00=single
+
+ switch (op) {
+ case OAdd:
+ // FADD Vd, Vn, Vm
+ encode_fp_arith(ctx, 0, 0, type, pb->id, 0x02, pa->id, pd->id);
+ break;
+ case OSub:
+ // FSUB Vd, Vn, Vm
+ encode_fp_arith(ctx, 0, 0, type, pb->id, 0x03, pa->id, pd->id);
+ break;
+ case OMul:
+ // FMUL Vd, Vn, Vm
+ encode_fp_arith(ctx, 0, 0, type, pb->id, 0x00, pa->id, pd->id);
+ break;
+ case OSDiv:
+ case OUDiv: // Same as OSDiv for floats
+ // FDIV Vd, Vn, Vm
+ encode_fp_arith(ctx, 0, 0, type, pb->id, 0x01, pa->id, pd->id);
+ break;
+ case OSMod:
+ case OUMod: {
+ // Floating-point modulo: call fmod/fmodf from C library
+ // Need to discard pa/pb since op_call_native will spill
+ discard(ctx, pa);
+ discard(ctx, pb);
+ void *mod_func = (dst->t->kind == HF64) ? (void*)fmod : (void*)fmodf;
+ vreg *args[2] = { a, b };
+ op_call_native(ctx, dst, NULL, mod_func, args, 2);
+ return; // op_call_native handles result storage
+ }
+ default:
+ JIT_ASSERT(0); // Invalid FP operation
+ }
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ return;
+ }
+
+ // Integer operations
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+ preg *pd;
+
+ // If dst == a, reuse pa as destination to avoid clobbering issues
+ // when reg_bind tries to store the old (now stale) value
+ if (dst == a) {
+ pd = pa;
+ } else {
+ pd = alloc_cpu(ctx, RCPU);
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+ }
+
+ switch (op) {
+ case OAdd:
+ // ADD Xd, Xn, Xm
+ encode_add_sub_reg(ctx, sf, 0, 0, 0, pb->id, 0, pa->id, pd->id);
+ break;
+
+ case OSub:
+ // SUB Xd, Xn, Xm
+ encode_add_sub_reg(ctx, sf, 1, 0, 0, pb->id, 0, pa->id, pd->id);
+ break;
+
+ case OMul:
+ // MUL Xd, Xn, Xm (using MADD with XZR as addend)
+ encode_madd_msub(ctx, sf, 0, pb->id, XZR, pa->id, pd->id);
+ break;
+
+ case OSDiv:
+ // SDIV Xd, Xn, Xm (signed division)
+ // Note: encode_div U=1 means SDIV, U=0 means UDIV (per ARM ISA)
+ encode_div(ctx, sf, 1, pb->id, pa->id, pd->id);
+ break;
+
+ case OUDiv:
+ // UDIV Xd, Xn, Xm (unsigned division)
+ encode_div(ctx, sf, 0, pb->id, pa->id, pd->id);
+ break;
+
+ case OSMod: {
+ // Signed modulo with special case handling:
+ // - divisor == 0: return 0 (avoid returning dividend)
+ // - divisor == -1: return 0 (avoid MIN % -1 overflow)
+ // CBZ divisor, zero_case
+ int jz = BUF_POS();
+ encode_cbz_cbnz(ctx, sf, 0, 0, pb->id); // CBZ
+
+ // CMP divisor, #-1; B.EQ zero_case
+ // CMN is ADD setting flags, so CMN Xn, #1 checks if Xn == -1
+ encode_add_sub_imm(ctx, sf, 1, 1, 0, 1, pb->id, XZR); // CMN divisor, #1
+ int jneg1 = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ
+
+ // Normal path: remainder = dividend - (quotient * divisor)
+ encode_div(ctx, sf, 1, pb->id, pa->id, RTMP); // RTMP = a / b (signed)
+ encode_madd_msub(ctx, sf, 1, pb->id, pa->id, RTMP, pd->id); // pd = a - (RTMP * b)
+ int jend = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B end
+
+ // Zero case: return 0
+ int zero_pos = BUF_POS();
+ // MOV pd, #0 (using ORR with XZR)
+ encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, pd->id); // ORR pd, XZR, XZR
+
+ patch_jump(ctx, jz, zero_pos);
+ patch_jump(ctx, jneg1, zero_pos);
+ patch_jump(ctx, jend, BUF_POS());
+ break;
+ }
+
+ case OUMod: {
+ // Unsigned modulo with special case:
+ // - divisor == 0: return 0
+ // CBZ divisor, zero_case
+ int jz = BUF_POS();
+ encode_cbz_cbnz(ctx, sf, 0, 0, pb->id); // CBZ
+
+ // Normal path
+ encode_div(ctx, sf, 0, pb->id, pa->id, RTMP); // RTMP = a / b (unsigned)
+ encode_madd_msub(ctx, sf, 1, pb->id, pa->id, RTMP, pd->id); // pd = a - (RTMP * b)
+ int jend = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B end
+
+ // Zero case: return 0
+ int zero_pos = BUF_POS();
+ encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, pd->id); // ORR pd, XZR, XZR
+
+ patch_jump(ctx, jz, zero_pos);
+ patch_jump(ctx, jend, BUF_POS());
+ break;
+ }
+
+ case OAnd:
+ // AND Xd, Xn, Xm
+ encode_logical_reg(ctx, sf, 0x00, 0, 0, pb->id, 0, pa->id, pd->id);
+ break;
+
+ case OOr:
+ // ORR Xd, Xn, Xm
+ encode_logical_reg(ctx, sf, 0x01, 0, 0, pb->id, 0, pa->id, pd->id);
+ break;
+
+ case OXor:
+ // EOR Xd, Xn, Xm
+ encode_logical_reg(ctx, sf, 0x02, 0, 0, pb->id, 0, pa->id, pd->id);
+ break;
+
+ case OShl:
+ // LSL Xd, Xn, Xm (logical shift left)
+ encode_shift_reg(ctx, sf, 0x00, pb->id, pa->id, pd->id);
+ break;
+
+ case OUShr:
+ // LSR Xd, Xn, Xm (logical shift right - unsigned)
+ encode_shift_reg(ctx, sf, 0x01, pb->id, pa->id, pd->id);
+ break;
+
+ case OSShr:
+ // ASR Xd, Xn, Xm (arithmetic shift right - signed)
+ encode_shift_reg(ctx, sf, 0x02, pb->id, pa->id, pd->id);
+ break;
+
+ default:
+ JIT_ASSERT(0); // Unknown operation
+ }
+
+ // Mask result for sub-32-bit integer types (UI8, UI16)
+ // AArch64 doesn't have 8/16-bit registers like x86, so we need explicit masking
+ if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) {
+ // AND Wd, Wd, #0xFF (sf=0, opc=0, N=0, immr=0, imms=7)
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id);
+ } else if (dst->t->kind == HUI16) {
+ // AND Wd, Wd, #0xFFFF (sf=0, opc=0, N=0, immr=0, imms=15)
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id);
+ }
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+}
+
+/**
+ * Unary negation (ONeg)
+ */
+static void op_neg(jit_ctx *ctx, vreg *dst, vreg *a) {
+ if (IS_FLOAT(a)) {
+ // FNEG Vd, Vn
+ preg *pa = fetch(ctx, a);
+ preg *pd = alloc_fpu(ctx);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ int type = (dst->t->kind == HF64) ? 0x01 : 0x00;
+ encode_fp_1src(ctx, 0, 0, type, 0x02, pa->id, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ } else {
+ // NEG Xd, Xn (implemented as SUB Xd, XZR, Xn)
+ preg *pa = fetch(ctx, a);
+ preg *pd = alloc_cpu(ctx, RCPU);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ int sf = (dst->size == 8) ? 1 : 0;
+ encode_add_sub_reg(ctx, sf, 1, 0, 0, pa->id, 0, XZR, pd->id);
+
+ // Mask result for sub-32-bit integer types
+ if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id);
+ } else if (dst->t->kind == HUI16) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id);
+ }
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ }
+}
+
+/**
+ * Logical NOT (ONot) - boolean negation
+ */
+static void op_not(jit_ctx *ctx, vreg *dst, vreg *a) {
+ // XOR with 1 (boolean NOT)
+ preg *pa = fetch(ctx, a);
+ preg *pd = alloc_cpu(ctx, RCPU);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ // Load immediate 1
+ load_immediate(ctx, 1, RTMP, false);
+
+ // EOR Wd, Wn, Wtmp (32-bit XOR with 1)
+ encode_logical_reg(ctx, 0, 0x02, 0, 0, RTMP, 0, pa->id, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+}
+
+/**
+ * Increment (OIncr)
+ */
+static void op_incr(jit_ctx *ctx, vreg *dst) {
+ // ADD Xd, Xd, #1 with memory writeback
+ preg *pd = fetch(ctx, dst);
+ int sf = (dst->size == 8) ? 1 : 0;
+
+ // ADD Xd, Xn, #1
+ encode_add_sub_imm(ctx, sf, 0, 0, 0, 1, pd->id, pd->id);
+
+ // Mask result for sub-32-bit integer types
+ if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id);
+ } else if (dst->t->kind == HUI16) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id);
+ }
+
+ mark_dirty(ctx, dst);
+}
+
+/**
+ * Decrement (ODecr)
+ */
+static void op_decr(jit_ctx *ctx, vreg *dst) {
+ // SUB Xd, Xd, #1 with memory writeback
+ preg *pd = fetch(ctx, dst);
+ int sf = (dst->size == 8) ? 1 : 0;
+
+ // SUB Xd, Xn, #1
+ encode_add_sub_imm(ctx, sf, 1, 0, 0, 1, pd->id, pd->id);
+
+ // Mask result for sub-32-bit integer types
+ if (dst->t->kind == HUI8 || dst->t->kind == HBOOL) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, pd->id, pd->id);
+ } else if (dst->t->kind == HUI16) {
+ encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, pd->id, pd->id);
+ }
+
+ mark_dirty(ctx, dst);
+}
+
+// ============================================================================
+// Type Conversion Operations
+// ============================================================================
+
+/**
+ * Convert to integer (OToInt)
+ * Handles: float->int, i32->i64 sign extension, and int->int copy
+ */
+static void op_toint(jit_ctx *ctx, vreg *dst, vreg *src) {
+ // Same register optimization
+ if (dst == src) return;
+
+ // Case 1: Float to integer conversion
+ if (IS_FLOAT(src)) {
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_cpu(ctx, RCPU);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ int sf = (dst->size == 8) ? 1 : 0;
+ int type = (src->t->kind == HF64) ? 0x01 : 0x00;
+
+ // FCVTZS Xd, Vn (float to signed int, round toward zero)
+ encode_fcvt_int(ctx, sf, 0, type, 0x03, 0x00, ps->id, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ return;
+ }
+
+ // Case 2: i32 to i64 sign extension
+ if (dst->size == 8 && src->size == 4) {
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_cpu(ctx, RCPU);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ Arm64Reg src_r = (ps->kind == RCPU) ? (Arm64Reg)ps->id : RTMP;
+ if (ps->kind == RCONST) {
+ load_immediate(ctx, ps->id, src_r, false);
+ } else if (ps->kind != RCPU) {
+ ldr_stack(ctx, src_r, src->stackPos, src->size);
+ }
+
+ // SXTW Xd, Wn (sign extend word to doubleword)
+ // Encoding: 0x93407c00 | (Rn << 5) | Rd
+ EMIT32(ctx, 0x93407c00 | (src_r << 5) | pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ return;
+ }
+
+ // Case 3: Integer to integer copy (same size or truncation)
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_cpu(ctx, RCPU);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ Arm64Reg src_r = (ps->kind == RCPU) ? (Arm64Reg)ps->id : RTMP;
+ if (ps->kind == RCONST) {
+ load_immediate(ctx, ps->id, src_r, src->size == 8);
+ } else if (ps->kind != RCPU) {
+ ldr_stack(ctx, src_r, src->stackPos, src->size);
+ }
+
+ // MOV Xd, Xn (or MOV Wd, Wn for 32-bit)
+ int sf = (dst->size == 8) ? 1 : 0;
+ mov_reg_reg(ctx, pd->id, src_r, sf);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+}
+
+/**
+ * Convert signed integer to float, or convert between float precisions (OToSFloat)
+ * Handles: integer -> float (SCVTF), F64 -> F32, F32 -> F64 (FCVT)
+ */
+static void op_tosfloat(jit_ctx *ctx, vreg *dst, vreg *src) {
+ // Handle float-to-float precision conversions
+ if (src->t->kind == HF64 && dst->t->kind == HF32) {
+ // F64 -> F32: FCVT Sd, Dn
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_fpu(ctx);
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ Arm64FpReg src_r = (ps->kind == RFPU) ? (Arm64FpReg)ps->id : V16;
+ if (ps->kind != RFPU) {
+ ldr_stack_fp(ctx, src_r, src->stackPos, src->size);
+ }
+
+ // FCVT Sd, Dn: type=1 (double source), opcode=4 (convert to single)
+ encode_fp_1src(ctx, 0, 0, 1, 4, src_r, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ return;
+ }
+
+ if (src->t->kind == HF32 && dst->t->kind == HF64) {
+ // F32 -> F64: FCVT Dd, Sn
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_fpu(ctx);
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ Arm64FpReg src_r = (ps->kind == RFPU) ? (Arm64FpReg)ps->id : V16;
+ if (ps->kind != RFPU) {
+ ldr_stack_fp(ctx, src_r, src->stackPos, src->size);
+ }
+
+ // FCVT Dd, Sn: type=0 (single source), opcode=5 (convert to double)
+ encode_fp_1src(ctx, 0, 0, 0, 5, src_r, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+ return;
+ }
+
+ // Integer to float conversion (original behavior)
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_fpu(ctx);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ int sf = (src->size == 8) ? 1 : 0;
+ int type = (dst->t->kind == HF64) ? 0x01 : 0x00;
+
+ // SCVTF Vd, Xn (signed int to float)
+ encode_int_fcvt(ctx, sf, 0, type, 0x00, 0x02, ps->id, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+}
+
+/**
+ * Convert unsigned integer to float (OToUFloat)
+ */
+static void op_toufloat(jit_ctx *ctx, vreg *dst, vreg *src) {
+ preg *ps = fetch(ctx, src);
+ preg *pd = alloc_fpu(ctx);
+
+ if (pd->holds != NULL)
+ free_reg(ctx, pd);
+
+ int sf = (src->size == 8) ? 1 : 0;
+ int type = (dst->t->kind == HF64) ? 0x01 : 0x00;
+
+ // UCVTF Vd, Xn (unsigned int to float)
+ encode_int_fcvt(ctx, sf, 0, type, 0x00, 0x03, ps->id, pd->id);
+
+ reg_bind(ctx, dst, pd);
+ mark_dirty(ctx, dst);
+}
+
+// ============================================================================
+// Jump Patching
+// ============================================================================
+
+/**
+ * Add a jump to the patch list
+ * Also mark the target opcode so we know to discard registers when we reach it
+ */
+static void register_jump(jit_ctx *ctx, int pos, int target) {
+ jlist *j = (jlist*)malloc(sizeof(jlist));
+ j->pos = pos;
+ j->target = target;
+ j->next = ctx->jumps;
+ ctx->jumps = j;
+
+ // Mark target as a jump destination (like x86 does)
+ // This tells us to discard register bindings when we reach this opcode
+ if (target > 0 && target < ctx->maxOps && ctx->opsPos[target] == 0)
+ ctx->opsPos[target] = -1;
+}
+
+/**
+ * Patch a jump instruction with the correct offset
+ * AArch64 branches use instruction offsets (divide byte offset by 4)
+ */
+static void patch_jump(jit_ctx *ctx, int pos, int target_pos) {
+ unsigned int *code = (unsigned int*)(ctx->startBuf + pos);
+ int offset = target_pos - pos; // Byte offset
+ int insn_offset = offset / 4; // Instruction offset
+
+ // Check if this is a conditional branch (B.cond) or unconditional (B)
+ unsigned int insn = *code;
+ unsigned int opcode = (insn >> 24) & 0xFF;
+
+ if (opcode == 0x54) {
+ // B.cond - 19-bit signed offset
+ // Range: ±1MB (±0x40000 instructions, ±0x100000 bytes)
+ if (insn_offset < -0x40000 || insn_offset >= 0x40000) {
+ printf("JIT Error: Conditional branch offset too large: %d\n", insn_offset);
+ JIT_ASSERT(0);
+ }
+ // Clear old offset, set new offset (bits 5-23)
+ *code = (insn & 0xFF00001F) | ((insn_offset & 0x7FFFF) << 5);
+ } else if ((opcode & 0xFC) == 0x14) {
+ // B or BL - 26-bit signed offset
+ // Range: ±128MB (±0x2000000 instructions, ±0x8000000 bytes)
+ if (insn_offset < -0x2000000 || insn_offset >= 0x2000000) {
+ printf("JIT Error: Branch offset too large: %d\n", insn_offset);
+ JIT_ASSERT(0);
+ }
+ // Clear old offset, set new offset (bits 0-25)
+ *code = (insn & 0xFC000000) | (insn_offset & 0x3FFFFFF);
+ } else if ((opcode & 0x7E) == 0x34) {
+ // CBZ/CBNZ - 19-bit signed offset
+ if (insn_offset < -0x40000 || insn_offset >= 0x40000) {
+ printf("JIT Error: CBZ/CBNZ offset too large: %d\n", insn_offset);
+ JIT_ASSERT(0);
+ }
+ *code = (insn & 0xFF00001F) | ((insn_offset & 0x7FFFF) << 5);
+ } else {
+ printf("JIT Error: Unknown branch instruction at %d: 0x%08X\n", pos, insn);
+ JIT_ASSERT(0);
+ }
+}
+
+// ============================================================================
+// Control Flow & Comparisons
+// ============================================================================
+
+/**
+ * Map HashLink condition to AArch64 condition code
+ */
+static ArmCondition hl_cond_to_arm(hl_op op, bool is_float) {
+ switch (op) {
+ case OJEq: return COND_EQ; // Equal
+ case OJNotEq: return COND_NE; // Not equal
+ case OJSLt: return is_float ? COND_MI : COND_LT; // Signed less than
+ case OJSGte: return is_float ? COND_PL : COND_GE; // Signed greater or equal
+ case OJSGt: return COND_GT; // Signed greater than
+ case OJSLte: return COND_LE; // Signed less or equal
+ case OJULt: return COND_LO; // Unsigned less than (carry clear)
+ case OJUGte: return COND_HS; // Unsigned greater or equal (carry set)
+ // Float NaN-aware comparisons (includes unordered case)
+ case OJNotLt: return COND_HS; // Not less than (C=1: >=, or unordered)
+ case OJNotGte: return COND_LT; // Not greater/equal (N!=V: <, or unordered)
+ default:
+ JIT_ASSERT(0);
+ return COND_AL;
+ }
+}
+
+/**
+ * Conditional and comparison jumps
+ *
+ * Handles special cases for dynamic types:
+ * - HDYN/HFUN: Call hl_dyn_compare() to compare dynamic values
+ * - HTYPE: Call hl_same_type() to compare type objects
+ * - HNULL: Compare boxed values (Null)
+ * - HVIRTUAL: Compare virtual objects with underlying values
+ */
+static void op_jump(jit_ctx *ctx, vreg *a, vreg *b, hl_op op, int target_opcode) {
+ // Spill all registers to stack BEFORE the branch.
+ // Target label will use discard_regs() and expect values on stack.
+ spill_regs(ctx);
+ spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge
+
+ // Handle dynamic and function type comparisons
+ if (a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN) {
+ // Call hl_dyn_compare(a, b) which returns:
+ // 0 if equal
+ // negative if a < b
+ // positive if a > b
+ // hl_invalid_comparison (0xAABBCCDD) for incomparable types
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+
+ // Load function pointer and call
+ load_immediate(ctx, (int64_t)hl_dyn_compare, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+
+ // Clean up stack
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+
+ // Handle ordered comparisons (OJSLt/OJSGt/OJSLte/OJSGte) - need to check for hl_invalid_comparison
+ if (op == OJSLt || op == OJSGt || op == OJSLte || op == OJSGte) {
+ // Compare result with hl_invalid_comparison (0xAABBCCDD)
+ // If equal, don't take the branch (skip the jump)
+ load_immediate(ctx, hl_invalid_comparison, RTMP, false);
+ encode_add_sub_reg(ctx, 0, 1, 1, 0, RTMP, 0, X0, XZR); // CMP W0, WTMP
+ int skip_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (if invalid comparison)
+
+ // Valid comparison - compare result with 0 for sign flags
+ encode_add_sub_imm(ctx, 0, 1, 1, 0, 0, X0, XZR); // CMP W0, #0
+ ArmCondition cond = hl_cond_to_arm(op, false);
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond);
+ register_jump(ctx, jump_pos, target_opcode);
+
+ // Patch the skip branch to here
+ int skip_offset = (BUF_POS() - skip_pos) / 4;
+ *(int*)(ctx->startBuf + skip_pos) = (*(int*)(ctx->startBuf + skip_pos) & 0xFF00001F) | ((skip_offset & 0x7FFFF) << 5);
+ return;
+ }
+
+ // For OJEq/OJNotEq: result == 0 means equal
+ // TST W0, W0 (sets flags based on W0 & W0)
+ encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // ANDS WZR, W0, W0
+
+ // Branch based on zero flag (only equality ops should reach here)
+ ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE;
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond);
+ register_jump(ctx, jump_pos, target_opcode);
+ return;
+ }
+
+ // Handle type comparisons
+ if (a->t->kind == HTYPE) {
+ // Call hl_same_type(a, b) which returns bool
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+
+ load_immediate(ctx, (int64_t)hl_same_type, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+
+ // Compare result with 1 (true): CMP W0, #1 = SUBS WZR, W0, #1
+ // Note: S=1 is required both to set flags AND to make Rd=31 mean XZR (not SP)
+ encode_add_sub_imm(ctx, 0, 1, 1, 0, 1, X0, XZR); // CMP W0, #1
+
+ ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE;
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond);
+ register_jump(ctx, jump_pos, target_opcode);
+ return;
+ }
+
+ // Handle HNULL (Null) comparisons
+ // HNULL values have their inner value at offset HDYN_VALUE (8)
+ if (a->t->kind == HNULL) {
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+ Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP;
+ Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2;
+ if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8);
+ if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8);
+
+ if (op == OJEq) {
+ // if (a == b || (a && b && a->v == b->v)) goto target
+ // First: CMP a, b - if equal, jump to target
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jump_pos1 = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_pos1, target_opcode);
+
+ // If a == NULL, skip (don't jump)
+ int skip_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, skip
+
+ // If b == NULL, skip (don't jump)
+ int skip_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, skip
+
+ // Load inner values: a->v and b->v (at offset HDYN_VALUE)
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, ra, ra); // LDR ra, [ra, #8]
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, rb, rb); // LDR rb, [rb, #8]
+
+ // Compare inner values
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jump_pos2 = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_pos2, target_opcode);
+
+ // Patch skip branches to here
+ int here = BUF_POS();
+ int off_a = (here - skip_a) / 4;
+ int off_b = (here - skip_b) / 4;
+ *(int*)(ctx->startBuf + skip_a) = (*(int*)(ctx->startBuf + skip_a) & 0xFF00001F) | ((off_a & 0x7FFFF) << 5);
+ *(int*)(ctx->startBuf + skip_b) = (*(int*)(ctx->startBuf + skip_b) & 0xFF00001F) | ((off_b & 0x7FFFF) << 5);
+ } else if (op == OJNotEq) {
+ // if (a != b && (!a || !b || a->v != b->v)) goto target
+ // First: CMP a, b - if equal, skip entirely
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int skip_eq = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (a == b means not-not-equal)
+
+ // If a == NULL, goto target (NULL != non-NULL)
+ int jump_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, target
+ register_jump(ctx, jump_a, target_opcode);
+
+ // If b == NULL, goto target (non-NULL != NULL)
+ int jump_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, target
+ register_jump(ctx, jump_b, target_opcode);
+
+ // Load inner values
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, ra, ra);
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HDYN_VALUE / 8, rb, rb);
+
+ // Compare inner values - if not equal, goto target
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int skip_cmp = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ skip (values equal, don't jump)
+
+ // Values not equal - jump to target
+ int jump_ne = BUF_POS();
+ encode_branch_uncond(ctx, 0);
+ register_jump(ctx, jump_ne, target_opcode);
+
+ // Patch skip branches
+ int here = BUF_POS();
+ int off_eq = (here - skip_eq) / 4;
+ int off_cmp = (here - skip_cmp) / 4;
+ *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5);
+ *(int*)(ctx->startBuf + skip_cmp) = (*(int*)(ctx->startBuf + skip_cmp) & 0xFF00001F) | ((off_cmp & 0x7FFFF) << 5);
+ } else {
+ jit_error("Unsupported comparison op for HNULL");
+ }
+ // Clear register bindings - the registers now contain dereferenced inner values,
+ // not the original Null pointers. Without this, subsequent ops would think
+ // pa/pb still hold the original vregs, causing use of stale/wrong values.
+ discard(ctx, pa);
+ discard(ctx, pb);
+ return;
+ }
+
+ // Handle HVIRTUAL comparisons
+ // Virtual objects have a 'value' pointer at offset HL_WSIZE (8)
+ if (a->t->kind == HVIRTUAL) {
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+ Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP;
+ Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2;
+ if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8);
+ if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8);
+
+ if (b->t->kind == HOBJ) {
+ // Comparing virtual to object: compare a->value with b
+ if (op == OJEq) {
+ // if (a ? (b && a->value == b) : (b == NULL)) goto target
+ int ja = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, check_b_null
+
+ // a != NULL: check if b != NULL and a->value == b
+ int jb = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, skip (a!=NULL, b==NULL: not equal)
+
+ // Load a->value and compare with b
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra); // LDR ra, [ra, #8]
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jvalue = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B to_cmp
+
+ // a == NULL: check if b == NULL
+ int here_ja = BUF_POS();
+ int off_ja = (here_ja - ja) / 4;
+ *(int*)(ctx->startBuf + ja) = (*(int*)(ctx->startBuf + ja) & 0xFF00001F) | ((off_ja & 0x7FFFF) << 5);
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, XZR, XZR); // CMP rb, #0 (TST rb)
+
+ // Patch jvalue to here (to_cmp)
+ int here_jv = BUF_POS();
+ int off_jv = (here_jv - jvalue) / 4;
+ *(int*)(ctx->startBuf + jvalue) = 0x14000000 | (off_jv & 0x3FFFFFF);
+
+ // Now flags are set - branch if equal
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_pos, target_opcode);
+
+ // Patch jb to skip
+ int here_jb = BUF_POS();
+ int off_jb = (here_jb - jb) / 4;
+ *(int*)(ctx->startBuf + jb) = (*(int*)(ctx->startBuf + jb) & 0xFF00001F) | ((off_jb & 0x7FFFF) << 5);
+ } else if (op == OJNotEq) {
+ // if (a ? (b == NULL || a->value != b) : (b != NULL)) goto target
+ int ja = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ ra, check_b_notnull
+
+ // a != NULL: jump if b == NULL
+ int jump_b_null = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ rb, target
+ register_jump(ctx, jump_b_null, target_opcode);
+
+ // Load a->value and compare with b
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra);
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jvalue = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B to_cmp
+
+ // a == NULL: check if b != NULL
+ int here_ja = BUF_POS();
+ int off_ja = (here_ja - ja) / 4;
+ *(int*)(ctx->startBuf + ja) = (*(int*)(ctx->startBuf + ja) & 0xFF00001F) | ((off_ja & 0x7FFFF) << 5);
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, XZR, XZR); // CMP rb, #0
+
+ // Patch jvalue
+ int here_jv = BUF_POS();
+ int off_jv = (here_jv - jvalue) / 4;
+ *(int*)(ctx->startBuf + jvalue) = 0x14000000 | (off_jv & 0x3FFFFFF);
+
+ // Branch if not equal
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_NE);
+ register_jump(ctx, jump_pos, target_opcode);
+ } else {
+ jit_error("Unsupported comparison op for HVIRTUAL vs HOBJ");
+ }
+ // Clear register bindings - ra now contains a->value, not a
+ discard(ctx, pa);
+ return;
+ }
+
+ // Both are HVIRTUAL - compare underlying values
+ if (op == OJEq) {
+ // if (a == b || (a && b && a->value && b->value && a->value == b->value)) goto
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jump_eq = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_eq, target_opcode);
+
+ // Check a != NULL
+ int skip_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+ // Check b != NULL
+ int skip_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+
+ // Load a->value
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra);
+ int skip_av = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra); // CBZ if a->value == NULL
+
+ // Load b->value
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, rb, rb);
+ int skip_bv = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb); // CBZ if b->value == NULL
+
+ // Compare values
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int jump_val = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_val, target_opcode);
+
+ // Patch all skips to here
+ int here = BUF_POS();
+ int patches[] = { skip_a, skip_b, skip_av, skip_bv };
+ for (int i = 0; i < 4; i++) {
+ int off = (here - patches[i]) / 4;
+ *(int*)(ctx->startBuf + patches[i]) = (*(int*)(ctx->startBuf + patches[i]) & 0xFF00001F) | ((off & 0x7FFFF) << 5);
+ }
+ } else if (op == OJNotEq) {
+ // if (a != b && (!a || !b || !a->value || !b->value || a->value != b->value)) goto
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int skip_eq = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // Skip if a == b
+
+ // If a == NULL, jump
+ int jump_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+ register_jump(ctx, jump_a, target_opcode);
+
+ // If b == NULL, jump
+ int jump_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+ register_jump(ctx, jump_b, target_opcode);
+
+ // Load a->value
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, ra, ra);
+ int jump_av = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+ register_jump(ctx, jump_av, target_opcode);
+
+ // Load b->value
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, HL_WSIZE / 8, rb, rb);
+ int jump_bv = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+ register_jump(ctx, jump_bv, target_opcode);
+
+ // Compare - if not equal, jump
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR);
+ int skip_val = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+
+ // Not equal - jump to target
+ int jump_ne = BUF_POS();
+ encode_branch_uncond(ctx, 0);
+ register_jump(ctx, jump_ne, target_opcode);
+
+ // Patch skips
+ int here = BUF_POS();
+ int off_eq = (here - skip_eq) / 4;
+ int off_val = (here - skip_val) / 4;
+ *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5);
+ *(int*)(ctx->startBuf + skip_val) = (*(int*)(ctx->startBuf + skip_val) & 0xFF00001F) | ((off_val & 0x7FFFF) << 5);
+ } else {
+ jit_error("Unsupported comparison op for HVIRTUAL");
+ }
+ // Clear register bindings - ra/rb now contain ->value, not the original vregs
+ discard(ctx, pa);
+ discard(ctx, pb);
+ return;
+ }
+
+ // Handle HOBJ/HSTRUCT vs HVIRTUAL (swap operands)
+ if ((a->t->kind == HOBJ || a->t->kind == HSTRUCT) && b->t->kind == HVIRTUAL) {
+ // Swap and recurse - the HVIRTUAL case handles HOBJ on the right
+ op_jump(ctx, b, a, op, target_opcode);
+ return;
+ }
+
+ // Handle String EQUALITY comparison (value-based per Haxe spec)
+ // jit_str_cmp only returns 0 (equal) or 1 (not equal), so it can only be used
+ // for OJEq/OJNotEq. For ordered comparisons, fall through to compareFun path.
+ if ((op == OJEq || op == OJNotEq) && is_string_type(a->t) && is_string_type(b->t)) {
+ // Spill before call
+ spill_regs(ctx);
+ spill_callee_saved(ctx);
+
+ // Call jit_str_cmp(a, b) - returns 0 if equal, non-zero if not equal
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+ load_immediate(ctx, (int64_t)jit_str_cmp, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+
+ // Result in X0: 0 = equal, non-zero = not equal
+ // TST X0, X0 sets Z flag (Z=1 if X0==0)
+ encode_logical_reg(ctx, 1, 0x3, 0, 0, X0, 0, X0, XZR); // TST X0, X0
+
+ // Branch based on op (only EQ or NE)
+ ArmCondition cond = (op == OJEq) ? COND_EQ : COND_NE;
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond);
+ register_jump(ctx, jump_pos, target_opcode);
+ return;
+ }
+
+ // Handle HOBJ/HSTRUCT with compareFun (e.g., String)
+ // Use hl_get_obj_rt() to ensure runtime object is initialized (like x86 does)
+ // NOTE: compareFun is a FUNCTION INDEX, not a function pointer!
+ if ((a->t->kind == HOBJ || a->t->kind == HSTRUCT) && hl_get_obj_rt(a->t)->compareFun) {
+ int compareFunIndex = (int)(int_val)hl_get_obj_rt(a->t)->compareFun;
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+ Arm64Reg ra = (pa->kind == RCPU) ? (Arm64Reg)pa->id : RTMP;
+ Arm64Reg rb = (pb->kind == RCPU) ? (Arm64Reg)pb->id : RTMP2;
+ if (pa->kind != RCPU) ldr_stack(ctx, ra, a->stackPos, 8);
+ if (pb->kind != RCPU) ldr_stack(ctx, rb, b->stackPos, 8);
+
+ if (op == OJEq) {
+ // if (a == b || (a && b && cmp(a,b) == 0)) goto target
+ // First check pointer equality
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); // CMP ra, rb
+ int jump_eq = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+ register_jump(ctx, jump_eq, target_opcode);
+
+ // If a == NULL, skip
+ int skip_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+
+ // If b == NULL, skip
+ int skip_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+
+ // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer!
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+ emit_call_findex(ctx, compareFunIndex, stack_space);
+
+ // If result == 0, goto target
+ encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // TST W0, W0
+ int skip_cmp = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_NE); // Skip if result != 0
+
+ // Jump to target
+ int jump_target = BUF_POS();
+ encode_branch_uncond(ctx, 0);
+ register_jump(ctx, jump_target, target_opcode);
+
+ // Patch all skips to here
+ int here = BUF_POS();
+ int patches[] = { skip_a, skip_b, skip_cmp };
+ for (int i = 0; i < 3; i++) {
+ int off = (here - patches[i]) / 4;
+ *(int*)(ctx->startBuf + patches[i]) = (*(int*)(ctx->startBuf + patches[i]) & 0xFF00001F) | ((off & 0x7FFFF) << 5);
+ }
+ } else if (op == OJNotEq) {
+ // if (a != b && (!a || !b || cmp(a,b) != 0)) goto target
+ // First check pointer equality - if equal, skip entirely
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, rb, 0, ra, XZR); // CMP ra, rb
+ int skip_eq = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ);
+
+ // If a == NULL, goto target
+ int jump_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+ register_jump(ctx, jump_a, target_opcode);
+
+ // If b == NULL, goto target
+ int jump_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+ register_jump(ctx, jump_b, target_opcode);
+
+ // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer!
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+ emit_call_findex(ctx, compareFunIndex, stack_space);
+
+ // If result != 0, goto target
+ encode_logical_reg(ctx, 0, 0x3, 0, 0, X0, 0, X0, XZR); // TST W0, W0
+ int skip_cmp = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // Skip if result == 0
+
+ // Jump to target
+ int jump_target = BUF_POS();
+ encode_branch_uncond(ctx, 0);
+ register_jump(ctx, jump_target, target_opcode);
+
+ // Patch skips to here
+ int here = BUF_POS();
+ int off_eq = (here - skip_eq) / 4;
+ int off_cmp = (here - skip_cmp) / 4;
+ *(int*)(ctx->startBuf + skip_eq) = (*(int*)(ctx->startBuf + skip_eq) & 0xFF00001F) | ((off_eq & 0x7FFFF) << 5);
+ *(int*)(ctx->startBuf + skip_cmp) = (*(int*)(ctx->startBuf + skip_cmp) & 0xFF00001F) | ((off_cmp & 0x7FFFF) << 5);
+ } else {
+ // For OJSGt, OJSGte, OJSLt, OJSLte: if (a && b && cmp(a,b) ?? 0) goto
+ int skip_a = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, ra);
+
+ int skip_b = BUF_POS();
+ encode_cbz_cbnz(ctx, 1, 0, 0, rb);
+
+ // Call compareFun(a, b) - compareFunIndex is a function index, not a pointer!
+ vreg *args[2] = { a, b };
+ int stack_space = prepare_call_args(ctx, NULL, args, 2, true);
+ emit_call_findex(ctx, compareFunIndex, stack_space);
+
+ // Compare result with 0: CMP W0, #0
+ encode_add_sub_imm(ctx, 0, 1, 1, 0, 0, X0, XZR); // CMP W0, #0
+
+ // Branch based on condition
+ ArmCondition cond = hl_cond_to_arm(op, false);
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond);
+ register_jump(ctx, jump_pos, target_opcode);
+
+ // Patch skips to here
+ int here = BUF_POS();
+ int off_a = (here - skip_a) / 4;
+ int off_b = (here - skip_b) / 4;
+ *(int*)(ctx->startBuf + skip_a) = (*(int*)(ctx->startBuf + skip_a) & 0xFF00001F) | ((off_a & 0x7FFFF) << 5);
+ *(int*)(ctx->startBuf + skip_b) = (*(int*)(ctx->startBuf + skip_b) & 0xFF00001F) | ((off_b & 0x7FFFF) << 5);
+ }
+ return;
+ }
+
+ // Standard comparison for other types
+ bool is_float = IS_FLOAT(a);
+ preg *pa = fetch(ctx, a);
+ preg *pb = fetch(ctx, b);
+
+ if (is_float) {
+ // Floating-point comparison: FCMP Vn, Vm
+ int type = (a->t->kind == HF64) ? 0x01 : 0x00;
+ encode_fp_compare(ctx, 0, 0, type, pb->id, 0, pa->id);
+ } else {
+ // Integer comparison: CMP Xn, Xm (implemented as SUBS XZR, Xn, Xm)
+ int sf = (a->size == 8) ? 1 : 0;
+ encode_add_sub_reg(ctx, sf, 1, 1, 0, pb->id, 0, pa->id, XZR);
+ }
+
+ // Emit conditional branch
+ ArmCondition cond = hl_cond_to_arm(op, is_float);
+ int jump_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, cond); // Offset will be patched later
+
+ // Register for patching
+ register_jump(ctx, jump_pos, target_opcode);
+}
+
+/**
+ * Simple conditional jumps (OJTrue, OJFalse, OJNull, OJNotNull)
+ */
+static void op_jcond(jit_ctx *ctx, vreg *a, hl_op op, int target_opcode) {
+ // Spill all registers to stack BEFORE the branch.
+ // Target label will use discard_regs() and expect values on stack.
+ spill_regs(ctx);
+ spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge
+
+ preg *pa = fetch(ctx, a);
+ int jump_pos = BUF_POS();
+
+ // Determine which condition to test
+ bool test_zero = (op == OJFalse || op == OJNull);
+
+ // Use CBZ (compare and branch if zero) or CBNZ (compare and branch if non-zero)
+ int sf = (a->size == 8) ? 1 : 0;
+ int op_bit = test_zero ? 0 : 1; // 0=CBZ, 1=CBNZ
+
+ encode_cbz_cbnz(ctx, sf, op_bit, 0, pa->id); // Offset will be patched
+
+ // Register for patching
+ register_jump(ctx, jump_pos, target_opcode);
+}
+
+/**
+ * Unconditional jump (OJAlways)
+ */
+static void op_jalways(jit_ctx *ctx, int target_opcode) {
+ // Spill all registers to stack BEFORE the branch.
+ // Target label will use discard_regs() and expect values on stack.
+ spill_regs(ctx);
+ spill_callee_saved(ctx); // Callee-saved must also be spilled at control flow merge
+
+ int jump_pos = BUF_POS();
+ encode_branch_uncond(ctx, 0); // Offset will be patched
+
+ // Register for patching
+ register_jump(ctx, jump_pos, target_opcode);
+}
+
+/**
+ * Discard all register bindings at merge points (labels).
+ *
+ * Used at labels where control flow can come from multiple paths.
+ * Clears register↔vreg bindings so subsequent operations load from stack.
+ *
+ * With dirty tracking: If reached via fallthrough (not a jump), registers
+ * might still be dirty and need to be spilled first. Registers reached via
+ * jump are already clean because spill_regs() is called before all jumps.
+ */
+static void discard_regs(jit_ctx *ctx) {
+ int i;
+ // Handle CPU scratch registers (X0-X17)
+ // NOTE: This function must NOT emit any code!
+ // At labels, spill_regs() + spill_callee_saved() is called BEFORE this (for fallthrough).
+ // We just clear bindings here - values are already on stack.
+ for (i = 0; i < 18; i++) {
+ preg *r = &ctx->pregs[i];
+ if (r->holds) {
+ r->holds->dirty = 0;
+ r->holds->current = NULL;
+ r->holds = NULL;
+ }
+ }
+ // Handle callee-saved CPU registers (X19-X26)
+ // At merge points, callee-saved must also be discarded for consistent state
+ for (i = 0; i < RCPU_CALLEE_ALLOC_COUNT; i++) {
+ preg *r = REG_AT(RCPU_CALLEE_ALLOC[i]);
+ if (r->holds) {
+ r->holds->dirty = 0;
+ r->holds->current = NULL;
+ r->holds = NULL;
+ }
+ }
+ // Handle ALL FPU registers (V0-V31) at merge points
+ // At labels, control flow may come from different paths with different allocations
+ for (i = 0; i < RFPU_COUNT; i++) {
+ preg *r = &ctx->pregs[RCPU_COUNT + i];
+ if (r->holds) {
+ r->holds->dirty = 0;
+ r->holds->current = NULL;
+ r->holds = NULL;
+ }
+ }
+}
+
+/**
+ * Label marker (OLabel) - just records position for jump targets
+ * At a label, control flow could come from multiple places,
+ * so we must invalidate all register associations.
+ *
+ * IMPORTANT: No code is emitted here! The main loop calls spill_regs()
+ * BEFORE this function for the fallthrough path. Jump paths have already
+ * spilled before jumping. We just clear bindings so subsequent ops
+ * load from stack.
+ */
+static void op_label(jit_ctx *ctx) {
+ // Just clear bindings - spill_regs() was already called in main loop
+ discard_regs(ctx);
+}
+
+// ============================================================================
+// Memory Operations
+// ============================================================================
+
+/*
+ * Load byte/halfword/word from memory
+ * OGetI8/OGetI16/OGetI32: dst = *(type*)(base + offset)
+ */
+static void op_get_mem(jit_ctx *ctx, vreg *dst, vreg *base, int offset, int size) {
+ preg *base_reg = fetch(ctx, base);
+ preg *dst_reg = alloc_dst(ctx, dst);
+
+ Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP;
+ if (base_reg->kind != RCPU) {
+ ldr_stack(ctx, base_r, base->stackPos, base->size);
+ }
+
+ // Handle float and integer cases separately
+ if (IS_FLOAT(dst)) {
+ // Float: load into FPU register
+ Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16;
+ int size_bits = (size == 8) ? 0x03 : 0x02; // D or S
+
+ if (offset >= 0 && offset < (1 << 12) * size) {
+ int imm12 = offset / size;
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x01, imm12, base_r, (Arm64Reg)dst_r); // V=1 for FP
+ } else {
+ load_immediate(ctx, offset, RTMP2, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP);
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, (Arm64Reg)dst_r); // V=1 for FP
+ }
+
+ str_stack_fp(ctx, dst_r, dst->stackPos, dst->size);
+ } else {
+ // Integer/pointer: load into CPU register
+ // Use RTMP2 as temp (not RTMP) because str_stack's fallback uses RTMP internally.
+ // If we loaded into RTMP, str_stack would clobber the value.
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP2;
+
+ // Load with offset
+ // LDR Xd, [Xn, #offset] or LDRB/LDRH for smaller sizes
+ if (offset >= 0 && offset < (1 << 12) * size) {
+ // Fits in immediate offset
+ int imm12 = offset / size;
+ // size: 1=LDRB, 2=LDRH, 4=LDR(W), 8=LDR(X)
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, imm12, base_r, dst_r);
+ } else {
+ // Offset too large - compute effective address in RTMP, then load into dst_r
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+ load_immediate(ctx, offset, RTMP, false); // Use RTMP for address computation
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, base_r, RTMP);
+ // LDR dst_r, [RTMP]
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, RTMP, dst_r);
+ }
+
+ // Always store to stack - it's the source of truth for later loads
+ // (registers may be clobbered by subsequent calls)
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ // Release the base register - discard() will store if dirty
+ discard(ctx, base_reg);
+}
+
+/*
+ * Store byte/halfword/word to memory
+ * OSetI8/OSetI16/OSetI32: *(type*)(base + offset) = value
+ */
+static void op_set_mem(jit_ctx *ctx, vreg *base, int offset, vreg *value, int size) {
+ preg *base_reg = fetch(ctx, base);
+ preg *value_reg = fetch(ctx, value);
+
+ /*
+ * IMPORTANT: Load value FIRST, then base.
+ * ldr_stack's fallback path uses RTMP internally, so if we load base into RTMP
+ * first, then load value from stack, RTMP would get clobbered.
+ * By loading value first (into RTMP2 or FPU reg), any RTMP usage is harmless.
+ * Then we load base into RTMP, which is safe since value is already loaded.
+ */
+
+ // Handle float and integer cases separately
+ if (IS_FLOAT(value)) {
+ // Float: load value first into FPU register
+ Arm64FpReg value_r = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16;
+ if (value_reg->kind != RFPU) {
+ ldr_stack_fp(ctx, value_r, value->stackPos, value->size);
+ }
+
+ // Now load base (safe - value is already in FPU reg)
+ Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP;
+ if (base_reg->kind != RCPU) {
+ ldr_stack(ctx, base_r, base->stackPos, base->size);
+ }
+
+ int size_bits = (size == 8) ? 0x03 : 0x02; // D or S
+
+ if (offset >= 0 && offset < (1 << 12) * size) {
+ int imm12 = offset / size;
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x00, imm12, base_r, (Arm64Reg)value_r); // V=1 for FP
+ } else {
+ load_immediate(ctx, offset, RTMP2, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP);
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, (Arm64Reg)value_r); // V=1 for FP
+ }
+ } else {
+ // Integer/pointer: load value first into CPU register
+ Arm64Reg value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : RTMP2;
+ if (value_reg->kind == RCONST) {
+ load_immediate(ctx, value_reg->id, value_r, value->size == 8);
+ } else if (value_reg->kind != RCPU) {
+ ldr_stack(ctx, value_r, value->stackPos, value->size);
+ }
+
+ // Now load base (safe - value is already in RTMP2 or CPU reg)
+ Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP;
+ if (base_reg->kind != RCPU) {
+ ldr_stack(ctx, base_r, base->stackPos, base->size);
+ }
+
+ // Store with offset
+ // STR Xd, [Xn, #offset] or STRB/STRH for smaller sizes
+ if (offset >= 0 && offset < (1 << 12) * size) {
+ // Fits in immediate offset
+ int imm12 = offset / size;
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, imm12, base_r, value_r);
+ } else {
+ // Offset too large - load offset to temp register
+ if (value_r == RTMP2) {
+ // Value is already in RTMP2, use a different temp
+ load_immediate(ctx, offset, X9, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, X9, 0, base_r, RTMP);
+ } else {
+ load_immediate(ctx, offset, RTMP2, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, base_r, RTMP);
+ }
+ // STR value_r, [RTMP]
+ {
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, RTMP, value_r);
+ }
+ }
+ }
+
+ discard(ctx, base_reg);
+ discard(ctx, value_reg);
+}
+
+/*
+ * Load byte/halfword/word from memory with register offset
+ * OGetI8/OGetI16/OGetMem: dst = *(type*)(base + offset_reg)
+ * Unlike op_get_mem which takes an immediate offset, this takes an offset vreg
+ */
+/*
+ * IMPORTANT: We must load offset BEFORE base when base uses RTMP,
+ * because ldr_stack's fallback path uses RTMP as a temporary.
+ * Order: offset -> base -> compute address -> load
+ */
+static void op_get_mem_reg(jit_ctx *ctx, vreg *dst, vreg *base, vreg *offset, int size) {
+ preg *base_reg = fetch(ctx, base);
+ preg *offset_reg = fetch(ctx, offset);
+ preg *dst_reg = alloc_dst(ctx, dst);
+
+ // Step 1: Load offset FIRST (may clobber RTMP in fallback, but we haven't used it yet)
+ Arm64Reg offset_r = (offset_reg->kind == RCPU) ? (Arm64Reg)offset_reg->id : RTMP2;
+ if (offset_reg->kind == RCONST) {
+ load_immediate(ctx, offset_reg->id, offset_r, false);
+ } else if (offset_reg->kind != RCPU) {
+ ldr_stack(ctx, offset_r, offset->stackPos, offset->size);
+ }
+
+ // Step 2: Load base (if it needs RTMP, the value will stay in RTMP)
+ Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP;
+ if (base_reg->kind != RCPU) {
+ ldr_stack(ctx, base_r, base->stackPos, base->size);
+ }
+
+ // Step 3: Compute effective address: RTMP = base + offset
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, offset_r, 0, base_r, RTMP);
+
+ // Load from [RTMP] - handle float vs integer types
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+
+ if (IS_FLOAT(dst)) {
+ // Float load: use FPU register and V=1
+ Arm64FpReg dst_fp = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16;
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, (Arm64Reg)dst_fp); // V=1 for FP
+ str_stack_fp(ctx, dst_fp, dst->stackPos, dst->size);
+ } else {
+ // Integer load
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : X9;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, RTMP, dst_r);
+ // For byte/halfword loads, the result is zero-extended automatically by LDRB/LDRH
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ discard(ctx, base_reg);
+ discard(ctx, offset_reg);
+}
+
+/*
+ * Store byte/halfword/word to memory with register offset
+ * OSetI8/OSetI16/OSetMem: *(type*)(base + offset_reg) = value
+ * Unlike op_set_mem which takes an immediate offset, this takes an offset vreg
+ *
+ * IMPORTANT: We must load the value BEFORE computing the address in RTMP,
+ * because ldr_stack's fallback path for large/unaligned offsets uses RTMP
+ * as a temporary register.
+ */
+static void op_set_mem_reg(jit_ctx *ctx, vreg *base, vreg *offset, vreg *value, int size) {
+ preg *base_reg = fetch(ctx, base);
+ preg *offset_reg = fetch(ctx, offset);
+ preg *value_reg = fetch(ctx, value);
+
+ int size_bits = (size == 1) ? 0x00 : (size == 2) ? 0x01 : (size == 4) ? 0x02 : 0x03;
+
+ // Step 1: Load value FIRST (before using RTMP for address computation)
+ // ldr_stack's fallback path uses RTMP, so we must do this before RTMP holds the address
+ Arm64FpReg value_fp = V16;
+ Arm64Reg value_r = X9;
+
+ if (IS_FLOAT(value)) {
+ value_fp = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16;
+ if (value_reg->kind != RFPU) {
+ ldr_stack_fp(ctx, value_fp, value->stackPos, value->size);
+ }
+ } else {
+ value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : X9;
+ if (value_reg->kind == RCONST) {
+ load_immediate(ctx, value_reg->id, value_r, value->size == 8);
+ } else if (value_reg->kind != RCPU) {
+ ldr_stack(ctx, value_r, value->stackPos, value->size);
+ }
+ }
+
+ // Step 2: Load base and offset (these may also use RTMP in fallback, but that's ok
+ // since we compute the final address in RTMP at the end)
+ Arm64Reg base_r = (base_reg->kind == RCPU) ? (Arm64Reg)base_reg->id : RTMP;
+ if (base_reg->kind != RCPU) {
+ ldr_stack(ctx, base_r, base->stackPos, base->size);
+ }
+
+ Arm64Reg offset_r = (offset_reg->kind == RCPU) ? (Arm64Reg)offset_reg->id : RTMP2;
+ if (offset_reg->kind == RCONST) {
+ load_immediate(ctx, offset_reg->id, offset_r, false);
+ } else if (offset_reg->kind != RCPU) {
+ ldr_stack(ctx, offset_r, offset->stackPos, offset->size);
+ }
+
+ // Step 3: Compute effective address: RTMP = base + offset
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, offset_r, 0, base_r, RTMP);
+
+ // Step 4: Store to [RTMP]
+ if (IS_FLOAT(value)) {
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, (Arm64Reg)value_fp); // V=1 for FP
+ } else {
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, RTMP, value_r);
+ }
+
+ discard(ctx, base_reg);
+ discard(ctx, offset_reg);
+ discard(ctx, value_reg);
+}
+
+/*
+ * Field access: dst = obj->field
+ * OField: dst = *(obj + field_offset)
+ *
+ * Special handling for HPACKED -> HSTRUCT: return address of inline storage
+ * instead of loading a value (LEA semantics).
+ */
+static void op_field(jit_ctx *ctx, vreg *dst, vreg *obj, int field_index) {
+ hl_runtime_obj *rt = hl_get_obj_rt(obj->t);
+ int offset = rt->fields_indexes[field_index];
+
+ // Check for packed field -> struct destination (LEA semantics)
+ if (dst->t->kind == HSTRUCT) {
+ hl_type *ft = hl_obj_field_fetch(obj->t, field_index)->t;
+ if (ft->kind == HPACKED) {
+ // Return address of inline storage: dst = &obj->field
+ preg *p_obj = fetch(ctx, obj);
+ preg *p_dst = alloc_dst(ctx, dst); // Allocates register, binds to dst, marks dirty
+
+ Arm64Reg obj_r = (p_obj->kind == RCPU) ? (Arm64Reg)p_obj->id : RTMP;
+ if (p_obj->kind != RCPU) {
+ ldr_stack(ctx, obj_r, obj->stackPos, obj->size);
+ }
+
+ Arm64Reg dst_r = (Arm64Reg)p_dst->id; // alloc_dst always returns RCPU for non-float
+
+ // ADD dst, obj, #offset (equivalent to LEA)
+ if (offset >= 0 && offset < 4096) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, offset, obj_r, dst_r);
+ } else {
+ load_immediate(ctx, offset, RTMP2, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, obj_r, dst_r);
+ }
+
+ // Don't call store_result - alloc_dst already set up the binding
+ // The value will be spilled when needed
+ discard(ctx, p_obj);
+ return;
+ }
+ }
+
+ op_get_mem(ctx, dst, obj, offset, dst->size);
+}
+
+/*
+ * Field assignment: obj->field = value
+ * OSetField: *(obj + field_offset) = value
+ *
+ * Special handling for HSTRUCT -> HPACKED: must copy struct byte-by-byte
+ * because HPACKED means the struct is stored inline, not as a pointer.
+ */
+static void op_set_field(jit_ctx *ctx, vreg *obj, int field_index, vreg *value) {
+ hl_runtime_obj *rt = hl_get_obj_rt(obj->t);
+ int field_offset = rt->fields_indexes[field_index];
+
+ // Check for struct-to-packed-field assignment
+ if (value->t->kind == HSTRUCT) {
+ hl_type *ft = hl_obj_field_fetch(obj->t, field_index)->t;
+ if (ft->kind == HPACKED) {
+ // Copy struct byte-by-byte
+ hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam);
+
+ // Load obj pointer into RTMP and value pointer into RTMP2.
+ // This is simpler than trying to manage register allocation for the copy.
+ preg *p_obj = fetch(ctx, obj);
+ preg *p_val = fetch(ctx, value);
+
+ // Always load to scratch registers to avoid conflicts with copy temp
+ Arm64Reg obj_r = RTMP;
+ Arm64Reg val_r = RTMP2;
+
+ if (p_obj->kind == RCPU) {
+ // Move from allocated register to RTMP: ORR RTMP, XZR, Rm
+ encode_logical_reg(ctx, 1, 0x01, 0, 0, (Arm64Reg)p_obj->id, 0, XZR, obj_r);
+ } else {
+ ldr_stack(ctx, obj_r, obj->stackPos, obj->size);
+ }
+
+ if (p_val->kind == RCPU) {
+ // Move from allocated register to RTMP2: ORR RTMP2, XZR, Rm
+ encode_logical_reg(ctx, 1, 0x01, 0, 0, (Arm64Reg)p_val->id, 0, XZR, val_r);
+ } else {
+ ldr_stack(ctx, val_r, value->stackPos, value->size);
+ }
+
+ // Use X9 for data copy, X10 for large offset computation
+ // Evict both if they're holding values
+ preg *p_x9 = &ctx->pregs[X9];
+ preg *p_x10 = &ctx->pregs[X10];
+ if (p_x9->holds != NULL) {
+ free_reg(ctx, p_x9);
+ }
+ if (p_x10->holds != NULL) {
+ free_reg(ctx, p_x10);
+ }
+
+ Arm64Reg tmp = X9;
+ int offset = 0;
+ while (offset < frt->size) {
+ int remain = frt->size - offset;
+ int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
+ int size_bits = (copy_size == 8) ? 0x03 : (copy_size == 4) ? 0x02 : (copy_size == 2) ? 0x01 : 0x00;
+
+ // Load from source: LDR tmp, [val_r, #offset]
+ // Source offset starts at 0 and increments by copy_size, so always aligned
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, offset / copy_size, val_r, tmp);
+
+ // Store to dest: STR tmp, [obj_r + field_offset + offset]
+ // Dest offset may not be aligned to copy_size, so compute address explicitly
+ int dest_offset = field_offset + offset;
+ if ((dest_offset % copy_size) == 0 && dest_offset >= 0 && dest_offset < (1 << 12) * copy_size) {
+ // Aligned and fits in immediate - use scaled offset
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, dest_offset / copy_size, obj_r, tmp);
+ } else {
+ // Misaligned or large offset - compute address in X10
+ load_immediate(ctx, dest_offset, X10, false);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, X10, 0, obj_r, X10);
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, X10, tmp);
+ }
+
+ offset += copy_size;
+ }
+
+ discard(ctx, p_obj);
+ discard(ctx, p_val);
+ return;
+ }
+ }
+
+ op_set_mem(ctx, obj, field_offset, value, value->size);
+}
+
+/*
+ * Array element access: dst = array[index]
+ * OGetArray: dst = hl_aptr(array)[index]
+ *
+ * varray layout: { hl_type *t, hl_type *at, int size, int __pad } = 24 bytes
+ * Data is INLINE immediately after the header (not via a pointer!)
+ * hl_aptr(a,t) = (t*)(((varray*)(a))+1) = array + sizeof(varray)
+ *
+ * CArray (HABSTRACT) layout: raw memory, no header
+ * For HOBJ/HSTRUCT: return address of element (LEA)
+ * For other types: load value (LDR)
+ */
+/*
+ * IMPORTANT: We must load index BEFORE array when array uses RTMP,
+ * because ldr_stack's fallback path uses RTMP as a temporary.
+ * Order: index -> array -> compute address -> load
+ */
+static void op_get_array(jit_ctx *ctx, vreg *dst, vreg *array, vreg *index) {
+ preg *array_reg = fetch(ctx, array);
+ preg *index_reg = fetch(ctx, index);
+ preg *dst_reg = alloc_dst(ctx, dst);
+
+ // CArrays (HABSTRACT) have different layout - no header, and for HOBJ/HSTRUCT
+ // we return the address (LEA) rather than loading the value
+ bool is_carray = (array->t->kind == HABSTRACT);
+ bool is_lea = is_carray && (dst->t->kind == HOBJ || dst->t->kind == HSTRUCT);
+
+ int elem_size;
+ if (is_carray) {
+ if (is_lea) {
+ // For HOBJ/HSTRUCT in CArray, element size is the runtime object size
+ hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+ elem_size = rt->size;
+ } else {
+ // For other types in CArray, element size is pointer size
+ elem_size = sizeof(void*);
+ }
+ } else {
+ elem_size = hl_type_size(dst->t);
+ }
+
+ // Step 1: Load index FIRST (may clobber RTMP in fallback, but we haven't used it yet)
+ Arm64Reg index_r = (index_reg->kind == RCPU) ? (Arm64Reg)index_reg->id : RTMP2;
+ if (index_reg->kind == RCONST) {
+ load_immediate(ctx, index_reg->id, index_r, false);
+ } else if (index_reg->kind != RCPU) {
+ ldr_stack(ctx, index_r, index->stackPos, index->size);
+ }
+
+ // Step 2: Load array (if it needs RTMP, the value will stay in RTMP)
+ Arm64Reg array_r = (array_reg->kind == RCPU) ? (Arm64Reg)array_reg->id : RTMP;
+ if (array_reg->kind != RCPU) {
+ ldr_stack(ctx, array_r, array->stackPos, array->size);
+ }
+
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : X9;
+
+ // Step 3: Calculate element address
+ // For varray: array + sizeof(varray) + index * elem_size
+ // For CArray: array + index * elem_size (no header)
+
+ if (is_carray) {
+ // CArray: no header offset, start from array_r directly
+ // Scale index by elem_size
+ if (elem_size == 1) {
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, array_r, RTMP);
+ } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) {
+ int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3;
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, array_r, RTMP);
+ } else {
+ // Non-power-of-2: compute index * elem_size in RTMP2, then add
+ load_immediate(ctx, elem_size, RTMP2, false);
+ encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, array_r, RTMP);
+ }
+ } else {
+ // varray: add sizeof(varray) header offset first
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, sizeof(varray), array_r, RTMP);
+
+ // Add scaled index offset: RTMP = RTMP + (index_r << shift)
+ if (elem_size == 1) {
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, RTMP, RTMP);
+ } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) {
+ int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3;
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, RTMP, RTMP);
+ } else {
+ // Non-power-of-2: scale index into RTMP2, then add
+ load_immediate(ctx, elem_size, RTMP2, false);
+ encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, RTMP, RTMP);
+ }
+ }
+
+ if (is_lea) {
+ // LEA: just move the computed address to dst
+ mov_reg_reg(ctx, dst_r, RTMP, true);
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ } else if (IS_FLOAT(dst)) {
+ // Float load: use FP register with V=1
+ preg *pv0 = PVFPR(0);
+ if (pv0->holds != NULL && pv0->holds != dst) {
+ free_reg(ctx, pv0);
+ }
+ int size_bits = (dst->size == 8) ? 0x03 : 0x02; // F64 or F32
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, RTMP, (Arm64Reg)V0); // V=1 for FP
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ // Clear dst's old binding - value is now on stack, not in a register
+ if (dst->current != NULL) {
+ dst->current->holds = NULL;
+ dst->current = NULL;
+ }
+ } else {
+ // Integer load
+ int size_bits = (elem_size == 1) ? 0x00 : (elem_size == 2) ? 0x01 : (elem_size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, RTMP, dst_r);
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ discard(ctx, array_reg);
+ discard(ctx, index_reg);
+}
+
+/*
+ * Array element assignment: array[index] = value
+ * OSetArray: hl_aptr(array)[index] = value
+ *
+ * varray layout: { hl_type *t, hl_type *at, int size, int __pad } = 24 bytes
+ * Data is INLINE immediately after the header (not via a pointer!)
+ *
+ * CArray (HABSTRACT) layout: raw memory, no header
+ * For HOBJ/HSTRUCT: copy entire struct from value (which is address from LEA)
+ * For other types: store value directly
+ */
+/*
+ * IMPORTANT: We must load value and index BEFORE array when array uses RTMP,
+ * because ldr_stack's fallback path uses RTMP as a temporary.
+ * Order: value -> index -> array -> compute address -> store
+ */
+static void op_set_array(jit_ctx *ctx, vreg *array, vreg *index, vreg *value) {
+ preg *array_reg = fetch(ctx, array);
+ preg *index_reg = fetch(ctx, index);
+ preg *value_reg = fetch(ctx, value);
+
+ // CArrays (HABSTRACT) have different semantics
+ bool is_carray = (array->t->kind == HABSTRACT);
+ bool is_struct_copy = is_carray && (value->t->kind == HOBJ || value->t->kind == HSTRUCT);
+
+ int elem_size;
+ if (is_carray) {
+ if (is_struct_copy) {
+ // For HOBJ/HSTRUCT in CArray, element size is the runtime object size
+ hl_runtime_obj *rt = hl_get_obj_rt(value->t);
+ elem_size = rt->size;
+ } else {
+ // For other types in CArray, element size is pointer size
+ elem_size = sizeof(void*);
+ }
+ } else {
+ elem_size = hl_type_size(value->t);
+ }
+
+ // Step 1: Load value FIRST (before using RTMP for address computation)
+ // For struct copy, value is a pointer to the source struct
+ // For floats, use FP register; for integers, use CPU register
+ Arm64Reg value_r = X9;
+ Arm64FpReg value_fp = V0;
+ bool is_float_value = IS_FLOAT(value);
+
+ if (is_float_value) {
+ if (value_reg->kind == RFPU) {
+ value_fp = (Arm64FpReg)value_reg->id;
+ } else {
+ ldr_stack_fp(ctx, value_fp, value->stackPos, value->size);
+ }
+ } else {
+ value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : X9;
+ if (value_reg->kind == RCONST) {
+ load_immediate(ctx, value_reg->id, value_r, value->size == 8);
+ } else if (value_reg->kind != RCPU) {
+ ldr_stack(ctx, value_r, value->stackPos, value->size);
+ }
+ }
+
+ // Step 2: Load index (may clobber RTMP in fallback, but we haven't used it yet)
+ Arm64Reg index_r = (index_reg->kind == RCPU) ? (Arm64Reg)index_reg->id : RTMP2;
+ if (index_reg->kind == RCONST) {
+ load_immediate(ctx, index_reg->id, index_r, false);
+ } else if (index_reg->kind != RCPU) {
+ ldr_stack(ctx, index_r, index->stackPos, index->size);
+ }
+
+ // Step 3: Load array (if it needs RTMP, the value will stay in RTMP)
+ Arm64Reg array_r = (array_reg->kind == RCPU) ? (Arm64Reg)array_reg->id : RTMP;
+ if (array_reg->kind != RCPU) {
+ ldr_stack(ctx, array_r, array->stackPos, array->size);
+ }
+
+ // Step 4: Calculate element address
+ // For varray: array + sizeof(varray) + index * elem_size
+ // For CArray: array + index * elem_size (no header)
+
+ if (is_carray) {
+ // CArray: no header offset
+ if (elem_size == 1) {
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, array_r, RTMP);
+ } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) {
+ int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3;
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, array_r, RTMP);
+ } else {
+ // Non-power-of-2: compute index * elem_size
+ load_immediate(ctx, elem_size, RTMP2, false);
+ encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, array_r, RTMP);
+ }
+ } else {
+ // varray: add sizeof(varray) header offset first
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, sizeof(varray), array_r, RTMP);
+
+ // Add scaled index offset
+ if (elem_size == 1) {
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, 0, RTMP, RTMP);
+ } else if (elem_size == 2 || elem_size == 4 || elem_size == 8) {
+ int shift = (elem_size == 2) ? 1 : (elem_size == 4) ? 2 : 3;
+ encode_add_sub_reg(ctx, 1, 0, 0, SHIFT_LSL, index_r, shift, RTMP, RTMP);
+ } else {
+ load_immediate(ctx, elem_size, RTMP2, false);
+ encode_madd_msub(ctx, 1, 0, RTMP2, XZR, index_r, RTMP2);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP2, 0, RTMP, RTMP);
+ }
+ }
+
+ if (is_struct_copy) {
+ // Copy struct from value (pointer) to RTMP (destination)
+ // value_r points to source struct, RTMP points to destination
+ // Use X10 as temporary for copy (not value_r which we need as source base)
+ int offset = 0;
+ while (offset < elem_size) {
+ int remain = elem_size - offset;
+ int copy_size, size_bits;
+ if (remain >= 8) {
+ copy_size = 8;
+ size_bits = 0x03;
+ } else if (remain >= 4) {
+ copy_size = 4;
+ size_bits = 0x02;
+ } else if (remain >= 2) {
+ copy_size = 2;
+ size_bits = 0x01;
+ } else {
+ copy_size = 1;
+ size_bits = 0x00;
+ }
+ // Load from source: X10 = [value_r + offset]
+ encode_ldur_stur(ctx, size_bits, 0, 0x01, offset, value_r, X10);
+ // Store to dest: [RTMP + offset] = X10
+ encode_ldur_stur(ctx, size_bits, 0, 0x00, offset, RTMP, X10);
+ offset += copy_size;
+ }
+ } else if (is_float_value) {
+ // Float store: STR Vn, [RTMP] with V=1
+ int size_bits = (value->size == 8) ? 0x03 : 0x02; // F64 or F32
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x00, 0, RTMP, (Arm64Reg)value_fp); // V=1 for FP
+ } else {
+ // Integer store: STR Xn, [RTMP]
+ int size_bits = (elem_size == 1) ? 0x00 : (elem_size == 2) ? 0x01 : (elem_size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, RTMP, value_r);
+ }
+
+ discard(ctx, array_reg);
+ discard(ctx, index_reg);
+ discard(ctx, value_reg);
+}
+
+/*
+ * Global variable access: dst = globals[index]
+ * OGetGlobal: Use PC-relative addressing with ADRP + LDR
+ */
+static void op_get_global(jit_ctx *ctx, vreg *dst, int global_index) {
+ preg *dst_reg = alloc_dst(ctx, dst);
+
+ // Get global address from module
+ void **globals = (void**)ctx->m->globals_data;
+ void *global_addr = &globals[global_index];
+
+ // Load global address to RTMP2
+ load_immediate(ctx, (int64_t)global_addr, RTMP2, true);
+
+ if (IS_FLOAT(dst)) {
+ // Float: load into FPU register
+ Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16;
+ // LDR Vn, [RTMP2] - floating point load
+ // size: 0x02=32-bit (S), 0x03=64-bit (D)
+ encode_ldr_str_imm(ctx, dst->size == 8 ? 0x03 : 0x02, 1, 0x01, 0, RTMP2, (Arm64Reg)dst_r);
+ // Store to stack
+ str_stack_fp(ctx, dst_r, dst->stackPos, dst->size);
+ } else {
+ // Integer/pointer: load into CPU register
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+ // LDR Xn, [RTMP2]
+ encode_ldr_str_imm(ctx, dst->size == 8 ? 0x03 : 0x02, 0, 0x01, 0, RTMP2, dst_r);
+ // Store to stack
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+}
+
+/*
+ * Global variable assignment: globals[index] = value
+ * OSetGlobal
+ */
+static void op_set_global(jit_ctx *ctx, int global_index, vreg *value) {
+ preg *value_reg = fetch(ctx, value);
+
+ // Get global address from module
+ void **globals = (void**)ctx->m->globals_data;
+ void *global_addr = &globals[global_index];
+
+ // Load global address to RTMP2
+ load_immediate(ctx, (int64_t)global_addr, RTMP2, true);
+
+ if (IS_FLOAT(value)) {
+ // Float: store from FPU register
+ Arm64FpReg value_r = (value_reg->kind == RFPU) ? (Arm64FpReg)value_reg->id : V16;
+ if (value_reg->kind != RFPU) {
+ // Load from stack into temp FPU register
+ ldr_stack_fp(ctx, value_r, value->stackPos, value->size);
+ }
+ // STR Vn, [RTMP2] - floating point store
+ encode_ldr_str_imm(ctx, value->size == 8 ? 0x03 : 0x02, 1, 0x00, 0, RTMP2, (Arm64Reg)value_r);
+ } else {
+ // Integer/pointer: store from CPU register
+ Arm64Reg value_r = (value_reg->kind == RCPU) ? (Arm64Reg)value_reg->id : RTMP;
+ if (value_reg->kind == RCONST) {
+ load_immediate(ctx, value_reg->id, value_r, value->size == 8);
+ } else if (value_reg->kind != RCPU) {
+ ldr_stack(ctx, value_r, value->stackPos, value->size);
+ }
+ // STR Xn, [RTMP2]
+ encode_ldr_str_imm(ctx, value->size == 8 ? 0x03 : 0x02, 0, 0x00, 0, RTMP2, value_r);
+ }
+
+ discard(ctx, value_reg);
+}
+
+// ============================================================================
+// Reference Operations
+// ============================================================================
+
+/*
+ * Create reference: dst = &src
+ * ORef: dst = address of vreg
+ *
+ * IMPORTANT: After taking a reference to a vreg, that vreg may be modified
+ * through the reference (via OSetref). We must:
+ * 1. Ensure src is spilled to stack (in case it's only in a register)
+ * 2. Invalidate src's register binding so future reads go to stack
+ */
+static void op_ref(jit_ctx *ctx, vreg *dst, vreg *src) {
+ // First, ensure src is on stack and invalidate its register binding
+ // (like x86's scratch(ra->current))
+ if (src->current != NULL) {
+ // Spill to stack if in a register
+ store(ctx, src, src->current);
+ // Invalidate the binding so future reads go to stack
+ src->current->holds = NULL;
+ src->current = NULL;
+ }
+
+ preg *dst_reg = alloc_dst(ctx, dst);
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+
+ // Calculate stack address: FP + src->stackPos
+ if (src->stackPos >= 0) {
+ // ADD dst_r, FP, #stackPos
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, src->stackPos, FP, dst_r);
+ } else {
+ // SUB dst_r, FP, #(-stackPos)
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, -src->stackPos, FP, dst_r);
+ }
+
+ // Always store to stack - source of truth for later loads
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+}
+
+/*
+ * Dereference: dst = *src
+ * OUnref: Load value from pointer
+ */
+static void op_unref(jit_ctx *ctx, vreg *dst, vreg *src) {
+ preg *src_reg = fetch(ctx, src);
+
+ // Load the pointer (always integer register since it's an address)
+ Arm64Reg src_r = (src_reg->kind == RCPU) ? (Arm64Reg)src_reg->id : RTMP;
+ if (src_reg->kind != RCPU) {
+ ldr_stack(ctx, src_r, src->stackPos, src->size);
+ }
+
+ int size_bits = (dst->size == 1) ? 0x00 : (dst->size == 2) ? 0x01 : (dst->size == 4) ? 0x02 : 0x03;
+
+ if (IS_FLOAT(dst)) {
+ // Float dereference: LDR Vd, [src_r]
+ preg *dst_reg = alloc_dst(ctx, dst);
+ Arm64FpReg dst_r = (dst_reg->kind == RFPU) ? (Arm64FpReg)dst_reg->id : V16;
+ encode_ldr_str_imm(ctx, size_bits, 1, 0x01, 0, src_r, (Arm64Reg)dst_r);
+ str_stack_fp(ctx, dst_r, dst->stackPos, dst->size);
+ } else {
+ // Integer dereference: LDR Xd, [src_r]
+ preg *dst_reg = alloc_dst(ctx, dst);
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP2;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x01, 0, src_r, dst_r);
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ discard(ctx, src_reg);
+}
+
+/*
+ * Set reference: *dst = src
+ * OSetref: Store value to pointer
+ */
+static void op_setref(jit_ctx *ctx, vreg *dst, vreg *src) {
+ preg *dst_reg = fetch(ctx, dst);
+ preg *src_reg = fetch(ctx, src);
+
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+ if (dst_reg->kind != RCPU) {
+ ldr_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ Arm64Reg src_r = (src_reg->kind == RCPU) ? (Arm64Reg)src_reg->id : RTMP2;
+ if (src_reg->kind == RCONST) {
+ load_immediate(ctx, src_reg->id, src_r, src->size == 8);
+ } else if (src_reg->kind != RCPU) {
+ ldr_stack(ctx, src_r, src->stackPos, src->size);
+ }
+
+ // Store to pointer: STR src_r, [dst_r]
+ int size_bits = (src->size == 1) ? 0x00 : (src->size == 2) ? 0x01 : (src->size == 4) ? 0x02 : 0x03;
+ encode_ldr_str_imm(ctx, size_bits, 0, 0x00, 0, dst_r, src_r);
+
+ discard(ctx, dst_reg);
+ discard(ctx, src_reg);
+}
+
+// ============================================================================
+// Comparison Operations (result stored, not branching)
+// ============================================================================
+
+/*
+ * OGetThis: Load a field from the "this" object (R(0))
+ * Equivalent to OField but implicitly uses R(0) as the object
+ */
+static void op_get_this(jit_ctx *ctx, vreg *dst, int field_idx) {
+ vreg *this_vreg = R(0);
+ op_field(ctx, dst, this_vreg, field_idx);
+}
+
+/*
+ * Get the dynamic cast function for a given type
+ */
+static void *get_dyncast(hl_type *t) {
+ switch (t->kind) {
+ case HF32:
+ return hl_dyn_castf;
+ case HF64:
+ return hl_dyn_castd;
+ case HI64:
+ case HGUID:
+ return hl_dyn_casti64;
+ case HI32:
+ case HUI16:
+ case HUI8:
+ case HBOOL:
+ return hl_dyn_casti;
+ default:
+ return hl_dyn_castp;
+ }
+}
+
+/*
+ * Cast operation (safe cast with runtime check)
+ * OSafeCast: dst = (target_type)obj or NULL if cast fails
+ */
+static void op_safe_cast(jit_ctx *ctx, vreg *dst, vreg *obj, hl_type *target_type) {
+ // Special case: Null to T - unbox with null check
+ if (obj->t->kind == HNULL && obj->t->tparam->kind == dst->t->kind) {
+ int jnull, jend;
+
+ switch (dst->t->kind) {
+ case HUI8:
+ case HUI16:
+ case HI32:
+ case HBOOL:
+ case HI64:
+ case HGUID:
+ {
+ preg *tmp = fetch(ctx, obj);
+ Arm64Reg r = (tmp->kind == RCPU) ? tmp->id : RTMP;
+ if (tmp->kind != RCPU) {
+ ldr_stack(ctx, r, obj->stackPos, obj->size);
+ }
+ // Test for null
+ encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, r, XZR); // CMP r, #0
+ jnull = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ null_path
+
+ // Non-null: load value from offset 8 with correct size
+ // Size determines scale: 0x00=1, 0x01=2, 0x02=4, 0x03=8
+ // So offset = 8 / scale to get byte offset 8
+ int size_code;
+ int scaled_offset;
+ switch (dst->size) {
+ case 1: size_code = 0x00; scaled_offset = 8; break; // LDRB [r, #8]
+ case 2: size_code = 0x01; scaled_offset = 4; break; // LDRH [r, #8]
+ case 4: size_code = 0x02; scaled_offset = 2; break; // LDR W [r, #8]
+ default: size_code = 0x03; scaled_offset = 1; break; // LDR X [r, #8]
+ }
+ // The LDR below clobbers r. If obj is dirty in r, save it to stack first.
+ // This preserves obj's value (the dynamic pointer) for later use.
+ if (obj->dirty && obj->current == tmp) {
+ str_stack(ctx, r, obj->stackPos, obj->size);
+ obj->dirty = 0;
+ }
+ encode_ldr_str_imm(ctx, size_code, 0, 0x01, scaled_offset, r, r);
+ jend = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B end
+
+ // Null path: set to zero
+ patch_jump(ctx, jnull, BUF_POS());
+ load_immediate(ctx, 0, r, dst->size == 8);
+
+ // End
+ patch_jump(ctx, jend, BUF_POS());
+ str_stack(ctx, r, dst->stackPos, dst->size);
+ // Clear binding - register no longer holds obj's original value
+ discard(ctx, tmp);
+ // Invalidate dst's old binding since we wrote directly to stack
+ if (dst->current) {
+ dst->current->holds = NULL;
+ dst->current = NULL;
+ }
+ }
+ return;
+
+ case HF32:
+ case HF64:
+ {
+ preg *tmp = fetch(ctx, obj);
+ Arm64Reg r = (tmp->kind == RCPU) ? tmp->id : RTMP;
+ if (tmp->kind != RCPU) {
+ ldr_stack(ctx, r, obj->stackPos, obj->size);
+ }
+ // Evict any vreg currently bound to V0 before using it
+ preg *pv0 = PVFPR(0);
+ if (pv0->holds != NULL && pv0->holds != dst) {
+ free_reg(ctx, pv0);
+ }
+ // Test for null
+ encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, r, XZR); // CMP r, #0
+ jnull = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ null_path
+
+ // Non-null: load float from offset 8
+ encode_ldr_str_imm(ctx, (dst->size == 8) ? 0x03 : 0x02, 1, 0x01, 8 / dst->size, r, (Arm64Reg)V0);
+ jend = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B end
+
+ // Null path: set to zero
+ patch_jump(ctx, jnull, BUF_POS());
+ // FMOV Vd, XZR
+ EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | V0);
+
+ // End
+ patch_jump(ctx, jend, BUF_POS());
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ // Clear binding - register no longer holds obj's original value
+ discard(ctx, tmp);
+ // Invalidate dst's old binding since we wrote directly to stack
+ if (dst->current) {
+ dst->current->holds = NULL;
+ dst->current = NULL;
+ }
+ }
+ return;
+
+ default:
+ break;
+ }
+ }
+
+ // General case: call runtime cast function
+ spill_regs(ctx);
+
+ // Get stack address of obj
+ // LEA X0, [FP, #obj->stackPos] or similar
+ if (obj->stackPos >= 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, obj->stackPos, FP, X0);
+ } else {
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, -obj->stackPos, FP, X0);
+ }
+
+ // Set up arguments based on destination type
+ void *cast_func = get_dyncast(dst->t);
+ switch (dst->t->kind) {
+ case HF32:
+ case HF64:
+ case HI64:
+ // 2 args: ptr, src_type
+ load_immediate(ctx, (int64_t)obj->t, X1, true);
+ break;
+ default:
+ // 3 args: ptr, src_type, dst_type
+ load_immediate(ctx, (int64_t)obj->t, X1, true);
+ load_immediate(ctx, (int64_t)dst->t, X2, true);
+ break;
+ }
+
+ // Call cast function
+ load_immediate(ctx, (int64_t)cast_func, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+
+ // Store result and clear stale binding
+ if (IS_FLOAT(dst)) {
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ } else {
+ str_stack(ctx, X0, dst->stackPos, dst->size);
+ }
+ store_result(ctx, dst);
+}
+
+/*
+ * Null coalescing: dst = (a != null) ? a : b
+ * OCoalesce/ONullCheck
+ */
+static void op_null_check(jit_ctx *ctx, vreg *dst) {
+ // Check if dst is null and call hl_null_access if so
+ preg *dst_reg = fetch(ctx, dst);
+
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+ if (dst_reg->kind != RCPU) {
+ ldr_stack(ctx, dst_r, dst->stackPos, dst->size);
+ }
+
+ // Compare with zero: CMP dst_r, #0 (actually SUBS XZR, dst_r, #0)
+ encode_add_sub_imm(ctx, 1, 1, 1, 0, 0, dst_r, XZR);
+
+ // If not zero (not null), skip error handling: B.NE skip
+ int bne_pos = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_NE); // B.NE (will patch offset)
+
+ // Null path: call hl_null_access(field, hashed_name)
+ // For simplicity, pass 0 for both (basic null access error)
+ // NOTE: Do NOT call spill_regs() here! hl_null_access never returns (it throws),
+ // and spill_regs() would corrupt compile-time register bindings for the non-null path.
+ load_immediate(ctx, 0, X0, true); // field = NULL
+ load_immediate(ctx, 0, X1, true); // hashed_name = 0
+ load_immediate(ctx, (int64_t)hl_null_access, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+ // hl_null_access doesn't return (it throws), but we don't emit anything after
+
+ // Patch the B.NE to skip here
+ int skip_pos = BUF_POS();
+ int bne_offset = (skip_pos - bne_pos) / 4;
+ ctx->buf.b = ctx->startBuf + bne_pos;
+ encode_branch_cond(ctx, bne_offset, COND_NE);
+ ctx->buf.b = ctx->startBuf + skip_pos;
+
+ discard(ctx, dst_reg);
+}
+
+/*
+ * Object/memory allocation operations
+ * These typically call into the runtime allocator
+ */
+static void op_new(jit_ctx *ctx, vreg *dst, hl_type *type) {
+ // Call runtime allocator based on type kind
+ // Different type kinds require different allocation functions:
+ // - HOBJ/HSTRUCT: hl_alloc_obj(type)
+ // - HDYNOBJ: hl_alloc_dynobj() - no arguments!
+ // - HVIRTUAL: hl_alloc_virtual(type)
+
+ // Spill all caller-saved registers BEFORE the call
+ spill_regs(ctx);
+
+ void *alloc_func;
+ int has_type_arg = 1;
+
+ switch (type->kind) {
+ case HOBJ:
+ case HSTRUCT:
+ alloc_func = (void*)hl_alloc_obj;
+ break;
+ case HDYNOBJ:
+ alloc_func = (void*)hl_alloc_dynobj;
+ has_type_arg = 0; // hl_alloc_dynobj takes no arguments
+ break;
+ case HVIRTUAL:
+ alloc_func = (void*)hl_alloc_virtual;
+ break;
+ default:
+ // Unsupported type for ONew
+ printf("op_new: unsupported type kind %d\n", type->kind);
+ return;
+ }
+
+ // Load type address to X0 (first argument) if needed
+ if (has_type_arg) {
+ load_immediate(ctx, (int64_t)type, X0, true);
+ }
+
+ // Load function pointer and call
+ load_immediate(ctx, (int64_t)alloc_func, RTMP, true);
+
+ // Call allocator: BLR RTMP
+ EMIT32(ctx, (0xD63F0000) | (RTMP << 5));
+
+ // Result is in X0 - always store to stack first (source of truth for later loads)
+ str_stack(ctx, X0, dst->stackPos, dst->size);
+
+ // Also keep in a register if allocated
+ preg *dst_reg = alloc_dst(ctx, dst);
+ if (dst_reg->kind == RCPU && (Arm64Reg)dst_reg->id != X0) {
+ mov_reg_reg(ctx, (Arm64Reg)dst_reg->id, X0, 8);
+ }
+}
+
+/*
+ * String/bytes operations
+ */
+static void op_string(jit_ctx *ctx, vreg *dst, int string_index) {
+ // Load UTF-16 string from module string table
+ preg *dst_reg = alloc_dst(ctx, dst);
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+
+ // Get UTF-16 string pointer (hl_get_ustring converts from UTF-8 and caches)
+ const uchar *string_ptr = hl_get_ustring(ctx->m->code, string_index);
+
+ // Load string address
+ load_immediate(ctx, (int64_t)string_ptr, dst_r, true);
+
+ // Always store to stack - source of truth for later loads
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+}
+
+static void op_bytes(jit_ctx *ctx, vreg *dst, int bytes_index) {
+ // Load bytes from module bytes table
+ preg *dst_reg = alloc_dst(ctx, dst);
+ Arm64Reg dst_r = (dst_reg->kind == RCPU) ? (Arm64Reg)dst_reg->id : RTMP;
+
+ // Get bytes pointer from module - use bytes_pos lookup for version >= 5
+ char *bytes_ptr;
+ if (ctx->m->code->version >= 5)
+ bytes_ptr = ctx->m->code->bytes + ctx->m->code->bytes_pos[bytes_index];
+ else
+ bytes_ptr = ctx->m->code->strings[bytes_index];
+
+ // Load bytes address
+ load_immediate(ctx, (int64_t)bytes_ptr, dst_r, true);
+
+ // Always store to stack - source of truth for later loads
+ str_stack(ctx, dst_r, dst->stackPos, dst->size);
+}
+
+// Forward declaration for prepare_call_args (defined later)
+static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native);
+
+/*
+ * Virtual/method calls
+ * OCallMethod/OCallThis/OCallClosure
+ */
+// ============================================================================
+// Dynamic Object Helpers
+// ============================================================================
+
+/**
+ * Get the appropriate dynamic set function for a type
+ */
+static void *get_dynset(hl_type *t) {
+ switch (t->kind) {
+ case HF32:
+ return hl_dyn_setf;
+ case HF64:
+ return hl_dyn_setd;
+ case HI64:
+ case HGUID:
+ return hl_dyn_seti64;
+ case HI32:
+ case HUI16:
+ case HUI8:
+ case HBOOL:
+ return hl_dyn_seti;
+ default:
+ return hl_dyn_setp;
+ }
+}
+
+/**
+ * Get the appropriate dynamic get function for a type
+ */
+static void *get_dynget(hl_type *t) {
+ switch (t->kind) {
+ case HF32:
+ return hl_dyn_getf;
+ case HF64:
+ return hl_dyn_getd;
+ case HI64:
+ case HGUID:
+ return hl_dyn_geti64;
+ case HI32:
+ case HUI16:
+ case HUI8:
+ case HBOOL:
+ return hl_dyn_geti;
+ default:
+ return hl_dyn_getp;
+ }
+}
+
+// ============================================================================
+// Method and Function Calls
+// ============================================================================
+
+static void op_call_method_obj(jit_ctx *ctx, vreg *dst, vreg *obj, int method_index, vreg **args, int nargs) {
+ // HOBJ method call: obj->type->vobj_proto[method_index](obj, args...)
+
+ // Spill all caller-saved registers BEFORE the call
+ spill_regs(ctx);
+
+ // Now fetch obj (will load from stack since we just spilled)
+ preg *obj_reg = fetch(ctx, obj);
+
+ Arm64Reg obj_r = (obj_reg->kind == RCPU) ? (Arm64Reg)obj_reg->id : RTMP;
+ if (obj_reg->kind != RCPU) {
+ ldr_stack(ctx, obj_r, obj->stackPos, obj->size);
+ }
+
+ // Load type from obj[0]
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, obj_r, RTMP); // RTMP = obj->type
+ // Load vobj_proto from type[16] (HL_WSIZE*2 = offset index 2)
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 2, RTMP, RTMP); // RTMP = type->vobj_proto
+ // Load method pointer from proto[method_index] into RTMP2
+ // NOTE: We use RTMP2 here because prepare_call_args uses RTMP for stack calculations
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, method_index, RTMP, RTMP2);
+
+ discard(ctx, obj_reg);
+
+ // Prepare call with obj as first argument
+ vreg **full_args = (vreg**)malloc(sizeof(vreg*) * (nargs + 1));
+ full_args[0] = obj;
+ for (int i = 0; i < nargs; i++) {
+ full_args[i + 1] = args[i];
+ }
+
+ // Prepare arguments (this uses RTMP, but method pointer is safe in RTMP2)
+ int stack_space = prepare_call_args(ctx, NULL, full_args, nargs + 1, false);
+ free(full_args);
+
+ // Call method: BLR RTMP2
+ EMIT32(ctx,(0xD63F0000) | (RTMP2 << 5));
+
+ // Clean up stack
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+
+ // Store return value
+ if (dst && dst->t->kind != HVOID) {
+ preg *p = alloc_dst(ctx, dst);
+ if (IS_FLOAT(dst)) {
+ if (p->kind == RFPU && (Arm64FpReg)p->id != V0) {
+ fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size);
+ } else if (p->kind == RSTACK) {
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ }
+ } else {
+ if (p->kind == RCPU && (Arm64Reg)p->id != X0) {
+ mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size);
+ } else if (p->kind == RSTACK) {
+ str_stack(ctx, X0, dst->stackPos, dst->size);
+ }
+ }
+ }
+}
+
+// ============================================================================
+// Function Calls
+// ============================================================================
+
+static inline void align_upwards(int *value, int alignment) {
+ if (*value & (alignment - 1)) {
+ *value = (*value + alignment - 1) & ~(alignment - 1);
+ }
+}
+
+/*
+ * Prepare arguments for a function call according to AAPCS64:
+ * - First 8 integer/pointer args in X0-X7
+ * - First 8 floating-point args in V0-V7
+ * - Additional args on stack (16-byte aligned)
+ * - on Apple platforms arguments only consume their size + any necessary padding
+ * - on platforms that follow the standard AAPCS arguments consume one or more 8-byte slots
+ * - Returns the total stack space needed for overflow args
+ */
+static int prepare_call_args(jit_ctx *ctx, hl_type **arg_types, vreg **args, int nargs, bool is_native) {
+ int int_reg_count = 0;
+ int fp_reg_count = 0;
+ int stack_offset = 0;
+
+ // First pass: count args and calculate stack space needed
+ for (int i = 0; i < nargs; i++) {
+ bool is_fp = IS_FLOAT(args[i]);
+ int *reg_count = is_fp ? &fp_reg_count : &int_reg_count;
+
+ if (*reg_count >= CALL_NREGS) {
+ // Arg goes on stack
+ #ifdef __APPLE__
+ align_upwards(&stack_offset, args[i]->size);
+ stack_offset += args[i]->size;
+ #else
+ stack_offset += 8; // Each stack arg takes 8 bytes (aligned)
+ #endif
+ }
+ (*reg_count)++;
+ }
+
+ // Align stack to 16 bytes
+ if (stack_offset & 15)
+ stack_offset = (stack_offset + 15) & ~15;
+
+ // Allocate stack space for overflow args if needed
+ if (stack_offset > 0) {
+ // SUB SP, SP, #stack_offset
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, stack_offset, SP_REG, SP_REG);
+ }
+
+ // Second pass: move arguments to their locations
+ int_reg_count = 0;
+ fp_reg_count = 0;
+ int current_stack_offset = 0;
+
+ // After spill_regs(), all values are on stack.
+ // Load arguments directly to their destination registers to avoid
+ // the register allocation problem where fetch() reuses registers.
+ for (int i = 0; i < nargs; i++) {
+ vreg *arg = args[i];
+ bool is_fp = IS_FLOAT(arg);
+
+ if (is_fp) {
+ if (fp_reg_count < CALL_NREGS) {
+ // Load directly to FP argument register
+ Arm64FpReg dest_reg = FP_CALL_REGS[fp_reg_count];
+ ldr_stack_fp(ctx, dest_reg, arg->stackPos, arg->size);
+ fp_reg_count++;
+ } else {
+#ifdef __APPLE__
+ align_upwards(¤t_stack_offset, arg->size);
+#endif
+ // Overflow: load to temp, then store to stack
+ ldr_stack_fp(ctx, V16, arg->stackPos, arg->size);
+ encode_ldr_str_imm(ctx, arg->size == 4 ? 0x02 : 0x03, 1, 0x00,
+ current_stack_offset / (arg->size == 4 ? 4 : 8),
+ SP_REG, (Arm64Reg)V16);
+#ifdef __APPLE__
+ current_stack_offset += arg->size;
+#else
+ current_stack_offset += 8; // Each stack arg takes 8 bytes (aligned)
+#endif
+ }
+ } else {
+ // Integer/pointer argument
+ if (int_reg_count < CALL_NREGS) {
+ // Load directly to integer argument register
+ Arm64Reg dest_reg = CALL_REGS[int_reg_count];
+ ldr_stack(ctx, dest_reg, arg->stackPos, arg->size);
+ int_reg_count++;
+ } else {
+#ifdef __APPLE__
+ align_upwards(¤t_stack_offset, arg->size);
+#endif
+ // Overflow: load to temp, then store to stack
+ ldr_stack(ctx, RTMP, arg->stackPos, arg->size);
+ encode_ldr_str_imm(ctx, arg->size == 8 ? 0x03 : 0x02, 0, 0x00,
+ current_stack_offset / (arg->size == 8 ? 8 : 4),
+ SP_REG, RTMP);
+#ifdef __APPLE__
+ current_stack_offset += arg->size;
+#else
+ current_stack_offset += 8; // Each stack arg takes 8 bytes (aligned)
+#endif
+ }
+ }
+ }
+
+ return stack_offset;
+}
+
+/*
+ * Call a native C function
+ */
+static void op_call_native(jit_ctx *ctx, vreg *dst, hl_type *ftype, void *func_ptr, vreg **args, int nargs) {
+ // Spill all caller-saved registers BEFORE the call
+ spill_regs(ctx);
+
+ // Prepare arguments (arg_types not actually used by prepare_call_args)
+ int stack_space = prepare_call_args(ctx, NULL, args, nargs, true);
+
+ // Load function pointer to RTMP
+ load_immediate(ctx, (int64_t)func_ptr, RTMP, true);
+
+ // BLR RTMP (Branch with Link to Register)
+ // Encoding: 1101 0110 0011 1111 0000 00rr rrr0 0000
+ // where rrrrr = RTMP register number
+ EMIT32(ctx,(0xD63F0000) | (RTMP << 5));
+
+ // Clean up stack if we allocated space for args
+ if (stack_space > 0) {
+ // ADD SP, SP, #stack_space
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+
+ // Store return value if needed
+ if (dst && dst->t->kind != HVOID) {
+ // Always store to stack first (source of truth for later loads)
+ if (IS_FLOAT(dst)) {
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ } else {
+ str_stack(ctx, X0, dst->stackPos, dst->size);
+ }
+
+ // Also keep in a register if allocated to a different one
+ preg *p = alloc_dst(ctx, dst);
+ if (IS_FLOAT(dst)) {
+ if (p->kind == RFPU && (Arm64FpReg)p->id != V0) {
+ fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size);
+ }
+ } else {
+ if (p->kind == RCPU && (Arm64Reg)p->id != X0) {
+ mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size);
+ }
+ }
+ }
+}
+
+/*
+ * Call a native function with a known absolute address
+ * The address is embedded directly in the instruction stream (no patching needed)
+ */
+static void call_native(jit_ctx *ctx, void *nativeFun, int stack_space) {
+ // Emit indirect call sequence with the address embedded inline:
+ // LDR X17, #12 ; load target address from PC+12
+ // BLR X17 ; call
+ // B #12 ; skip over the literal
+ // .quad addr ; 8-byte absolute address (embedded now, not patched later)
+
+ EMIT32(ctx, 0x58000071); // LDR X17, #12
+ EMIT32(ctx, 0xD63F0220); // BLR X17
+ EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes)
+
+ // Embed the native function address directly
+ uint64_t addr = (uint64_t)nativeFun;
+ EMIT32(ctx, (uint32_t)(addr & 0xFFFFFFFF)); // Low 32 bits
+ EMIT32(ctx, (uint32_t)((addr >> 32) & 0xFFFFFFFF)); // High 32 bits
+
+ // Clean up stack if we allocated space for args
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+}
+
+/*
+ * Emit a call to a function by its index (without spill/prepare - for use when those are already done)
+ * Used by compareFun and other places that set up args manually
+ */
+static void emit_call_findex(jit_ctx *ctx, int findex, int stack_space) {
+ int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex];
+ bool isNative = fid >= ctx->m->code->nfunctions;
+
+ if (fid < 0) {
+ jit_error("Invalid function index");
+ } else if (isNative) {
+ // Native function - address is already resolved
+ call_native(ctx, ctx->m->functions_ptrs[findex], stack_space);
+ } else {
+ // JIT function - use indirect call via literal pool (patched later)
+ EMIT32(ctx, 0x58000071); // LDR X17, #12
+ EMIT32(ctx, 0xD63F0220); // BLR X17
+
+ // Register literal position for patching
+ jlist *j = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist));
+ j->pos = BUF_POS() + 4; // Position of the 8-byte literal (after B instruction)
+ j->target = findex;
+ j->next = ctx->calls;
+ ctx->calls = j;
+
+ EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes)
+ EMIT32(ctx, 0); // Low 32 bits placeholder
+ EMIT32(ctx, 0); // High 32 bits placeholder
+
+ // Clean up stack if we allocated space for args
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+ }
+}
+
+/*
+ * Call a HashLink function (native or JIT-compiled)
+ * For OCall0-OCall4, OCallN
+ */
+static void op_call_hl(jit_ctx *ctx, vreg *dst, int findex, vreg **args, int nargs) {
+ // Spill all caller-saved registers BEFORE the call
+ // This must happen before prepare_call_args to save values that might be clobbered
+ spill_regs(ctx);
+
+ // Prepare arguments
+ int stack_space = prepare_call_args(ctx, NULL, args, nargs, false);
+
+ // Check if this is a native function or JIT function
+ int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex];
+ bool isNative = fid >= ctx->m->code->nfunctions;
+
+ if (fid < 0) {
+ // Invalid function index
+ jit_error("Invalid function index");
+ } else if (isNative) {
+ // Native function - address is already resolved, call directly
+ call_native(ctx, ctx->m->functions_ptrs[findex], stack_space);
+ } else {
+ // JIT function - use indirect call via literal pool (patched later)
+ // During JIT compilation, functions_ptrs contains CODE OFFSETS.
+ // The conversion to absolute addresses happens in hl_jit_code.
+ //
+ // Sequence:
+ // LDR X17, #12 ; load target address from PC+12
+ // BLR X17 ; call
+ // B #12 ; skip over the literal
+ // .quad addr ; 8-byte address placeholder (patched later)
+
+ EMIT32(ctx, 0x58000071); // LDR X17, #12
+ EMIT32(ctx, 0xD63F0220); // BLR X17
+
+ // Register literal position for patching
+ jlist *j = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist));
+ j->pos = BUF_POS() + 4; // Position of the 8-byte literal (after B instruction)
+ j->target = findex;
+ j->next = ctx->calls;
+ ctx->calls = j;
+
+ EMIT32(ctx, 0x14000003); // B #12 (skip 3 instructions = 12 bytes)
+ EMIT32(ctx, 0); // Low 32 bits placeholder
+ EMIT32(ctx, 0); // High 32 bits placeholder
+
+ // Clean up stack if we allocated space for args
+ if (stack_space > 0) {
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, stack_space, SP_REG, SP_REG);
+ }
+ }
+
+ // Note: spill_regs was already called before prepare_call_args
+
+ // Store return value if needed
+ if (dst && dst->t->kind != HVOID) {
+ // Always store to stack first (source of truth for later loads)
+ if (IS_FLOAT(dst)) {
+ str_stack_fp(ctx, V0, dst->stackPos, dst->size);
+ } else {
+ str_stack(ctx, X0, dst->stackPos, dst->size);
+ }
+
+ // Also keep in a register if allocated to a different one
+ preg *p = alloc_dst(ctx, dst);
+ if (IS_FLOAT(dst)) {
+ if (p->kind == RFPU && (Arm64FpReg)p->id != V0) {
+ fmov_reg_reg(ctx, (Arm64FpReg)p->id, V0, dst->size);
+ }
+ } else {
+ if (p->kind == RCPU && (Arm64Reg)p->id != X0) {
+ mov_reg_reg(ctx, (Arm64Reg)p->id, X0, dst->size);
+ }
+ }
+ }
+}
+
+// ============================================================================
+// C↔HL Trampolines
+// ============================================================================
+
+static void *call_jit_c2hl = NULL;
+static void *call_jit_hl2c = NULL;
+
+// Maximum args for dynamic calls
+#define MAX_ARGS 64
+
+/**
+ * Wrapper function for HL->C calls - unpacks arguments and calls the wrapped function.
+ * Called from jit_hl2c trampoline.
+ */
+static vdynamic *jit_wrapper_call(vclosure_wrapper *c, char *stack_args, void **regs) {
+ vdynamic *args[MAX_ARGS];
+ int i;
+ int nargs = c->cl.t->fun->nargs;
+ int nextCpu = 1; // Skip X0 which holds the closure pointer
+ int nextFpu = 0;
+
+ if (nargs > MAX_ARGS)
+ hl_error("Too many arguments for wrapped call");
+
+ for (i = 0; i < nargs; i++) {
+ hl_type *t = c->cl.t->fun->args[i];
+
+ if (t->kind == HF32 || t->kind == HF64) {
+ // Float argument
+ if (nextFpu < CALL_NREGS) {
+ // In FP register - regs[CALL_NREGS + fpu_index]
+ args[i] = hl_make_dyn(regs + CALL_NREGS + nextFpu, &hlt_f64);
+ nextFpu++;
+ } else {
+ // On stack
+ args[i] = hl_make_dyn(stack_args, &hlt_f64);
+ stack_args += 8;
+ }
+ } else {
+ // Integer/pointer argument
+ if (nextCpu < CALL_NREGS) {
+ // In CPU register
+ if (hl_is_dynamic(t)) {
+ args[i] = *(vdynamic**)(regs + nextCpu);
+ } else {
+ args[i] = hl_make_dyn(regs + nextCpu, t);
+ }
+ nextCpu++;
+ } else {
+ // On stack
+ if (hl_is_dynamic(t)) {
+ args[i] = *(vdynamic**)stack_args;
+ } else {
+ args[i] = hl_make_dyn(stack_args, t);
+ }
+ stack_args += 8;
+ }
+ }
+ }
+ return hl_dyn_call(c->wrappedFun, args, nargs);
+}
+
+/**
+ * Wrapper for pointer-returning HL->C calls
+ */
+static void *jit_wrapper_ptr(vclosure_wrapper *c, char *stack_args, void **regs) {
+ vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
+ hl_type *tret = c->cl.t->fun->ret;
+ switch (tret->kind) {
+ case HVOID:
+ return NULL;
+ case HUI8:
+ case HUI16:
+ case HI32:
+ case HBOOL:
+ return (void*)(int_val)hl_dyn_casti(&ret, &hlt_dyn, tret);
+ case HI64:
+ case HGUID:
+ return (void*)(int_val)hl_dyn_casti64(&ret, &hlt_dyn);
+ default:
+ return hl_dyn_castp(&ret, &hlt_dyn, tret);
+ }
+}
+
+/**
+ * Wrapper for float-returning HL->C calls
+ */
+static double jit_wrapper_d(vclosure_wrapper *c, char *stack_args, void **regs) {
+ vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
+ return hl_dyn_castd(&ret, &hlt_dyn);
+}
+
+/**
+ * Select which register to use for an argument based on type and position.
+ * Returns register ID or -1 if should go on stack.
+ */
+static int select_call_reg_c2hl(int *nextCpu, int *nextFpu, hl_type *t) {
+ if (t->kind == HF32 || t->kind == HF64) {
+ if (*nextFpu < CALL_NREGS)
+ return RCPU_COUNT + (*nextFpu)++; // FPU register
+ return -1; // Stack
+ } else {
+ if (*nextCpu < CALL_NREGS)
+ return (*nextCpu)++; // CPU register
+ return -1; // Stack
+ }
+}
+
+/**
+ * Get the stack size for a type
+ */
+static int stack_size_c2hl(hl_type *t) {
+ switch (t->kind) {
+ case HUI8:
+ case HBOOL:
+ return 1;
+ case HUI16:
+ return 2;
+ case HI32:
+ case HF32:
+ return 4;
+ default:
+ return 8;
+ }
+}
+
+/**
+ * Callback function that prepares arguments and calls the JIT trampoline.
+ * Called from C code to invoke JIT-compiled functions.
+ */
+static void *callback_c2hl(void *_f, hl_type *t, void **args, vdynamic *ret) {
+ void **f = (void**)_f;
+ // Stack layout:
+ // [0..size) = stack args (pushed in reverse)
+ // [size..size+CALL_NREGS*8) = integer register args (X0-X7)
+ // [size+CALL_NREGS*8..size+CALL_NREGS*16) = FP register args (V0-V7)
+ unsigned char stack[MAX_ARGS * 16];
+ int nextCpu = 0, nextFpu = 0;
+ int mappedRegs[MAX_ARGS];
+
+ // Zero-initialize the stack to avoid passing garbage to unused registers
+ // The jit_c2hl trampoline loads ALL 8 int + 8 FP registers unconditionally
+ memset(stack, 0, sizeof(stack));
+
+ if (t->fun->nargs > MAX_ARGS)
+ hl_error("Too many arguments for dynamic call");
+
+ // First pass: determine register assignments and stack size
+ int i, size = 0;
+ for (i = 0; i < t->fun->nargs; i++) {
+ hl_type *at = t->fun->args[i];
+ int creg = select_call_reg_c2hl(&nextCpu, &nextFpu, at);
+ mappedRegs[i] = creg;
+ if (creg < 0) {
+ int tsize = stack_size_c2hl(at);
+ if (tsize < 8) tsize = 8; // Align to 8 bytes on stack
+ size += tsize;
+ }
+ }
+
+ // Align stack size to 16 bytes
+ int pad = (-size) & 15;
+ size += pad;
+
+ // Second pass: copy arguments to appropriate locations
+ int pos = 0;
+ for (i = 0; i < t->fun->nargs; i++) {
+ hl_type *at = t->fun->args[i];
+ void *v = args[i];
+ int creg = mappedRegs[i];
+ void *store;
+
+ if (creg >= 0) {
+ if (creg >= RCPU_COUNT) {
+ // FP register - stored after integer registers
+ store = stack + size + CALL_NREGS * 8 + (creg - RCPU_COUNT) * 8;
+ } else {
+ // Integer register
+ store = stack + size + creg * 8;
+ }
+ switch (at->kind) {
+ case HBOOL:
+ case HUI8:
+ *(int64*)store = *(unsigned char*)v;
+ break;
+ case HUI16:
+ *(int64*)store = *(unsigned short*)v;
+ break;
+ case HI32:
+ *(int64*)store = *(int*)v;
+ break;
+ case HF32:
+ *(double*)store = *(float*)v;
+ break;
+ case HF64:
+ *(double*)store = *(double*)v;
+ break;
+ case HI64:
+ case HGUID:
+ *(int64*)store = *(int64*)v;
+ break;
+ default:
+ *(void**)store = v;
+ break;
+ }
+ } else {
+ // Stack argument
+ store = stack + pos;
+ int tsize = 8;
+ switch (at->kind) {
+ case HBOOL:
+ case HUI8:
+ *(int64*)store = *(unsigned char*)v;
+ break;
+ case HUI16:
+ *(int64*)store = *(unsigned short*)v;
+ break;
+ case HI32:
+ case HF32:
+ *(int64*)store = *(int*)v;
+ break;
+ case HF64:
+ *(double*)store = *(double*)v;
+ break;
+ case HI64:
+ case HGUID:
+ *(int64*)store = *(int64*)v;
+ break;
+ default:
+ *(void**)store = v;
+ break;
+ }
+ pos += tsize;
+ }
+ }
+
+ pos += pad;
+ pos >>= 3; // Convert to 64-bit units
+
+ // Call the trampoline with: function pointer, reg args pointer, stack args end
+ switch (t->fun->ret->kind) {
+ case HUI8:
+ case HUI16:
+ case HI32:
+ case HBOOL:
+ ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack);
+ return &ret->v.i;
+ case HI64:
+ case HGUID:
+ ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack);
+ return &ret->v.i64;
+ case HF32:
+ ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack);
+ return &ret->v.f;
+ case HF64:
+ ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack);
+ return &ret->v.d;
+ default:
+ return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)stack + pos, stack);
+ }
+}
+
+/**
+ * Generate the HL-to-C trampoline.
+ * Called from C code with a vclosure_wrapper* in X0 and native args in X1-X7, V0-V7.
+ * Saves registers and calls jit_wrapper_ptr or jit_wrapper_d based on return type.
+ */
+static void jit_hl2c(jit_ctx *ctx) {
+ hl_type_fun *ft = NULL;
+
+ // Function prologue - save frame
+ // STP X29, X30, [SP, #-16]!
+ encode_ldp_stp(ctx, 0x02, 0, 0x03, -2, LR, SP_REG, FP);
+ // MOV X29, SP
+ mov_reg_reg(ctx, FP, SP_REG, true);
+
+ // Allocate space for saved registers: 8 CPU regs + 8 FP regs = 16 * 8 = 128 bytes
+ // SUB SP, SP, #128
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, 128, SP_REG, SP_REG);
+
+ // Trampoline marker: MOV W17, #0xE001 (HL2C trampoline)
+ EMIT32(ctx, 0x52800011 | (0xE001 << 5));
+
+ // Save integer argument registers X0-X7 at [SP, #0..63]
+ for (int i = 0; i < CALL_NREGS; i++) {
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x00, i, SP_REG, i); // STR Xi, [SP, #i*8]
+ }
+
+ // Save FP argument registers V0-V7 at [SP, #64..127]
+ for (int i = 0; i < CALL_NREGS; i++) {
+ encode_ldr_str_imm(ctx, 0x03, 1, 0x00, 8 + i, SP_REG, i); // STR Di, [SP, #(8+i)*8]
+ }
+
+ // X0 = closure pointer (vclosure_wrapper*)
+ // Check return type: closure->t->fun->ret->kind
+ // X9 = X0->t
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, X0, X9);
+ // X9 = X9->fun (hl_type->fun is at offset 8 on 64-bit)
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 1, X9, X9);
+ // X9 = X9->ret (hl_type_fun->ret offset)
+ int ret_offset = (int)(int_val)&ft->ret;
+ if (ret_offset < 4096 && (ret_offset % 8) == 0) {
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, ret_offset / 8, X9, X9);
+ } else {
+ load_immediate(ctx, ret_offset, RTMP, true);
+ encode_ldr_str_reg(ctx, 0x03, 0, 0x01, RTMP, 0x03, 0, X9, X9);
+ }
+ // W9 = X9->kind (hl_type->kind is at offset 0, 32-bit)
+ encode_ldr_str_imm(ctx, 0x02, 0, 0x01, 0, X9, X9);
+
+ // Compare with HF64 and HF32
+ // CMP W9, #HF64
+ encode_add_sub_imm(ctx, 0, 1, 1, 0, HF64, X9, XZR);
+ int jfloat1 = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ float_path
+
+ // CMP W9, #HF32
+ encode_add_sub_imm(ctx, 0, 1, 1, 0, HF32, X9, XZR);
+ int jfloat2 = BUF_POS();
+ encode_branch_cond(ctx, 0, COND_EQ); // B.EQ float_path
+
+ // Integer/pointer path: call jit_wrapper_ptr(closure, stack_args, regs)
+ // X0 = closure (reload from saved regs)
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, SP_REG, X0);
+ // X1 = stack_args (FP + 16 is return address area, args start after saved frame)
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1);
+ // X2 = regs pointer (SP)
+ mov_reg_reg(ctx, X2, SP_REG, true);
+
+ load_immediate(ctx, (int64_t)jit_wrapper_ptr, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+
+ // Result in X0, jump to exit
+ int jexit = BUF_POS();
+ encode_branch_uncond(ctx, 0); // B exit
+
+ // Float path
+ int float_pos = BUF_POS();
+ patch_jump(ctx, jfloat1, float_pos);
+ patch_jump(ctx, jfloat2, float_pos);
+
+ // X0 = closure (reload)
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, 0, SP_REG, X0);
+ // X1 = stack_args
+ encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1);
+ // X2 = regs pointer
+ mov_reg_reg(ctx, X2, SP_REG, true);
+
+ load_immediate(ctx, (int64_t)jit_wrapper_d, RTMP, true);
+ EMIT32(ctx, 0xD63F0000 | (RTMP << 5)); // BLR RTMP
+ // Result in V0
+
+ // Exit path
+ int exit_pos = BUF_POS();
+ patch_jump(ctx, jexit, exit_pos);
+
+ // Restore frame and return
+ // MOV SP, X29
+ mov_reg_reg(ctx, SP_REG, FP, true);
+ // LDP X29, X30, [SP], #16
+ encode_ldp_stp(ctx, 0x02, 0, 0x01, 2, LR, SP_REG, FP);
+ // RET
+ encode_branch_reg(ctx, 0x02, LR);
+}
+
+/**
+ * Generate the C-to-HL trampoline.
+ * Input: X0 = function pointer, X1 = reg args pointer, X2 = stack args end
+ * The trampoline loads arguments from the prepared stack and calls the function.
+ */
+static void jit_c2hl(jit_ctx *ctx) {
+ // Save callee-saved registers and set up frame
+ // STP X29, X30, [SP, #-16]!
+ encode_ldp_stp(ctx, 0x02, 0, 0x03, -2, LR, SP_REG, FP);
+ // MOV X29, SP
+ mov_reg_reg(ctx, FP, SP_REG, true);
+
+ // Trampoline marker: MOV W17, #0xE002 (C2HL trampoline)
+ EMIT32(ctx, 0x52800011 | (0xE002 << 5));
+
+ // Save function pointer to X9 (caller-saved, will survive loads)
+ // MOV X9, X0
+ mov_reg_reg(ctx, X9, X0, true);
+
+ // Save stack args pointers to X10, X11
+ // MOV X10, X1 (reg args pointer)
+ // MOV X11, X2 (stack args end)
+ mov_reg_reg(ctx, X10, X1, true);
+ mov_reg_reg(ctx, X11, X2, true);
+
+ // Load integer register arguments X0-X7 from [X10]
+ for (int i = 0; i < CALL_NREGS; i++) {
+ // LDR Xi, [X10, #i*8]
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x01, i, X10, CALL_REGS[i]);
+ }
+
+ // Load FP register arguments V0-V7 from [X10 + CALL_NREGS*8]
+ for (int i = 0; i < CALL_NREGS; i++) {
+ // LDR Di, [X10, #(CALL_NREGS + i)*8]
+ // Using 64-bit FP load: size=11, opc=01
+ EMIT32(ctx,0xFD400000 | (((CALL_NREGS + i) & 0x1FF) << 10) | (X10 << 5) | FP_CALL_REGS[i]);
+ }
+
+ // Push stack args from buffer to actual stack
+ // X10 = end of stack args in buffer (= start of reg args area)
+ // X11 = start of stack args in buffer
+ // Stack args size = X10 - X11 (already 16-byte aligned by callback_c2hl)
+
+ // Calculate stack args size: X12 = X10 - X11
+ // SUB X12, X10, X11
+ encode_add_sub_reg(ctx, 1, 1, 0, 0, X11, 0, X10, X12);
+
+ // Skip if no stack args: CBZ X12, done
+ int cbz_pos = BUF_POS();
+ EMIT32(ctx, 0xB4000000 | (X12 & 0x1F)); // CBZ X12,
+
+ // Allocate stack space: SUB SP, SP, X12
+ encode_add_sub_reg(ctx, 1, 1, 0, 0, X12, 0, SP_REG, SP_REG);
+
+ // Copy args contiguously at 8-byte intervals
+ // X13 = destination pointer (SP), X14 = source pointer (X11)
+ mov_reg_reg(ctx, X13, SP_REG, true); // MOV X13, SP
+ mov_reg_reg(ctx, X14, X11, true); // MOV X14, X11 (source = buffer start)
+
+ // Loop: copy 8 bytes at a time until X14 reaches X10
+ int loop_start = BUF_POS();
+ // CMP X14, X10
+ encode_add_sub_reg(ctx, 1, 1, 1, 0, X10, 0, X14, XZR);
+ // B.GE done (if X14 >= X10, we've copied all args)
+ int bge_pos = BUF_POS();
+ EMIT32(ctx, 0x54000000 | (COND_GE & 0xF)); // B.GE (will patch)
+
+ // LDR X15, [X14], #8 (load and post-increment source)
+ EMIT32(ctx, 0xF8408000 | (8 << 12) | (X14 << 5) | X15); // LDR X15, [X14], #8
+
+ // STR X15, [X13], #8 (store and post-increment destination)
+ EMIT32(ctx, 0xF8008000 | (8 << 12) | (X13 << 5) | X15); // STR X15, [X13], #8
+
+ // B loop_start
+ int b_offset = (loop_start - BUF_POS()) / 4;
+ EMIT32(ctx, 0x14000000 | (b_offset & 0x3FFFFFF));
+
+ // Patch the CBZ and B.GE to jump here
+ int done_pos = BUF_POS();
+ int cbz_offset = (done_pos - cbz_pos) / 4;
+ ctx->buf.w = (unsigned int*)(ctx->startBuf + cbz_pos);
+ EMIT32(ctx, 0xB4000000 | ((cbz_offset & 0x7FFFF) << 5) | (X12 & 0x1F));
+ int bge_offset = (done_pos - bge_pos) / 4;
+ ctx->buf.w = (unsigned int*)(ctx->startBuf + bge_pos);
+ EMIT32(ctx, 0x54000000 | ((bge_offset & 0x7FFFF) << 5) | (COND_GE & 0xF));
+ ctx->buf.w = (unsigned int*)(ctx->startBuf + done_pos);
+
+ // Call the function: BLR X9
+ EMIT32(ctx,0xD63F0000 | (X9 << 5));
+
+ // Restore frame and return
+ // MOV SP, X29
+ mov_reg_reg(ctx, SP_REG, FP, true);
+ // LDP X29, X30, [SP], #16
+ encode_ldp_stp(ctx, 0x02, 0, 0x01, 2, LR, SP_REG, FP);
+ // RET
+ encode_branch_reg(ctx, 0x02, LR);
+}
+
+/**
+ * Get wrapper function for HL-to-C calls.
+ * This is used for callbacks from C code back into HashLink.
+ * Returns the jit_hl2c trampoline address.
+ */
+static void *get_wrapper(hl_type *t) {
+ return call_jit_hl2c;
+}
+
+// ============================================================================
+// JIT API Implementation
+// ============================================================================
+
+// Forward declaration
+static void hl_jit_init_module(jit_ctx *ctx, hl_module *m);
+
+jit_ctx *hl_jit_alloc() {
+ jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx));
+ if (ctx == NULL)
+ return NULL;
+ memset(ctx, 0, sizeof(jit_ctx));
+ return ctx;
+}
+
+void hl_jit_free(jit_ctx *ctx, h_bool can_reset) {
+ if (ctx == NULL)
+ return;
+
+ free(ctx->vregs);
+ free(ctx->opsPos);
+ free(ctx->startBuf);
+ ctx->maxRegs = 0;
+ ctx->vregs = NULL;
+ ctx->maxOps = 0;
+ ctx->opsPos = NULL;
+ ctx->startBuf = NULL;
+ ctx->bufSize = 0;
+ ctx->buf.b = NULL;
+ ctx->calls = NULL;
+ ctx->switchs = NULL;
+ ctx->closure_list = NULL;
+ hl_free(&ctx->falloc);
+ hl_free(&ctx->galloc);
+ if (!can_reset) free(ctx);
+}
+
+void hl_jit_reset(jit_ctx *ctx, hl_module *m) {
+ ctx->debug = NULL;
+ hl_jit_init_module(ctx, m);
+}
+
+/**
+ * Build a JIT helper function, ensuring buffer is allocated.
+ * Returns the position in the buffer where the function starts.
+ */
+static int jit_build(jit_ctx *ctx, void (*fbuild)(jit_ctx *)) {
+ int pos;
+ jit_buf(ctx); // Ensure buffer is allocated
+ pos = BUF_POS();
+ fbuild(ctx);
+ return pos;
+}
+
+/**
+ * Initialize module-specific data in JIT context.
+ */
+static void hl_jit_init_module(jit_ctx *ctx, hl_module *m) {
+ int i;
+ ctx->m = m;
+ ctx->closure_list = NULL;
+
+ // Allocate debug info array if bytecode has debug info
+ if (m->code->hasdebug && m->code->nfunctions > 0) {
+ ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions);
+ if (ctx->debug)
+ memset(ctx->debug, 0, sizeof(hl_debug_infos) * m->code->nfunctions);
+ }
+
+ // Store float constants in the code buffer (like x86 does)
+ for (i = 0; i < m->code->nfloats; i++) {
+ jit_buf(ctx);
+ *ctx->buf.d++ = m->code->floats[i];
+ }
+}
+
+void hl_jit_init(jit_ctx *ctx, hl_module *m) {
+ hl_jit_init_module(ctx, m);
+
+ // Generate C↔HL trampolines
+ ctx->c2hl = jit_build(ctx, jit_c2hl);
+ ctx->hl2c = jit_build(ctx, jit_hl2c);
+}
+
+/**
+ * Allocate a static closure object.
+ * For native functions, the function pointer is set immediately.
+ * For JIT functions, the function pointer is stored temporarily as the findex
+ * and the closure is added to closure_list for later patching.
+ */
+static vclosure *alloc_static_closure(jit_ctx *ctx, int fid) {
+ hl_module *m = ctx->m;
+ vclosure *c = hl_malloc(&m->ctx.alloc, sizeof(vclosure));
+ int fidx = m->functions_indexes[fid];
+ c->hasValue = 0;
+ if (fidx >= m->code->nfunctions) {
+ // Native function - pointer is already resolved
+ c->t = m->code->natives[fidx - m->code->nfunctions].t;
+ c->fun = m->functions_ptrs[fid];
+ c->value = NULL;
+ } else {
+ // JIT function - store fid temporarily, add to closure_list for patching
+ c->t = m->code->functions[fidx].type;
+ c->fun = (void*)(int_val)fid;
+ c->value = ctx->closure_list;
+ ctx->closure_list = c;
+ }
+ return c;
+}
+
+int hl_jit_function(jit_ctx *ctx, hl_module *m, hl_function *f) {
+ int i, size = 0, opCount;
+ int codePos = BUF_POS();
+ int nargs = f->type->fun->nargs;
+ unsigned short *debug16 = NULL;
+ int *debug32 = NULL;
+
+ ctx->f = f;
+ ctx->m = m;
+ ctx->allocOffset = 0;
+
+ // Allocate virtual register array if needed
+ // Always reallocate to ensure clean state (no stale data from previous functions)
+ free(ctx->vregs);
+ ctx->vregs = (vreg*)calloc(f->nregs + 1, sizeof(vreg));
+ if (ctx->vregs == NULL) {
+ ctx->maxRegs = 0;
+ return -1;
+ }
+ ctx->maxRegs = f->nregs;
+
+ // Allocate opcode position array if needed
+ if (f->nops > ctx->maxOps) {
+ free(ctx->opsPos);
+ ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1));
+ if (ctx->opsPos == NULL) {
+ ctx->maxOps = 0;
+ return -1;
+ }
+ ctx->maxOps = f->nops;
+ }
+
+ memset(ctx->opsPos, 0, (f->nops + 1) * sizeof(int));
+
+ // Clear/initialize physical registers
+ for (i = 0; i < RCPU_COUNT; i++) {
+ preg *p = &ctx->pregs[i];
+ p->kind = RCPU;
+ p->id = i;
+ p->holds = NULL;
+ p->lock = 0;
+ }
+ for (i = 0; i < RFPU_COUNT; i++) {
+ preg *p = &ctx->pregs[RCPU_COUNT + i];
+ p->kind = RFPU;
+ p->id = i;
+ p->holds = NULL;
+ p->lock = 0;
+ }
+
+ // Initialize virtual registers
+ for (i = 0; i < f->nregs; i++) {
+ vreg *r = R(i);
+ r->t = f->regs[i];
+ r->size = hl_type_size(r->t);
+ r->stackPos = 0;
+ r->current = NULL;
+ r->stack.holds = NULL;
+ r->stack.id = i;
+ r->stack.kind = RSTACK;
+ r->stack.lock = 0;
+ }
+
+ // Calculate stack layout
+ // Arguments: first 8 integer args in X0-X7, first 8 FP args in V0-V7
+ // Additional args on stack
+ size = 0;
+ int argsSize = 0;
+ int int_arg_count = 0;
+ int fp_arg_count = 0;
+
+ for (i = 0; i < nargs; i++) {
+ vreg *r = R(i);
+ bool is_fp = IS_FLOAT(r);
+ int *arg_count = is_fp ? &fp_arg_count : &int_arg_count;
+
+ if (*arg_count < CALL_NREGS) {
+ // Argument is in register - allocate stack space for it
+ size += r->size;
+ size += hl_pad_size(size, r->t);
+ r->stackPos = -size;
+ (*arg_count)++;
+ } else {
+ // Argument is on stack (caller's frame)
+ // Offset by CALLEE_SAVED_FRAME_SIZE to skip saved registers
+ // Each stack arg occupies 8 bytes (matching caller's prepare_call_args)
+ r->stackPos = argsSize + CALLEE_SAVED_FRAME_SIZE;
+ argsSize += 8;
+ }
+ }
+
+ // Local variables
+ for (i = nargs; i < f->nregs; i++) {
+ vreg *r = R(i);
+ size += r->size;
+ size += hl_pad_size(size, r->t);
+ r->stackPos = -size;
+ }
+
+ // Align stack to 16 bytes
+ size += (-size) & 15;
+ ctx->totalRegsSize = size;
+
+ jit_buf(ctx);
+ ctx->functionPos = BUF_POS();
+ ctx->currentPos = 1;
+
+ // Initialize callee-saved register tracking for backpatching
+ ctx->callee_saved_used = 0;
+ ctx->fpu_callee_saved_used = 0;
+ memset(ctx->stp_positions, 0, sizeof(ctx->stp_positions));
+ memset(ctx->ldp_positions, 0, sizeof(ctx->ldp_positions));
+ memset(ctx->stp_fpu_positions, 0, sizeof(ctx->stp_fpu_positions));
+ memset(ctx->ldp_fpu_positions, 0, sizeof(ctx->ldp_fpu_positions));
+
+ // Function prologue - callee-saved saves are backpatched to NOP if unused
+ // Reserve space for callee-saved registers + FP/LR
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, CALLEE_SAVED_FRAME_SIZE, SP_REG, SP_REG); // SUB SP, SP, #CALLEE_SAVED_FRAME_SIZE
+
+ // Save callee-saved at fixed offsets (NOPpable) - positions recorded for backpatching
+ ctx->stp_positions[0] = BUF_POS();
+ stp_offset(ctx, X27, X28, SP_REG, 80); // STP X27, X28, [SP, #80]
+
+ ctx->stp_positions[1] = BUF_POS();
+ stp_offset(ctx, X25, X26, SP_REG, 64); // STP X25, X26, [SP, #64]
+
+ ctx->stp_positions[2] = BUF_POS();
+ stp_offset(ctx, X23, X24, SP_REG, 48); // STP X23, X24, [SP, #48]
+
+ ctx->stp_positions[3] = BUF_POS();
+ stp_offset(ctx, X21, X22, SP_REG, 32); // STP X21, X22, [SP, #32]
+
+ ctx->stp_positions[4] = BUF_POS();
+ stp_offset(ctx, X19, X20, SP_REG, 16); // STP X19, X20, [SP, #16]
+
+ // Save FPU callee-saved at fixed offsets (NOPpable)
+ ctx->stp_fpu_positions[0] = BUF_POS();
+ stp_offset_fp(ctx, V8, V9, SP_REG, 96); // STP D8, D9, [SP, #96]
+
+ ctx->stp_fpu_positions[1] = BUF_POS();
+ stp_offset_fp(ctx, V10, V11, SP_REG, 112); // STP D10, D11, [SP, #112]
+
+ ctx->stp_fpu_positions[2] = BUF_POS();
+ stp_offset_fp(ctx, V12, V13, SP_REG, 128); // STP D12, D13, [SP, #128]
+
+ ctx->stp_fpu_positions[3] = BUF_POS();
+ stp_offset_fp(ctx, V14, V15, SP_REG, 144); // STP D14, D15, [SP, #144]
+
+ // Save FP/LR at bottom (NOT NOPpable - always needed)
+ stp_offset(ctx, FP, LR, SP_REG, 0); // STP X29, X30, [SP, #0]
+
+ // MOV X29, SP ; Set frame pointer (points to saved FP/LR)
+ mov_reg_reg(ctx, FP, SP_REG, true);
+
+ // SUB SP, SP, #size ; Allocate stack space
+ if (size > 0) {
+ if (size < 4096) {
+ encode_add_sub_imm(ctx, 1, 1, 0, 0, size, SP_REG, SP_REG);
+ } else {
+ // Large stack frame - use multiple instructions
+ // Must use extended register form (UXTX) for SP, not shifted register
+ load_immediate(ctx, size, RTMP, true);
+ encode_add_sub_ext(ctx, 1, 1, 0, RTMP, 3, 0, SP_REG, SP_REG); // SUB SP, SP, RTMP, UXTX
+ }
+ }
+
+ // Function marker: MOV W17, #(0xF000 | (findex & 0xFFF)) ; MOVK W17, #(findex >> 12), LSL #16
+ // This encodes as 0xFnnnnnnn where nnnnnnn is the function index
+ // Distinguishes from opcode markers which are smaller numbers
+ {
+ int findex = f->findex;
+ int low12 = 0xF000 | (findex & 0xFFF);
+ int high = (findex >> 12) & 0xFFFF;
+ // MOV W17, #low12
+ EMIT32(ctx, 0x52800011 | (low12 << 5));
+ if (high != 0) {
+ // MOVK W17, #high, LSL #16
+ EMIT32(ctx, 0x72A00011 | (high << 5));
+ }
+ }
+
+ // Store register arguments to their stack locations FIRST
+ // (before we clobber the argument registers with zero-init)
+ int_arg_count = 0;
+ fp_arg_count = 0;
+ for (i = 0; i < nargs && i < f->nregs; i++) {
+ vreg *r = R(i);
+ bool is_fp = IS_FLOAT(r);
+ int *arg_count = is_fp ? &fp_arg_count : &int_arg_count;
+
+ if (*arg_count < CALL_NREGS) {
+ // This arg was in a register - store it to stack
+ // Skip void arguments (size 0) - they don't need storage
+ // but still consume a call register slot
+ if (r->size > 0) {
+ if (is_fp) {
+ str_stack_fp(ctx, FP_CALL_REGS[fp_arg_count], r->stackPos, r->size);
+ } else {
+ str_stack(ctx, CALL_REGS[int_arg_count], r->stackPos, r->size);
+ }
+ }
+ (*arg_count)++;
+ }
+ }
+
+ // Zero-initialize local variables on stack (not arguments)
+ // This ensures reading unassigned locals returns null/0
+ if (f->nregs > nargs) {
+ // Store zeros to each local variable slot using XZR
+ for (i = nargs; i < f->nregs; i++) {
+ vreg *r = R(i);
+ if (r->size > 0 && r->stackPos < 0) {
+ // Use str_stack with XZR as source
+ if (r->size == 8) {
+ load_immediate(ctx, r->stackPos, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, 0x03, 0, 0x00, 0, RTMP, XZR);
+ } else if (r->size == 4) {
+ load_immediate(ctx, r->stackPos, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, 0x02, 0, 0x00, 0, RTMP, XZR);
+ } else if (r->size == 2) {
+ load_immediate(ctx, r->stackPos, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, 0x01, 0, 0x00, 0, RTMP, XZR);
+ } else if (r->size == 1) {
+ load_immediate(ctx, r->stackPos, RTMP, true);
+ encode_add_sub_reg(ctx, 1, 0, 0, 0, RTMP, 0, RFP, RTMP);
+ encode_ldr_str_imm(ctx, 0x00, 0, 0x00, 0, RTMP, XZR);
+ }
+ }
+ }
+ }
+
+ ctx->opsPos[0] = BUF_POS();
+
+ // Initialize debug offset tracking
+ if (ctx->m->code->hasdebug) {
+ debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1));
+ debug16[0] = (unsigned short)(BUF_POS() - codePos);
+ }
+
+ // Main opcode translation loop
+ for (opCount = 0; opCount < f->nops; opCount++) {
+ hl_opcode *o = f->ops + opCount;
+ vreg *dst = R(o->p1);
+ vreg *ra = R(o->p2);
+ vreg *rb = R(o->p3);
+
+ ctx->currentPos = opCount + 1;
+ jit_buf(ctx);
+
+ // Emit opcode marker for debugging: MOV W17, #(opcode | (opCount << 8))
+ // W17 is IP1, a scratch register. This encodes both the opcode type and index.
+ {
+ int marker = (o->op & 0xFF) | ((opCount & 0xFF) << 8);
+ EMIT32(ctx, 0x52800011 | ((marker & 0xFFFF) << 5)); // MOV W17, #marker
+ }
+
+ // Before a label (merge point), spill dirty registers for fallthrough path.
+ // After spilling, update the label position so jumps bypass the spill code.
+ // discard_regs() in op_label just clears bindings (no code).
+ if (o->op == OLabel) {
+ spill_regs(ctx);
+ // Update label position AFTER spill - jumps should target here,
+ // not before the spill (which is only for fallthrough path)
+ ctx->opsPos[opCount] = BUF_POS();
+ }
+
+ // Emit code based on opcode
+ switch (o->op) {
+ case OMov:
+ case OUnsafeCast:
+ op_mov(ctx, dst, ra);
+ break;
+
+ case OInt:
+ store_const(ctx, dst, m->code->ints[o->p2]);
+ break;
+
+ case OBool:
+ store_const(ctx, dst, o->p2);
+ break;
+
+ case ONull:
+ // Set register to NULL (0)
+ store_const(ctx, dst, 0);
+ break;
+
+ case OFloat: {
+ // Load float constant from module
+ // Float constants are stored at the start of the code buffer (offset o->p2 * 8)
+ double float_val = m->code->floats[o->p2];
+ preg *dst_reg = alloc_fpu(ctx);
+
+ if (float_val == 0.0) {
+ // Zero out FP register: FMOV Dd, XZR
+ // FMOV Dd, XZR: sf=1, S=0, type=01, rmode=00, opcode=000111, Rn=31, Rd
+ EMIT32(ctx, (1 << 31) | (0 << 29) | (0x1E << 24) | (1 << 22) | (1 << 21) | (7 << 16) | (31 << 5) | dst_reg->id);
+ } else {
+ // Float constants are at the start of the code buffer
+ // Calculate PC-relative offset from current position to the float data
+ int float_offset = o->p2 * 8; // Offset from start of code buffer
+ int cur_pos = BUF_POS(); // Current position in code buffer
+ int pc_offset = float_offset - cur_pos; // PC-relative offset
+
+ // LDR Dt,