From 8b46e3949da641b4f630650c13890acf1f5dc927 Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sat, 21 Sep 2024 20:27:58 +0100 Subject: [PATCH 1/5] AArch64 port using fixed stacks Architecture specific parts are either generated by the python code, in a per-architecture assembly file, or selected by preprocessor macros in C. Note that the red zone test is now disabled on everything except segmented stacks. --- Makefile | 4 +- asm/dwarf.S | 2 + asm/seff-aarch64.S | 257 ++++++++++++++++++++++++++++++++++ asm/{seff.S => seff-x86-64.S} | 0 decls/seff_types.py | 39 +++++- src/seff.c | 16 +++ src/seff.h | 9 ++ tests/stack_aligned.c | 4 + tests/stack_red_zone.c | 6 + 9 files changed, 328 insertions(+), 9 deletions(-) create mode 100644 asm/seff-aarch64.S rename asm/{seff.S => seff-x86-64.S} (100%) diff --git a/Makefile b/Makefile index 1470da1..a0966d6 100644 --- a/Makefile +++ b/Makefile @@ -64,8 +64,8 @@ asm/seff_types.S: decls/seff_types.py decls/generate.py output/seff.o: src/seff.c src/seff.h src/seff_types.h | output $(CC) $(CFLAGS) -I./src -o output/seff.o -c src/seff.c -output/seff_asm.o: asm/seff.S asm/seff_types.S | output - $(CC) $(CFLAGS) -I./asm -o output/seff_asm.o -c asm/seff.S +output/seff_asm.o: asm/seff-${ARCH}.S asm/seff_types.S | output + $(CC) $(CFLAGS) -I./asm -o output/seff_asm.o -c asm/seff-${ARCH}.S output/seff_mem.o: src/mem/seff_mem_${STACK_POLICY}.c src/mem/seff_mem.h src/seff_types.h | output $(CC) $(CFLAGS) -I./src -o output/seff_mem.o -c src/mem/seff_mem_${STACK_POLICY}.c diff --git a/asm/dwarf.S b/asm/dwarf.S index bfe4fed..71e2400 100644 --- a/asm/dwarf.S +++ b/asm/dwarf.S @@ -47,3 +47,5 @@ #define DW_REG_r14 14 #define DW_REG_r15 15 #define DW_REG_rip 16 + +#define DW_REG_aarch64_r26 26 diff --git a/asm/seff-aarch64.S b/asm/seff-aarch64.S new file mode 100644 index 0000000..c36b34b --- /dev/null +++ b/asm/seff-aarch64.S @@ -0,0 +1,257 @@ +# Copyright (c) 2023 Huawei Technologies Co., Ltd. +# Copyright (c) 2024 Brian Campbell, University of Edinburgh +# +# libseff is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR +# FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# +# Architecture-specific code for context-switching +# This file is for the System V (Linux/OSX) 64 bit architecture +# + +#include "dwarf.S" +#include "seff_types.S" + +#ifdef STACK_POLICY_SEGMENTED +#error Segmented stacks not currently supported on AArch64 +#endif + +.global seff_exit +.global seff_yield +.global seff_resume +.global coroutine_prelude + +.type seff_exit,%function +.type seff_yield,%function +.type seff_resume,%function +.type coroutine_prelude,%function + +.global _seff_current_coroutine +.global _seff_system_stack +.global _seff_paused_coroutine_stack + +.section .tbss + +_seff_current_coroutine: + .zero 8 +_seff_paused_coroutine_stack: + .zero 8 +_seff_system_stack: + .zero 8 + +.macro swap_registers + # Saved registers + # TODO: ldp/stp? + ldr x9, [x0, seff_coroutine_t__resume_point + seff_cont_t__r19] + ldr x10, [x0, seff_coroutine_t__resume_point + seff_cont_t__r20] + ldr x11, [x0, seff_coroutine_t__resume_point + seff_cont_t__r21] + ldr x12, [x0, seff_coroutine_t__resume_point + seff_cont_t__r22] + ldr x13, [x0, seff_coroutine_t__resume_point + seff_cont_t__r23] + ldr x14, [x0, seff_coroutine_t__resume_point + seff_cont_t__r24] + ldr x15, [x0, seff_coroutine_t__resume_point + seff_cont_t__r25] + ldr x16, [x0, seff_coroutine_t__resume_point + seff_cont_t__r26] + ldr x17, [x0, seff_coroutine_t__resume_point + seff_cont_t__r27] + ldr x8, [x0, seff_coroutine_t__resume_point + seff_cont_t__r28] + + str x19, [x0, seff_coroutine_t__resume_point + seff_cont_t__r19] + str x20, [x0, seff_coroutine_t__resume_point + seff_cont_t__r20] + str x21, [x0, seff_coroutine_t__resume_point + seff_cont_t__r21] + str x22, [x0, seff_coroutine_t__resume_point + seff_cont_t__r22] + str x23, [x0, seff_coroutine_t__resume_point + seff_cont_t__r23] + str x24, [x0, seff_coroutine_t__resume_point + seff_cont_t__r24] + str x25, [x0, seff_coroutine_t__resume_point + seff_cont_t__r25] + str x26, [x0, seff_coroutine_t__resume_point + seff_cont_t__r26] + str x27, [x0, seff_coroutine_t__resume_point + seff_cont_t__r27] + str x28, [x0, seff_coroutine_t__resume_point + seff_cont_t__r28] + + mov x19, x9 + mov x20, x10 + mov x21, x11 + mov x22, x12 + mov x23, x13 + mov x24, x14 + mov x25, x15 + mov x26, x16 + mov x27, x17 + mov x28, x8 +.endm + +.macro swap_stack + ldr x10, [x0, seff_coroutine_t__resume_point + seff_cont_t__rbp] + ldr x11, [x0, seff_coroutine_t__resume_point + seff_cont_t__rsp] + + str x29, [x0, seff_coroutine_t__resume_point + seff_cont_t__rbp] + mov x29, sp + str x29, [x0, seff_coroutine_t__resume_point + seff_cont_t__rsp] + + mov x29, x10 + mov sp, x11 +.endm + +.text + +# void* seff_yield(seff_coroutine_t* self, void* arg) +seff_yield: + .cfi_startproc + # self in x0 + # arg1 in x1 + # arg2 in x2 + + swap_registers + + # We get to pummel x3-x15 so we use those for fast swaps + + mrs x9, TPIDR_EL0 + + add x9, x9, :tprel_hi12:_seff_current_coroutine + add x9, x9, :tprel_lo12_nc:_seff_current_coroutine + # Keep x9 for later + ldr x4, [x9] + + ldr x6, [x0, seff_coroutine_t__parent_coroutine] + + # _seff_current_coroutine + str x6, [x9] + + str x4, [x0, seff_coroutine_t__resume_point + seff_cont_t__current_coroutine] + + # Save return pointer + mov x3, x30 + ldr x30, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + str x3, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + + swap_stack + + mov x0, x1 + mov x1, x2 + ret + .cfi_endproc +.size seff_yield, . - seff_yield + +# void* seff_resume(seff_coroutine_t* k, void* arg, effect_set handled) +seff_resume: + .cfi_startproc + # k in x0 + # arg in x1 + # handled in x2 + str x2, [x0, seff_coroutine_t__handled_effects] + + swap_registers + + # We get to pummel x3-x15 so we use those for fast swaps + + mrs x9, TPIDR_EL0 + + add x10, x9, :tprel_hi12:_seff_current_coroutine + add x10, x10, :tprel_lo12_nc:_seff_current_coroutine + ldr x4, [x10] + + ldr x6, [x0, seff_coroutine_t__resume_point + seff_cont_t__current_coroutine] + + str x6, [x10] /* _seff_current_coroutine */ + + str x4, [x0, seff_coroutine_t__parent_coroutine] + + # Save return pointer + mov x3, x30 + ldr x30, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + str x3, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + + # At this point, the stack is aligned on a 16-byte boundary, so we use + # this as the system stack if possible. Note that we need to do this + # before clobbering x4 + cbnz x4, seff_resume_after_store_system_stack + add x10, x9, :tprel_hi12:_seff_system_stack + add x10, x10, :tprel_lo12_nc:_seff_system_stack + mov x11, sp + str x11, [x10] +seff_resume_after_store_system_stack: + + # Stack + swap_stack + + mov x0, x1 + ret + .cfi_endproc +.size seff_resume, . - seff_resume + +# void coroutine_prelude(seff_coroutine_t *k, seff_start_fun_t *fn, void *arg); +# k in x26 +# arg in x27 +# fn in x28 +coroutine_prelude: + .cfi_startproc + # needed or else gdb does not allow switching frames to a lower address in the backtrace + .cfi_signal_frame + + # At this point we are already at a new stack frame, we need to recover the CFA from x26 + .cfi_escape DW_CFA_def_cfa_expression, 2, DW_OP_breg(DW_REG_aarch64_r26), seff_coroutine_t__resume_point + # And then every other register from its offset to CFA + .cfi_offset x30, seff_cont_t__ip + .cfi_offset sp, seff_cont_t__rsp + .cfi_offset x29, seff_cont_t__rbp + .cfi_offset x19, seff_cont_t__r19 + .cfi_offset x20, seff_cont_t__r20 + .cfi_offset x21, seff_cont_t__r21 + .cfi_offset x22, seff_cont_t__r22 + .cfi_offset x23, seff_cont_t__r23 + .cfi_offset x24, seff_cont_t__r24 + .cfi_offset x25, seff_cont_t__r25 + .cfi_offset x26, seff_cont_t__r26 + .cfi_offset x27, seff_cont_t__r27 + .cfi_offset x28, seff_cont_t__r28 + + mov x0, x27 + # x26 is a callee-save register, so the debugger will continue to find the correct CFA + + blr x28 + # FALLTHROUGH TO SEFF_EXIT + # set the return value to (seff_result_t){ 0xFF...FF, %rax } + mov x2, x0 + mov x0, x26 + mov x1, #RETURN_EFFECT_ID + .cfi_endproc +.size coroutine_prelude, . - coroutine_prelude + +# void seff_exit(seff_coroutine_t* k, void* result) +seff_exit: + .cfi_startproc + # k in x0 + # eff_id in x1 + # result in x2 + mov x9, #seff_coroutine_state_t__FINISHED + str x9, [x0, seff_coroutine_t__state] + + # TODO: it would be faster to avoid swapping, since we're never restarting this coroutine + swap_registers + + ldr x9, [x0, seff_coroutine_t__parent_coroutine] + mrs x10, TPIDR_EL0 + add x10, x10, :tprel_hi12:_seff_current_coroutine + add x10, x10, :tprel_lo12_nc:_seff_current_coroutine + str x9, [x10] + + ldr x9, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + + ldr x29, [x0, seff_coroutine_t__resume_point + seff_cont_t__rbp] + ldr x10, [x0, seff_coroutine_t__resume_point + seff_cont_t__rsp] + mov sp, x10 + +#ifndef NDEBUG + # Cleaning up + str xzr, [x0, seff_coroutine_t__resume_point + seff_cont_t__ip] + + str xzr, [x0, seff_coroutine_t__resume_point + seff_cont_t__rbp] + str xzr, [x0, seff_coroutine_t__resume_point + seff_cont_t__rsp] +#endif + + mov x0, x1 + mov x1, x2 + br x9 + .cfi_endproc +.size seff_exit, . - seff_exit diff --git a/asm/seff.S b/asm/seff-x86-64.S similarity index 100% rename from asm/seff.S rename to asm/seff-x86-64.S diff --git a/decls/seff_types.py b/decls/seff_types.py index 45ca78f..a09bb85 100644 --- a/decls/seff_types.py +++ b/decls/seff_types.py @@ -20,7 +20,8 @@ parser = argparse.ArgumentParser() x86_64 = 'x86-64' -parser.add_argument('--arch', choices=[x86_64], default=x86_64) +aarch64 = 'aarch64' +parser.add_argument('--arch', choices=[x86_64, aarch64], default=x86_64) segmented = 'segmented' fixed = 'fixed' @@ -38,6 +39,11 @@ if args.arch == x86_64: arch = Architecture(64) generate.arch = arch + Defn('SEFF_ARCH_X86_64') +elif args.arch == aarch64: + arch = Architecture(64) + generate.arch = arch + Defn('SEFF_ARCH_AARCH64') else: print(f"Fatal error: unsupported architecture {args.arch}") exit() @@ -73,14 +79,33 @@ Field('rsp', ptr(void)), Field('rbp', ptr(void)), - Field('ip', ptr(void)), - Field('rbx', ptr(void)), - Field('r12', ptr(void)), - Field('r13', ptr(void)), - Field('r14', ptr(void)), - Field('r15', ptr(void)), ] +if args.arch == x86_64: + cont_fields.extend([ + Field('rbx', ptr(void)), + Field('r12', ptr(void)), + Field('r13', ptr(void)), + Field('r14', ptr(void)), + Field('r15', ptr(void)), + ]) +elif args.arch == aarch64: + cont_fields.extend([ + Field('r19', ptr(void)), + Field('r20', ptr(void)), + Field('r21', ptr(void)), + Field('r22', ptr(void)), + Field('r23', ptr(void)), + Field('r24', ptr(void)), + Field('r25', ptr(void)), + Field('r26', ptr(void)), + Field('r27', ptr(void)), + Field('r28', ptr(void)), + ]) +else: + print(f"Fatal error: unsupported architecture {args.arch}") + exit() + if stack_policy == segmented: cont_fields.insert(0, Field('stack_top', ptr(void))) cont = Struct('seff_cont_t', *cont_fields) diff --git a/src/seff.c b/src/seff.c index bbe124e..222533a 100644 --- a/src/seff.c +++ b/src/seff.c @@ -94,6 +94,7 @@ bool seff_coroutine_init_sized( seff_coroutine_t *k, seff_start_fun_t *fn, void *arg, size_t frame_size) { char *rsp; // Overhead needed by coroutine_prelude (since they make a call) + // TODO: architecture dependent? size_t overhead = 16; seff_frame_ptr_t stack = init_stack_frame(frame_size + overhead, &rsp); if (!stack) { @@ -108,11 +109,26 @@ bool seff_coroutine_init_sized( k->resume_point.stack_top = stack_top; #endif k->resume_point.current_coroutine = k; +#ifdef SEFF_ARCH_X86_64 k->resume_point.rbx = (void *)0xcacabbbb; k->resume_point.r12 = (void *)0xcaca1212; k->resume_point.r13 = (void *)k; k->resume_point.r14 = (void *)arg; k->resume_point.r15 = (void *)fn; +#elif defined(SEFF_ARCH_AARCH64) + k->resume_point.r19 = (void *)0xcaca1919; + k->resume_point.r20 = (void *)0xcaca2020; + k->resume_point.r21 = (void *)0xcaca2121; + k->resume_point.r22 = (void *)0xcaca2222; + k->resume_point.r23 = (void *)0xcaca2323; + k->resume_point.r24 = (void *)0xcaca2424; + k->resume_point.r25 = (void *)0xcaca2525; + k->resume_point.r26 = (void *)k; + k->resume_point.r27 = (void *)arg; + k->resume_point.r28 = (void *)fn; +#else +#error Architecture not supported in seff_coroutine_init_sized +#endif k->state = PAUSED; /* diff --git a/src/seff.h b/src/seff.h index b332552..eb478dd 100644 --- a/src/seff.h +++ b/src/seff.h @@ -81,6 +81,7 @@ E __attribute__((noreturn, no_split_stack)) void seff_exit( E __attribute__((noreturn)) void seff_throw(effect_id eff_id, void *payload); // TODO: this is architecture specific +#ifdef SEFF_ARCH_X86_64 #define MAKE_SYSCALL_WRAPPER(ret, fn, ...) \ ret __attribute__((no_split_stack)) fn##_syscall_wrapper(__VA_ARGS__); \ __asm__(#fn "_syscall_wrapper:" \ @@ -96,6 +97,14 @@ E __attribute__((noreturn)) void seff_throw(effect_id eff_id, void *payload); "movq %fs:_seff_paused_coroutine_stack_top@TPOFF, %rcx;", "", \ "") "movq %rcx, %fs:0x70;" \ "retq;") +#elif defined(SEFF_ARCH_AARCH64) +#define MAKE_SYSCALL_WRAPPER(ret, fn, ...) \ + ret __attribute__((no_split_stack)) fn##_syscall_wrapper(__VA_ARGS__); \ + __asm__(#fn "_syscall_wrapper:" \ + "b " #fn) +#else +#error Architecture not supported for MAKE_SYSCALL_WRAPPER +#endif #define EFF_ID(name) __##name##_eff_id #define EFF_PAYLOAD_T(name) __##name##_eff_payload diff --git a/tests/stack_aligned.c b/tests/stack_aligned.c index bef1f80..a49abbd 100644 --- a/tests/stack_aligned.c +++ b/tests/stack_aligned.c @@ -6,7 +6,11 @@ void *fn(void *arg); __asm__("fn:" +#ifdef SEFF_ARCH_X86_64 "movq %rsp, %rax;" +#else + "mov x0, sp;" +#endif "ret;"); int main(void) { diff --git a/tests/stack_red_zone.c b/tests/stack_red_zone.c index 4fae022..36f3abc 100644 --- a/tests/stack_red_zone.c +++ b/tests/stack_red_zone.c @@ -4,6 +4,7 @@ #include #include +#ifdef STACK_POLICY_SEGMENTED void *fn(void *arg); __asm__("fn:" "movq $0xFFFFFFFFFFFFFFFF, -0x08(%rsp);" @@ -37,3 +38,8 @@ int main(void) { return (uintptr_t)k->frame_ptr->canary; } +#else +int main(void) { + return 0; +} +#endif From bde1484adb35e28df34913cd6578953a610ef05f Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sat, 21 Sep 2024 20:33:53 +0100 Subject: [PATCH 2/5] Fix fibonacci_filter_generator example --- tests/fibonacci_filter_generator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fibonacci_filter_generator.c b/tests/fibonacci_filter_generator.c index 59c8ee3..437b8fb 100644 --- a/tests/fibonacci_filter_generator.c +++ b/tests/fibonacci_filter_generator.c @@ -62,7 +62,7 @@ int main(void) { int64_t next_elt; for (int i = 0; i < 1000; i++) { seff_request_t req = seff_resume_handling_all(filter_fib, fib); - if (seff_finished(req) == FINISHED) { + if (seff_finished(req)) { break; } else { next_elt = (int64_t)req.payload; From 6eac53d5951511b2e504088134b8e20d682acc8a Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sat, 21 Sep 2024 20:34:26 +0100 Subject: [PATCH 3/5] Comment out epoll stuff for BSD compatibility --- utils/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/net.c b/utils/net.c index 69e384c..dfac2fb 100644 --- a/utils/net.c +++ b/utils/net.c @@ -22,7 +22,7 @@ #include #include #include -#include +// #include #include #include #include @@ -123,7 +123,7 @@ typedef enum : uint32_t { WRITE = POLLOUT, HANGUP = POLLHUP, ERROR = POLLERR, - ET = EPOLLET + // ET = EPOLLET } event_t; bool poll_await_condition(void *_arg) { @@ -134,7 +134,7 @@ bool poll_await_condition(void *_arg) { short poll_await(int fd, uint32_t awaiting_events) { // Only valid with epoll - assert(!(awaiting_events & EPOLLET)); + //assert(!(awaiting_events & EPOLLET)); struct pollfd polls; polls.events = (short)awaiting_events; polls.fd = fd; From bf451227655f3cb99d74e5bd4f03d8766a898857 Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Wed, 25 Sep 2024 17:30:05 +0100 Subject: [PATCH 4/5] Change test with alloca to use stdlib.h While alloca.h works on typical Linux systems, stdlib.h also works on BSD --- tests/coroutine_nested_syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/coroutine_nested_syscall.c b/tests/coroutine_nested_syscall.c index 2e434be..2a60780 100644 --- a/tests/coroutine_nested_syscall.c +++ b/tests/coroutine_nested_syscall.c @@ -1,5 +1,5 @@ #include "seff.h" -#include +#include #include void heavy_hello(void) { From 4dbaff1d13c6f106b3bc61d19b41324c862dee55 Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Mon, 30 Sep 2024 15:50:53 +0100 Subject: [PATCH 5/5] Fix header comment --- asm/seff-aarch64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asm/seff-aarch64.S b/asm/seff-aarch64.S index c36b34b..42a59f4 100644 --- a/asm/seff-aarch64.S +++ b/asm/seff-aarch64.S @@ -11,7 +11,7 @@ # See the Mulan PSL v2 for more details. # # Architecture-specific code for context-switching -# This file is for the System V (Linux/OSX) 64 bit architecture +# This file is for the Arm's AArch64 instruction set architecture # #include "dwarf.S"