diff --git a/include/l4/ipc.h b/include/l4/ipc.h index cfccae99..50ee23d4 100644 --- a/include/l4/ipc.h +++ b/include/l4/ipc.h @@ -9,7 +9,13 @@ #define IPC_TI_MAP_GRANT 0x8 #define IPC_TI_GRANT 0x2 -#define IPC_MR_COUNT 16 +/* Total MR capacity with short message buffer: + * MR0-MR7: 8 words in registers (ctx.regs[0-7]) + * MR8-MR39: 32 words in msg_buffer[0-31] + * MR40-MR47: 8 words in UTCB (utcb->mr[0-7]) + * Total: 48 MRs = 192 bytes + */ +#define IPC_MR_COUNT 48 typedef union { struct { diff --git a/include/l4/utcb.h b/include/l4/utcb.h index 8c425796..d60a071c 100644 --- a/include/l4/utcb.h +++ b/include/l4/utcb.h @@ -25,8 +25,15 @@ struct utcb { uint32_t thread_word_1; uint32_t thread_word_2; /* +12w */ - uint32_t mr[8]; /* MRs 8-15 (0-8 are laying in - r4..r11 [thread's context]) */ + /* Message Registers (MR) mapping with short message buffer: + * MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7]) - 32 bytes + * MR8-MR39: Short message buffer (tcb->msg_buffer[0-31]) - 128 bytes + * MR40-MR47: UTCB overflow (mr[0-7]) - 32 bytes + * + * Total message capacity: 192 bytes (48 words) + * Fastpath capacity: 160 bytes (40 words, MR0-MR39) + */ + uint32_t mr[8]; /* MRs 40-47 (overflow beyond short buffer) */ /* +20w */ uint32_t br[8]; /* +28w */ diff --git a/include/platform/ipc-fastpath.h b/include/platform/ipc-fastpath.h new file mode 100644 index 00000000..87bdab79 --- /dev/null +++ b/include/platform/ipc-fastpath.h @@ -0,0 +1,204 @@ +/* Copyright (c) 2026 The F9 Microkernel Project. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file. + */ + +#ifndef PLATFORM_IPC_FASTPATH_H_ +#define PLATFORM_IPC_FASTPATH_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Fastpath IPC optimization for ARM Cortex-M. + * + * Bypasses softirq scheduling for simple short-message IPC by performing + * direct register transfer in the SVC handler context. + * + * CRITICAL: Message registers (MR0-MR7) are in R4-R11, NOT R0-R7. + * - R0-R3: Syscall parameters (to_tid, from_tid, timeout, unused) + * - R4-R11: Message registers (MR0-MR7) via ctx.regs[0-7] + * + * This implementation uses a NAKED wrapper to capture R4-R11 immediately + * before any compiler-generated prologue can clobber them. + * + * Implementation is in header as static inline for zero call overhead. + */ + +/** + * ipc_fastpath_copy_mrs() - Copy message registers to receiver + * @saved_mrs: Saved message registers R4-R11 (MR0-MR7) + * @sender: Source thread (for msg_buffer access) + * @receiver: Destination thread + * @n_untyped: Number of untyped words to copy (0-39) + * + * Copies MR0-MR{n_untyped} from sender to receiver: + * - MR0-MR7: From saved_mrs to receiver->ctx.regs[0-7] + * - MR8-MR39: From sender->msg_buffer to receiver->msg_buffer (NEW) + * + * WCET: ~20 cycles (MR0-MR7) + ~100 cycles (MR8-MR39, if used) + */ +static inline void ipc_fastpath_copy_mrs(volatile uint32_t *saved_mrs, + struct tcb *sender, + struct tcb *receiver, + int n_untyped) +{ + int count = n_untyped + 1; /* +1 for tag in MR0 */ + int i; + + /* Phase 1: Copy MR0-MR7 from saved registers (R4-R11) */ + for (i = 0; i < count && i < 8; i++) + receiver->ctx.regs[i] = saved_mrs[i]; + + /* Phase 2: Copy MR8-MR39 from sender's msg_buffer (if needed) */ + if (count > 8) { + int buf_count = count - 8; /* Number of words in buffer */ + if (buf_count > 32) + buf_count = 32; /* Clamp to buffer size */ + + for (i = 0; i < buf_count; i++) + receiver->msg_buffer[i] = sender->msg_buffer[i]; + } +} + +/** + * ipc_fastpath_helper() - Fastpath IPC implementation (C helper) + * @caller: Current thread attempting IPC + * @svc_param: SVC stack frame (R0-R3, R12, LR, PC, xPSR) + * @saved_mrs: Pre-saved R4-R11 message registers + * + * Called by the naked wrapper after message registers have been captured. + * Returns 1 if fastpath succeeded, 0 to fall back to slowpath. + */ +static inline int ipc_fastpath_helper(struct tcb *caller, + uint32_t *svc_param, + volatile uint32_t *saved_mrs) +{ + struct tcb *to_thr; + l4_thread_t to_tid, from_tid; + ipc_msg_tag_t tag; + + /* Extract IPC parameters from hardware stack (R0-R3) */ + to_tid = svc_param[REG_R0]; + from_tid = svc_param[REG_R1]; + + /* Extract tag from saved MR0 (R4), NOT from R0! */ + tag.raw = saved_mrs[0]; + + /* Fastpath Eligibility Check */ + + /* Criterion 1: Simple send (to_tid valid, from_tid = NILTHREAD) */ + if (to_tid == L4_NILTHREAD || from_tid != L4_NILTHREAD) + return 0; /* Slowpath: receive-only or send+receive */ + + /* Criterion 2: No typed items (no MapItems/GrantItems) */ + if (tag.s.n_typed != 0) + return 0; /* Slowpath: requires map_area() processing */ + + /* Criterion 3: Short message (fits in MR0-MR39: registers + buffer) + * MR0-MR7: 8 words × 4 bytes = 32 bytes (registers) + * MR8-MR39: 32 words × 4 bytes = 128 bytes (buffer) + * Total capacity: 160 bytes (40 words) + */ + if (tag.s.n_untyped > 39) + return 0; /* Slowpath: requires UTCB access */ + + /* Criterion 4: Receiver exists and is blocked waiting */ + to_thr = thread_by_globalid(to_tid); + if (!to_thr || to_thr->state != T_RECV_BLOCKED) + return 0; /* Slowpath: receiver not ready */ + + /* Criterion 5: Receiver is waiting for us */ + if (to_thr->ipc_from != L4_ANYTHREAD && + to_thr->ipc_from != caller->t_globalid) + return 0; /* Slowpath: receiver waiting for someone else */ + + /* Criterion 6: Special thread handling */ + if (to_tid == TID_TO_GLOBALID(THREAD_LOG) || + to_tid == TID_TO_GLOBALID(THREAD_IRQ_REQUEST)) + return 0; /* Slowpath: special kernel threads */ + + /* Criterion 7: Thread start protocol */ + if (tag.raw == 0x00000005) + return 0; /* Slowpath: thread initialization */ + + /* All criteria met - Execute Fastpath */ + + /* Phase 0: Dequeue caller (will re-enqueue later) */ + extern void sched_dequeue(struct tcb *); + sched_dequeue(caller); + + /* Phase 1: Copy message registers from sender to receiver + * - MR0-MR7: From saved registers (R4-R11) + * - MR8-MR39: From sender's msg_buffer to receiver's msg_buffer (if + * needed) + */ + ipc_fastpath_copy_mrs(saved_mrs, caller, to_thr, tag.s.n_untyped); + + /* Phase 2: Update receiver context */ + /* Set R0 to sender ID (IPC protocol) */ + ((uint32_t *) to_thr->ctx.sp)[REG_R0] = caller->t_globalid; + to_thr->utcb->sender = caller->t_globalid; + + /* Phase 3: Update thread states */ + + /* Clear timeout events (no timeout in fastpath) */ + caller->timeout_event = 0; + to_thr->timeout_event = 0; + + /* Receiver becomes runnable with IPC priority boost */ + to_thr->state = T_RUNNABLE; + to_thr->ipc_from = L4_NILTHREAD; + sched_set_priority(to_thr, SCHED_PRIO_IPC); + sched_enqueue(to_thr); + + /* Caller continues (send-only, no reply expected) + * Fastpath only handles from_tid==NILTHREAD (simple send). + * For L4_Call (send+receive), slowpath handles blocking. + * + * Re-enqueue caller (was dequeued at SVC entry). + * It's safe to enqueue current thread - sched has double-enqueue + * protection. + */ + caller->state = T_RUNNABLE; + sched_enqueue(caller); + + /* Phase 4: Request context switch via PendSV */ + /* DON'T do immediate switch - let PendSV handle it normally */ + request_schedule(); + + return 1; /* Fastpath succeeded */ +} + +/** + * ipc_try_fastpath() - IPC fastpath using pre-saved R4-R11 + * @caller: Current thread attempting IPC + * @svc_param: SVC stack frame (R0-R3, R12, LR, PC, xPSR) + * + * Reads message registers from __irq_saved_regs which were saved by + * SVC_HANDLER before any C code ran, ensuring MR0-MR7 are untouched. + * + * Returns: + * 1 if fastpath succeeded (caller should skip slowpath) + * 0 if fastpath unavailable (caller must use slowpath) + * + * Eligibility criteria: + * - Simple send (to_tid valid, from_tid == NILTHREAD) + * - Short message (n_untyped <= 39, n_typed == 0) + * - Receiver ready (T_RECV_BLOCKED, waiting for caller or ANYTHREAD) + */ +static inline int ipc_try_fastpath(struct tcb *caller, uint32_t *svc_param) +{ + extern volatile uint32_t __irq_saved_regs[8]; + + /* Read from global __irq_saved_regs saved by SVC_HANDLER */ + return ipc_fastpath_helper(caller, svc_param, __irq_saved_regs); +} + +#endif /* PLATFORM_IPC_FASTPATH_H_ */ diff --git a/include/platform/irq.h b/include/platform/irq.h index efeeed58..5500e2d5 100644 --- a/include/platform/irq.h +++ b/include/platform/irq.h @@ -317,6 +317,38 @@ extern volatile uint32_t __irq_saved_regs[8]; request_schedule(); \ irq_return(); \ } + +/* + * SVC_HANDLER - Specialized handler for SVC exceptions requiring R4-R11 + * preservation. + * + * Unlike IRQ_HANDLER, this variant saves R4-R11 to __irq_saved_regs BEFORE + * calling the C handler, ensuring message registers are captured untouched. + * + * This is critical for IPC fastpath optimization where user message registers + * (MR0-MR7 in R4-R11) must be available before any C code runs. + * + * IMPORTANT: Must restore R4-R11 before returning since the C handler may + * clobber these registers. + * + * Usage: SVC_HANDLER(svc_handler, __svc_handler); + */ +#define SVC_HANDLER(name, sub) \ + void name(void) __NAKED; \ + void name(void) \ + { \ + irq_enter(); \ + irq_save_regs_only(); \ + sub(); \ + request_schedule(); \ + /* Restore R4-R11 before returning */ \ + __asm__ __volatile__( \ + "ldr r0, =__irq_saved_regs\n\t" \ + "ldm r0, {r4-r11}" :: \ + : "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", \ + "memory"); \ + irq_return(); \ + } extern volatile tcb_t *current; #endif /* PLATFORM_IRQ_H_ */ diff --git a/include/thread.h b/include/thread.h index 5f21a3f3..21b20d61 100644 --- a/include/thread.h +++ b/include/thread.h @@ -166,6 +166,22 @@ struct tcb { uint8_t notify_pending; uint8_t _notify_pad[1]; /* Alignment padding */ + + /* Short message buffer for IPC fastpath optimization. + * Extends fastpath coverage from 32 bytes (registers only) to 160 bytes. + * Maps to L4 virtual registers MR8-MR39 (32 additional 4-byte registers). + * + * Memory layout: + * - MR0-MR7: ctx.regs[0-7] (32 bytes, R4-R11 hardware registers) + * - MR8-MR39: msg_buffer[0-31] (128 bytes, TCB-embedded buffer) + * - MR40-MR47: utcb->mr[0-7] (32 bytes, UTCB overflow) + * + * Fastpath eligibility: n_untyped <= 39 (160 bytes total) + * Expected fastpath coverage: 70% → 95% + * + * RAM impact: 128 bytes × 20 threads = 2.5 KB (1.3% of 192KB) + */ + uint32_t msg_buffer[32]; }; typedef struct tcb tcb_t; diff --git a/kernel/ipc.c b/kernel/ipc.c index a6340c90..3a75e979 100644 --- a/kernel/ipc.c +++ b/kernel/ipc.c @@ -35,19 +35,33 @@ static inline void thread_make_runnable(tcb_t *thr) sched_enqueue(thr); } +/* Read message register with short buffer support. + * MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7]) + * MR8-MR39: Short message buffer (msg_buffer[0-31]) - NEW + * MR40-MR47: UTCB overflow (utcb->mr[0-7]) + */ uint32_t ipc_read_mr(tcb_t *from, int i) { - if (i >= 8) - return from->utcb->mr[i - 8]; - return from->ctx.regs[i]; + if (i < 8) + return from->ctx.regs[i]; + if (i < 40) + return from->msg_buffer[i - 8]; + return from->utcb->mr[i - 40]; } +/* Write message register with short buffer support. + * MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7]) + * MR8-MR39: Short message buffer (msg_buffer[0-31]) - NEW + * MR40-MR47: UTCB overflow (utcb->mr[0-7]) + */ void ipc_write_mr(tcb_t *to, int i, uint32_t data) { - if (i >= 8) - to->utcb->mr[i - 8] = data; - else + if (i < 8) to->ctx.regs[i] = data; + else if (i < 40) + to->msg_buffer[i - 8] = data; + else + to->utcb->mr[i - 40] = data; } static void user_ipc_error(tcb_t *thr, enum user_error_t error) @@ -351,19 +365,19 @@ void sys_ipc(uint32_t *param1) } else if (to_thr && to_thr->state == T_INACTIVE && GLOBALID_TO_TID(to_thr->utcb->t_pager) == GLOBALID_TO_TID(caller->t_globalid)) { - if (ipc_read_mr(caller, 0) == 0x00000005) { - /* mr1: thread func, mr2: stack addr, - * mr3: stack size - * mr4: thread entry, mr5: thread args - * thread start protocol */ + uint32_t tag = ipc_read_mr(caller, 0); + if (tag == 0x00000005) { + /* Thread start protocol from pager: + * mr1: thread_container (wrapper), mr2: sp, + * mr3: stack size, mr4: entry point, mr5: entry arg */ + uint32_t mr1_container = ipc_read_mr(caller, 1); memptr_t sp = ipc_read_mr(caller, 2); size_t stack_size = ipc_read_mr(caller, 3); + uint32_t entry_point = ipc_read_mr(caller, 4); + uint32_t entry_arg = ipc_read_mr(caller, 5); uint32_t regs[4]; /* r0, r1, r2, r3 */ - dbg_printf(DL_IPC, "IPC: %t thread start sp:%p stack_size:%p\n", - to_tid, sp, stack_size); - /* Security check: Ensure stack is in user-writable memory */ int pid = mempool_search(sp - stack_size, stack_size); mempool_t *mp = mempool_getbyid(pid); @@ -387,10 +401,12 @@ void sys_ipc(uint32_t *param1) regs[REG_R0] = (uint32_t) &kip; regs[REG_R1] = (uint32_t) to_thr->utcb; - regs[REG_R2] = ipc_read_mr(caller, 4); - regs[REG_R3] = ipc_read_mr(caller, 5); - thread_init_ctx((void *) sp, (void *) ipc_read_mr(caller, 1), - regs, to_thr); + regs[REG_R2] = + entry_point; /* Actual entry passed to container */ + regs[REG_R3] = entry_arg; + + thread_init_ctx((void *) sp, (void *) mr1_container, regs, + to_thr); thread_make_runnable(caller); diff --git a/kernel/syscall.c b/kernel/syscall.c index cb21964c..a3c3aa66 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -28,24 +29,82 @@ tcb_t *caller; -void __svc_handler(void) +/* Always returns 0; fastpath and slowpath both use PendSV for context switching + */ +int __svc_handler(void) { extern tcb_t *kernel; + uint32_t *svc_param; + uint8_t svc_num; /* Kernel requests context switch, satisfy it */ if (thread_current() == kernel) - return; + return 0; caller = thread_current(); - /* Dequeue before blocking (strict queue invariant) */ - sched_dequeue(caller); - caller->state = T_SVC_BLOCKED; + /* CRITICAL: Read SVC frame from CURRENT PSP, not stale ctx.sp! + * ctx.sp is only updated during context switch, but user thread may have + * pushed/popped since then. Hardware SVC frame is on the CURRENT stack. + */ + uint32_t psp; + __asm__ __volatile__("mrs %0, psp" : "=r"(psp)); + svc_param = (uint32_t *) psp; + svc_num = ((char *) svc_param[REG_PC])[-2]; + + if (svc_num == SYS_IPC) { + extern volatile uint32_t __irq_saved_regs[8]; + + /* Try fastpath with saved message registers */ + if (ipc_fastpath_helper(caller, svc_param, __irq_saved_regs)) { + /* Fastpath succeeded - MRs copied, threads enqueued, PendSV + * requested. Caller is already T_RUNNABLE and enqueued by fastpath. + * Just return normally - PendSV will do the context switch. */ + return 0; /* Normal return, PendSV will switch */ + } - softirq_schedule(SYSCALL_SOFTIRQ); + /* Fastpath failed, use slowpath */ + /* Slowpath will dequeue caller in softirq handler */ + sched_dequeue(caller); + caller->state = T_SVC_BLOCKED; + softirq_schedule(SYSCALL_SOFTIRQ); + return 0; + } else { + /* Non-IPC syscall */ + sched_dequeue(caller); + caller->state = T_SVC_BLOCKED; + softirq_schedule(SYSCALL_SOFTIRQ); + return 0; + } } -IRQ_HANDLER(svc_handler, __svc_handler); +/* Custom SVC handler with fastpath support */ +void svc_handler(void) __NAKED; +void svc_handler(void) +{ + /* Save LR and R4-R11 before any C code */ + __asm__ __volatile__("push {lr}"); + __asm__ __volatile__( + "ldr r0, =__irq_saved_regs\n\t" + "stm r0, {r4-r11}" :: + : "r0", "memory"); + + /* Call C handler - always returns 0 (context switch via PendSV) */ + __svc_handler(); + + /* Restore R4-R11 BEFORE returning so PendSV saves original values */ + __asm__ __volatile__( + "ldr r0, =__irq_saved_regs\n\t" + "ldm r0, {r4-r11}\n\t" + /* Now set PendSV and return normally */ + "ldr r0, =0xE000ED04\n\t" /* SCB_ICSR */ + "ldr r1, [r0]\n\t" + "orr r1, #0x10000000\n\t" /* SCB_ICSR_PENDSVSET */ + "str r1, [r0]\n\t" + "pop {lr}\n\t" + "bx lr" :: + : "r0", "r1", "memory"); +} void syscall_init() { diff --git a/kernel/user-log.c b/kernel/user-log.c index 53544774..165a450a 100644 --- a/kernel/user-log.c +++ b/kernel/user-log.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -12,12 +13,46 @@ void user_log(tcb_t *from) char *format = (char *) from->ctx.regs[1]; va_list *va = (va_list *) from->ctx.regs[2]; - /* Debug: validate pointers before use */ - if (!format || (uint32_t) format < 0x08000000 || - (uint32_t) format > 0x20020000) { - dbg_printf(DL_KDB, "[ULOG: bad fmt %p]\n", format); + /* CRITICAL: Validate user pointers before dereferencing! + * User can pass arbitrary pointers that could cause kernel fault. + * Check that pointers are in valid mapped memory. + */ + memptr_t format_addr = (memptr_t) format; + memptr_t va_addr = (memptr_t) va; + + /* Check format pointer is in valid executable memory (code section) + * Format strings must be in UTEXT (MP_UX) or KTEXT (MP_KX), not arbitrary + * kernel data. This prevents malicious users from using kernel data + * structures as format strings to leak kernel information. + */ + int format_pool = mempool_search(format_addr, 1); + if (format_pool < 0) { + dbg_printf(DL_KDB, "[user_log] Invalid format pointer: %p\n", format); + return; + } + + mempool_t *pool = mempool_getbyid(format_pool); + if (!pool || !(pool->flags & (MP_UX | MP_KX))) { + dbg_printf(DL_KDB, + "[user_log] Format pointer not in executable memory: %p\n", + format); + return; + } + + /* Check va_list pointer is in valid user-accessible memory (stack/data) */ + int va_pool = mempool_search(va_addr, sizeof(va_list)); + if (va_pool < 0) { + dbg_printf(DL_KDB, "[user_log] Invalid va_list pointer: %p\n", va); + return; + } + + pool = mempool_getbyid(va_pool); + if (!pool || !(pool->flags & (MP_UR | MP_UW))) { + dbg_printf(DL_KDB, + "[user_log] va_list pointer not user-accessible: %p\n", va); return; } + /* Pointers validated, safe to dereference */ dbg_vprintf(DL_KDB, format, *va); } diff --git a/mk/generic.mk b/mk/generic.mk index fa7a3dd3..5dbff476 100644 --- a/mk/generic.mk +++ b/mk/generic.mk @@ -105,6 +105,25 @@ qemu: $(out)/$(PROJECT).bin -killall -q qemu-system-arm $(QEMU) -M netduinoplus2 -nographic -kernel $(out)/$(PROJECT).elf -serial mon:stdio +# QEMU with GDB debugging +# Terminal 1: make qemu-gdb +# Terminal 2: make gdb-attach +.PHONY: qemu-gdb +qemu-gdb: $(out)/$(PROJECT).bin + @echo "Starting QEMU with GDB server on port 1234..." + @echo "In another terminal, run: make gdb-attach" + @echo "Press Ctrl+C to exit" + -killall -q qemu-system-arm + $(QEMU) -M netduinoplus2 -nographic -kernel $(out)/$(PROJECT).elf -s -S + +.PHONY: gdb-attach +gdb-attach: $(out)/$(PROJECT).elf + @echo "Connecting to QEMU GDB server..." + arm-none-eabi-gdb $(out)/$(PROJECT).elf \ + -ex "target remote :1234" \ + -ex "layout src" \ + -ex "layout regs" + # QEMU automated testing # Usage: make run-tests (test suite) # make run-tests FAULT=mpu (MPU fault test) diff --git a/user/lib/l4/pager.c b/user/lib/l4/pager.c index 46d4d5a3..8cf15f5f 100644 --- a/user/lib/l4/pager.c +++ b/user/lib/l4/pager.c @@ -173,6 +173,9 @@ static void start_thread(L4_ThreadId_t t, { L4_Msg_t msg; + /* CRITICAL: Do NOT call functions after L4_MsgLoad! + * Any function call (including printf) will clobber R4-R11 (MR0-MR7)! + */ L4_MsgClear(&msg); L4_MsgAppendWord(&msg, (L4_Word_t) thread_container); L4_MsgAppendWord(&msg, sp); diff --git a/user/lib/l4/platform/syscalls.c b/user/lib/l4/platform/syscalls.c index 246d7f65..ee99dbfa 100644 --- a/user/lib/l4/platform/syscalls.c +++ b/user/lib/l4/platform/syscalls.c @@ -127,10 +127,16 @@ L4_MsgTag_t L4_Ipc(L4_ThreadId_t to, register L4_Word_t r1 __asm__("r1") = FromSpecifier.raw; register L4_Word_t r2 __asm__("r2") = Timeouts; + /* CRITICAL: Declare R4-R11 as clobbered to force compiler to preserve + * global register variables (__L4_MR0-__L4_MR7) across the SVC call. + * Without this, compiler may generate code that uses R4-R11 as scratch + * registers before the SVC, corrupting the message registers. + */ __asm__ __volatile__("svc %[syscall_num]\n" : "+r"(r0) : "r"(r1), "r"(r2), [syscall_num] "i"(SYS_IPC) - : "memory"); + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", + "r11"); result.raw = __L4_MR0;