From f0f547cbd3a08aa11cdb8a4a910ab9bba21b6f71 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Thu, 4 Sep 2025 13:02:46 -0400 Subject: [PATCH 1/3] Make VM lock per fiber instead of per ractor. Having it per-ractor is weird because that means two threads in the same ractor can both acquire the lock (if it's through the re-entering version). I don't think that's how it was envisioned to work. Also, it's currently broken if you raise an exception when the lock is held because `rb_ec_vm_lock_rec(ec)` gives you the state of the ractor's lock_rec, and that is saved on the `tag.lock_rec`. This value doesn't necessarily reflect the state of that thread or fiber's lock_rec (or even if that thread or fiber locked the VM lock), so we can't know if we should release it or how many levels to release. I changed it to work per-fiber like ruby mutexes. Now we can trust that the value saved is correct per fiber, and other threads and fibers trying to acquire the VM lock will be blocked by the fiber that acquired. Also, there was a "bug" (I'm not sure it can happen) where if you acquire the VM lock then call rb_fork, you can deadlock. After a fork we should reset ractor.sync.lock_rec to 0, lock_owner to NULL in case the VM lock was held above the `rb_fork` call site. --- thread_pthread.c | 9 +++++++++ vm_core.h | 9 +++++---- vm_sync.c | 25 ++++++++++++++++--------- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/thread_pthread.c b/thread_pthread.c index 730ecb54163f7d..a4aa493965598e 100644 --- a/thread_pthread.c +++ b/thread_pthread.c @@ -1419,8 +1419,10 @@ rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) // release VM lock lock_rec = vm->ractor.sync.lock_rec; + rb_fiber_t *fiber = vm->ractor.sync.lock_owner_fiber; vm->ractor.sync.lock_rec = 0; vm->ractor.sync.lock_owner = NULL; + vm->ractor.sync.lock_owner_fiber = NULL; rb_native_mutex_unlock(&vm->ractor.sync.lock); // interrupts all running threads @@ -1450,6 +1452,7 @@ rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) rb_native_mutex_lock(&vm->ractor.sync.lock); vm->ractor.sync.lock_rec = lock_rec; vm->ractor.sync.lock_owner = cr; + vm->ractor.sync.lock_owner_fiber = fiber; } // do not release ractor_sched_lock and threre is no newly added (resumed) thread @@ -1583,6 +1586,12 @@ thread_sched_atfork(struct rb_thread_sched *sched) } vm->ractor.sched.running_cnt = 0; + vm->ractor.sync.lock_rec = 0; + th->ec->tag->lock_rec = 0; + vm->ractor.sync.lock_owner = NULL; + vm->ractor.sync.lock_owner_fiber = NULL; + rb_native_mutex_initialize(&vm->ractor.sync.lock); + rb_native_mutex_initialize(&vm->ractor.sched.lock); #if VM_CHECK_MODE > 0 vm->ractor.sched.lock_owner = NULL; diff --git a/vm_core.h b/vm_core.h index 9986d3e9231163..ebcb031a92f635 100644 --- a/vm_core.h +++ b/vm_core.h @@ -676,6 +676,7 @@ typedef struct rb_vm_struct { // monitor rb_nativethread_lock_t lock; struct rb_ractor_struct *lock_owner; + struct rb_fiber_struct *lock_owner_fiber; unsigned int lock_rec; // join at exit @@ -2073,10 +2074,10 @@ void rb_ec_vm_lock_rec_release(const rb_execution_context_t *ec, /* This technically is a data race, as it's checked without the lock, however we * check against a value only our own thread will write. */ NO_SANITIZE("thread", static inline bool -vm_locked_by_ractor_p(rb_vm_t *vm, rb_ractor_t *cr)) +vm_locked_by_fiber_p(rb_vm_t *vm, rb_fiber_t *cur_f)) { - VM_ASSERT(cr == GET_RACTOR()); - return vm->ractor.sync.lock_owner == cr; + VM_ASSERT(cur_f == GET_THREAD()->ec->fiber_ptr); + return vm->ractor.sync.lock_owner_fiber == cur_f; } static inline unsigned int @@ -2084,7 +2085,7 @@ rb_ec_vm_lock_rec(const rb_execution_context_t *ec) { rb_vm_t *vm = rb_ec_vm_ptr(ec); - if (!vm_locked_by_ractor_p(vm, rb_ec_ractor_ptr(ec))) { + if (!vm_locked_by_fiber_p(vm, ec->fiber_ptr)) { return 0; } else { diff --git a/vm_sync.c b/vm_sync.c index ba311a00e97838..9254fccd437647 100644 --- a/vm_sync.c +++ b/vm_sync.c @@ -12,7 +12,7 @@ void rb_ractor_sched_barrier_end(rb_vm_t *vm, rb_ractor_t *cr); static bool vm_locked(rb_vm_t *vm) { - return vm_locked_by_ractor_p(vm, GET_RACTOR()); + return vm_locked_by_fiber_p(vm, GET_THREAD()->ec->fiber_ptr); } #if RUBY_DEBUG > 0 @@ -62,7 +62,7 @@ vm_need_barrier(bool no_barrier, const rb_ractor_t *cr, const rb_vm_t *vm) } static void -vm_lock_enter(rb_ractor_t *cr, rb_vm_t *vm, bool locked, bool no_barrier, unsigned int *lev APPEND_LOCATION_ARGS) +vm_lock_enter(rb_ractor_t *cr, rb_fiber_t *fiber, rb_vm_t *vm, bool locked, bool no_barrier, unsigned int *lev APPEND_LOCATION_ARGS) { RUBY_DEBUG_LOG2(file, line, "start locked:%d", locked); @@ -77,6 +77,7 @@ vm_lock_enter(rb_ractor_t *cr, rb_vm_t *vm, bool locked, bool no_barrier, unsign // lock rb_native_mutex_lock(&vm->ractor.sync.lock); VM_ASSERT(vm->ractor.sync.lock_owner == NULL); + VM_ASSERT(vm->ractor.sync.lock_owner_fiber == NULL); VM_ASSERT(vm->ractor.sync.lock_rec == 0); // barrier @@ -93,7 +94,9 @@ vm_lock_enter(rb_ractor_t *cr, rb_vm_t *vm, bool locked, bool no_barrier, unsign VM_ASSERT(vm->ractor.sync.lock_rec == 0); VM_ASSERT(vm->ractor.sync.lock_owner == NULL); + VM_ASSERT(vm->ractor.sync.lock_owner_fiber == NULL); vm->ractor.sync.lock_owner = cr; + vm->ractor.sync.lock_owner_fiber = fiber; } vm->ractor.sync.lock_rec++; @@ -130,6 +133,7 @@ vm_lock_leave(rb_vm_t *vm, bool no_barrier, unsigned int *lev APPEND_LOCATION_AR if (vm->ractor.sync.lock_rec == 0) { vm->ractor.sync.lock_owner = NULL; + vm->ractor.sync.lock_owner_fiber = NULL; rb_native_mutex_unlock(&vm->ractor.sync.lock); } } @@ -139,10 +143,10 @@ rb_vm_lock_enter_body(unsigned int *lev APPEND_LOCATION_ARGS) { rb_vm_t *vm = GET_VM(); if (vm_locked(vm)) { - vm_lock_enter(NULL, vm, true, false, lev APPEND_LOCATION_PARAMS); + vm_lock_enter(NULL, NULL, vm, true, false, lev APPEND_LOCATION_PARAMS); } else { - vm_lock_enter(GET_RACTOR(), vm, false, false, lev APPEND_LOCATION_PARAMS); + vm_lock_enter(GET_RACTOR(), GET_THREAD()->ec->fiber_ptr, vm, false, false, lev APPEND_LOCATION_PARAMS); } } @@ -151,10 +155,10 @@ rb_vm_lock_enter_body_nb(unsigned int *lev APPEND_LOCATION_ARGS) { rb_vm_t *vm = GET_VM(); if (vm_locked(vm)) { - vm_lock_enter(NULL, vm, true, true, lev APPEND_LOCATION_PARAMS); + vm_lock_enter(NULL, NULL, vm, true, true, lev APPEND_LOCATION_PARAMS); } else { - vm_lock_enter(GET_RACTOR(), vm, false, true, lev APPEND_LOCATION_PARAMS); + vm_lock_enter(GET_RACTOR(), GET_THREAD()->ec->fiber_ptr, vm, false, true, lev APPEND_LOCATION_PARAMS); } } @@ -162,7 +166,7 @@ void rb_vm_lock_enter_body_cr(rb_ractor_t *cr, unsigned int *lev APPEND_LOCATION_ARGS) { rb_vm_t *vm = GET_VM(); - vm_lock_enter(cr, vm, vm_locked(vm), false, lev APPEND_LOCATION_PARAMS); + vm_lock_enter(cr, GET_THREAD()->ec->fiber_ptr, vm, vm_locked(vm), false, lev APPEND_LOCATION_PARAMS); } void @@ -174,7 +178,7 @@ rb_vm_lock_leave_body_nb(unsigned int *lev APPEND_LOCATION_ARGS) void rb_vm_lock_leave_body(unsigned int *lev APPEND_LOCATION_ARGS) { - vm_lock_leave(GET_VM(), false, lev APPEND_LOCATION_PARAMS); + vm_lock_leave(GET_VM(), false, lev APPEND_LOCATION_PARAMS); } void @@ -183,7 +187,7 @@ rb_vm_lock_body(LOCATION_ARGS) rb_vm_t *vm = GET_VM(); ASSERT_vm_unlocking(); - vm_lock_enter(GET_RACTOR(), vm, false, false, &vm->ractor.sync.lock_rec APPEND_LOCATION_PARAMS); + vm_lock_enter(GET_RACTOR(), GET_THREAD()->ec->fiber_ptr, vm, false, false, &vm->ractor.sync.lock_rec APPEND_LOCATION_PARAMS); } void @@ -201,9 +205,11 @@ vm_cond_wait(rb_vm_t *vm, rb_nativethread_cond_t *cond, unsigned long msec) ASSERT_vm_locking(); unsigned int lock_rec = vm->ractor.sync.lock_rec; rb_ractor_t *cr = vm->ractor.sync.lock_owner; + rb_fiber_t *fiber = vm->ractor.sync.lock_owner_fiber; vm->ractor.sync.lock_rec = 0; vm->ractor.sync.lock_owner = NULL; + vm->ractor.sync.lock_owner_fiber = NULL; if (msec > 0) { rb_native_cond_timedwait(cond, &vm->ractor.sync.lock, msec); } @@ -212,6 +218,7 @@ vm_cond_wait(rb_vm_t *vm, rb_nativethread_cond_t *cond, unsigned long msec) } vm->ractor.sync.lock_rec = lock_rec; vm->ractor.sync.lock_owner = cr; + vm->ractor.sync.lock_owner_fiber = fiber; } void From 04dcf9242138ddd8bddaafb68f41709a41cb79af Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Mon, 8 Sep 2025 11:50:30 -0400 Subject: [PATCH 2/3] Remove references to vm->ractor.sync.lock_owner --- thread_pthread.c | 10 +++------- thread_sync.c | 2 -- thread_win32.c | 6 +++--- vm_core.h | 2 +- vm_sync.c | 18 +++++------------- 5 files changed, 12 insertions(+), 26 deletions(-) diff --git a/thread_pthread.c b/thread_pthread.c index a4aa493965598e..5cbd6b214a18e3 100644 --- a/thread_pthread.c +++ b/thread_pthread.c @@ -1401,7 +1401,7 @@ void rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) { VM_ASSERT(cr == GET_RACTOR()); - VM_ASSERT(vm->ractor.sync.lock_owner == cr); // VM is locked + VM_ASSERT(rb_fiber_threadptr(vm->ractor.sync.lock_owner_fiber)->ractor == cr); // VM is locked VM_ASSERT(!vm->ractor.sched.barrier_waiting); VM_ASSERT(vm->ractor.sched.barrier_waiting_cnt == 0); VM_ASSERT(vm->ractor.sched.barrier_ractor == NULL); @@ -1421,7 +1421,6 @@ rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) lock_rec = vm->ractor.sync.lock_rec; rb_fiber_t *fiber = vm->ractor.sync.lock_owner_fiber; vm->ractor.sync.lock_rec = 0; - vm->ractor.sync.lock_owner = NULL; vm->ractor.sync.lock_owner_fiber = NULL; rb_native_mutex_unlock(&vm->ractor.sync.lock); @@ -1451,7 +1450,6 @@ rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) // acquire VM lock rb_native_mutex_lock(&vm->ractor.sync.lock); vm->ractor.sync.lock_rec = lock_rec; - vm->ractor.sync.lock_owner = cr; vm->ractor.sync.lock_owner_fiber = fiber; } @@ -1503,11 +1501,10 @@ ractor_sched_barrier_join_wait_locked(rb_vm_t *vm, rb_thread_t *th) } void -rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr) +rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr, rb_fiber_t *fiber) { VM_ASSERT(cr->threads.sched.running != NULL); // running ractor VM_ASSERT(cr == GET_RACTOR()); - VM_ASSERT(vm->ractor.sync.lock_owner == NULL); // VM is locked, but owner == NULL VM_ASSERT(vm->ractor.sched.barrier_waiting); // VM needs barrier sync #if USE_RUBY_DEBUG_LOG || VM_CHECK_MODE > 0 @@ -1588,14 +1585,13 @@ thread_sched_atfork(struct rb_thread_sched *sched) vm->ractor.sync.lock_rec = 0; th->ec->tag->lock_rec = 0; - vm->ractor.sync.lock_owner = NULL; vm->ractor.sync.lock_owner_fiber = NULL; rb_native_mutex_initialize(&vm->ractor.sync.lock); rb_native_mutex_initialize(&vm->ractor.sched.lock); #if VM_CHECK_MODE > 0 - vm->ractor.sched.lock_owner = NULL; vm->ractor.sched.locked = false; + vm->ractor.sched.lock_owner = NULL; #endif // rb_native_cond_destroy(&vm->ractor.sched.cond); diff --git a/thread_sync.c b/thread_sync.c index dd54e8267195cd..18d8f506168f26 100644 --- a/thread_sync.c +++ b/thread_sync.c @@ -121,8 +121,6 @@ rb_mutex_num_waiting(rb_mutex_t *mutex) return n; } -rb_thread_t* rb_fiber_threadptr(const rb_fiber_t *fiber); - static void mutex_free(void *ptr) { diff --git a/thread_win32.c b/thread_win32.c index 576f617e8d5c2a..68cb3ce2419ec0 100644 --- a/thread_win32.c +++ b/thread_win32.c @@ -968,9 +968,9 @@ rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr) } void -rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr) +rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr, rb_fiber_t *fiber) { - vm->ractor.sync.lock_owner = cr; + vm->ractor.sync.lock_owner_fiber = fiber; unsigned int barrier_cnt = vm->ractor.sync.barrier_cnt; rb_thread_t *th = GET_THREAD(); bool running; @@ -1005,7 +1005,7 @@ rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr) rb_vm_ractor_blocking_cnt_dec(vm, cr, __FILE__, __LINE__); } - vm->ractor.sync.lock_owner = NULL; + vm->ractor.sync.lock_owner_fiber = NULL; } bool diff --git a/vm_core.h b/vm_core.h index ebcb031a92f635..51e66c7a9baddf 100644 --- a/vm_core.h +++ b/vm_core.h @@ -675,7 +675,6 @@ typedef struct rb_vm_struct { struct { // monitor rb_nativethread_lock_t lock; - struct rb_ractor_struct *lock_owner; struct rb_fiber_struct *lock_owner_fiber; unsigned int lock_rec; @@ -2151,6 +2150,7 @@ void rb_execution_context_mark(const rb_execution_context_t *ec); void rb_fiber_close(rb_fiber_t *fib); void Init_native_thread(rb_thread_t *th); int rb_vm_check_ints_blocking(rb_execution_context_t *ec); +rb_thread_t* rb_fiber_threadptr(const rb_fiber_t *fiber); // vm_sync.h void rb_vm_cond_wait(rb_vm_t *vm, rb_nativethread_cond_t *cond); diff --git a/vm_sync.c b/vm_sync.c index 9254fccd437647..f37b25fabec31e 100644 --- a/vm_sync.c +++ b/vm_sync.c @@ -6,7 +6,7 @@ #include "vm_debug.h" void rb_ractor_sched_barrier_start(rb_vm_t *vm, rb_ractor_t *cr); -void rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr); +void rb_ractor_sched_barrier_join(rb_vm_t *vm, rb_ractor_t *cr, rb_fiber_t *fiber); void rb_ractor_sched_barrier_end(rb_vm_t *vm, rb_ractor_t *cr); static bool @@ -76,7 +76,6 @@ vm_lock_enter(rb_ractor_t *cr, rb_fiber_t *fiber, rb_vm_t *vm, bool locked, bool #endif // lock rb_native_mutex_lock(&vm->ractor.sync.lock); - VM_ASSERT(vm->ractor.sync.lock_owner == NULL); VM_ASSERT(vm->ractor.sync.lock_owner_fiber == NULL); VM_ASSERT(vm->ractor.sync.lock_rec == 0); @@ -88,14 +87,12 @@ vm_lock_enter(rb_ractor_t *cr, rb_fiber_t *fiber, rb_vm_t *vm, bool locked, bool do { VM_ASSERT(vm_need_barrier_waiting(vm)); RUBY_DEBUG_LOG("barrier serial:%u", vm->ractor.sched.barrier_serial); - rb_ractor_sched_barrier_join(vm, cr); + rb_ractor_sched_barrier_join(vm, cr, fiber); } while (vm_need_barrier_waiting(vm)); } VM_ASSERT(vm->ractor.sync.lock_rec == 0); - VM_ASSERT(vm->ractor.sync.lock_owner == NULL); VM_ASSERT(vm->ractor.sync.lock_owner_fiber == NULL); - vm->ractor.sync.lock_owner = cr; vm->ractor.sync.lock_owner_fiber = fiber; } @@ -103,13 +100,13 @@ vm_lock_enter(rb_ractor_t *cr, rb_fiber_t *fiber, rb_vm_t *vm, bool locked, bool *lev = vm->ractor.sync.lock_rec; RUBY_DEBUG_LOG2(file, line, "rec:%u owner:%u", vm->ractor.sync.lock_rec, - (unsigned int)rb_ractor_id(vm->ractor.sync.lock_owner)); + (unsigned int)rb_ractor_id(cr)); } static void vm_lock_leave(rb_vm_t *vm, bool no_barrier, unsigned int *lev APPEND_LOCATION_ARGS) { - MAYBE_UNUSED(rb_ractor_t *cr = vm->ractor.sync.lock_owner); + MAYBE_UNUSED(rb_ractor_t *cr = rb_fiber_threadptr(vm->ractor.sync.lock_owner_fiber)->ractor); RUBY_DEBUG_LOG2(file, line, "rec:%u owner:%u%s", vm->ractor.sync.lock_rec, (unsigned int)rb_ractor_id(cr), @@ -132,7 +129,6 @@ vm_lock_leave(rb_vm_t *vm, bool no_barrier, unsigned int *lev APPEND_LOCATION_AR *lev = vm->ractor.sync.lock_rec; if (vm->ractor.sync.lock_rec == 0) { - vm->ractor.sync.lock_owner = NULL; vm->ractor.sync.lock_owner_fiber = NULL; rb_native_mutex_unlock(&vm->ractor.sync.lock); } @@ -204,11 +200,9 @@ vm_cond_wait(rb_vm_t *vm, rb_nativethread_cond_t *cond, unsigned long msec) { ASSERT_vm_locking(); unsigned int lock_rec = vm->ractor.sync.lock_rec; - rb_ractor_t *cr = vm->ractor.sync.lock_owner; rb_fiber_t *fiber = vm->ractor.sync.lock_owner_fiber; vm->ractor.sync.lock_rec = 0; - vm->ractor.sync.lock_owner = NULL; vm->ractor.sync.lock_owner_fiber = NULL; if (msec > 0) { rb_native_cond_timedwait(cond, &vm->ractor.sync.lock, msec); @@ -217,7 +211,6 @@ vm_cond_wait(rb_vm_t *vm, rb_nativethread_cond_t *cond, unsigned long msec) rb_native_cond_wait(cond, &vm->ractor.sync.lock); } vm->ractor.sync.lock_rec = lock_rec; - vm->ractor.sync.lock_owner = cr; vm->ractor.sync.lock_owner_fiber = fiber; } @@ -249,12 +242,11 @@ rb_vm_barrier(void) RB_DEBUG_COUNTER_INC(vm_sync_barrier); if (!rb_multi_ractor_p()) { - // no other ractors return; } else { rb_vm_t *vm = GET_VM(); - rb_ractor_t *cr = vm->ractor.sync.lock_owner; + rb_ractor_t *cr = rb_fiber_threadptr(vm->ractor.sync.lock_owner_fiber)->ractor; ASSERT_vm_locking(); VM_ASSERT(cr == GET_RACTOR()); From dbd40ef3ba4082c97e4d7b54d1017effd39133ee Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Mon, 8 Sep 2025 13:44:58 -0400 Subject: [PATCH 3/3] Move an assertion to earlier point in rb_vm_barrier --- vm_sync.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vm_sync.c b/vm_sync.c index f37b25fabec31e..491fe66754268b 100644 --- a/vm_sync.c +++ b/vm_sync.c @@ -245,10 +245,9 @@ rb_vm_barrier(void) return; } else { + ASSERT_vm_locking(); rb_vm_t *vm = GET_VM(); rb_ractor_t *cr = rb_fiber_threadptr(vm->ractor.sync.lock_owner_fiber)->ractor; - - ASSERT_vm_locking(); VM_ASSERT(cr == GET_RACTOR()); VM_ASSERT(rb_ractor_status_p(cr, ractor_running));