diff --git a/CMakeLists.txt b/CMakeLists.txt index cad25ef65..e017d1e7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ option(SNMALLOC_IPO "Link with IPO/LTO support" OFF) option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF) option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF) option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON) +option(SNMALLOC_PTHREAD_FORK_PROTECTION "Guard against forking while allocator locks are held using pthread_atfork hooks" OFF) option(SNMALLOC_ENABLE_FUZZING "Enable fuzzing instrumentation tests" OFF) option(SNMALLOC_USE_SELF_VENDORED_STL "Avoid using system STL" OFF) # Options that apply only if we're not building the header-only library @@ -133,6 +134,9 @@ int main() { } " SNMALLOC_PTHREAD_ATFORK_WORKS) +if (SNMALLOC_PTHREAD_FORK_PROTECTION AND NOT SNMALLOC_PTHREAD_ATFORK_WORKS) + message(FATAL_ERROR "SNMALLOC_PTHREAD_FORK_PROTECTION requires working pthread_atfork support") +endif() if (NOT MSVC AND NOT (SNMALLOC_CLEANUP STREQUAL CXX11_DESTRUCTORS)) # If the target compiler doesn't support -nostdlib++ then we must enable C at @@ -333,6 +337,7 @@ endfunction() add_as_define(SNMALLOC_QEMU_WORKAROUND) add_as_define(SNMALLOC_TRACING) add_as_define(SNMALLOC_CI_BUILD) +add_as_define(SNMALLOC_PTHREAD_FORK_PROTECTION) add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY) add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS) add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H) diff --git a/src/snmalloc/ds_aal/flaglock.h b/src/snmalloc/ds_aal/flaglock.h index a939f8e49..c55b80bb1 100644 --- a/src/snmalloc/ds_aal/flaglock.h +++ b/src/snmalloc/ds_aal/flaglock.h @@ -116,7 +116,8 @@ namespace snmalloc public: FlagLock(FlagWord& lock) : lock(lock) { - while (lock.flag.exchange(true, stl::memory_order_acquire)) + while ( + SNMALLOC_UNLIKELY(lock.flag.exchange(true, stl::memory_order_acquire))) { // assert_not_owned_by_current_thread is only called when the first // acquiring is failed; which means the lock is already held somewhere @@ -124,7 +125,7 @@ namespace snmalloc lock.assert_not_owned_by_current_thread(); // This loop is better for spin-waiting because it won't issue // expensive write operation (xchg for example). - while (lock.flag.load(stl::memory_order_relaxed)) + while (SNMALLOC_UNLIKELY(lock.flag.load(stl::memory_order_relaxed))) { Aal::pause(); } diff --git a/src/snmalloc/ds_aal/prevent_fork.h b/src/snmalloc/ds_aal/prevent_fork.h index 2ef0c2465..6e7635cb7 100644 --- a/src/snmalloc/ds_aal/prevent_fork.h +++ b/src/snmalloc/ds_aal/prevent_fork.h @@ -10,6 +10,8 @@ namespace snmalloc { + +#ifdef SNMALLOC_PTHREAD_FORK_PROTECTION // This is a simple implementation of a class that can be // used to prevent a process from forking. Holding a lock // in the allocator while forking can lead to deadlocks. @@ -43,7 +45,7 @@ namespace snmalloc // calls would be ignored. static void ensure_init() { -#ifdef SNMALLOC_PTHREAD_ATFORK_WORKS +# ifdef SNMALLOC_PTHREAD_ATFORK_WORKS static stl::Atomic initialised{false}; if (initialised.load(stl::memory_order_acquire)) @@ -51,7 +53,7 @@ namespace snmalloc pthread_atfork(prefork, postfork_parent, postfork_child); initialised.store(true, stl::memory_order_release); -#endif +# endif }; public: @@ -158,4 +160,15 @@ namespace snmalloc threads_preventing_fork--; } }; +#else + // The fork protection can cost a lot and it is generally not required. + // This is a dummy implementation of the PreventFork class that does nothing. + class PreventFork + { + public: + PreventFork() {} + + ~PreventFork() {} + }; +#endif } // namespace snmalloc \ No newline at end of file diff --git a/src/snmalloc/global/threadalloc.h b/src/snmalloc/global/threadalloc.h index e49c6d5c5..d037995e5 100644 --- a/src/snmalloc/global/threadalloc.h +++ b/src/snmalloc/global/threadalloc.h @@ -93,7 +93,7 @@ namespace snmalloc // we need to record if we are already in that state as we will not // receive another teardown call, so each operation needs to release // the underlying data structures after the call. - static inline thread_local bool teardown_called{false}; + static inline thread_local size_t times_teardown_called{0}; public: /** @@ -114,8 +114,9 @@ namespace snmalloc if (alloc == &default_alloc) return; - teardown_called = true; - alloc->flush(); + times_teardown_called++; + if (bits::is_pow2(times_teardown_called) || times_teardown_called < 128) + alloc->flush(); AllocPool::release(alloc); alloc = const_cast(&default_alloc); } @@ -131,7 +132,7 @@ namespace snmalloc template SNMALLOC_SLOW_PATH static auto check_init_slow(Restart r, Args... args) { - bool post_teardown = teardown_called; + bool post_teardown = times_teardown_called > 0; alloc = AllocPool::acquire(); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index d83ab9dc7..5ec7bf1f3 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -1169,6 +1169,10 @@ namespace snmalloc template SNMALLOC_SLOW_PATH void dealloc_local_slabs(smallsizeclass_t sizeclass) { + if constexpr (!check_slabs) + if (alloc_classes[sizeclass].unused == 0) + return; + // Return unused slabs of sizeclass_t back to global allocator alloc_classes[sizeclass].available.iterate([this, sizeclass](auto* meta) { auto domesticate = @@ -1420,18 +1424,20 @@ namespace snmalloc for (smallsizeclass_t sizeclass = 0; sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++) { - dealloc_local_slabs(sizeclass); + dealloc_local_slabs(sizeclass); } - laden.iterate( - [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA { - if (!meta->is_large()) - { - meta->free_queue.validate( - freelist::Object::key_root, meta->as_key_tweak(), domesticate); - } - }); - + if constexpr (mitigations(freelist_teardown_validate)) + { + laden.iterate( + [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA { + if (!meta->is_large()) + { + meta->free_queue.validate( + freelist::Object::key_root, meta->as_key_tweak(), domesticate); + } + }); + } // Set the remote_dealloc_cache to immediately slow path. remote_dealloc_cache.capacity = 0; diff --git a/src/snmalloc/mem/pooled.h b/src/snmalloc/mem/pooled.h index 489d72f9d..ca4e94101 100644 --- a/src/snmalloc/mem/pooled.h +++ b/src/snmalloc/mem/pooled.h @@ -51,13 +51,21 @@ namespace snmalloc public: void set_in_use() { - if (in_use.exchange(true)) +#ifndef NDEBUG + if (in_use.exchange(true, stl::memory_order_acq_rel)) error("Critical error: double use of Pooled Type!"); +#else + in_use.store(true, stl::memory_order_relaxed); +#endif } void reset_in_use() { - in_use.store(false); +#ifndef NDEBUG + in_use.store(false, stl::memory_order_release); +#else + in_use.store(false, stl::memory_order_relaxed); +#endif } bool debug_is_in_use() diff --git a/src/test/func/protect_fork/protect_fork.cc b/src/test/func/protect_fork/protect_fork.cc index 28a8f4c7a..77227b202 100644 --- a/src/test/func/protect_fork/protect_fork.cc +++ b/src/test/func/protect_fork/protect_fork.cc @@ -1,6 +1,4 @@ #include -#include - #ifndef SNMALLOC_PTHREAD_ATFORK_WORKS int main() { @@ -9,7 +7,9 @@ int main() } #else +# define SNMALLOC_PTHREAD_FORK_PROTECTION # include +# include # include void simulate_allocation() diff --git a/src/test/perf/post_teardown/post-teardown.cc b/src/test/perf/post_teardown/post-teardown.cc new file mode 100644 index 000000000..ca9296d71 --- /dev/null +++ b/src/test/perf/post_teardown/post-teardown.cc @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include + +using namespace snmalloc; + +void fill(std::vector& out, size_t count, size_t size) +{ + out.reserve(count); + for (size_t i = 0; i < count; i++) + { + out.push_back(snmalloc::alloc(size)); + } +} + +void drain(const char* label, std::vector& vec, size_t size) +{ + MeasureTime m; + m << label << " (" << vec.size() << " x " << size << " B)"; + for (void* p : vec) + { + snmalloc::dealloc(p, size); + } + vec.clear(); +} + +int main(int, char**) +{ + setup(); + // Issue #809: perf when many objects are freed after the allocator has + // already been finalised (e.g. static/global teardown). Keep counts equal + // for baseline and post-teardown to isolate the teardown cost. + constexpr size_t alloc_count = 1 << 18; + constexpr size_t obj_size = 64; + + std::vector ptrs; + fill(ptrs, alloc_count, obj_size); + drain("Baseline dealloc before finalise", ptrs, obj_size); + + // Simulate the allocator already being torn down before remaining frees + // (post-main / static destruction path from #809). + ThreadAlloc::teardown(); + + fill(ptrs, alloc_count, obj_size); + drain("Immediate dealloc after teardown", ptrs, obj_size); + + return 0; +}