diff --git a/CMakeLists.txt b/CMakeLists.txt
index cad25ef65..e017d1e7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(SNMALLOC_IPO "Link with IPO/LTO support" OFF)
 option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF)
 option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF)
 option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON)
+option(SNMALLOC_PTHREAD_FORK_PROTECTION "Guard against forking while allocator locks are held using pthread_atfork hooks" OFF)
 option(SNMALLOC_ENABLE_FUZZING "Enable fuzzing instrumentation tests" OFF)
 option(SNMALLOC_USE_SELF_VENDORED_STL "Avoid using system STL" OFF)
 # Options that apply only if we're not building the header-only library
@@ -133,6 +134,9 @@ int main() {
 }
 " SNMALLOC_PTHREAD_ATFORK_WORKS)
 
+if (SNMALLOC_PTHREAD_FORK_PROTECTION AND NOT SNMALLOC_PTHREAD_ATFORK_WORKS)
+  message(FATAL_ERROR "SNMALLOC_PTHREAD_FORK_PROTECTION requires working pthread_atfork support")
+endif()
 
 if (NOT MSVC AND NOT (SNMALLOC_CLEANUP STREQUAL CXX11_DESTRUCTORS))
   # If the target compiler doesn't support -nostdlib++ then we must enable C at
@@ -333,6 +337,7 @@ endfunction()
 add_as_define(SNMALLOC_QEMU_WORKAROUND)
 add_as_define(SNMALLOC_TRACING)
 add_as_define(SNMALLOC_CI_BUILD)
+add_as_define(SNMALLOC_PTHREAD_FORK_PROTECTION)
 add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY)
 add_as_define(SNMALLOC_PTHREAD_ATFORK_WORKS)
 add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H)
diff --git a/src/snmalloc/ds_aal/flaglock.h b/src/snmalloc/ds_aal/flaglock.h
index a939f8e49..c55b80bb1 100644
--- a/src/snmalloc/ds_aal/flaglock.h
+++ b/src/snmalloc/ds_aal/flaglock.h
@@ -116,7 +116,8 @@ namespace snmalloc
   public:
     FlagLock(FlagWord& lock) : lock(lock)
     {
-      while (lock.flag.exchange(true, stl::memory_order_acquire))
+      while (
+        SNMALLOC_UNLIKELY(lock.flag.exchange(true, stl::memory_order_acquire)))
       {
         // assert_not_owned_by_current_thread is only called when the first
         // acquiring is failed; which means the lock is already held somewhere
@@ -124,7 +125,7 @@ namespace snmalloc
         lock.assert_not_owned_by_current_thread();
         // This loop is better for spin-waiting because it won't issue
         // expensive write operation (xchg for example).
-        while (lock.flag.load(stl::memory_order_relaxed))
+        while (SNMALLOC_UNLIKELY(lock.flag.load(stl::memory_order_relaxed)))
         {
           Aal::pause();
         }
diff --git a/src/snmalloc/ds_aal/prevent_fork.h b/src/snmalloc/ds_aal/prevent_fork.h
index 2ef0c2465..6e7635cb7 100644
--- a/src/snmalloc/ds_aal/prevent_fork.h
+++ b/src/snmalloc/ds_aal/prevent_fork.h
@@ -10,6 +10,8 @@
 
 namespace snmalloc
 {
+
+#ifdef SNMALLOC_PTHREAD_FORK_PROTECTION
   // This is a simple implementation of a class that can be
   // used to prevent a process from forking. Holding a lock
   // in the allocator while forking can lead to deadlocks.
@@ -43,7 +45,7 @@ namespace snmalloc
     // calls would be ignored.
     static void ensure_init()
     {
-#ifdef SNMALLOC_PTHREAD_ATFORK_WORKS
+#  ifdef SNMALLOC_PTHREAD_ATFORK_WORKS
       static stl::Atomic<bool> initialised{false};
 
       if (initialised.load(stl::memory_order_acquire))
@@ -51,7 +53,7 @@ namespace snmalloc
 
       pthread_atfork(prefork, postfork_parent, postfork_child);
       initialised.store(true, stl::memory_order_release);
-#endif
+#  endif
     };
 
   public:
@@ -158,4 +160,15 @@ namespace snmalloc
       threads_preventing_fork--;
     }
   };
+#else
+  // The fork protection can cost a lot and it is generally not required.
+  // This is a dummy implementation of the PreventFork class that does nothing.
+  class PreventFork
+  {
+  public:
+    PreventFork() {}
+
+    ~PreventFork() {}
+  };
+#endif
 } // namespace snmalloc
\ No newline at end of file
diff --git a/src/snmalloc/global/threadalloc.h b/src/snmalloc/global/threadalloc.h
index e49c6d5c5..d037995e5 100644
--- a/src/snmalloc/global/threadalloc.h
+++ b/src/snmalloc/global/threadalloc.h
@@ -93,7 +93,7 @@ namespace snmalloc
     // we need to record if we are already in that state as we will not
     // receive another teardown call, so each operation needs to release
     // the underlying data structures after the call.
-    static inline thread_local bool teardown_called{false};
+    static inline thread_local size_t times_teardown_called{0};
 
   public:
     /**
@@ -114,8 +114,9 @@ namespace snmalloc
       if (alloc == &default_alloc)
         return;
 
-      teardown_called = true;
-      alloc->flush();
+      times_teardown_called++;
+      if (bits::is_pow2(times_teardown_called) || times_teardown_called < 128)
+        alloc->flush();
       AllocPool<Config>::release(alloc);
       alloc = const_cast<Alloc*>(&default_alloc);
     }
@@ -131,7 +132,7 @@ namespace snmalloc
       template<typename Restart, typename... Args>
       SNMALLOC_SLOW_PATH static auto check_init_slow(Restart r, Args... args)
       {
-        bool post_teardown = teardown_called;
+        bool post_teardown = times_teardown_called > 0;
 
         alloc = AllocPool<Config>::acquire();
 
diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h
index d83ab9dc7..5ec7bf1f3 100644
--- a/src/snmalloc/mem/corealloc.h
+++ b/src/snmalloc/mem/corealloc.h
@@ -1169,6 +1169,10 @@ namespace snmalloc
     template<bool check_slabs = false>
     SNMALLOC_SLOW_PATH void dealloc_local_slabs(smallsizeclass_t sizeclass)
     {
+      if constexpr (!check_slabs)
+        if (alloc_classes[sizeclass].unused == 0)
+          return;
+
       // Return unused slabs of sizeclass_t back to global allocator
       alloc_classes[sizeclass].available.iterate([this, sizeclass](auto* meta) {
         auto domesticate =
@@ -1420,18 +1424,20 @@ namespace snmalloc
       for (smallsizeclass_t sizeclass = 0; sizeclass < NUM_SMALL_SIZECLASSES;
            sizeclass++)
       {
-        dealloc_local_slabs<true>(sizeclass);
+        dealloc_local_slabs<mitigations(freelist_teardown_validate)>(sizeclass);
       }
 
-      laden.iterate(
-        [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA {
-          if (!meta->is_large())
-          {
-            meta->free_queue.validate(
-              freelist::Object::key_root, meta->as_key_tweak(), domesticate);
-          }
-        });
-
+      if constexpr (mitigations(freelist_teardown_validate))
+      {
+        laden.iterate(
+          [domesticate](BackendSlabMetadata* meta) SNMALLOC_FAST_PATH_LAMBDA {
+            if (!meta->is_large())
+            {
+              meta->free_queue.validate(
+                freelist::Object::key_root, meta->as_key_tweak(), domesticate);
+            }
+          });
+      }
       // Set the remote_dealloc_cache to immediately slow path.
       remote_dealloc_cache.capacity = 0;
 
diff --git a/src/snmalloc/mem/pooled.h b/src/snmalloc/mem/pooled.h
index 489d72f9d..ca4e94101 100644
--- a/src/snmalloc/mem/pooled.h
+++ b/src/snmalloc/mem/pooled.h
@@ -51,13 +51,21 @@ namespace snmalloc
   public:
     void set_in_use()
     {
-      if (in_use.exchange(true))
+#ifndef NDEBUG
+      if (in_use.exchange(true, stl::memory_order_acq_rel))
         error("Critical error: double use of Pooled Type!");
+#else
+      in_use.store(true, stl::memory_order_relaxed);
+#endif
     }
 
     void reset_in_use()
     {
-      in_use.store(false);
+#ifndef NDEBUG
+      in_use.store(false, stl::memory_order_release);
+#else
+      in_use.store(false, stl::memory_order_relaxed);
+#endif
     }
 
     bool debug_is_in_use()
diff --git a/src/test/func/protect_fork/protect_fork.cc b/src/test/func/protect_fork/protect_fork.cc
index 28a8f4c7a..77227b202 100644
--- a/src/test/func/protect_fork/protect_fork.cc
+++ b/src/test/func/protect_fork/protect_fork.cc
@@ -1,6 +1,4 @@
 #include <iostream>
-#include <snmalloc/snmalloc.h>
-
 #ifndef SNMALLOC_PTHREAD_ATFORK_WORKS
 int main()
 {
@@ -9,7 +7,9 @@ int main()
 }
 #else
 
+#  define SNMALLOC_PTHREAD_FORK_PROTECTION
 #  include <pthread.h>
+#  include <snmalloc/snmalloc.h>
 #  include <thread>
 
 void simulate_allocation()
diff --git a/src/test/perf/post_teardown/post-teardown.cc b/src/test/perf/post_teardown/post-teardown.cc
new file mode 100644
index 000000000..ca9296d71
--- /dev/null
+++ b/src/test/perf/post_teardown/post-teardown.cc
@@ -0,0 +1,50 @@
+#include <cstdlib>
+#include <snmalloc/snmalloc.h>
+#include <test/measuretime.h>
+#include <test/setup.h>
+#include <vector>
+
+using namespace snmalloc;
+
+void fill(std::vector<void*>& out, size_t count, size_t size)
+{
+  out.reserve(count);
+  for (size_t i = 0; i < count; i++)
+  {
+    out.push_back(snmalloc::alloc<Uninit>(size));
+  }
+}
+
+void drain(const char* label, std::vector<void*>& vec, size_t size)
+{
+  MeasureTime m;
+  m << label << " (" << vec.size() << " x " << size << " B)";
+  for (void* p : vec)
+  {
+    snmalloc::dealloc(p, size);
+  }
+  vec.clear();
+}
+
+int main(int, char**)
+{
+  setup();
+  // Issue #809: perf when many objects are freed after the allocator has
+  // already been finalised (e.g. static/global teardown). Keep counts equal
+  // for baseline and post-teardown to isolate the teardown cost.
+  constexpr size_t alloc_count = 1 << 18;
+  constexpr size_t obj_size = 64;
+
+  std::vector<void*> ptrs;
+  fill(ptrs, alloc_count, obj_size);
+  drain("Baseline dealloc before finalise", ptrs, obj_size);
+
+  // Simulate the allocator already being torn down before remaining frees
+  // (post-main / static destruction path from #809).
+  ThreadAlloc::teardown();
+
+  fill(ptrs, alloc_count, obj_size);
+  drain("Immediate dealloc after teardown", ptrs, obj_size);
+
+  return 0;
+}