From d498cc75b570348c4d4be2ef5c2aba61d060650d Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Wed, 11 Feb 2026 08:26:48 -0600
Subject: [PATCH 1/5] Sparse matrix release queue

---
 lib/mkl/wrappers_sparse.jl | 40 ++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl
index bb17907a..0fc39091 100644
--- a/lib/mkl/wrappers_sparse.jl
+++ b/lib/mkl/wrappers_sparse.jl
@@ -1,16 +1,41 @@
+# Deferred release queue for sparse matrix handles.
+# Finalizers run on the GC thread, but onemklXsparse_release_matrix_handle submits
+# work to the SYCL queue. Using the same queue from the GC thread and the main thread
+# concurrently is not safe and causes ZE_RESULT_ERROR_DEVICE_LOST / ZE_RESULT_ERROR_UNKNOWN.
+# Instead, finalizers push handles here and they are released on the main thread.
+const _deferred_sparse_handles = Vector{matrix_handle_t}()
+const _deferred_sparse_handles_lock = ReentrantLock()
+
 function sparse_release_matrix_handle(A::oneAbstractSparseMatrix)
-    return if A.handle !== nothing
+    if A.handle !== nothing
+        lock(_deferred_sparse_handles_lock) do
+            push!(_deferred_sparse_handles, A.handle)
+        end
+    end
+end
+
+function flush_deferred_sparse_releases()
+    handles = lock(_deferred_sparse_handles_lock) do
+        if isempty(_deferred_sparse_handles)
+            return matrix_handle_t[]
+        end
+        h = copy(_deferred_sparse_handles)
+        empty!(_deferred_sparse_handles)
+        return h
+    end
+    isempty(handles) && return
+    dev = device()
+    ctx = context()
+    queue = global_queue(ctx, dev)
+    for handle in handles
         try
-            queue = global_queue(context(A.nzVal), device(A.nzVal))
-            handle_ptr = Ref{matrix_handle_t}(A.handle)
+            handle_ptr = Ref{matrix_handle_t}(handle)
             onemklXsparse_release_matrix_handle(sycl_queue(queue), handle_ptr)
-            # Only synchronize after successful release to ensure completion
-            synchronize(queue)
         catch err
-            # Don't let finalizer errors crash the program
             @warn "Error releasing sparse matrix handle" exception = err
         end
     end
+    synchronize(queue)
 end
 
 for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int32),
@@ -27,6 +52,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
                 rowPtr::oneVector{$intty}, colVal::oneVector{$intty},
                 nzVal::oneVector{$elty}, dims::NTuple{2, Int}
             )
+            flush_deferred_sparse_releases()
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
             m, n = dims
@@ -47,6 +73,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
                 colPtr::oneVector{$intty}, rowVal::oneVector{$intty},
                 nzVal::oneVector{$elty}, dims::NTuple{2, Int}
             )
+            flush_deferred_sparse_releases()
             queue = global_queue(context(nzVal), device(nzVal))
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
@@ -106,6 +133,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_coo_data   , :Float32   , :Int3
                              (:onemklZsparse_set_coo_data_64, :ComplexF64, :Int64))
     @eval begin
         function oneSparseMatrixCOO(A::SparseMatrixCSC{$elty, $intty})
+            flush_deferred_sparse_releases()
             handle_ptr = Ref{matrix_handle_t}()
             onemklXsparse_init_matrix_handle(handle_ptr)
             m, n = size(A)

From cfb8cd1b2abd2d99e8634e902e585e75199f592a Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Wed, 11 Feb 2026 08:29:04 -0600
Subject: [PATCH 2/5] Memory tracking for GC

---
 src/pool.jl | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/pool.jl b/src/pool.jl
index a2bfc250..8b776eea 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -1,8 +1,38 @@
+# Track total allocated GPU memory (device + shared buffers) for proactive GC.
+# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
+# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
+# only sees CPU memory pressure, not GPU memory pressure).
+const _allocated_bytes = Threads.Atomic{Int64}(0)
+const _total_mem_cache = Threads.Atomic{Int64}(0)
+
+function _get_total_mem(dev)
+    cached = _total_mem_cache[]
+    cached > 0 && return cached
+    total = only(oneL0.memory_properties(dev)).totalSize
+    Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
+    return _total_mem_cache[]
+end
+
+function _maybe_gc(dev, bytes)
+    allocated = _allocated_bytes[]
+    allocated <= 0 && return
+    total_mem = _get_total_mem(dev)
+    if allocated + bytes > total_mem * 0.8
+        # Full GC to collect old-generation objects whose finalizers free GPU memory.
+        # GC.gc(false) only does minor collection which won't reclaim promoted objects.
+        GC.gc(true)
+    elseif allocated + bytes > total_mem * 0.4
+        GC.gc(false)
+    end
+end
+
 function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
     bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)
 
+    _maybe_gc(dev, bytes)
     buf = device_alloc(ctx, dev, bytes, alignment)
     make_resident(ctx, dev, buf)
+    Threads.atomic_add!(_allocated_bytes, Int64(bytes))
 
     return buf
 end
@@ -12,8 +42,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I
 
     # TODO: support cross-device shared buffers (by setting `dev=nothing`)
 
+    _maybe_gc(dev, bytes)
     buf = shared_alloc(ctx, dev, bytes, alignment)
     make_resident(ctx, dev, buf)
+    Threads.atomic_add!(_allocated_bytes, Int64(bytes))
 
     return buf
 end
@@ -26,6 +58,10 @@ end
 function release(buf::oneL0.AbstractBuffer)
     sizeof(buf) == 0 && return
 
+    if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
+        Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
+    end
+
     # XXX: is it necessary to evice memory if we are going to free it?
     #      this is racy, because eviction is not queue-ordered, and
     #      we don't want to synchronize inside what could have been a

From ed03fb0c9e9c378f7d63125cbe50e5d0b4fc8a31 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Wed, 11 Feb 2026 13:31:40 -0600
Subject: [PATCH 3/5] Flush deferred sparse releases during memory reclaim

---
 lib/level-zero/utils.jl | 24 ++++++++++++++++++++++++
 lib/mkl/oneMKL.jl       |  5 +++++
 src/pool.jl             |  5 ++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/lib/level-zero/utils.jl b/lib/level-zero/utils.jl
index 48a45d07..eaabea58 100644
--- a/lib/level-zero/utils.jl
+++ b/lib/level-zero/utils.jl
@@ -1,5 +1,23 @@
 isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing
 
+# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
+# sparse handle releases).  Extensions like oneMKL can register cleanup functions here
+# so they run when Level Zero reports OOM or when proactive GC fires.
+const _reclaim_callbacks = Function[]
+
+function register_reclaim_callback!(f::Function)
+    push!(_reclaim_callbacks, f)
+end
+
+function _run_reclaim_callbacks()
+    for cb in _reclaim_callbacks
+        try
+            cb()
+        catch
+        end
+    end
+end
+
 function retry_reclaim(f, isfailed)
     ret = f()
 
@@ -11,6 +29,12 @@ function retry_reclaim(f, isfailed)
                 GC.gc(false)
             elseif phase == 2
                 GC.gc(true)
+            elseif phase == 3
+                # After GC, finalizers may have deferred resource releases (e.g., MKL
+                # sparse handles).  Flush them now, then GC again to free the memory
+                # those releases made available.
+                _run_reclaim_callbacks()
+                GC.gc(true)
             else
                 break
             end
diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl
index 533edf04..64f54358 100644
--- a/lib/mkl/oneMKL.jl
+++ b/lib/mkl/oneMKL.jl
@@ -31,6 +31,11 @@ include("linalg.jl")
 include("interfaces.jl")
 include("fft.jl")
 
+# Register deferred sparse handle flush as a memory reclaim callback so that OOM
+# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
+# associated with sparse matrix handles that were deferred from finalizer threads.
+oneL0.register_reclaim_callback!(flush_deferred_sparse_releases)
+
 function version()
     major = Ref{Int64}()
     minor = Ref{Int64}()
diff --git a/src/pool.jl b/src/pool.jl
index 8b776eea..58cefb82 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -18,8 +18,11 @@ function _maybe_gc(dev, bytes)
     allocated <= 0 && return
     total_mem = _get_total_mem(dev)
     if allocated + bytes > total_mem * 0.8
+        # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
+        # cycles first — these are safe to release now because they were deferred earlier.
+        # Do this BEFORE GC to avoid racing with new finalizers.
+        oneL0._run_reclaim_callbacks()
         # Full GC to collect old-generation objects whose finalizers free GPU memory.
-        # GC.gc(false) only does minor collection which won't reclaim promoted objects.
         GC.gc(true)
     elseif allocated + bytes > total_mem * 0.4
         GC.gc(false)

From 2f1a2fbf3c66720885dfb0c7db31d575553ead3b Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Wed, 11 Feb 2026 13:34:30 -0600
Subject: [PATCH 4/5] Format

---
 lib/level-zero/utils.jl    | 3 ++-
 lib/mkl/wrappers_sparse.jl | 4 ++--
 src/pool.jl                | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/level-zero/utils.jl b/lib/level-zero/utils.jl
index eaabea58..bf0153b8 100644
--- a/lib/level-zero/utils.jl
+++ b/lib/level-zero/utils.jl
@@ -6,7 +6,7 @@ isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug,
 const _reclaim_callbacks = Function[]
 
 function register_reclaim_callback!(f::Function)
-    push!(_reclaim_callbacks, f)
+    return push!(_reclaim_callbacks, f)
 end
 
 function _run_reclaim_callbacks()
@@ -16,6 +16,7 @@ function _run_reclaim_callbacks()
         catch
         end
     end
+    return
 end
 
 function retry_reclaim(f, isfailed)
diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl
index 0fc39091..8e58956b 100644
--- a/lib/mkl/wrappers_sparse.jl
+++ b/lib/mkl/wrappers_sparse.jl
@@ -7,7 +7,7 @@ const _deferred_sparse_handles = Vector{matrix_handle_t}()
 const _deferred_sparse_handles_lock = ReentrantLock()
 
 function sparse_release_matrix_handle(A::oneAbstractSparseMatrix)
-    if A.handle !== nothing
+    return if A.handle !== nothing
         lock(_deferred_sparse_handles_lock) do
             push!(_deferred_sparse_handles, A.handle)
         end
@@ -35,7 +35,7 @@ function flush_deferred_sparse_releases()
             @warn "Error releasing sparse matrix handle" exception = err
         end
     end
-    synchronize(queue)
+    return synchronize(queue)
 end
 
 for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int32),
diff --git a/src/pool.jl b/src/pool.jl
index 58cefb82..165a7f07 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -17,7 +17,7 @@ function _maybe_gc(dev, bytes)
     allocated = _allocated_bytes[]
     allocated <= 0 && return
     total_mem = _get_total_mem(dev)
-    if allocated + bytes > total_mem * 0.8
+    return if allocated + bytes > total_mem * 0.8
         # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
         # cycles first — these are safe to release now because they were deferred earlier.
         # Do this BEFORE GC to avoid racing with new finalizers.

From 802bb5075eeef0d2bd4fdda359fde654bc352f46 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Wed, 11 Feb 2026 13:38:15 -0600
Subject: [PATCH 5/5] Bump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index bd0179c2..f84b9c27 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "oneAPI"
 uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Schanen <michel.schanen@gmail.com>"]
-version = "2.6.0"
+version = "2.6.1"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"