From d498cc75b570348c4d4be2ef5c2aba61d060650d Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 11 Feb 2026 08:26:48 -0600 Subject: [PATCH 1/5] Sparse matrix release queue --- lib/mkl/wrappers_sparse.jl | 40 ++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl index bb17907a..0fc39091 100644 --- a/lib/mkl/wrappers_sparse.jl +++ b/lib/mkl/wrappers_sparse.jl @@ -1,16 +1,41 @@ +# Deferred release queue for sparse matrix handles. +# Finalizers run on the GC thread, but onemklXsparse_release_matrix_handle submits +# work to the SYCL queue. Using the same queue from the GC thread and the main thread +# concurrently is not safe and causes ZE_RESULT_ERROR_DEVICE_LOST / ZE_RESULT_ERROR_UNKNOWN. +# Instead, finalizers push handles here and they are released on the main thread. +const _deferred_sparse_handles = Vector{matrix_handle_t}() +const _deferred_sparse_handles_lock = ReentrantLock() + function sparse_release_matrix_handle(A::oneAbstractSparseMatrix) - return if A.handle !== nothing + if A.handle !== nothing + lock(_deferred_sparse_handles_lock) do + push!(_deferred_sparse_handles, A.handle) + end + end +end + +function flush_deferred_sparse_releases() + handles = lock(_deferred_sparse_handles_lock) do + if isempty(_deferred_sparse_handles) + return matrix_handle_t[] + end + h = copy(_deferred_sparse_handles) + empty!(_deferred_sparse_handles) + return h + end + isempty(handles) && return + dev = device() + ctx = context() + queue = global_queue(ctx, dev) + for handle in handles try - queue = global_queue(context(A.nzVal), device(A.nzVal)) - handle_ptr = Ref{matrix_handle_t}(A.handle) + handle_ptr = Ref{matrix_handle_t}(handle) onemklXsparse_release_matrix_handle(sycl_queue(queue), handle_ptr) - # Only synchronize after successful release to ensure completion - synchronize(queue) catch err - # Don't let finalizer errors crash the program @warn "Error releasing sparse matrix handle" exception = err end end + synchronize(queue) end for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int32), @@ -27,6 +52,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3 rowPtr::oneVector{$intty}, colVal::oneVector{$intty}, nzVal::oneVector{$elty}, dims::NTuple{2, Int} ) + flush_deferred_sparse_releases() handle_ptr = Ref{matrix_handle_t}() onemklXsparse_init_matrix_handle(handle_ptr) m, n = dims @@ -47,6 +73,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3 colPtr::oneVector{$intty}, rowVal::oneVector{$intty}, nzVal::oneVector{$elty}, dims::NTuple{2, Int} ) + flush_deferred_sparse_releases() queue = global_queue(context(nzVal), device(nzVal)) handle_ptr = Ref{matrix_handle_t}() onemklXsparse_init_matrix_handle(handle_ptr) @@ -106,6 +133,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_coo_data , :Float32 , :Int3 (:onemklZsparse_set_coo_data_64, :ComplexF64, :Int64)) @eval begin function oneSparseMatrixCOO(A::SparseMatrixCSC{$elty, $intty}) + flush_deferred_sparse_releases() handle_ptr = Ref{matrix_handle_t}() onemklXsparse_init_matrix_handle(handle_ptr) m, n = size(A) From cfb8cd1b2abd2d99e8634e902e585e75199f592a Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 11 Feb 2026 08:29:04 -0600 Subject: [PATCH 2/5] Memory tracking for GC --- src/pool.jl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/pool.jl b/src/pool.jl index a2bfc250..8b776eea 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -1,8 +1,38 @@ +# Track total allocated GPU memory (device + shared buffers) for proactive GC. +# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers +# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC +# only sees CPU memory pressure, not GPU memory pressure). +const _allocated_bytes = Threads.Atomic{Int64}(0) +const _total_mem_cache = Threads.Atomic{Int64}(0) + +function _get_total_mem(dev) + cached = _total_mem_cache[] + cached > 0 && return cached + total = only(oneL0.memory_properties(dev)).totalSize + Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total)) + return _total_mem_cache[] +end + +function _maybe_gc(dev, bytes) + allocated = _allocated_bytes[] + allocated <= 0 && return + total_mem = _get_total_mem(dev) + if allocated + bytes > total_mem * 0.8 + # Full GC to collect old-generation objects whose finalizers free GPU memory. + # GC.gc(false) only does minor collection which won't reclaim promoted objects. + GC.gc(true) + elseif allocated + bytes > total_mem * 0.4 + GC.gc(false) + end +end + function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int) bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev) + _maybe_gc(dev, bytes) buf = device_alloc(ctx, dev, bytes, alignment) make_resident(ctx, dev, buf) + Threads.atomic_add!(_allocated_bytes, Int64(bytes)) return buf end @@ -12,8 +42,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I # TODO: support cross-device shared buffers (by setting `dev=nothing`) + _maybe_gc(dev, bytes) buf = shared_alloc(ctx, dev, bytes, alignment) make_resident(ctx, dev, buf) + Threads.atomic_add!(_allocated_bytes, Int64(bytes)) return buf end @@ -26,6 +58,10 @@ end function release(buf::oneL0.AbstractBuffer) sizeof(buf) == 0 && return + if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer + Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf))) + end + # XXX: is it necessary to evice memory if we are going to free it? # this is racy, because eviction is not queue-ordered, and # we don't want to synchronize inside what could have been a From ed03fb0c9e9c378f7d63125cbe50e5d0b4fc8a31 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 11 Feb 2026 13:31:40 -0600 Subject: [PATCH 3/5] Flush deferred sparse releases during memory reclaim --- lib/level-zero/utils.jl | 24 ++++++++++++++++++++++++ lib/mkl/oneMKL.jl | 5 +++++ src/pool.jl | 5 ++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/lib/level-zero/utils.jl b/lib/level-zero/utils.jl index 48a45d07..eaabea58 100644 --- a/lib/level-zero/utils.jl +++ b/lib/level-zero/utils.jl @@ -1,5 +1,23 @@ isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing +# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL +# sparse handle releases). Extensions like oneMKL can register cleanup functions here +# so they run when Level Zero reports OOM or when proactive GC fires. +const _reclaim_callbacks = Function[] + +function register_reclaim_callback!(f::Function) + push!(_reclaim_callbacks, f) +end + +function _run_reclaim_callbacks() + for cb in _reclaim_callbacks + try + cb() + catch + end + end +end + function retry_reclaim(f, isfailed) ret = f() @@ -11,6 +29,12 @@ function retry_reclaim(f, isfailed) GC.gc(false) elseif phase == 2 GC.gc(true) + elseif phase == 3 + # After GC, finalizers may have deferred resource releases (e.g., MKL + # sparse handles). Flush them now, then GC again to free the memory + # those releases made available. + _run_reclaim_callbacks() + GC.gc(true) else break end diff --git a/lib/mkl/oneMKL.jl b/lib/mkl/oneMKL.jl index 533edf04..64f54358 100644 --- a/lib/mkl/oneMKL.jl +++ b/lib/mkl/oneMKL.jl @@ -31,6 +31,11 @@ include("linalg.jl") include("interfaces.jl") include("fft.jl") +# Register deferred sparse handle flush as a memory reclaim callback so that OOM +# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers +# associated with sparse matrix handles that were deferred from finalizer threads. +oneL0.register_reclaim_callback!(flush_deferred_sparse_releases) + function version() major = Ref{Int64}() minor = Ref{Int64}() diff --git a/src/pool.jl b/src/pool.jl index 8b776eea..58cefb82 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -18,8 +18,11 @@ function _maybe_gc(dev, bytes) allocated <= 0 && return total_mem = _get_total_mem(dev) if allocated + bytes > total_mem * 0.8 + # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC + # cycles first — these are safe to release now because they were deferred earlier. + # Do this BEFORE GC to avoid racing with new finalizers. + oneL0._run_reclaim_callbacks() # Full GC to collect old-generation objects whose finalizers free GPU memory. - # GC.gc(false) only does minor collection which won't reclaim promoted objects. GC.gc(true) elseif allocated + bytes > total_mem * 0.4 GC.gc(false) From 2f1a2fbf3c66720885dfb0c7db31d575553ead3b Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 11 Feb 2026 13:34:30 -0600 Subject: [PATCH 4/5] Format --- lib/level-zero/utils.jl | 3 ++- lib/mkl/wrappers_sparse.jl | 4 ++-- src/pool.jl | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/level-zero/utils.jl b/lib/level-zero/utils.jl index eaabea58..bf0153b8 100644 --- a/lib/level-zero/utils.jl +++ b/lib/level-zero/utils.jl @@ -6,7 +6,7 @@ isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, const _reclaim_callbacks = Function[] function register_reclaim_callback!(f::Function) - push!(_reclaim_callbacks, f) + return push!(_reclaim_callbacks, f) end function _run_reclaim_callbacks() @@ -16,6 +16,7 @@ function _run_reclaim_callbacks() catch end end + return end function retry_reclaim(f, isfailed) diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl index 0fc39091..8e58956b 100644 --- a/lib/mkl/wrappers_sparse.jl +++ b/lib/mkl/wrappers_sparse.jl @@ -7,7 +7,7 @@ const _deferred_sparse_handles = Vector{matrix_handle_t}() const _deferred_sparse_handles_lock = ReentrantLock() function sparse_release_matrix_handle(A::oneAbstractSparseMatrix) - if A.handle !== nothing + return if A.handle !== nothing lock(_deferred_sparse_handles_lock) do push!(_deferred_sparse_handles, A.handle) end @@ -35,7 +35,7 @@ function flush_deferred_sparse_releases() @warn "Error releasing sparse matrix handle" exception = err end end - synchronize(queue) + return synchronize(queue) end for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int32), diff --git a/src/pool.jl b/src/pool.jl index 58cefb82..165a7f07 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -17,7 +17,7 @@ function _maybe_gc(dev, bytes) allocated = _allocated_bytes[] allocated <= 0 && return total_mem = _get_total_mem(dev) - if allocated + bytes > total_mem * 0.8 + return if allocated + bytes > total_mem * 0.8 # Flush deferred resource releases (e.g., MKL sparse handles) from previous GC # cycles first — these are safe to release now because they were deferred earlier. # Do this BEFORE GC to avoid racing with new finalizers. From 802bb5075eeef0d2bd4fdda359fde654bc352f46 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 11 Feb 2026 13:38:15 -0600 Subject: [PATCH 5/5] Bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index bd0179c2..f84b9c27 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "oneAPI" uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" authors = ["Tim Besard ", "Alexis Montoison", "Michel Schanen "] -version = "2.6.0" +version = "2.6.1" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"