Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "oneAPI"
uuid = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison", "Michel Schanen <michel.schanen@gmail.com>"]
version = "2.6.0"
version = "2.6.1"

[deps]
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
Expand Down
25 changes: 25 additions & 0 deletions lib/level-zero/utils.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
isdebug(group) = Base.CoreLogging.current_logger_for_env(Base.CoreLogging.Debug, group, oneL0) !== nothing

# Registered callbacks invoked during memory reclamation (e.g., flushing deferred MKL
# sparse handle releases). Extensions like oneMKL can register cleanup functions here
# so they run when Level Zero reports OOM or when proactive GC fires.
const _reclaim_callbacks = Function[]

function register_reclaim_callback!(f::Function)
return push!(_reclaim_callbacks, f)
end

function _run_reclaim_callbacks()
for cb in _reclaim_callbacks
try
cb()
catch
end
end
return
end

function retry_reclaim(f, isfailed)
ret = f()

Expand All @@ -11,6 +30,12 @@ function retry_reclaim(f, isfailed)
GC.gc(false)
elseif phase == 2
GC.gc(true)
elseif phase == 3
# After GC, finalizers may have deferred resource releases (e.g., MKL
# sparse handles). Flush them now, then GC again to free the memory
# those releases made available.
_run_reclaim_callbacks()
GC.gc(true)
else
break
end
Expand Down
5 changes: 5 additions & 0 deletions lib/mkl/oneMKL.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ include("linalg.jl")
include("interfaces.jl")
include("fft.jl")

# Register deferred sparse handle flush as a memory reclaim callback so that OOM
# recovery (retry_reclaim) and proactive GC (_maybe_gc) can free MKL internal buffers
# associated with sparse matrix handles that were deferred from finalizer threads.
oneL0.register_reclaim_callback!(flush_deferred_sparse_releases)

function version()
major = Ref{Int64}()
minor = Ref{Int64}()
Expand Down
38 changes: 33 additions & 5 deletions lib/mkl/wrappers_sparse.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
# Deferred release queue for sparse matrix handles.
# Finalizers run on the GC thread, but onemklXsparse_release_matrix_handle submits
# work to the SYCL queue. Using the same queue from the GC thread and the main thread
# concurrently is not safe and causes ZE_RESULT_ERROR_DEVICE_LOST / ZE_RESULT_ERROR_UNKNOWN.
# Instead, finalizers push handles here and they are released on the main thread.
const _deferred_sparse_handles = Vector{matrix_handle_t}()
const _deferred_sparse_handles_lock = ReentrantLock()

function sparse_release_matrix_handle(A::oneAbstractSparseMatrix)
return if A.handle !== nothing
lock(_deferred_sparse_handles_lock) do
push!(_deferred_sparse_handles, A.handle)
end
end
end

function flush_deferred_sparse_releases()
handles = lock(_deferred_sparse_handles_lock) do
if isempty(_deferred_sparse_handles)
return matrix_handle_t[]
end
h = copy(_deferred_sparse_handles)
empty!(_deferred_sparse_handles)
return h
end
isempty(handles) && return
dev = device()
ctx = context()
queue = global_queue(ctx, dev)
for handle in handles
try
queue = global_queue(context(A.nzVal), device(A.nzVal))
handle_ptr = Ref{matrix_handle_t}(A.handle)
handle_ptr = Ref{matrix_handle_t}(handle)
onemklXsparse_release_matrix_handle(sycl_queue(queue), handle_ptr)
# Only synchronize after successful release to ensure completion
synchronize(queue)
catch err
# Don't let finalizer errors crash the program
@warn "Error releasing sparse matrix handle" exception = err
end
end
return synchronize(queue)
end

for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int32),
Expand All @@ -27,6 +52,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3
rowPtr::oneVector{$intty}, colVal::oneVector{$intty},
nzVal::oneVector{$elty}, dims::NTuple{2, Int}
)
flush_deferred_sparse_releases()
handle_ptr = Ref{matrix_handle_t}()
onemklXsparse_init_matrix_handle(handle_ptr)
m, n = dims
Expand All @@ -47,6 +73,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3
colPtr::oneVector{$intty}, rowVal::oneVector{$intty},
nzVal::oneVector{$elty}, dims::NTuple{2, Int}
)
flush_deferred_sparse_releases()
queue = global_queue(context(nzVal), device(nzVal))
handle_ptr = Ref{matrix_handle_t}()
onemklXsparse_init_matrix_handle(handle_ptr)
Expand Down Expand Up @@ -106,6 +133,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_coo_data , :Float32 , :Int3
(:onemklZsparse_set_coo_data_64, :ComplexF64, :Int64))
@eval begin
function oneSparseMatrixCOO(A::SparseMatrixCSC{$elty, $intty})
flush_deferred_sparse_releases()
handle_ptr = Ref{matrix_handle_t}()
onemklXsparse_init_matrix_handle(handle_ptr)
m, n = size(A)
Expand Down
39 changes: 39 additions & 0 deletions src/pool.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,41 @@
# Track total allocated GPU memory (device + shared buffers) for proactive GC.
# This mirrors AMDGPU.jl's approach: trigger GC before OOM so that finalizers
# can free stale GPU buffers that Julia's GC hasn't collected yet (Julia's GC
# only sees CPU memory pressure, not GPU memory pressure).
const _allocated_bytes = Threads.Atomic{Int64}(0)
const _total_mem_cache = Threads.Atomic{Int64}(0)

function _get_total_mem(dev)
cached = _total_mem_cache[]
cached > 0 && return cached
total = only(oneL0.memory_properties(dev)).totalSize
Threads.atomic_cas!(_total_mem_cache, Int64(0), Int64(total))
return _total_mem_cache[]
end

function _maybe_gc(dev, bytes)
allocated = _allocated_bytes[]
allocated <= 0 && return
total_mem = _get_total_mem(dev)
return if allocated + bytes > total_mem * 0.8
# Flush deferred resource releases (e.g., MKL sparse handles) from previous GC
# cycles first — these are safe to release now because they were deferred earlier.
# Do this BEFORE GC to avoid racing with new finalizers.
oneL0._run_reclaim_callbacks()
# Full GC to collect old-generation objects whose finalizers free GPU memory.
GC.gc(true)
elseif allocated + bytes > total_mem * 0.4
GC.gc(false)
end
end

function allocate(::Type{oneL0.DeviceBuffer}, ctx, dev, bytes::Int, alignment::Int)
bytes == 0 && return oneL0.DeviceBuffer(ZE_NULL, bytes, ctx, dev)

_maybe_gc(dev, bytes)
buf = device_alloc(ctx, dev, bytes, alignment)
make_resident(ctx, dev, buf)
Threads.atomic_add!(_allocated_bytes, Int64(bytes))

return buf
end
Expand All @@ -12,8 +45,10 @@ function allocate(::Type{oneL0.SharedBuffer}, ctx, dev, bytes::Int, alignment::I

# TODO: support cross-device shared buffers (by setting `dev=nothing`)

_maybe_gc(dev, bytes)
buf = shared_alloc(ctx, dev, bytes, alignment)
make_resident(ctx, dev, buf)
Threads.atomic_add!(_allocated_bytes, Int64(bytes))

return buf
end
Expand All @@ -26,6 +61,10 @@ end
function release(buf::oneL0.AbstractBuffer)
sizeof(buf) == 0 && return

if buf isa oneL0.DeviceBuffer || buf isa oneL0.SharedBuffer
Threads.atomic_sub!(_allocated_bytes, Int64(sizeof(buf)))
end

# XXX: is it necessary to evice memory if we are going to free it?
# this is racy, because eviction is not queue-ordered, and
# we don't want to synchronize inside what could have been a
Expand Down
Loading