Impl minimal support device_id

caffeinism · caffeinism · commit 56dd2dc18cd7 · 2026-02-05T01:39:33.000+09:00
diff --git a/av/_hwdevice_registry.py b/av/_hwdevice_registry.py
@@ -0,0 +1,12 @@
+_cuda_hwdevice_data_ptr_to_device_id: dict[int, int] = {}
+
+
+def register_cuda_hwdevice_data_ptr(hwdevice_data_ptr: int, device_id: int) -> None:
+    if hwdevice_data_ptr:
+        _cuda_hwdevice_data_ptr_to_device_id[int(hwdevice_data_ptr)] = int(device_id)
+
+
+def lookup_cuda_device_id(hwdevice_data_ptr: int) -> int:
+    if not hwdevice_data_ptr:
+        return 0
+    return _cuda_hwdevice_data_ptr_to_device_id.get(int(hwdevice_data_ptr), 0)
diff --git a/av/codec/hwaccel.py b/av/codec/hwaccel.py
@@ -8,6 +8,8 @@
 from cython.cimports.av.error import err_check
 from cython.cimports.av.video.format import get_video_format
 
+import av._hwdevice_registry as _hwreg
+
 
 class HWDeviceType(IntEnum):
     none = lib.AV_HWDEVICE_TYPE_NONE
@@ -112,6 +114,9 @@ def __init__(
         flags=None,
         output_format="sw",
     ):
+        if isinstance(device, int):
+            device = str(device)
+
         if isinstance(device_type, HWDeviceType):
             self._device_type = device_type
         elif isinstance(device_type, str):
@@ -131,7 +136,10 @@ def __init__(
 
         self._device = device
         self.allow_software_fallback = allow_software_fallback
+
         self.options = {} if not options else dict(options)
+        if self._device_type == HWDeviceType.cuda and self.output_format == "hw":
+            self.options.setdefault("primary_ctx", "1")
         self.flags = 0 if not flags else flags
         self.ptr = cython.NULL
         self.config = None
@@ -164,6 +172,19 @@ def _initialize_hw_context(self, codec: Codec):
             )
         )
 
+        if config.ptr.device_type == lib.AV_HWDEVICE_TYPE_CUDA:
+            device_id = 0
+            if self._device:
+                try:
+                    device_id = int(self._device)
+                except ValueError:
+                    device_id = 0
+
+            _hwreg.register_cuda_hwdevice_data_ptr(
+                cython.cast(cython.size_t, self.ptr.data),
+                device_id,
+            )
+
     def create(self, codec: Codec):
         """Create a new hardware accelerator context with the given codec"""
         if self.ptr:
diff --git a/av/video/frame.py b/av/video/frame.py
@@ -4,7 +4,7 @@
 import cython
 import cython.cimports.libav as lib
 from cython.cimports.av.dictionary import Dictionary
-from cython.cimports.av.dlpack import DLManagedTensor, kDLCUDA, kDLUInt
+from cython.cimports.av.dlpack import DLManagedTensor, kDLCUDA, kDLUInt, kDLCPU
 from cython.cimports.av.error import err_check
 from cython.cimports.av.hwcontext import (
     AVHWFramesContext,
@@ -23,6 +23,7 @@
 )
 from cython.cimports.libc.stdint import int64_t, uint8_t
 
+import av._hwdevice_registry as _hwreg
 
 _cuda_device_ctx_cache = {}
 _cuda_frames_ctx_cache = {}
@@ -67,8 +68,12 @@ def _dlpack_avbuffer_free(
         managed.deleter(managed)
 
 @cython.cfunc
-def _get_cuda_device_ctx(device_id: cython.int) -> cython.pointer[lib.AVBufferRef]:
-    cached = _cuda_device_ctx_cache.get(device_id)
+def _get_cuda_device_ctx(
+    device_id: cython.int,
+    primary_ctx: cython.bint,
+) -> cython.pointer[lib.AVBufferRef]:
+    key = (int(device_id), int(primary_ctx))
+    cached = _cuda_device_ctx_cache.get(key)
     if cached is not None:
         return cython.cast(
             cython.pointer[lib.AVBufferRef],
@@ -78,7 +83,7 @@ def _get_cuda_device_ctx(device_id: cython.int) -> cython.pointer[lib.AVBufferRe
     device_ref: cython.pointer[lib.AVBufferRef] = cython.NULL
     device_bytes = str(device_id).encode()
     c_device: cython.p_char = device_bytes
-    options: Dictionary = Dictionary({"primary_ctx": "1"})
+    options: Dictionary = Dictionary({"primary_ctx": "1" if primary_ctx else "0"})
 
     err_check(
         lib.av_hwdevice_ctx_create(
@@ -90,25 +95,31 @@ def _get_cuda_device_ctx(device_id: cython.int) -> cython.pointer[lib.AVBufferRe
         )
     )
 
-    _cuda_device_ctx_cache[device_id] = cython.cast(cython.size_t, device_ref)
+    _hwreg.register_cuda_hwdevice_data_ptr(
+        cython.cast(cython.size_t, device_ref.data),
+        device_id,
+    )
+
+    _cuda_device_ctx_cache[key] = cython.cast(cython.size_t, device_ref)
     return device_ref
 
 @cython.cfunc
 def _get_cuda_frames_ctx(
     device_id: cython.int,
+    primary_ctx: cython.bint,
     sw_fmt: lib.AVPixelFormat,
     width: cython.int,
     height: cython.int,
 ) -> cython.pointer[lib.AVBufferRef]:
-    key = (device_id, int(sw_fmt), int(width), int(height))
+    key = (int(device_id), int(primary_ctx), int(sw_fmt), int(width), int(height))
     cached = _cuda_frames_ctx_cache.get(key)
     if cached is not None:
         return cython.cast(
             cython.pointer[lib.AVBufferRef],
             cython.cast(cython.size_t, cached),
         )
 
-    device_ref = _get_cuda_device_ctx(device_id)
+    device_ref = _get_cuda_device_ctx(device_id, primary_ctx)
     frames_ref = av_hwframe_ctx_alloc(device_ref)
     if frames_ref == cython.NULL:
         raise MemoryError("av_hwframe_ctx_alloc() failed")
@@ -1330,6 +1341,7 @@ def from_dlpack(
         height: int = 0,
         stream=None,
         device_id: int | None = None,
+        primary_ctx: bool = True,
     ):
         if not isinstance(planes, (tuple, list)):
             planes = (planes,)
@@ -1356,18 +1368,30 @@ def from_dlpack(
             m0 = _consume_dlpack(planes[0], stream)
             m1 = _consume_dlpack(planes[1], stream)
 
-            if m0.dl_tensor.device.device_type != kDLCUDA or m1.dl_tensor.device.device_type != kDLCUDA:
-                raise TypeError("only CUDA DLPack tensors are supported")
+            dev_type0 = m0.dl_tensor.device.device_type
+            dev_type1 = m1.dl_tensor.device.device_type
+            if dev_type0 != dev_type1:
+                raise ValueError("plane tensors must have the same device_type")
+            if dev_type0 not in {kDLCUDA, kDLCPU}:
+                raise NotImplementedError("only CPU and CUDA DLPack tensors are supported")
 
             dev0 = m0.dl_tensor.device.device_id
             dev1 = m1.dl_tensor.device.device_id
             if dev0 != dev1:
                 raise ValueError("plane tensors must be on the same CUDA device")
-
-            if device_id is None:
-                device_id = dev0
-            elif device_id != dev0:
-                raise ValueError("device_id does not match the DLPack tensor device_id")
+            if dev_type0 == kDLCUDA:
+                if dev0 != dev1:
+                    raise ValueError("plane tensors must be on the same CUDA device")
+                if device_id is None:
+                    device_id = dev0
+                elif device_id != dev0:
+                    raise ValueError("device_id does not match the DLPack tensor device_id")
+            else:
+                if device_id not in (None, 0):
+                    raise ValueError("device_id must be 0 for CPU tensors")
+                device_id = 0
+            if dev_type0 == kDLCPU and (dev0 != 0 or dev1 != 0):
+                raise ValueError("CPU DLPack tensors must have device_id == 0")
 
             if (
                 m0.dl_tensor.dtype.code != kDLUInt
@@ -1443,16 +1467,24 @@ def from_dlpack(
             uv_linesize = cython.cast(int, uv_pitch_elems * itemsize)
             uv_size = cython.cast(int, uv_linesize * (height // 2))
 
-            frames_ref = _get_cuda_frames_ctx(device_id, sw_fmt, width, height)
-
             frame = alloc_video_frame()
             frame.ptr.width = width
             frame.ptr.height = height
-            frame.ptr.format = get_pix_fmt(b"cuda")
-
-            frame.ptr.hw_frames_ctx = lib.av_buffer_ref(frames_ref)
-            if frame.ptr.hw_frames_ctx == cython.NULL:
-                raise MemoryError("av_buffer_ref(hw_frames_ctx) failed")
+            if dev_type0 == kDLCUDA:
+                if primary_ctx is None:
+                    primary_ctx = True
+                if not isinstance(primary_ctx, (bool, int)):
+                    raise TypeError("primary_ctx must be a bool")
+                primary_ctx = bool(primary_ctx)
+
+                frames_ref = _get_cuda_frames_ctx(device_id, primary_ctx, sw_fmt, width, height)
+
+                frame.ptr.format = get_pix_fmt(b"cuda")
+                frame.ptr.hw_frames_ctx = lib.av_buffer_ref(frames_ref)
+                if frame.ptr.hw_frames_ctx == cython.NULL:
+                    raise MemoryError("av_buffer_ref(hw_frames_ctx) failed")
+            else:
+                frame.ptr.format = sw_fmt
 
             y_ptr = cython.cast(cython.pointer[uint8_t], m0.dl_tensor.data) + cython.cast(
                 cython.size_t, m0.dl_tensor.byte_offset
diff --git a/av/video/frame.pyi b/av/video/frame.pyi
@@ -92,4 +92,5 @@ class VideoFrame(Frame):
         height: int = 0,
         stream: int | None = None,
         device_id: int | None = None,
+        primary_ctx: bool = True,
     ) -> "VideoFrame": ...
diff --git a/av/video/plane.py b/av/video/plane.py
@@ -1,7 +1,7 @@
 import cython
 import cython.cimports.libav as lib
 from cython.cimports.av.buffer import Buffer
-from cython.cimports.av.dlpack import DLManagedTensor, kDLCUDA, kDLUInt
+from cython.cimports.av.dlpack import DLManagedTensor, kDLCPU, kDLCUDA, kDLUInt
 from cython.cimports.av.error import err_check
 from cython.cimports.av.hwcontext import AVHWFramesContext
 from cython.cimports.av.video.format import get_pix_fmt, get_video_format
@@ -17,6 +17,8 @@
 from cython.cimports.libc.stdint import int64_t
 from cython.cimports.libc.stdlib import free, malloc
 
+import av._hwdevice_registry as _hwreg
+
 
 @cython.cclass
 class VideoPlane(Plane):
@@ -79,22 +81,44 @@ def __getbuffer__(self, view: cython.pointer[Py_buffer], flags: cython.int):
         PyBuffer_FillInfo(view, self, self._buffer_ptr(), self._buffer_size(), 0, flags)
 
     def __dlpack_device__(self):
-        if not self.frame.ptr.hw_frames_ctx:
-            raise TypeError("DLPack export is only supported for hardware frames")
-        if cython.cast(lib.AVPixelFormat, self.frame.ptr.format) != get_pix_fmt(b"cuda"):
-            raise NotImplementedError("DLPack export is only implemented for CUDA hw frames")
-        return (kDLCUDA, 0)
+        if self.frame.ptr.hw_frames_ctx:
+            if cython.cast(lib.AVPixelFormat, self.frame.ptr.format) != get_pix_fmt(b"cuda"):
+                raise NotImplementedError("DLPack export is only implemented for CUDA hw frames")
+
+            frames_ctx: cython.pointer[AVHWFramesContext] = cython.cast(
+                cython.pointer[AVHWFramesContext], self.frame.ptr.hw_frames_ctx.data
+            )
+            device_id = _hwreg.lookup_cuda_device_id(
+                cython.cast(cython.size_t, frames_ctx.device_ref.data)
+            )
+            return (kDLCUDA, device_id)
+
+        return (kDLCPU, 0)
 
     def __dlpack__(self, stream=None):
-        if not self.frame.ptr.hw_frames_ctx:
-            raise TypeError("DLPack export is only supported for hardware frames")
-        if cython.cast(lib.AVPixelFormat, self.frame.ptr.format) != get_pix_fmt(b"cuda"):
-            raise NotImplementedError("DLPack export is only implemented for CUDA hw frames")
+        if self.frame.ptr.buf[0] == cython.NULL:
+            raise TypeError("DLPack export requires a refcounted AVFrame (frame.buf[0] is NULL)")
 
-        frames_ctx: cython.pointer[AVHWFramesContext] = cython.cast(
-            cython.pointer[AVHWFramesContext], self.frame.ptr.hw_frames_ctx.data
-        )
-        sw_fmt = frames_ctx.sw_format
+        device_type: cython.int
+        device_id: cython.int
+        sw_fmt: lib.AVPixelFormat
+
+        if self.frame.ptr.hw_frames_ctx:
+            if cython.cast(lib.AVPixelFormat, self.frame.ptr.format) != get_pix_fmt(b"cuda"):
+                raise NotImplementedError("DLPack export is only implemented for CUDA hw frames")
+
+            frames_ctx: cython.pointer[AVHWFramesContext] = cython.cast(
+                cython.pointer[AVHWFramesContext], self.frame.ptr.hw_frames_ctx.data
+            )
+            sw_fmt = frames_ctx.sw_format
+            device_type = kDLCUDA
+            device_id = _hwreg.lookup_cuda_device_id(
+                cython.cast(cython.size_t, frames_ctx.device_ref.data)
+            )
+        else:
+            sw_fmt = cython.cast(lib.AVPixelFormat, self.frame.ptr.format)
+            device_type = kDLCPU
+            device_id = 0
 
         line_size = self.line_size
         if line_size < 0:
@@ -206,8 +230,8 @@ def __dlpack__(self, stream=None):
             raise MemoryError("malloc() failed")
 
         managed.dl_tensor.data = cython.cast(cython.p_void, frame_ref.data[self.index])
-        managed.dl_tensor.device.device_type = kDLCUDA
-        managed.dl_tensor.device.device_id = 0
+        managed.dl_tensor.device.device_type = device_type
+        managed.dl_tensor.device.device_id = device_id
         managed.dl_tensor.ndim = ndim
         managed.dl_tensor.dtype.code = kDLUInt
         managed.dl_tensor.dtype.bits = bits
diff --git a/include/libavcodec/avcodec.pxd b/include/libavcodec/avcodec.pxd
@@ -353,7 +353,7 @@ cdef extern from "libavcodec/avcodec.h" nogil:
         int64_t pkt_dts
         void *opaque
         int sample_rate
-        AVBufferRef *buf[4]
+        AVBufferRef *buf[8]
         AVBufferRef **extended_buf
         int nb_extended_buf