From d3687839a787af6c0e233e57077751c4f06b44c3 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 30 Jan 2026 13:30:38 -0500
Subject: [PATCH 1/5] save

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/kernels/patches.cu        | 27 ++++++++++++++++++++++
 vortex-cuda/src/kernel/mod.rs         |  1 +
 vortex-cuda/src/kernel/patches/mod.rs | 32 +++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 vortex-cuda/kernels/patches.cu
 create mode 100644 vortex-cuda/src/kernel/patches/mod.rs
diff --git a/vortex-cuda/kernels/patches.cu b/vortex-cuda/kernels/patches.cu
new file mode 100644
index 00000000000..f237ccee057
--- /dev/null
+++ b/vortex-cuda/kernels/patches.cu
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+// Apply patches to a source array
+template<typename ValuesT, typename IndexT>
+__device__ void patches_apply_inplace(
+    ValuesT *const values,
+    const IndexT *const patchIndices,
+    const ValueT *const patchValues,
+    uint64_t valuesLen,
+    uint64_t patchesLen,
+) {
+    const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx > patchesLen) {
+        return;
+    }
+
+    const IndexT patchIdx = patchIndices[idx];
+    const ValueT patchVal = patchValues[idx];
+
+    const size_t valueIdx = static_cast<size_t>(patchIdx);
+    values[valueIdx] = patchVal;
+}
+
+#define GENERATE_PATCHES_KERNEL(ValuesT, IndicesT) \
+extern "C" __global__ patches_apply_inplace
\ No newline at end of file
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 51ba8cc55bd..4f90ce894f8 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -23,6 +23,7 @@ use vortex_utils::aliases::dash_map::DashMap;
 mod arrays;
 mod encodings;
 mod filter;
+mod patches;
 mod slice;
 
 pub use arrays::DictExecutor;
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
new file mode 100644
index 00000000000..f2819900fe1
--- /dev/null
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::patches::Patches;
+use vortex_dtype::NativePType;
+use vortex_error::VortexResult;
+
+#[derive(Debug)]
+pub struct PatchesExecutor;
+
+pub(crate) async fn execute_patches<ValuesT: NativePType, IndicesT: NativePType>(
+    patches: Patches,
+    array: Canonical,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<Canonical> {
+    let len = array.len();
+    let values = array.into_primitive();
+
+    todo!()
+
+    // Based on the typed indices and values instead...we can apply those
+    // launch_cuda_kernel!(
+    //     execution_ctx: ctx,
+    //     module: "patches",
+    //     ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE],
+    //     launch_args: [],
+    //     event_recording: CU_EVENT_DISABLE_TIMING,
+    //     array_len:
+    // )
+}

From c12d59893eddf3bbfddf38c7c9b6043ed7d6286a Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 30 Jan 2026 20:59:59 +0000
Subject: [PATCH 2/5] feat[cuda]: patches application kernel

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/benches/dict_cuda.rs        |   2 +-
 vortex-cuda/benches/for_cuda.rs         |   8 +-
 vortex-cuda/kernels/patches.cu          |  27 -----
 vortex-cuda/kernels/src/patches.cu      |  55 +++++++++++
 vortex-cuda/src/kernel/arrays/dict.rs   |   2 +-
 vortex-cuda/src/kernel/encodings/alp.rs |  60 +++++++++---
 vortex-cuda/src/kernel/mod.rs           |   2 +-
 vortex-cuda/src/kernel/patches/mod.rs   | 125 +++++++++++++++++++-----
 8 files changed, 209 insertions(+), 72 deletions(-)
 delete mode 100644 vortex-cuda/kernels/patches.cu
 create mode 100644 vortex-cuda/kernels/src/patches.cu

diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index 0142cb8842d..74b7f30c0e6 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -101,7 +101,7 @@ fn launch_dict_kernel_timed<V: cudarc::driver::DeviceRepr, I: cudarc::driver::De
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), code_ptype.to_string().as_str()],
+        ptypes: &[value_ptype, code_ptype],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: codes_len
diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
index 88662c01f74..03a09f91b80 100644
--- a/vortex-cuda/benches/for_cuda.rs
+++ b/vortex-cuda/benches/for_cuda.rs
@@ -89,7 +89,7 @@ fn launch_for_kernel_timed_u8(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -110,7 +110,7 @@ fn launch_for_kernel_timed_u16(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -131,7 +131,7 @@ fn launch_for_kernel_timed_u32(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
@@ -152,7 +152,7 @@ fn launch_for_kernel_timed_u64(
     let events = vortex_cuda::launch_cuda_kernel!(
         execution_ctx: cuda_ctx,
         module: "for",
-        ptypes: &[for_array.ptype().to_string().as_str()],
+        ptypes: &[for_array.ptype()],
         launch_args: [device_data, reference, array_len_u64],
         event_recording: CU_EVENT_BLOCKING_SYNC,
         array_len: for_array.len()
diff --git a/vortex-cuda/kernels/patches.cu b/vortex-cuda/kernels/patches.cu
deleted file mode 100644
index f237ccee057..00000000000
--- a/vortex-cuda/kernels/patches.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-// Apply patches to a source array
-template<typename ValuesT, typename IndexT>
-__device__ void patches_apply_inplace(
-    ValuesT *const values,
-    const IndexT *const patchIndices,
-    const ValueT *const patchValues,
-    uint64_t valuesLen,
-    uint64_t patchesLen,
-) {
-    const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (idx > patchesLen) {
-        return;
-    }
-
-    const IndexT patchIdx = patchIndices[idx];
-    const ValueT patchVal = patchValues[idx];
-
-    const size_t valueIdx = static_cast<size_t>(patchIdx);
-    values[valueIdx] = patchVal;
-}
-
-#define GENERATE_PATCHES_KERNEL(ValuesT, IndicesT) \
-extern "C" __global__ patches_apply_inplace
\ No newline at end of file
diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
new file mode 100644
index 00000000000..3358f1950c5
--- /dev/null
+++ b/vortex-cuda/kernels/src/patches.cu
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include <stdint.h>
+
+// Apply patches to a source array
+template<typename ValueT, typename IndexT>
+__device__ void patches(
+    ValueT *const values,
+    const IndexT *const patchIndices,
+    const ValueT *const patchValues,
+    uint64_t patchesLen
+) {
+    const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx > patchesLen) {
+        return;
+    }
+
+    const IndexT patchIdx = patchIndices[idx];
+    const ValueT patchVal = patchValues[idx];
+
+    const size_t valueIdx = static_cast<size_t>(patchIdx);
+    values[valueIdx] = patchVal;
+}
+
+#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \
+extern "C" __global__ void patches_##value_suffix##_##index_suffix( \
+    ValueT *const values, \
+    const IndexT *const patchIndices, \
+    const ValueT *const patchValues, \
+    uint64_t patchesLen \
+) { \
+    patches(values, patchIndices, patchValues, patchesLen); \
+}
+
+#define GENERATE_PATCHES_KERNEL_FOR_VALUE(ValueT, value_suffix) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \
+     GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64)
+
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint8_t, u8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint16_t, u16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint32_t, u32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(uint64_t, u64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int8_t, i8)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int16_t, i16)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int32_t, i32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(int64_t, i64)
+
+GENERATE_PATCHES_KERNEL_FOR_VALUE(float, f32)
+GENERATE_PATCHES_KERNEL_FOR_VALUE(double, f64)
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index c9cb27f2cd3..02b02dca2c9 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -129,7 +129,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
     let _cuda_events = crate::launch_cuda_kernel!(
         execution_ctx: ctx,
         module: "dict",
-        ptypes: &[value_ptype.to_string().as_str(), I::PTYPE.to_string().as_str()],
+        ptypes: &[value_ptype, I::PTYPE],
         launch_args: [codes_view, codes_len_u64, values_view, output_view],
         event_recording: cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING,
         array_len: codes_len
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index b3953034f48..da1e8669420 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -20,6 +20,7 @@ use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
+use vortex_dtype::match_each_unsigned_integer_ptype;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -28,6 +29,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
+use crate::kernel::patches::execute_patches;
 use crate::launch_cuda_kernel_impl;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
@@ -88,20 +90,33 @@ where
     // Load kernel function
     let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
     let cuda_function = ctx.load_function_ptype("alp", &kernel_ptypes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
+
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
+        launch_builder.arg(&f);
+        launch_builder.arg(&e);
+        launch_builder.arg(&array_len_u64);
+
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    }
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-    launch_builder.arg(&f);
-    launch_builder.arg(&e);
-    launch_builder.arg(&array_len_u64);
+    // Check if there are any patches to decode here
+    let output_buf = if let Some(patches) = array.patches() {
+        match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
+            execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
+        })
+    } else {
+        output_buf
+    };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?;
+    // TODO(aduffy): scatter patch values validity. There are several places we'll need to start
+    //  handling validity.
 
-    // Build result with newly allocated buffer
     let output_handle = BufferHandle::new_device(Arc::new(output_buf));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
@@ -117,8 +132,10 @@ mod tests {
     use vortex_array::IntoArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::assert_arrays_eq;
-    use vortex_array::validity::Validity::NonNullable;
+    use vortex_array::patches::Patches;
+    use vortex_array::validity::Validity;
     use vortex_buffer::Buffer;
+    use vortex_buffer::buffer;
     use vortex_error::VortexExpect;
     use vortex_session::VortexSession;
 
@@ -138,13 +155,24 @@ mod tests {
         let encoded_data: Vec<i32> = vec![100, 200, 300, 400, 500];
         let exponents = Exponents { e: 0, f: 2 }; // multiply by 100
 
+        // Patches
+        let patches = Patches::new(
+            5,
+            0,
+            PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(),
+            PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(),
+            None,
+        )
+        .unwrap();
+
         let alp_array = ALPArray::try_new(
-            PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable).into_array(),
+            PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable)
+                .into_array(),
             exponents,
-            None,
+            Some(patches),
         )?;
 
-        let cpu_result = alp_array.to_canonical()?;
+        let cpu_result = alp_array.to_canonical()?.into_array();
 
         let gpu_result = ALPExecutor
             .execute(alp_array.to_array(), &mut cuda_ctx)
@@ -154,7 +182,7 @@ mod tests {
             .await?
             .into_array();
 
-        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+        assert_arrays_eq!(cpu_result, gpu_result);
 
         Ok(())
     }
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 4f90ce894f8..afba4453db8 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -66,7 +66,7 @@ macro_rules! launch_cuda_kernel {
         array_len: $len:expr
     ) => {{
         use ::cudarc::driver::PushKernelArg as _;
-        let cuda_function = $ctx.load_function($module, $ptypes)?;
+        let cuda_function = $ctx.load_function_ptype($module, $ptypes)?;
         let mut launch_builder = $ctx.launch_builder(&cuda_function);
 
         $(
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
index f2819900fe1..34fecd7594c 100644
--- a/vortex-cuda/src/kernel/patches/mod.rs
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -1,32 +1,113 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use vortex_array::Canonical;
-use vortex_array::ExecutionCtx;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::patches::Patches;
+use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
 use vortex_error::VortexResult;
+use vortex_error::vortex_ensure;
+use vortex_error::vortex_err;
 
-#[derive(Debug)]
-pub struct PatchesExecutor;
+use crate::CudaDeviceBuffer;
+use crate::CudaExecutionCtx;
+use crate::executor::CudaArrayExt;
+use crate::launch_cuda_kernel;
 
-pub(crate) async fn execute_patches<ValuesT: NativePType, IndicesT: NativePType>(
+/// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`.
+pub(crate) async fn execute_patches<
+    ValuesT: NativePType + DeviceRepr,
+    IndicesT: NativePType + DeviceRepr,
+>(
     patches: Patches,
-    array: Canonical,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<Canonical> {
-    let len = array.len();
-    let values = array.into_primitive();
-
-    todo!()
-
-    // Based on the typed indices and values instead...we can apply those
-    // launch_cuda_kernel!(
-    //     execution_ctx: ctx,
-    //     module: "patches",
-    //     ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE],
-    //     launch_args: [],
-    //     event_recording: CU_EVENT_DISABLE_TIMING,
-    //     array_len:
-    // )
+    target: CudaDeviceBuffer,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<CudaDeviceBuffer> {
+    let indices = patches.indices().clone();
+    let values = patches.values().clone();
+    drop(patches);
+
+    let indices = indices.execute_cuda(ctx).await?.into_primitive();
+    let values = values.execute_cuda(ctx).await?.into_primitive();
+
+    vortex_ensure!(
+        indices.ptype() == IndicesT::PTYPE,
+        "expected PType {} for patch indices, was {}",
+        IndicesT::PTYPE,
+        indices.ptype()
+    );
+
+    vortex_ensure!(
+        values.ptype() == ValuesT::PTYPE,
+        "expected PType {} for patch values, was {}",
+        ValuesT::PTYPE,
+        values.ptype()
+    );
+
+    let patches_len = indices.len();
+    let patches_len_u64 = patches_len as u64;
+
+    let PrimitiveArrayParts {
+        buffer: indices_buffer,
+        ..
+    } = indices.into_parts();
+
+    let PrimitiveArrayParts {
+        buffer: values_buffer,
+        validity: values_validity,
+        ..
+    } = values.into_parts();
+
+    let d_patch_indices = if indices_buffer.is_on_device() {
+        indices_buffer
+    } else {
+        ctx.move_to_device::<IndicesT>(indices_buffer)?.await?
+    };
+
+    let d_patch_values = if values_buffer.is_on_device() {
+        values_buffer
+    } else {
+        ctx.move_to_device::<ValuesT>(values_buffer)?.await?
+    };
+
+    let d_patch_indices_buf = d_patch_indices
+        .as_device()
+        .as_any()
+        .downcast_ref::<CudaDeviceBuffer>()
+        .ok_or_else(|| vortex_err!("d_patch_indices must be CudaDeviceBuffer"))?;
+
+    let d_patch_values_buf = d_patch_values
+        .as_device()
+        .as_any()
+        .downcast_ref::<CudaDeviceBuffer>()
+        .ok_or_else(|| vortex_err!("d_patch_values must be CudaDeviceBuffer"))?;
+
+    let d_target_view = target.as_view::<ValuesT>();
+    let d_patch_indices_view = d_patch_indices_buf.as_view::<IndicesT>();
+    let d_patch_values_view = d_patch_values_buf.as_view::<ValuesT>();
+
+    // kernel arg order for patches is values, patchIndices, patchValues, patchesLen
+    let _events = launch_cuda_kernel!(
+        execution_ctx: ctx,
+        module: "patches",
+        ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE],
+        launch_args: [
+            d_target_view,
+            d_patch_indices_view,
+            d_patch_values_view,
+            patches_len_u64,
+        ],
+        event_recording: CU_EVENT_DISABLE_TIMING,
+        array_len: patches_len
+    );
+
+    Ok(target)
+}
+
+#[cuda_tests]
+mod tests {
+    #[test]
+    fn test_impl() {}
 }

From 617ade825a0b919780ae5a1e4657aeda467bab10 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 30 Jan 2026 21:15:27 +0000
Subject: [PATCH 3/5] BP

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/src/device_buffer.rs              |   3 +-
 vortex-cuda/src/kernel/encodings/bitpacked.rs |  82 ++++++++--
 vortex-cuda/src/kernel/patches/mod.rs         | 140 +++++++++++++++---
 3 files changed, 189 insertions(+), 36 deletions(-)

diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index 92b8fe08233..520f4551e96 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -27,6 +27,7 @@ use crate::stream::await_stream_callback;
 /// A [`DeviceBuffer`] wrapping a CUDA GPU allocation.
 ///
 /// Like the host `BufferHandle` variant, all slicing/referencing works in terms of byte units.
+#[derive(Clone)]
 pub struct CudaDeviceBuffer {
     allocation: Arc<dyn private::DeviceAllocation>,
     /// Offset in bytes from the start of the allocation
@@ -39,8 +40,6 @@ pub struct CudaDeviceBuffer {
     alignment: Alignment,
 }
 
-// We can call the sys methods, it's just a lot of extra code...fuck that lol
-
 mod private {
     use std::fmt::Debug;
     use std::sync::Arc;
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 635ee11a57b..c1e905aff77 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Debug;
+use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
@@ -16,6 +17,8 @@ use vortex_array::buffer::DeviceBufferExt;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
 use vortex_dtype::match_each_integer_ptype;
+use vortex_dtype::match_each_unsigned_integer_ptype;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_ensure;
 use vortex_error::vortex_err;
@@ -29,6 +32,7 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
 use crate::kernel::launch_cuda_kernel_with_config;
+use crate::kernel::patches::execute_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
@@ -74,7 +78,6 @@ where
     } = array.into_parts();
 
     vortex_ensure!(len > 0, "Non empty array");
-    vortex_ensure!(patches.is_none(), "Patches not supported");
     let offset = offset as usize;
 
     let device_input: BufferHandle = if packed.is_on_device() {
@@ -97,27 +100,46 @@ where
     let thread_count = if bits == 64 { 16 } else { 32 };
     let suffixes: [&str; _] = [&format!("{bit_width}bw"), &format!("{thread_count}t")];
     let cuda_function = ctx.load_function(&format!("bit_unpack_{}", bits), &suffixes)?;
-    let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    // Build launch args: input, output, f, e, length
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
+    {
+        let mut launch_builder = ctx.launch_builder(&cuda_function);
 
-    let num_blocks = u32::try_from(len.div_ceil(1024))?;
+        // Build launch args: input, output, f, e, length
+        launch_builder.arg(&input_view);
+        launch_builder.arg(&output_view);
 
-    let config = LaunchConfig {
-        grid_dim: (num_blocks, 1, 1),
-        block_dim: (thread_count, 1, 1),
-        shared_mem_bytes: 0,
-    };
+        let num_blocks = u32::try_from(len.div_ceil(1024))?;
+
+        let config = LaunchConfig {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (thread_count, 1, 1),
+            shared_mem_bytes: 0,
+        };
 
-    // Launch kernel
-    let _cuda_events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+        // Launch kernel
+        let _cuda_events =
+            launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?;
+    }
+
+    let output_handle = match patches {
+        None => BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len))),
+        Some(p) => {
+            let output_buf = output_buf.slice_typed::<A>(offset..(offset + len));
+            let buf = output_buf
+                .as_any()
+                .downcast_ref::<CudaDeviceBuffer>()
+                .vortex_expect("we created this as CudaDeviceBuffer")
+                .clone();
+
+            let patched_buf = match_each_unsigned_integer_ptype!(p.indices_ptype()?, |I| {
+                execute_patches::<A, I>(p, buf, ctx).await?
+            });
+
+            BufferHandle::new_device(Arc::new(patched_buf))
+        }
+    };
 
     // Build result with newly allocated buffer
-    let output_handle =
-        BufferHandle::new_device(output_buf.slice_typed::<A>(offset..(offset + len)));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
         A::PTYPE,
@@ -141,6 +163,34 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
+    #[test]
+    fn test_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let array = PrimitiveArray::new((0u16..=513).collect::<Buffer<_>>(), NonNullable);
+
+        // Last two items should be patched
+        let bp_with_patches = BitPackedArray::encode(array.as_ref(), 9)?;
+        assert!(bp_with_patches.patches().is_some());
+
+        let cpu_result = bp_with_patches.to_canonical()?.into_array();
+
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(bp_with_patches.to_array(), &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+
+        Ok(())
+    }
+
     #[rstest]
     #[case::bw_1(1)]
     #[case::bw_2(2)]
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
index 34fecd7594c..9922c7bf56d 100644
--- a/vortex-cuda/src/kernel/patches/mod.rs
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -5,12 +5,14 @@ use cudarc::driver::DeviceRepr;
 use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::patches::Patches;
+use vortex_array::validity::Validity;
+use vortex_array::vtable::ValidityHelper;
 use vortex_cuda_macros::cuda_tests;
 use vortex_dtype::NativePType;
 use vortex_error::VortexResult;
 use vortex_error::vortex_ensure;
-use vortex_error::vortex_err;
 
+use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
 use crate::executor::CudaArrayExt;
@@ -32,6 +34,15 @@ pub(crate) async fn execute_patches<
     let indices = indices.execute_cuda(ctx).await?.into_primitive();
     let values = values.execute_cuda(ctx).await?.into_primitive();
 
+    let supported = matches!(
+        values.validity(),
+        Validity::NonNullable | Validity::AllValid
+    );
+    vortex_ensure!(
+        supported,
+        "Applying patches with null values not currently supported on the GPU"
+    );
+
     vortex_ensure!(
         indices.ptype() == IndicesT::PTYPE,
         "expected PType {} for patch indices, was {}",
@@ -56,7 +67,6 @@ pub(crate) async fn execute_patches<
 
     let PrimitiveArrayParts {
         buffer: values_buffer,
-        validity: values_validity,
         ..
     } = values.into_parts();
 
@@ -72,21 +82,9 @@ pub(crate) async fn execute_patches<
         ctx.move_to_device::<ValuesT>(values_buffer)?.await?
     };
 
-    let d_patch_indices_buf = d_patch_indices
-        .as_device()
-        .as_any()
-        .downcast_ref::<CudaDeviceBuffer>()
-        .ok_or_else(|| vortex_err!("d_patch_indices must be CudaDeviceBuffer"))?;
-
-    let d_patch_values_buf = d_patch_values
-        .as_device()
-        .as_any()
-        .downcast_ref::<CudaDeviceBuffer>()
-        .ok_or_else(|| vortex_err!("d_patch_values must be CudaDeviceBuffer"))?;
-
     let d_target_view = target.as_view::<ValuesT>();
-    let d_patch_indices_view = d_patch_indices_buf.as_view::<IndicesT>();
-    let d_patch_values_view = d_patch_values_buf.as_view::<ValuesT>();
+    let d_patch_indices_view = d_patch_indices.cuda_view::<IndicesT>()?;
+    let d_patch_values_view = d_patch_values.cuda_view::<ValuesT>()?;
 
     // kernel arg order for patches is values, patchIndices, patchValues, patchesLen
     let _events = launch_cuda_kernel!(
@@ -108,6 +106,112 @@ pub(crate) async fn execute_patches<
 
 #[cuda_tests]
 mod tests {
-    #[test]
-    fn test_impl() {}
+    use std::sync::Arc;
+
+    use cudarc::driver::DeviceRepr;
+    use vortex_array::IntoArray;
+    use vortex_array::ToCanonical;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::arrays::PrimitiveArrayParts;
+    use vortex_array::assert_arrays_eq;
+    use vortex_array::buffer::BufferHandle;
+    use vortex_array::compute::cast;
+    use vortex_array::patches::Patches;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::buffer;
+    use vortex_dtype::DType;
+    use vortex_dtype::NativePType;
+    use vortex_dtype::Nullability;
+    use vortex_session::VortexSession;
+
+    use crate::CanonicalCudaExt;
+    use crate::CudaDeviceBuffer;
+    use crate::CudaSession;
+    use crate::kernel::patches::execute_patches;
+
+    #[tokio::test]
+    async fn test_patches() {
+        test_case::<u8>().await;
+        test_case::<u16>().await;
+        test_case::<u32>().await;
+        test_case::<u64>().await;
+
+        test_case::<i8>().await;
+        test_case::<i16>().await;
+        test_case::<i32>().await;
+        test_case::<i64>().await;
+
+        test_case::<f32>().await;
+        test_case::<f64>().await;
+    }
+
+    async fn test_case<Values: NativePType + DeviceRepr>() {
+        full_test_case::<Values, u8>().await;
+        full_test_case::<Values, u16>().await;
+        full_test_case::<Values, u32>().await;
+        full_test_case::<Values, u64>().await;
+    }
+
+    async fn full_test_case<Values: NativePType + DeviceRepr, Indices: NativePType + DeviceRepr>() {
+        let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap();
+
+        let values = PrimitiveArray::from_iter(0..128);
+        let values = force_cast::<Values>(values);
+
+        let patch_idx = PrimitiveArray::new(buffer![0, 8, 16, 32], Validity::NonNullable);
+        let patch_idx = force_cast::<Indices>(patch_idx);
+
+        let patch_val = PrimitiveArray::new(buffer![99, 99, 99, 99], Validity::NonNullable);
+        let patch_val = force_cast::<Values>(patch_val);
+
+        // Copy all to GPU
+        let patches =
+            Patches::new(128, 0, patch_idx.into_array(), patch_val.into_array(), None).unwrap();
+
+        let cpu_result = values.clone().patch(&patches).unwrap();
+
+        let PrimitiveArrayParts {
+            buffer: cuda_buffer,
+            ..
+        } = values.into_parts();
+
+        let handle = ctx
+            .move_to_device::<Values>(cuda_buffer)
+            .unwrap()
+            .await
+            .unwrap();
+        let device_buf = handle
+            .as_device()
+            .as_any()
+            .downcast_ref::<CudaDeviceBuffer>()
+            .unwrap()
+            .clone();
+
+        let patched_buf = execute_patches::<Values, Indices>(patches, device_buf, &mut ctx)
+            .await
+            .unwrap();
+
+        let gpu_result = PrimitiveArray::from_buffer_handle(
+            BufferHandle::new_device(Arc::new(patched_buf)),
+            Values::PTYPE,
+            Validity::NonNullable,
+        )
+        .to_canonical()
+        .unwrap()
+        .into_host()
+        .await
+        .unwrap()
+        .into_primitive();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+    }
+
+    fn force_cast<T: NativePType>(array: PrimitiveArray) -> PrimitiveArray {
+        cast(
+            array.as_ref(),
+            &DType::Primitive(T::PTYPE, Nullability::NonNullable),
+        )
+        .unwrap()
+        .to_primitive()
+    }
 }

From 7371736875db42c9ad2ee857898de5d3e2f9bbc3 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 2 Feb 2026 09:56:38 -0500
Subject: [PATCH 4/5] fix

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/kernels/src/patches.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
index 3358f1950c5..2fecc926569 100644
--- a/vortex-cuda/kernels/src/patches.cu
+++ b/vortex-cuda/kernels/src/patches.cu
@@ -3,6 +3,8 @@
 
 #include <stdint.h>
 
+// TODO(aduffy): this is very naive. In the future we need to
+//   transpose the patches, see G-ALP paper.
 // Apply patches to a source array
 template<typename ValueT, typename IndexT>
 __device__ void patches(
@@ -13,7 +15,7 @@ __device__ void patches(
 ) {
     const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (idx > patchesLen) {
+    if (idx >= patchesLen) {
         return;
     }
 

From 7e9717e626b7e7cdd1781189b45b50bb2ee4f3d6 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 2 Feb 2026 10:01:24 -0500
Subject: [PATCH 5/5] start/stop fix

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/kernels/src/config.cuh |  7 +++++++
 vortex-cuda/kernels/src/patches.cu | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh
index de679a24a6f..cf7fba2e848 100644
--- a/vortex-cuda/kernels/src/config.cuh
+++ b/vortex-cuda/kernels/src/config.cuh
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <stdint.h>
+
 // Kernel launch configuration constants.
 // Must match the Rust launch config in src/kernel/mod.rs.
 //
@@ -10,3 +12,8 @@
 //   elements_per_block = 64 * 32 = 2048
 //   grid_dim = ceil(array_len / 2048)
 constexpr uint32_t ELEMENTS_PER_THREAD = 32;
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len))
+#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
index 2fecc926569..fc5b611a828 100644
--- a/vortex-cuda/kernels/src/patches.cu
+++ b/vortex-cuda/kernels/src/patches.cu
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#include <stdint.h>
+#include "config.cuh"
 
 // TODO(aduffy): this is very naive. In the future we need to
 //   transpose the patches, see G-ALP paper.
@@ -13,17 +13,21 @@ __device__ void patches(
     const ValueT *const patchValues,
     uint64_t patchesLen
 ) {
-    const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint64_t startElem = START_ELEM(worker, patchesLen);
+    const uint64_t stopElem = START_ELEM(worker, patchesLen);
 
-    if (idx >= patchesLen) {
+    if (startElem >= patchesLen) {
         return;
     }
 
-    const IndexT patchIdx = patchIndices[idx];
-    const ValueT patchVal = patchValues[idx];
+    for (uint64_t idx = startElem; idx < stopElem; idx++) {
+        const IndexT patchIdx = patchIndices[idx];
+        const ValueT patchVal = patchValues[idx];
 
-    const size_t valueIdx = static_cast<size_t>(patchIdx);
-    values[valueIdx] = patchVal;
+        const size_t valueIdx = static_cast<size_t>(patchIdx);
+        values[valueIdx] = patchVal;
+    }
 }
 
 #define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \