From d3687839a787af6c0e233e57077751c4f06b44c3 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 30 Jan 2026 13:30:38 -0500 Subject: [PATCH 1/5] save Signed-off-by: Andrew Duffy --- vortex-cuda/kernels/patches.cu | 27 ++++++++++++++++++++++ vortex-cuda/src/kernel/mod.rs | 1 + vortex-cuda/src/kernel/patches/mod.rs | 32 +++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 vortex-cuda/kernels/patches.cu create mode 100644 vortex-cuda/src/kernel/patches/mod.rs diff --git a/vortex-cuda/kernels/patches.cu b/vortex-cuda/kernels/patches.cu new file mode 100644 index 00000000000..f237ccee057 --- /dev/null +++ b/vortex-cuda/kernels/patches.cu @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +// Apply patches to a source array +template +__device__ void patches_apply_inplace( + ValuesT *const values, + const IndexT *const patchIndices, + const ValueT *const patchValues, + uint64_t valuesLen, + uint64_t patchesLen, +) { + const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx > patchesLen) { + return; + } + + const IndexT patchIdx = patchIndices[idx]; + const ValueT patchVal = patchValues[idx]; + + const size_t valueIdx = static_cast(patchIdx); + values[valueIdx] = patchVal; +} + +#define GENERATE_PATCHES_KERNEL(ValuesT, IndicesT) \ +extern "C" __global__ patches_apply_inplace \ No newline at end of file diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 51ba8cc55bd..4f90ce894f8 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -23,6 +23,7 @@ use vortex_utils::aliases::dash_map::DashMap; mod arrays; mod encodings; mod filter; +mod patches; mod slice; pub use arrays::DictExecutor; diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs new file mode 100644 index 00000000000..f2819900fe1 --- /dev/null +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::patches::Patches; +use vortex_dtype::NativePType; +use vortex_error::VortexResult; + +#[derive(Debug)] +pub struct PatchesExecutor; + +pub(crate) async fn execute_patches( + patches: Patches, + array: Canonical, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let len = array.len(); + let values = array.into_primitive(); + + todo!() + + // Based on the typed indices and values instead...we can apply those + // launch_cuda_kernel!( + // execution_ctx: ctx, + // module: "patches", + // ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE], + // launch_args: [], + // event_recording: CU_EVENT_DISABLE_TIMING, + // array_len: + // ) +} From c12d59893eddf3bbfddf38c7c9b6043ed7d6286a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 30 Jan 2026 20:59:59 +0000 Subject: [PATCH 2/5] feat[cuda]: patches application kernel Signed-off-by: Andrew Duffy --- vortex-cuda/benches/dict_cuda.rs | 2 +- vortex-cuda/benches/for_cuda.rs | 8 +- vortex-cuda/kernels/patches.cu | 27 ----- vortex-cuda/kernels/src/patches.cu | 55 +++++++++++ vortex-cuda/src/kernel/arrays/dict.rs | 2 +- vortex-cuda/src/kernel/encodings/alp.rs | 60 +++++++++--- vortex-cuda/src/kernel/mod.rs | 2 +- vortex-cuda/src/kernel/patches/mod.rs | 125 +++++++++++++++++++----- 8 files changed, 209 insertions(+), 72 deletions(-) delete mode 100644 vortex-cuda/kernels/patches.cu create mode 100644 vortex-cuda/kernels/src/patches.cu diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs index 0142cb8842d..74b7f30c0e6 100644 --- a/vortex-cuda/benches/dict_cuda.rs +++ b/vortex-cuda/benches/dict_cuda.rs @@ -101,7 +101,7 @@ fn launch_dict_kernel_timed -__device__ void patches_apply_inplace( - ValuesT *const values, - const IndexT *const patchIndices, - const ValueT *const patchValues, - uint64_t valuesLen, - uint64_t patchesLen, -) { - const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx > patchesLen) { - return; - } - - const IndexT patchIdx = patchIndices[idx]; - const ValueT patchVal = patchValues[idx]; - - const size_t valueIdx = static_cast(patchIdx); - values[valueIdx] = patchVal; -} - -#define GENERATE_PATCHES_KERNEL(ValuesT, IndicesT) \ -extern "C" __global__ patches_apply_inplace \ No newline at end of file diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu new file mode 100644 index 00000000000..3358f1950c5 --- /dev/null +++ b/vortex-cuda/kernels/src/patches.cu @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#include + +// Apply patches to a source array +template +__device__ void patches( + ValueT *const values, + const IndexT *const patchIndices, + const ValueT *const patchValues, + uint64_t patchesLen +) { + const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx > patchesLen) { + return; + } + + const IndexT patchIdx = patchIndices[idx]; + const ValueT patchVal = patchValues[idx]; + + const size_t valueIdx = static_cast(patchIdx); + values[valueIdx] = patchVal; +} + +#define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \ +extern "C" __global__ void patches_##value_suffix##_##index_suffix( \ + ValueT *const values, \ + const IndexT *const patchIndices, \ + const ValueT *const patchValues, \ + uint64_t patchesLen \ +) { \ + patches(values, patchIndices, patchValues, patchesLen); \ +} + +#define GENERATE_PATCHES_KERNEL_FOR_VALUE(ValueT, value_suffix) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint8_t, u8) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint16_t, u16) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint32_t, u32) \ + GENERATE_PATCHES_KERNEL(ValueT, value_suffix, uint64_t, u64) + + +GENERATE_PATCHES_KERNEL_FOR_VALUE(uint8_t, u8) +GENERATE_PATCHES_KERNEL_FOR_VALUE(uint16_t, u16) +GENERATE_PATCHES_KERNEL_FOR_VALUE(uint32_t, u32) +GENERATE_PATCHES_KERNEL_FOR_VALUE(uint64_t, u64) + +GENERATE_PATCHES_KERNEL_FOR_VALUE(int8_t, i8) +GENERATE_PATCHES_KERNEL_FOR_VALUE(int16_t, i16) +GENERATE_PATCHES_KERNEL_FOR_VALUE(int32_t, i32) +GENERATE_PATCHES_KERNEL_FOR_VALUE(int64_t, i64) + +GENERATE_PATCHES_KERNEL_FOR_VALUE(float, f32) +GENERATE_PATCHES_KERNEL_FOR_VALUE(double, f64) diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs index c9cb27f2cd3..02b02dca2c9 100644 --- a/vortex-cuda/src/kernel/arrays/dict.rs +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -129,7 +129,7 @@ async fn execute_dict_prim_typed(patches.clone(), output_buf, ctx).await? + }) + } else { + output_buf + }; - // Launch kernel - let _cuda_events = - launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + // TODO(aduffy): scatter patch values validity. There are several places we'll need to start + // handling validity. - // Build result with newly allocated buffer let output_handle = BufferHandle::new_device(Arc::new(output_buf)); Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( output_handle, @@ -117,8 +132,10 @@ mod tests { use vortex_array::IntoArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity::NonNullable; + use vortex_array::patches::Patches; + use vortex_array::validity::Validity; use vortex_buffer::Buffer; + use vortex_buffer::buffer; use vortex_error::VortexExpect; use vortex_session::VortexSession; @@ -138,13 +155,24 @@ mod tests { let encoded_data: Vec = vec![100, 200, 300, 400, 500]; let exponents = Exponents { e: 0, f: 2 }; // multiply by 100 + // Patches + let patches = Patches::new( + 5, + 0, + PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(), + PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(), + None, + ) + .unwrap(); + let alp_array = ALPArray::try_new( - PrimitiveArray::new(Buffer::from(encoded_data.clone()), NonNullable).into_array(), + PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable) + .into_array(), exponents, - None, + Some(patches), )?; - let cpu_result = alp_array.to_canonical()?; + let cpu_result = alp_array.to_canonical()?.into_array(); let gpu_result = ALPExecutor .execute(alp_array.to_array(), &mut cuda_ctx) @@ -154,7 +182,7 @@ mod tests { .await? .into_array(); - assert_arrays_eq!(cpu_result.into_array(), gpu_result); + assert_arrays_eq!(cpu_result, gpu_result); Ok(()) } diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 4f90ce894f8..afba4453db8 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -66,7 +66,7 @@ macro_rules! launch_cuda_kernel { array_len: $len:expr ) => {{ use ::cudarc::driver::PushKernelArg as _; - let cuda_function = $ctx.load_function($module, $ptypes)?; + let cuda_function = $ctx.load_function_ptype($module, $ptypes)?; let mut launch_builder = $ctx.launch_builder(&cuda_function); $( diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index f2819900fe1..34fecd7594c 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -1,32 +1,113 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use vortex_array::Canonical; -use vortex_array::ExecutionCtx; +use cudarc::driver::DeviceRepr; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use vortex_array::arrays::PrimitiveArrayParts; use vortex_array::patches::Patches; +use vortex_cuda_macros::cuda_tests; use vortex_dtype::NativePType; use vortex_error::VortexResult; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; -#[derive(Debug)] -pub struct PatchesExecutor; +use crate::CudaDeviceBuffer; +use crate::CudaExecutionCtx; +use crate::executor::CudaArrayExt; +use crate::launch_cuda_kernel; -pub(crate) async fn execute_patches( +/// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`. +pub(crate) async fn execute_patches< + ValuesT: NativePType + DeviceRepr, + IndicesT: NativePType + DeviceRepr, +>( patches: Patches, - array: Canonical, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let len = array.len(); - let values = array.into_primitive(); - - todo!() - - // Based on the typed indices and values instead...we can apply those - // launch_cuda_kernel!( - // execution_ctx: ctx, - // module: "patches", - // ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE], - // launch_args: [], - // event_recording: CU_EVENT_DISABLE_TIMING, - // array_len: - // ) + target: CudaDeviceBuffer, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { + let indices = patches.indices().clone(); + let values = patches.values().clone(); + drop(patches); + + let indices = indices.execute_cuda(ctx).await?.into_primitive(); + let values = values.execute_cuda(ctx).await?.into_primitive(); + + vortex_ensure!( + indices.ptype() == IndicesT::PTYPE, + "expected PType {} for patch indices, was {}", + IndicesT::PTYPE, + indices.ptype() + ); + + vortex_ensure!( + values.ptype() == ValuesT::PTYPE, + "expected PType {} for patch values, was {}", + ValuesT::PTYPE, + values.ptype() + ); + + let patches_len = indices.len(); + let patches_len_u64 = patches_len as u64; + + let PrimitiveArrayParts { + buffer: indices_buffer, + .. + } = indices.into_parts(); + + let PrimitiveArrayParts { + buffer: values_buffer, + validity: values_validity, + .. + } = values.into_parts(); + + let d_patch_indices = if indices_buffer.is_on_device() { + indices_buffer + } else { + ctx.move_to_device::(indices_buffer)?.await? + }; + + let d_patch_values = if values_buffer.is_on_device() { + values_buffer + } else { + ctx.move_to_device::(values_buffer)?.await? + }; + + let d_patch_indices_buf = d_patch_indices + .as_device() + .as_any() + .downcast_ref::() + .ok_or_else(|| vortex_err!("d_patch_indices must be CudaDeviceBuffer"))?; + + let d_patch_values_buf = d_patch_values + .as_device() + .as_any() + .downcast_ref::() + .ok_or_else(|| vortex_err!("d_patch_values must be CudaDeviceBuffer"))?; + + let d_target_view = target.as_view::(); + let d_patch_indices_view = d_patch_indices_buf.as_view::(); + let d_patch_values_view = d_patch_values_buf.as_view::(); + + // kernel arg order for patches is values, patchIndices, patchValues, patchesLen + let _events = launch_cuda_kernel!( + execution_ctx: ctx, + module: "patches", + ptypes: &[ValuesT::PTYPE, IndicesT::PTYPE], + launch_args: [ + d_target_view, + d_patch_indices_view, + d_patch_values_view, + patches_len_u64, + ], + event_recording: CU_EVENT_DISABLE_TIMING, + array_len: patches_len + ); + + Ok(target) +} + +#[cuda_tests] +mod tests { + #[test] + fn test_impl() {} } From 617ade825a0b919780ae5a1e4657aeda467bab10 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 30 Jan 2026 21:15:27 +0000 Subject: [PATCH 3/5] BP Signed-off-by: Andrew Duffy --- vortex-cuda/src/device_buffer.rs | 3 +- vortex-cuda/src/kernel/encodings/bitpacked.rs | 82 ++++++++-- vortex-cuda/src/kernel/patches/mod.rs | 140 +++++++++++++++--- 3 files changed, 189 insertions(+), 36 deletions(-) diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs index 92b8fe08233..520f4551e96 100644 --- a/vortex-cuda/src/device_buffer.rs +++ b/vortex-cuda/src/device_buffer.rs @@ -27,6 +27,7 @@ use crate::stream::await_stream_callback; /// A [`DeviceBuffer`] wrapping a CUDA GPU allocation. /// /// Like the host `BufferHandle` variant, all slicing/referencing works in terms of byte units. +#[derive(Clone)] pub struct CudaDeviceBuffer { allocation: Arc, /// Offset in bytes from the start of the allocation @@ -39,8 +40,6 @@ pub struct CudaDeviceBuffer { alignment: Alignment, } -// We can call the sys methods, it's just a lot of extra code...fuck that lol - mod private { use std::fmt::Debug; use std::sync::Arc; diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 635ee11a57b..c1e905aff77 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt::Debug; +use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; @@ -16,6 +17,8 @@ use vortex_array::buffer::DeviceBufferExt; use vortex_cuda_macros::cuda_tests; use vortex_dtype::NativePType; use vortex_dtype::match_each_integer_ptype; +use vortex_dtype::match_each_unsigned_integer_ptype; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_error::vortex_err; @@ -29,6 +32,7 @@ use crate::CudaDeviceBuffer; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; use crate::kernel::launch_cuda_kernel_with_config; +use crate::kernel::patches::execute_patches; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] @@ -74,7 +78,6 @@ where } = array.into_parts(); vortex_ensure!(len > 0, "Non empty array"); - vortex_ensure!(patches.is_none(), "Patches not supported"); let offset = offset as usize; let device_input: BufferHandle = if packed.is_on_device() { @@ -97,27 +100,46 @@ where let thread_count = if bits == 64 { 16 } else { 32 }; let suffixes: [&str; _] = [&format!("{bit_width}bw"), &format!("{thread_count}t")]; let cuda_function = ctx.load_function(&format!("bit_unpack_{}", bits), &suffixes)?; - let mut launch_builder = ctx.launch_builder(&cuda_function); - // Build launch args: input, output, f, e, length - launch_builder.arg(&input_view); - launch_builder.arg(&output_view); + { + let mut launch_builder = ctx.launch_builder(&cuda_function); - let num_blocks = u32::try_from(len.div_ceil(1024))?; + // Build launch args: input, output, f, e, length + launch_builder.arg(&input_view); + launch_builder.arg(&output_view); - let config = LaunchConfig { - grid_dim: (num_blocks, 1, 1), - block_dim: (thread_count, 1, 1), - shared_mem_bytes: 0, - }; + let num_blocks = u32::try_from(len.div_ceil(1024))?; + + let config = LaunchConfig { + grid_dim: (num_blocks, 1, 1), + block_dim: (thread_count, 1, 1), + shared_mem_bytes: 0, + }; - // Launch kernel - let _cuda_events = - launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?; + // Launch kernel + let _cuda_events = + launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?; + } + + let output_handle = match patches { + None => BufferHandle::new_device(output_buf.slice_typed::(offset..(offset + len))), + Some(p) => { + let output_buf = output_buf.slice_typed::(offset..(offset + len)); + let buf = output_buf + .as_any() + .downcast_ref::() + .vortex_expect("we created this as CudaDeviceBuffer") + .clone(); + + let patched_buf = match_each_unsigned_integer_ptype!(p.indices_ptype()?, |I| { + execute_patches::(p, buf, ctx).await? + }); + + BufferHandle::new_device(Arc::new(patched_buf)) + } + }; // Build result with newly allocated buffer - let output_handle = - BufferHandle::new_device(output_buf.slice_typed::(offset..(offset + len))); Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( output_handle, A::PTYPE, @@ -141,6 +163,34 @@ mod tests { use crate::CanonicalCudaExt; use crate::session::CudaSession; + #[test] + fn test_patches() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + let array = PrimitiveArray::new((0u16..=513).collect::>(), NonNullable); + + // Last two items should be patched + let bp_with_patches = BitPackedArray::encode(array.as_ref(), 9)?; + assert!(bp_with_patches.patches().is_some()); + + let cpu_result = bp_with_patches.to_canonical()?.into_array(); + + let gpu_result = block_on(async { + BitPackedExecutor + .execute(bp_with_patches.to_array(), &mut cuda_ctx) + .await + .vortex_expect("GPU decompression failed") + .into_host() + .await + .map(|a| a.into_array()) + })?; + + assert_arrays_eq!(cpu_result, gpu_result); + + Ok(()) + } + #[rstest] #[case::bw_1(1)] #[case::bw_2(2)] diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index 34fecd7594c..9922c7bf56d 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -5,12 +5,14 @@ use cudarc::driver::DeviceRepr; use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; use vortex_array::arrays::PrimitiveArrayParts; use vortex_array::patches::Patches; +use vortex_array::validity::Validity; +use vortex_array::vtable::ValidityHelper; use vortex_cuda_macros::cuda_tests; use vortex_dtype::NativePType; use vortex_error::VortexResult; use vortex_error::vortex_ensure; -use vortex_error::vortex_err; +use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::CudaExecutionCtx; use crate::executor::CudaArrayExt; @@ -32,6 +34,15 @@ pub(crate) async fn execute_patches< let indices = indices.execute_cuda(ctx).await?.into_primitive(); let values = values.execute_cuda(ctx).await?.into_primitive(); + let supported = matches!( + values.validity(), + Validity::NonNullable | Validity::AllValid + ); + vortex_ensure!( + supported, + "Applying patches with null values not currently supported on the GPU" + ); + vortex_ensure!( indices.ptype() == IndicesT::PTYPE, "expected PType {} for patch indices, was {}", @@ -56,7 +67,6 @@ pub(crate) async fn execute_patches< let PrimitiveArrayParts { buffer: values_buffer, - validity: values_validity, .. } = values.into_parts(); @@ -72,21 +82,9 @@ pub(crate) async fn execute_patches< ctx.move_to_device::(values_buffer)?.await? }; - let d_patch_indices_buf = d_patch_indices - .as_device() - .as_any() - .downcast_ref::() - .ok_or_else(|| vortex_err!("d_patch_indices must be CudaDeviceBuffer"))?; - - let d_patch_values_buf = d_patch_values - .as_device() - .as_any() - .downcast_ref::() - .ok_or_else(|| vortex_err!("d_patch_values must be CudaDeviceBuffer"))?; - let d_target_view = target.as_view::(); - let d_patch_indices_view = d_patch_indices_buf.as_view::(); - let d_patch_values_view = d_patch_values_buf.as_view::(); + let d_patch_indices_view = d_patch_indices.cuda_view::()?; + let d_patch_values_view = d_patch_values.cuda_view::()?; // kernel arg order for patches is values, patchIndices, patchValues, patchesLen let _events = launch_cuda_kernel!( @@ -108,6 +106,112 @@ pub(crate) async fn execute_patches< #[cuda_tests] mod tests { - #[test] - fn test_impl() {} + use std::sync::Arc; + + use cudarc::driver::DeviceRepr; + use vortex_array::IntoArray; + use vortex_array::ToCanonical; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::PrimitiveArrayParts; + use vortex_array::assert_arrays_eq; + use vortex_array::buffer::BufferHandle; + use vortex_array::compute::cast; + use vortex_array::patches::Patches; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + use vortex_dtype::DType; + use vortex_dtype::NativePType; + use vortex_dtype::Nullability; + use vortex_session::VortexSession; + + use crate::CanonicalCudaExt; + use crate::CudaDeviceBuffer; + use crate::CudaSession; + use crate::kernel::patches::execute_patches; + + #[tokio::test] + async fn test_patches() { + test_case::().await; + test_case::().await; + test_case::().await; + test_case::().await; + + test_case::().await; + test_case::().await; + test_case::().await; + test_case::().await; + + test_case::().await; + test_case::().await; + } + + async fn test_case() { + full_test_case::().await; + full_test_case::().await; + full_test_case::().await; + full_test_case::().await; + } + + async fn full_test_case() { + let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); + + let values = PrimitiveArray::from_iter(0..128); + let values = force_cast::(values); + + let patch_idx = PrimitiveArray::new(buffer![0, 8, 16, 32], Validity::NonNullable); + let patch_idx = force_cast::(patch_idx); + + let patch_val = PrimitiveArray::new(buffer![99, 99, 99, 99], Validity::NonNullable); + let patch_val = force_cast::(patch_val); + + // Copy all to GPU + let patches = + Patches::new(128, 0, patch_idx.into_array(), patch_val.into_array(), None).unwrap(); + + let cpu_result = values.clone().patch(&patches).unwrap(); + + let PrimitiveArrayParts { + buffer: cuda_buffer, + .. + } = values.into_parts(); + + let handle = ctx + .move_to_device::(cuda_buffer) + .unwrap() + .await + .unwrap(); + let device_buf = handle + .as_device() + .as_any() + .downcast_ref::() + .unwrap() + .clone(); + + let patched_buf = execute_patches::(patches, device_buf, &mut ctx) + .await + .unwrap(); + + let gpu_result = PrimitiveArray::from_buffer_handle( + BufferHandle::new_device(Arc::new(patched_buf)), + Values::PTYPE, + Validity::NonNullable, + ) + .to_canonical() + .unwrap() + .into_host() + .await + .unwrap() + .into_primitive(); + + assert_arrays_eq!(cpu_result, gpu_result); + } + + fn force_cast(array: PrimitiveArray) -> PrimitiveArray { + cast( + array.as_ref(), + &DType::Primitive(T::PTYPE, Nullability::NonNullable), + ) + .unwrap() + .to_primitive() + } } From 7371736875db42c9ad2ee857898de5d3e2f9bbc3 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 09:56:38 -0500 Subject: [PATCH 4/5] fix Signed-off-by: Andrew Duffy --- vortex-cuda/kernels/src/patches.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu index 3358f1950c5..2fecc926569 100644 --- a/vortex-cuda/kernels/src/patches.cu +++ b/vortex-cuda/kernels/src/patches.cu @@ -3,6 +3,8 @@ #include +// TODO(aduffy): this is very naive. In the future we need to +// transpose the patches, see G-ALP paper. // Apply patches to a source array template __device__ void patches( @@ -13,7 +15,7 @@ __device__ void patches( ) { const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx > patchesLen) { + if (idx >= patchesLen) { return; } From 7e9717e626b7e7cdd1781189b45b50bb2ee4f3d6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 10:01:24 -0500 Subject: [PATCH 5/5] start/stop fix Signed-off-by: Andrew Duffy --- vortex-cuda/kernels/src/config.cuh | 7 +++++++ vortex-cuda/kernels/src/patches.cu | 18 +++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh index de679a24a6f..cf7fba2e848 100644 --- a/vortex-cuda/kernels/src/config.cuh +++ b/vortex-cuda/kernels/src/config.cuh @@ -3,6 +3,8 @@ #pragma once +#include + // Kernel launch configuration constants. // Must match the Rust launch config in src/kernel/mod.rs. // @@ -10,3 +12,8 @@ // elements_per_block = 64 * 32 = 2048 // grid_dim = ceil(array_len / 2048) constexpr uint32_t ELEMENTS_PER_THREAD = 32; + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +#define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len)) +#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len)) diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu index 2fecc926569..fc5b611a828 100644 --- a/vortex-cuda/kernels/src/patches.cu +++ b/vortex-cuda/kernels/src/patches.cu @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#include +#include "config.cuh" // TODO(aduffy): this is very naive. In the future we need to // transpose the patches, see G-ALP paper. @@ -13,17 +13,21 @@ __device__ void patches( const ValueT *const patchValues, uint64_t patchesLen ) { - const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; + const uint64_t startElem = START_ELEM(worker, patchesLen); + const uint64_t stopElem = START_ELEM(worker, patchesLen); - if (idx >= patchesLen) { + if (startElem >= patchesLen) { return; } - const IndexT patchIdx = patchIndices[idx]; - const ValueT patchVal = patchValues[idx]; + for (uint64_t idx = startElem; idx < stopElem; idx++) { + const IndexT patchIdx = patchIndices[idx]; + const ValueT patchVal = patchValues[idx]; - const size_t valueIdx = static_cast(patchIdx); - values[valueIdx] = patchVal; + const size_t valueIdx = static_cast(patchIdx); + values[valueIdx] = patchVal; + } } #define GENERATE_PATCHES_KERNEL(ValueT, value_suffix, IndexT, index_suffix) \