From f910ac742eafc5ab334f7881eee10bfffa60789a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 30 Jan 2026 23:02:07 +0000 Subject: [PATCH 1/3] feat[cuda]: sequence Signed-off-by: Andrew Duffy --- Cargo.lock | 4 + encodings/sequence/src/array.rs | 33 ++++- encodings/sequence/src/kernel.rs | 20 +-- encodings/sequence/src/lib.rs | 1 + vortex-cuda/Cargo.toml | 3 +- vortex-cuda/kernels/src/sequence.cu | 40 +++++ vortex-cuda/src/kernel/encodings/mod.rs | 2 + vortex-cuda/src/kernel/encodings/sequence.rs | 147 +++++++++++++++++++ vortex-cuda/src/lib.rs | 3 + 9 files changed, 235 insertions(+), 18 deletions(-) create mode 100644 vortex-cuda/kernels/src/sequence.cu create mode 100644 vortex-cuda/src/kernel/encodings/sequence.rs diff --git a/Cargo.lock b/Cargo.lock index de836acc39c..dc7ce54711f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2270,6 +2270,7 @@ version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3aa12038120eb13347a6ae2ffab1d34efe78150125108627fd85044dd4d6ff1e" dependencies = [ + "half", "libloading 0.8.9", ] @@ -4441,6 +4442,8 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", + "rand 0.9.2", + "rand_distr 0.5.1", "zerocopy", ] @@ -10409,6 +10412,7 @@ dependencies = [ "vortex-mask", "vortex-nvcomp", "vortex-scalar", + "vortex-sequence", "vortex-session", "vortex-utils", "vortex-zigzag", diff --git a/encodings/sequence/src/array.rs b/encodings/sequence/src/array.rs index f99e2d812ab..36ad094d3be 100644 --- a/encodings/sequence/src/array.rs +++ b/encodings/sequence/src/array.rs @@ -59,13 +59,22 @@ pub struct SequenceMetadata { multiplier: Option, } +/// Components of [`SequenceArray`]. +pub struct SequenceArrayParts { + pub base: PValue, + pub multiplier: PValue, + pub len: usize, + pub ptype: PType, + pub nullability: Nullability, +} + #[derive(Clone, Debug)] /// An array representing the equation `A[i] = base + i * multiplier`. pub struct SequenceArray { base: PValue, multiplier: PValue, dtype: DType, - pub(crate) length: usize, + pub(crate) len: usize, stats_set: ArrayStats, } @@ -124,7 +133,7 @@ impl SequenceArray { base, multiplier, dtype, - length, + len: length, // TODO(joe): add stats, on construct or on use? stats_set: Default::default(), } @@ -164,7 +173,7 @@ impl SequenceArray { } pub(crate) fn index_value(&self, idx: usize) -> PValue { - assert!(idx < self.length, "index_value({idx}): index out of bounds"); + assert!(idx < self.len, "index_value({idx}): index out of bounds"); match_each_native_ptype!(self.ptype(), |P| { let base = self.base.cast::

(); @@ -177,9 +186,19 @@ impl SequenceArray { /// Returns the validated final value of a sequence array pub fn last(&self) -> PValue { - Self::try_last(self.base, self.multiplier, self.ptype(), self.length) + Self::try_last(self.base, self.multiplier, self.ptype(), self.len) .vortex_expect("validated array") } + + pub fn into_parts(self) -> SequenceArrayParts { + SequenceArrayParts { + base: self.base, + multiplier: self.multiplier, + len: self.len, + ptype: self.dtype.as_ptype(), + nullability: self.dtype.nullability(), + } + } } impl VTable for SequenceVTable { @@ -355,7 +374,7 @@ fn execute_iter>( impl BaseArrayVTable for SequenceVTable { fn len(array: &SequenceArray) -> usize { - array.length + array.len } fn dtype(array: &SequenceArray) -> &DType { @@ -374,14 +393,14 @@ impl BaseArrayVTable for SequenceVTable { array.base.hash(state); array.multiplier.hash(state); array.dtype.hash(state); - array.length.hash(state); + array.len.hash(state); } fn array_eq(array: &SequenceArray, other: &SequenceArray, _precision: Precision) -> bool { array.base == other.base && array.multiplier == other.multiplier && array.dtype == other.dtype - && array.length == other.length + && array.len == other.len } } diff --git a/encodings/sequence/src/kernel.rs b/encodings/sequence/src/kernel.rs index b13d5d491e7..586cfe629f7 100644 --- a/encodings/sequence/src/kernel.rs +++ b/encodings/sequence/src/kernel.rs @@ -89,7 +89,7 @@ impl ExecuteParentKernel for SequenceCompareKernel { // Constant is null - result is all null for comparisons let nullability = array.dtype().nullability() | constant.dtype().nullability(); let result_array = - ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.length).to_array(); + ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.len).to_array(); return Ok(Some(result_array.execute(ctx)?)); }; @@ -125,22 +125,22 @@ fn compare_eq_neq( // Check if there exists an integer solution to const = base + idx * multiplier let Some(set_idx) = - find_intersection_scalar(array.base(), array.multiplier(), array.length, constant) + find_intersection_scalar(array.base(), array.multiplier(), array.len, constant) else { let result_array = ConstantArray::new( Scalar::new(DType::Bool(nullability), not_match_val.into()), - array.length, + array.len, ) .to_array(); return Ok(Some(result_array.execute(ctx)?)); }; let idx = set_idx as u64; - let len = array.length as u64; + let len = array.len as u64; if len == 1 && set_idx == 0 { let result_array = ConstantArray::new( Scalar::new(DType::Bool(nullability), match_val.into()), - array.length, + array.len, ) .to_array(); return Ok(Some(result_array.execute(ctx)?)); @@ -179,7 +179,7 @@ fn compare_ordering( let transition = find_transition_point( array.base(), array.multiplier(), - array.length, + array.len, constant, operator, ); @@ -187,23 +187,23 @@ fn compare_ordering( let result_array = match transition { Transition::AllTrue => ConstantArray::new( Scalar::new(DType::Bool(nullability), true.into()), - array.length, + array.len, ) .to_array(), Transition::AllFalse => ConstantArray::new( Scalar::new(DType::Bool(nullability), false.into()), - array.length, + array.len, ) .to_array(), Transition::FalseToTrue(idx) => { // [0..idx) is false, [idx..len) is true - let ends = buffer![idx as u64, array.length as u64].into_array(); + let ends = buffer![idx as u64, array.len as u64].into_array(); let values = BoolArray::new(bitbuffer![false, true], nullability.into()).into_array(); RunEndArray::try_new(ends, values)?.into_array() } Transition::TrueToFalse(idx) => { // [0..idx) is true, [idx..len) is false - let ends = buffer![idx as u64, array.length as u64].into_array(); + let ends = buffer![idx as u64, array.len as u64].into_array(); let values = BoolArray::new(bitbuffer![true, false], nullability.into()).into_array(); RunEndArray::try_new(ends, values)?.into_array() } diff --git a/encodings/sequence/src/lib.rs b/encodings/sequence/src/lib.rs index 1b099edcc9b..7dabc831b99 100644 --- a/encodings/sequence/src/lib.rs +++ b/encodings/sequence/src/lib.rs @@ -9,6 +9,7 @@ mod kernel; /// Represents the equation A\[i\] = a * i + b. /// This can be used for compression, fast comparisons and also for row ids. pub use array::SequenceArray; +pub use array::SequenceArrayParts; /// Represents the equation A\[i\] = a * i + b. /// This can be used for compression, fast comparisons and also for row ids. pub use array::SequenceVTable; diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index b2f5a0bc1cf..99a481e1720 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -23,7 +23,7 @@ _test-harness = [] [dependencies] arc-swap = { workspace = true } async-trait = { workspace = true } -cudarc = { workspace = true } +cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } kanal = { workspace = true } @@ -41,6 +41,7 @@ vortex-fastlanes = { workspace = true } vortex-io = { workspace = true } vortex-mask = { workspace = true } vortex-nvcomp = { path = "nvcomp" } +vortex-sequence = { workspace = true } vortex-session = { workspace = true } vortex-utils = { workspace = true } vortex-zigzag = { workspace = true } diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu new file mode 100644 index 00000000000..8aa73855acc --- /dev/null +++ b/vortex-cuda/kernels/src/sequence.cu @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#include + +template +__device__ void sequence( + ValueT *const output, + ValueT base, + ValueT multiplier, + uint64_t len +) { + const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= len) { + return; + } + + output[idx] = static_cast(idx) * multiplier + base; +} + +#define GENERATE_KERNEL(ValueT, suffix) \ +extern "C" __global__ void sequence_##suffix( \ + ValueT *const output, \ + ValueT base, \ + ValueT multiplier, \ + uint64_t len \ +) { \ + sequence(output, base, multiplier, len); \ +} + +GENERATE_KERNEL(uint8_t, u8); +GENERATE_KERNEL(uint16_t, u16); +GENERATE_KERNEL(uint32_t, u32); +GENERATE_KERNEL(uint64_t, u64); +GENERATE_KERNEL(int8_t, i8); +GENERATE_KERNEL(int16_t, i16); +GENERATE_KERNEL(int32_t, i32); +GENERATE_KERNEL(int64_t, i64); +GENERATE_KERNEL(float, f32); +GENERATE_KERNEL(double, f64); diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs index addb877543c..ed38a71b023 100644 --- a/vortex-cuda/src/kernel/encodings/mod.rs +++ b/vortex-cuda/src/kernel/encodings/mod.rs @@ -5,6 +5,7 @@ mod alp; mod bitpacked; mod decimal_byte_parts; mod for_; +mod sequence; mod zigzag; mod zstd; @@ -12,6 +13,7 @@ pub use alp::ALPExecutor; pub use bitpacked::BitPackedExecutor; pub use decimal_byte_parts::DecimalBytePartsExecutor; pub use for_::FoRExecutor; +pub use sequence::SequenceExecutor; pub use zigzag::ZigZagExecutor; pub use zstd::ZstdExecutor; pub use zstd::ZstdKernelPrep; diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs new file mode 100644 index 00000000000..d098a02d971 --- /dev/null +++ b/vortex-cuda/src/kernel/encodings/sequence.rs @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use async_trait::async_trait; +use cudarc::driver::DeviceRepr; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::buffer::BufferHandle; +use vortex_cuda_macros::cuda_tests; +use vortex_dtype::NativePType; +use vortex_dtype::Nullability; +use vortex_dtype::match_each_native_ptype; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_sequence::SequenceArrayParts; +use vortex_sequence::SequenceVTable; + +use crate::CudaDeviceBuffer; +use crate::CudaExecutionCtx; +use crate::executor::CudaExecute; +use crate::launch_cuda_kernel; + +/// CUDA execution for `SequenceArray`. +#[derive(Debug)] +pub struct SequenceExecutor; + +#[async_trait] +impl CudaExecute for SequenceExecutor { + async fn execute( + &self, + array: ArrayRef, + ctx: &mut CudaExecutionCtx, + ) -> VortexResult { + let array = array + .try_into::() + .map_err(|_| vortex_err!("SequenceExecutor can only accept SequenceArray"))?; + + let SequenceArrayParts { + base, + multiplier, + len, + ptype, + nullability, + } = array.into_parts(); + + match_each_native_ptype!(ptype, |P| { + let base = base.cast::

(); + let multiplier = multiplier.cast::

(); + + execute_typed::

(base, multiplier, len, nullability, ctx).await + }) + } +} + +async fn execute_typed( + base: T, + multiplier: T, + len: usize, + nullability: Nullability, + ctx: &mut CudaExecutionCtx, +) -> VortexResult { + let buffer = ctx.device_alloc::(len)?; + + let len_u64 = len as u64; + + let _events = launch_cuda_kernel!( + execution_ctx: ctx, + module: "sequence", + ptypes: &[T::PTYPE.to_string().as_str()], + launch_args: [buffer, base, multiplier, len_u64], + event_recording: CU_EVENT_DISABLE_TIMING, + array_len: len + ); + + let output_buf = BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer))); + + Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( + output_buf, + T::PTYPE, + nullability.into(), + ))) +} + +#[cuda_tests] +mod tests { + use futures::executor::block_on; + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::assert_arrays_eq; + use vortex_dtype::NativePType; + use vortex_dtype::Nullability; + use vortex_scalar::PValue; + use vortex_sequence::SequenceArray; + use vortex_session::VortexSession; + + use crate::CanonicalCudaExt; + use crate::CudaSession; + use crate::executor::CudaExecute; + use crate::kernel::encodings::sequence::SequenceExecutor; + + #[rstest] + #[case::u8(10u8, 2u8, 10)] + #[case::u16(10u16, 2u16, 100)] + #[case::u32(10u32, 2u32, 1000)] + #[case::u64(100u64, 20u64, 500)] + fn test_sequence>( + #[case] base: T, + #[case] multiplier: T, + #[case] len: usize, + ) { + block_on( + async move { test_ptype::(base, multiplier, len, Nullability::NonNullable).await }, + ); + + block_on( + async move { test_ptype::(base, multiplier, len, Nullability::Nullable).await }, + ); + } + + async fn test_ptype>( + base: P, + multiplier: P, + len: usize, + nullability: Nullability, + ) { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap(); + + let array = SequenceArray::typed_new(base, multiplier, nullability, len).unwrap(); + + let cpu_result = array.to_canonical().unwrap().into_array(); + + let gpu_result = SequenceExecutor + .execute(array.into_array(), &mut cuda_ctx) + .await + .unwrap() + .into_host() + .await + .unwrap() + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + } +} diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index c914fe1b749..b464281fde8 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -42,9 +42,11 @@ use vortex_decimal_byte_parts::DecimalBytePartsVTable; use vortex_fastlanes::BitPackedVTable; use vortex_fastlanes::FoRVTable; pub use vortex_nvcomp as nvcomp; +use vortex_sequence::SequenceVTable; use vortex_zigzag::ZigZagVTable; use vortex_zstd::ZstdVTable; +use crate::kernel::SequenceExecutor; use crate::kernel::SliceExecutor; /// Checks if CUDA is available on the system by looking for nvcc. @@ -63,6 +65,7 @@ pub fn initialize_cuda(session: &CudaSession) { session.register_kernel(DecimalBytePartsVTable::ID, &DecimalBytePartsExecutor); session.register_kernel(DictVTable::ID, &DictExecutor); session.register_kernel(FoRVTable::ID, &FoRExecutor); + session.register_kernel(SequenceVTable::ID, &SequenceExecutor); session.register_kernel(ZigZagVTable::ID, &ZigZagExecutor); session.register_kernel(ZstdVTable::ID, &ZstdExecutor); From 8db99a3a9e7f708196f06a22d24b41e31e295207 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Sun, 1 Feb 2026 13:00:16 -0500 Subject: [PATCH 2/3] properly handle ELEMENTS_PER_THREAD Signed-off-by: Andrew Duffy --- vortex-cuda/kernels/src/sequence.cu | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu index 8aa73855acc..1b58d62fa93 100644 --- a/vortex-cuda/kernels/src/sequence.cu +++ b/vortex-cuda/kernels/src/sequence.cu @@ -1,8 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "config.cuh" #include +#define MIN(a, b) (((a) < (b)) : (a) : (b)) + template __device__ void sequence( ValueT *const output, @@ -10,12 +13,14 @@ __device__ void sequence( ValueT multiplier, uint64_t len ) { - const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= len) { - return; - } + const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; - output[idx] = static_cast(idx) * multiplier + base; + const uint64_t elemStart = MIN(worker * ELEMENTS_PER_THREAD, len); + const uint64_t elemEnd = MIN(elemStart + ELEMENTS_PER_THREAD, len); + + for (uint64_t idx = elemStart; idx < elemEnd; idx++) { + output[idx] = static_cast(idx) * multiplier + base; + } } #define GENERATE_KERNEL(ValueT, suffix) \ From cab43e199c843300f8341ee2a3648b05c5f6770e Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 2 Feb 2026 10:43:52 -0500 Subject: [PATCH 3/3] fix Signed-off-by: Andrew Duffy --- vortex-cuda/kernels/src/patches.cu | 2 +- vortex-cuda/kernels/src/sequence.cu | 9 +++------ vortex-cuda/src/kernel/encodings/sequence.rs | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu index fc5b611a828..108c0e1c914 100644 --- a/vortex-cuda/kernels/src/patches.cu +++ b/vortex-cuda/kernels/src/patches.cu @@ -15,7 +15,7 @@ __device__ void patches( ) { const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; const uint64_t startElem = START_ELEM(worker, patchesLen); - const uint64_t stopElem = START_ELEM(worker, patchesLen); + const uint64_t stopElem = STOP_ELEM(worker, patchesLen); if (startElem >= patchesLen) { return; diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu index 1b58d62fa93..eb9df20b46e 100644 --- a/vortex-cuda/kernels/src/sequence.cu +++ b/vortex-cuda/kernels/src/sequence.cu @@ -2,9 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors #include "config.cuh" -#include - -#define MIN(a, b) (((a) < (b)) : (a) : (b)) template __device__ void sequence( @@ -15,10 +12,10 @@ __device__ void sequence( ) { const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x; - const uint64_t elemStart = MIN(worker * ELEMENTS_PER_THREAD, len); - const uint64_t elemEnd = MIN(elemStart + ELEMENTS_PER_THREAD, len); + const uint64_t startElem = START_ELEM(worker, len); + const uint64_t stopElem = STOP_ELEM(worker, len); - for (uint64_t idx = elemStart; idx < elemEnd; idx++) { + for (uint64_t idx = startElem; idx < stopElem; idx++) { output[idx] = static_cast(idx) * multiplier + base; } } diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs index d098a02d971..daf08bb9654 100644 --- a/vortex-cuda/src/kernel/encodings/sequence.rs +++ b/vortex-cuda/src/kernel/encodings/sequence.rs @@ -70,7 +70,7 @@ async fn execute_typed( let _events = launch_cuda_kernel!( execution_ctx: ctx, module: "sequence", - ptypes: &[T::PTYPE.to_string().as_str()], + ptypes: &[T::PTYPE], launch_args: [buffer, base, multiplier, len_u64], event_recording: CU_EVENT_DISABLE_TIMING, array_len: len