From f910ac742eafc5ab334f7881eee10bfffa60789a Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Fri, 30 Jan 2026 23:02:07 +0000
Subject: [PATCH 1/3] feat[cuda]: sequence

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 Cargo.lock                                   |   4 +
 encodings/sequence/src/array.rs              |  33 ++++-
 encodings/sequence/src/kernel.rs             |  20 +--
 encodings/sequence/src/lib.rs                |   1 +
 vortex-cuda/Cargo.toml                       |   3 +-
 vortex-cuda/kernels/src/sequence.cu          |  40 +++++
 vortex-cuda/src/kernel/encodings/mod.rs      |   2 +
 vortex-cuda/src/kernel/encodings/sequence.rs | 147 +++++++++++++++++++
 vortex-cuda/src/lib.rs                       |   3 +
 9 files changed, 235 insertions(+), 18 deletions(-)
 create mode 100644 vortex-cuda/kernels/src/sequence.cu
 create mode 100644 vortex-cuda/src/kernel/encodings/sequence.rs
diff --git a/Cargo.lock b/Cargo.lock
index de836acc39c..dc7ce54711f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2270,6 +2270,7 @@ version = "0.18.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3aa12038120eb13347a6ae2ffab1d34efe78150125108627fd85044dd4d6ff1e"
 dependencies = [
+ "half",
  "libloading 0.8.9",
 ]
 
@@ -4441,6 +4442,8 @@ dependencies = [
  "cfg-if",
  "crunchy",
  "num-traits",
+ "rand 0.9.2",
+ "rand_distr 0.5.1",
  "zerocopy",
 ]
 
@@ -10409,6 +10412,7 @@ dependencies = [
  "vortex-mask",
  "vortex-nvcomp",
  "vortex-scalar",
+ "vortex-sequence",
  "vortex-session",
  "vortex-utils",
  "vortex-zigzag",
diff --git a/encodings/sequence/src/array.rs b/encodings/sequence/src/array.rs
index f99e2d812ab..36ad094d3be 100644
--- a/encodings/sequence/src/array.rs
+++ b/encodings/sequence/src/array.rs
@@ -59,13 +59,22 @@ pub struct SequenceMetadata {
     multiplier: Option<vortex_proto::scalar::ScalarValue>,
 }
 
+/// Components of [`SequenceArray`].
+pub struct SequenceArrayParts {
+    pub base: PValue,
+    pub multiplier: PValue,
+    pub len: usize,
+    pub ptype: PType,
+    pub nullability: Nullability,
+}
+
 #[derive(Clone, Debug)]
 /// An array representing the equation `A[i] = base + i * multiplier`.
 pub struct SequenceArray {
     base: PValue,
     multiplier: PValue,
     dtype: DType,
-    pub(crate) length: usize,
+    pub(crate) len: usize,
     stats_set: ArrayStats,
 }
 
@@ -124,7 +133,7 @@ impl SequenceArray {
             base,
             multiplier,
             dtype,
-            length,
+            len: length,
             // TODO(joe): add stats, on construct or on use?
             stats_set: Default::default(),
         }
@@ -164,7 +173,7 @@ impl SequenceArray {
     }
 
     pub(crate) fn index_value(&self, idx: usize) -> PValue {
-        assert!(idx < self.length, "index_value({idx}): index out of bounds");
+        assert!(idx < self.len, "index_value({idx}): index out of bounds");
 
         match_each_native_ptype!(self.ptype(), |P| {
             let base = self.base.cast::<P>();
@@ -177,9 +186,19 @@ impl SequenceArray {
 
     /// Returns the validated final value of a sequence array
     pub fn last(&self) -> PValue {
-        Self::try_last(self.base, self.multiplier, self.ptype(), self.length)
+        Self::try_last(self.base, self.multiplier, self.ptype(), self.len)
             .vortex_expect("validated array")
     }
+
+    pub fn into_parts(self) -> SequenceArrayParts {
+        SequenceArrayParts {
+            base: self.base,
+            multiplier: self.multiplier,
+            len: self.len,
+            ptype: self.dtype.as_ptype(),
+            nullability: self.dtype.nullability(),
+        }
+    }
 }
 
 impl VTable for SequenceVTable {
@@ -355,7 +374,7 @@ fn execute_iter<P: NativePType, I: Iterator<Item = usize>>(
 
 impl BaseArrayVTable<SequenceVTable> for SequenceVTable {
     fn len(array: &SequenceArray) -> usize {
-        array.length
+        array.len
     }
 
     fn dtype(array: &SequenceArray) -> &DType {
@@ -374,14 +393,14 @@ impl BaseArrayVTable<SequenceVTable> for SequenceVTable {
         array.base.hash(state);
         array.multiplier.hash(state);
         array.dtype.hash(state);
-        array.length.hash(state);
+        array.len.hash(state);
     }
 
     fn array_eq(array: &SequenceArray, other: &SequenceArray, _precision: Precision) -> bool {
         array.base == other.base
             && array.multiplier == other.multiplier
             && array.dtype == other.dtype
-            && array.length == other.length
+            && array.len == other.len
     }
 }
 
diff --git a/encodings/sequence/src/kernel.rs b/encodings/sequence/src/kernel.rs
index b13d5d491e7..586cfe629f7 100644
--- a/encodings/sequence/src/kernel.rs
+++ b/encodings/sequence/src/kernel.rs
@@ -89,7 +89,7 @@ impl ExecuteParentKernel<SequenceVTable> for SequenceCompareKernel {
             // Constant is null - result is all null for comparisons
             let nullability = array.dtype().nullability() | constant.dtype().nullability();
             let result_array =
-                ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.length).to_array();
+                ConstantArray::new(Scalar::null(DType::Bool(nullability)), array.len).to_array();
             return Ok(Some(result_array.execute(ctx)?));
         };
 
@@ -125,22 +125,22 @@ fn compare_eq_neq(
 
     // Check if there exists an integer solution to const = base + idx * multiplier
     let Some(set_idx) =
-        find_intersection_scalar(array.base(), array.multiplier(), array.length, constant)
+        find_intersection_scalar(array.base(), array.multiplier(), array.len, constant)
     else {
         let result_array = ConstantArray::new(
             Scalar::new(DType::Bool(nullability), not_match_val.into()),
-            array.length,
+            array.len,
         )
         .to_array();
         return Ok(Some(result_array.execute(ctx)?));
     };
     let idx = set_idx as u64;
-    let len = array.length as u64;
+    let len = array.len as u64;
 
     if len == 1 && set_idx == 0 {
         let result_array = ConstantArray::new(
             Scalar::new(DType::Bool(nullability), match_val.into()),
-            array.length,
+            array.len,
         )
         .to_array();
         return Ok(Some(result_array.execute(ctx)?));
@@ -179,7 +179,7 @@ fn compare_ordering(
     let transition = find_transition_point(
         array.base(),
         array.multiplier(),
-        array.length,
+        array.len,
         constant,
         operator,
     );
@@ -187,23 +187,23 @@ fn compare_ordering(
     let result_array = match transition {
         Transition::AllTrue => ConstantArray::new(
             Scalar::new(DType::Bool(nullability), true.into()),
-            array.length,
+            array.len,
         )
         .to_array(),
         Transition::AllFalse => ConstantArray::new(
             Scalar::new(DType::Bool(nullability), false.into()),
-            array.length,
+            array.len,
         )
         .to_array(),
         Transition::FalseToTrue(idx) => {
             // [0..idx) is false, [idx..len) is true
-            let ends = buffer![idx as u64, array.length as u64].into_array();
+            let ends = buffer![idx as u64, array.len as u64].into_array();
             let values = BoolArray::new(bitbuffer![false, true], nullability.into()).into_array();
             RunEndArray::try_new(ends, values)?.into_array()
         }
         Transition::TrueToFalse(idx) => {
             // [0..idx) is true, [idx..len) is false
-            let ends = buffer![idx as u64, array.length as u64].into_array();
+            let ends = buffer![idx as u64, array.len as u64].into_array();
             let values = BoolArray::new(bitbuffer![true, false], nullability.into()).into_array();
             RunEndArray::try_new(ends, values)?.into_array()
         }
diff --git a/encodings/sequence/src/lib.rs b/encodings/sequence/src/lib.rs
index 1b099edcc9b..7dabc831b99 100644
--- a/encodings/sequence/src/lib.rs
+++ b/encodings/sequence/src/lib.rs
@@ -9,6 +9,7 @@ mod kernel;
 /// Represents the equation A\[i\] = a * i + b.
 /// This can be used for compression, fast comparisons and also for row ids.
 pub use array::SequenceArray;
+pub use array::SequenceArrayParts;
 /// Represents the equation A\[i\] = a * i + b.
 /// This can be used for compression, fast comparisons and also for row ids.
 pub use array::SequenceVTable;
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index b2f5a0bc1cf..99a481e1720 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -23,7 +23,7 @@ _test-harness = []
 [dependencies]
 arc-swap = { workspace = true }
 async-trait = { workspace = true }
-cudarc = { workspace = true }
+cudarc = { workspace = true, features = ["f16"] }
 fastlanes = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
@@ -41,6 +41,7 @@ vortex-fastlanes = { workspace = true }
 vortex-io = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-nvcomp = { path = "nvcomp" }
+vortex-sequence = { workspace = true }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
 vortex-zigzag = { workspace = true }
diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu
new file mode 100644
index 00000000000..8aa73855acc
--- /dev/null
+++ b/vortex-cuda/kernels/src/sequence.cu
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include <stdint.h>
+
+template<typename ValueT>
+__device__ void sequence(
+    ValueT *const output,
+    ValueT base,
+    ValueT multiplier,
+    uint64_t len
+) {
+    const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= len) {
+        return;
+    }
+
+    output[idx] = static_cast<ValueT>(idx) * multiplier + base;
+}
+
+#define GENERATE_KERNEL(ValueT, suffix) \
+extern "C" __global__ void sequence_##suffix( \
+    ValueT *const output, \
+    ValueT base, \
+    ValueT multiplier, \
+    uint64_t len \
+) { \
+    sequence(output, base, multiplier, len); \
+}
+
+GENERATE_KERNEL(uint8_t, u8);
+GENERATE_KERNEL(uint16_t, u16);
+GENERATE_KERNEL(uint32_t, u32);
+GENERATE_KERNEL(uint64_t, u64);
+GENERATE_KERNEL(int8_t, i8);
+GENERATE_KERNEL(int16_t, i16);
+GENERATE_KERNEL(int32_t, i32);
+GENERATE_KERNEL(int64_t, i64);
+GENERATE_KERNEL(float, f32);
+GENERATE_KERNEL(double, f64);
diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
index addb877543c..ed38a71b023 100644
--- a/vortex-cuda/src/kernel/encodings/mod.rs
+++ b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -5,6 +5,7 @@ mod alp;
 mod bitpacked;
 mod decimal_byte_parts;
 mod for_;
+mod sequence;
 mod zigzag;
 mod zstd;
 
@@ -12,6 +13,7 @@ pub use alp::ALPExecutor;
 pub use bitpacked::BitPackedExecutor;
 pub use decimal_byte_parts::DecimalBytePartsExecutor;
 pub use for_::FoRExecutor;
+pub use sequence::SequenceExecutor;
 pub use zigzag::ZigZagExecutor;
 pub use zstd::ZstdExecutor;
 pub use zstd::ZstdKernelPrep;
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
new file mode 100644
index 00000000000..d098a02d971
--- /dev/null
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING;
+use vortex_array::ArrayRef;
+use vortex_array::Canonical;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::buffer::BufferHandle;
+use vortex_cuda_macros::cuda_tests;
+use vortex_dtype::NativePType;
+use vortex_dtype::Nullability;
+use vortex_dtype::match_each_native_ptype;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_sequence::SequenceArrayParts;
+use vortex_sequence::SequenceVTable;
+
+use crate::CudaDeviceBuffer;
+use crate::CudaExecutionCtx;
+use crate::executor::CudaExecute;
+use crate::launch_cuda_kernel;
+
+/// CUDA execution for `SequenceArray`.
+#[derive(Debug)]
+pub struct SequenceExecutor;
+
+#[async_trait]
+impl CudaExecute for SequenceExecutor {
+    async fn execute(
+        &self,
+        array: ArrayRef,
+        ctx: &mut CudaExecutionCtx,
+    ) -> VortexResult<Canonical> {
+        let array = array
+            .try_into::<SequenceVTable>()
+            .map_err(|_| vortex_err!("SequenceExecutor can only accept SequenceArray"))?;
+
+        let SequenceArrayParts {
+            base,
+            multiplier,
+            len,
+            ptype,
+            nullability,
+        } = array.into_parts();
+
+        match_each_native_ptype!(ptype, |P| {
+            let base = base.cast::<P>();
+            let multiplier = multiplier.cast::<P>();
+
+            execute_typed::<P>(base, multiplier, len, nullability, ctx).await
+        })
+    }
+}
+
+async fn execute_typed<T: NativePType + DeviceRepr>(
+    base: T,
+    multiplier: T,
+    len: usize,
+    nullability: Nullability,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<Canonical> {
+    let buffer = ctx.device_alloc::<T>(len)?;
+
+    let len_u64 = len as u64;
+
+    let _events = launch_cuda_kernel!(
+        execution_ctx: ctx,
+        module: "sequence",
+        ptypes: &[T::PTYPE.to_string().as_str()],
+        launch_args: [buffer, base, multiplier, len_u64],
+        event_recording: CU_EVENT_DISABLE_TIMING,
+        array_len: len
+    );
+
+    let output_buf = BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(buffer)));
+
+    Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
+        output_buf,
+        T::PTYPE,
+        nullability.into(),
+    )))
+}
+
+#[cuda_tests]
+mod tests {
+    use futures::executor::block_on;
+    use rstest::rstest;
+    use vortex_array::IntoArray;
+    use vortex_array::assert_arrays_eq;
+    use vortex_dtype::NativePType;
+    use vortex_dtype::Nullability;
+    use vortex_scalar::PValue;
+    use vortex_sequence::SequenceArray;
+    use vortex_session::VortexSession;
+
+    use crate::CanonicalCudaExt;
+    use crate::CudaSession;
+    use crate::executor::CudaExecute;
+    use crate::kernel::encodings::sequence::SequenceExecutor;
+
+    #[rstest]
+    #[case::u8(10u8, 2u8, 10)]
+    #[case::u16(10u16, 2u16, 100)]
+    #[case::u32(10u32, 2u32, 1000)]
+    #[case::u64(100u64, 20u64, 500)]
+    fn test_sequence<T: NativePType + Into<PValue>>(
+        #[case] base: T,
+        #[case] multiplier: T,
+        #[case] len: usize,
+    ) {
+        block_on(
+            async move { test_ptype::<T>(base, multiplier, len, Nullability::NonNullable).await },
+        );
+
+        block_on(
+            async move { test_ptype::<T>(base, multiplier, len, Nullability::Nullable).await },
+        );
+    }
+
+    async fn test_ptype<P: NativePType + Into<PValue>>(
+        base: P,
+        multiplier: P,
+        len: usize,
+        nullability: Nullability,
+    ) {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).unwrap();
+
+        let array = SequenceArray::typed_new(base, multiplier, nullability, len).unwrap();
+
+        let cpu_result = array.to_canonical().unwrap().into_array();
+
+        let gpu_result = SequenceExecutor
+            .execute(array.into_array(), &mut cuda_ctx)
+            .await
+            .unwrap()
+            .into_host()
+            .await
+            .unwrap()
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+    }
+}
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index c914fe1b749..b464281fde8 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -42,9 +42,11 @@ use vortex_decimal_byte_parts::DecimalBytePartsVTable;
 use vortex_fastlanes::BitPackedVTable;
 use vortex_fastlanes::FoRVTable;
 pub use vortex_nvcomp as nvcomp;
+use vortex_sequence::SequenceVTable;
 use vortex_zigzag::ZigZagVTable;
 use vortex_zstd::ZstdVTable;
 
+use crate::kernel::SequenceExecutor;
 use crate::kernel::SliceExecutor;
 
 /// Checks if CUDA is available on the system by looking for nvcc.
@@ -63,6 +65,7 @@ pub fn initialize_cuda(session: &CudaSession) {
     session.register_kernel(DecimalBytePartsVTable::ID, &DecimalBytePartsExecutor);
     session.register_kernel(DictVTable::ID, &DictExecutor);
     session.register_kernel(FoRVTable::ID, &FoRExecutor);
+    session.register_kernel(SequenceVTable::ID, &SequenceExecutor);
     session.register_kernel(ZigZagVTable::ID, &ZigZagExecutor);
     session.register_kernel(ZstdVTable::ID, &ZstdExecutor);
 

From 8db99a3a9e7f708196f06a22d24b41e31e295207 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Sun, 1 Feb 2026 13:00:16 -0500
Subject: [PATCH 2/3] properly handle ELEMENTS_PER_THREAD

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/kernels/src/sequence.cu | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu
index 8aa73855acc..1b58d62fa93 100644
--- a/vortex-cuda/kernels/src/sequence.cu
+++ b/vortex-cuda/kernels/src/sequence.cu
@@ -1,8 +1,11 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+#include "config.cuh"
 #include <stdint.h>
 
+#define MIN(a, b) (((a) < (b)) : (a) : (b))
+
 template<typename ValueT>
 __device__ void sequence(
     ValueT *const output,
@@ -10,12 +13,14 @@ __device__ void sequence(
     ValueT multiplier,
     uint64_t len
 ) {
-    const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= len) {
-        return;
-    }
+    const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
 
-    output[idx] = static_cast<ValueT>(idx) * multiplier + base;
+    const uint64_t elemStart = MIN(worker * ELEMENTS_PER_THREAD, len);
+    const uint64_t elemEnd = MIN(elemStart + ELEMENTS_PER_THREAD, len);
+
+    for (uint64_t idx = elemStart; idx < elemEnd; idx++) {
+        output[idx] = static_cast<ValueT>(idx) * multiplier + base;
+    }
 }
 
 #define GENERATE_KERNEL(ValueT, suffix) \

From cab43e199c843300f8341ee2a3648b05c5f6770e Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andrew@a10y.dev>
Date: Mon, 2 Feb 2026 10:43:52 -0500
Subject: [PATCH 3/3] fix

Signed-off-by: Andrew Duffy <andrew@a10y.dev>
---
 vortex-cuda/kernels/src/patches.cu           | 2 +-
 vortex-cuda/kernels/src/sequence.cu          | 9 +++------
 vortex-cuda/src/kernel/encodings/sequence.rs | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/vortex-cuda/kernels/src/patches.cu b/vortex-cuda/kernels/src/patches.cu
index fc5b611a828..108c0e1c914 100644
--- a/vortex-cuda/kernels/src/patches.cu
+++ b/vortex-cuda/kernels/src/patches.cu
@@ -15,7 +15,7 @@ __device__ void patches(
 ) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
     const uint64_t startElem = START_ELEM(worker, patchesLen);
-    const uint64_t stopElem = START_ELEM(worker, patchesLen);
+    const uint64_t stopElem = STOP_ELEM(worker, patchesLen);
 
     if (startElem >= patchesLen) {
         return;
diff --git a/vortex-cuda/kernels/src/sequence.cu b/vortex-cuda/kernels/src/sequence.cu
index 1b58d62fa93..eb9df20b46e 100644
--- a/vortex-cuda/kernels/src/sequence.cu
+++ b/vortex-cuda/kernels/src/sequence.cu
@@ -2,9 +2,6 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 #include "config.cuh"
-#include <stdint.h>
-
-#define MIN(a, b) (((a) < (b)) : (a) : (b))
 
 template<typename ValueT>
 __device__ void sequence(
@@ -15,10 +12,10 @@ __device__ void sequence(
 ) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
 
-    const uint64_t elemStart = MIN(worker * ELEMENTS_PER_THREAD, len);
-    const uint64_t elemEnd = MIN(elemStart + ELEMENTS_PER_THREAD, len);
+    const uint64_t startElem = START_ELEM(worker, len);
+    const uint64_t stopElem = STOP_ELEM(worker, len);
 
-    for (uint64_t idx = elemStart; idx < elemEnd; idx++) {
+    for (uint64_t idx = startElem; idx < stopElem; idx++) {
         output[idx] = static_cast<ValueT>(idx) * multiplier + base;
     }
 }
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
index d098a02d971..daf08bb9654 100644
--- a/vortex-cuda/src/kernel/encodings/sequence.rs
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -70,7 +70,7 @@ async fn execute_typed<T: NativePType + DeviceRepr>(
     let _events = launch_cuda_kernel!(
         execution_ctx: ctx,
         module: "sequence",
-        ptypes: &[T::PTYPE.to_string().as_str()],
+        ptypes: &[T::PTYPE],
         launch_args: [buffer, base, multiplier, len_u64],
         event_recording: CU_EVENT_DISABLE_TIMING,
         array_len: len