diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index 9a07cd281a2..1c48a31c7df 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -21,6 +21,8 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: file_io
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
@@ -69,6 +71,8 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: array_ops
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
@@ -103,6 +107,45 @@ jobs:
     uses: ./.github/workflows/run-fuzzer.yml
     with:
       fuzz_target: compress_roundtrip
+      family: "m8g.large"
+      image: "ubuntu24-full-arm64"
     secrets:
       R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
       R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+
+  # ============================================================================
+  # GPU Compress Fuzzer (CUDA)
+  # ============================================================================
+  gpu_compress_fuzz:
+    name: "GPU Compress Fuzz"
+    uses: ./.github/workflows/run-fuzzer.yml
+    with:
+      fuzz_target: compress_gpu
+      family: "g4dn"
+      image: "ubuntu24-gpu-x64"
+      extra_features: "cuda"
+    secrets:
+      R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+      R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+
+#  report-gpu-compress-fuzz-failures:
+#    name: "Report GPU Compress Fuzz Failures"
+#    needs: gpu_compress_fuzz
+#    if: always() && needs.gpu_compress_fuzz.outputs.crashes_found == 'true'
+#    permissions:
+#      issues: write
+#      contents: read
+#      id-token: write
+#      pull-requests: read
+#    uses: ./.github/workflows/report-fuzz-crash.yml
+#    with:
+#      fuzz_target: compress_gpu
+#      crash_file: ${{ needs.gpu_compress_fuzz.outputs.first_crash_name }}
+#      artifact_url: ${{ needs.gpu_compress_fuzz.outputs.artifact_url }}
+#      artifact_name: compress_gpu-crash-artifacts
+#      logs_artifact_name: compress_gpu-logs
+#      branch: ${{ github.ref_name }}
+#      commit: ${{ github.sha }}
+#    secrets:
+#      claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+#      gh_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/run-fuzzer.yml b/.github/workflows/run-fuzzer.yml
index 302555d7b54..c649b951256 100644
--- a/.github/workflows/run-fuzzer.yml
+++ b/.github/workflows/run-fuzzer.yml
@@ -12,6 +12,21 @@ on:
         required: false
         type: number
         default: 7200
+      family:
+        description: "Runner family (e.g., m8g.large for CPU, g5+g4dn+g6 for GPU)"
+        required: false
+        type: string
+        default: "m8g.large"
+      image:
+        description: "Runner image (e.g., ubuntu24-full-arm64, ubuntu24-gpu-x64)"
+        required: false
+        type: string
+        default: "ubuntu24-full-arm64"
+      extra_features:
+        description: "Extra cargo features to enable (e.g., cuda)"
+        required: false
+        type: string
+        default: ""
     outputs:
       crashes_found:
         description: "Whether crashes were found"
@@ -34,8 +49,8 @@ jobs:
     timeout-minutes: 230  # almost 4 hours
     runs-on:
       - runs-on=${{ github.run_id }}
-      - family=m8g.large
-      - image=ubuntu24-full-arm64
+      - family=${{ inputs.family }}
+      - image=${{ inputs.image }}
       - disk=large
       - extras=s3-cache
       - tag=${{ inputs.fuzz_target }}-fuzz
@@ -43,11 +58,6 @@ jobs:
       crashes_found: ${{ steps.check.outputs.crashes_found }}
       first_crash_name: ${{ steps.check.outputs.first_crash_name }}
       artifact_url: ${{ steps.upload_artifacts.outputs.artifact-url }}
-    env:
-      AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
-      AWS_REGION: "us-east-1"
-      AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
     steps:
       - uses: runs-on/action@v2
         with:
@@ -70,6 +80,11 @@ jobs:
 
       - name: Restore corpus
         shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+          AWS_REGION: "us-east-1"
+          AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
         run: |
           CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
           CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
@@ -99,8 +114,13 @@ jobs:
       - name: Run fuzzing target
         id: fuzz
         run: |
+          FEATURES_FLAG=""
+          if [ -n "${{ inputs.extra_features }}" ]; then
+            FEATURES_FLAG="--features ${{ inputs.extra_features }}"
+          fi
           RUSTFLAGS="--cfg vortex_nightly" RUST_BACKTRACE=1 \
             cargo +nightly fuzz run --release --debug-assertions \
+            $FEATURES_FLAG \
             ${{ inputs.fuzz_target }} -- \
             -max_total_time=${{ inputs.max_time }} -rss_limit_mb=0 \
             2>&1 | tee fuzz_output.log
@@ -149,6 +169,11 @@ jobs:
 
       - name: Persist corpus
         shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
+          AWS_REGION: "us-east-1"
+          AWS_ENDPOINT_URL: "https://01e9655179bbec953276890b183039bc.r2.cloudflarestorage.com"
         run: |
           CORPUS_KEY="${{ inputs.fuzz_target }}_corpus.tar.zst"
           CORPUS_DIR="fuzz/corpus/${{ inputs.fuzz_target }}"
diff --git a/Cargo.lock b/Cargo.lock
index 55984f4d702..43b31e21b6e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10639,10 +10639,12 @@ dependencies = [
  "itertools 0.14.0",
  "libfuzzer-sys",
  "strum 0.27.2",
+ "tokio",
  "vortex",
  "vortex-array",
  "vortex-btrblocks",
  "vortex-buffer",
+ "vortex-cuda",
  "vortex-dtype",
  "vortex-error",
  "vortex-file",
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index a9e2119bdb4..52281611140 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -22,6 +22,7 @@ default = ["native"]
 native = ["libfuzzer-sys", "zstd", "vortex-file", "vortex/files"]
 wasmfuzz = []
 zstd = ["vortex/zstd"]
+cuda = ["vortex-cuda", "tokio"]
 
 [dependencies]
 # Always needed - arbitrary is used for input generation
@@ -48,6 +49,10 @@ vortex-utils = { workspace = true }
 libfuzzer-sys = { workspace = true, optional = true }
 vortex-file = { workspace = true, optional = true }
 
+# GPU support dependencies (optional, only for CUDA fuzzing)
+vortex-cuda = { path = "../vortex-cuda", optional = true }
+tokio = { workspace = true, features = ["rt", "macros"], optional = true }
+
 [lints]
 workspace = true
 
@@ -82,3 +87,11 @@ name = "compress_roundtrip"
 path = "fuzz_targets/compress_roundtrip.rs"
 test = false
 required-features = ["native"]
+
+[[bin]]
+bench = false
+doc = false
+name = "compress_gpu"
+path = "fuzz_targets/compress_gpu.rs"
+test = false
+required-features = ["native", "cuda"]
diff --git a/fuzz/build.rs b/fuzz/build.rs
new file mode 100644
index 00000000000..4d2804cdb2e
--- /dev/null
+++ b/fuzz/build.rs
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::process::Command;
+
+fn main() {
+    // Declare the cfg so rustc doesn't warn about unexpected cfg.
+    println!("cargo::rustc-check-cfg=cfg(cuda_available)");
+
+    // Only enable CUDA on Linux (matching vortex-cuda's behavior)
+    if cfg!(not(target_os = "linux")) {
+        return;
+    }
+
+    // Check if nvcc is available
+    if has_nvcc() {
+        println!("cargo:rustc-cfg=cuda_available");
+    }
+}
+
+fn has_nvcc() -> bool {
+    Command::new("nvcc")
+        .arg("--version")
+        .output()
+        .is_ok_and(|o| o.status.success())
+}
diff --git a/fuzz/fuzz_targets/compress_gpu.rs b/fuzz/fuzz_targets/compress_gpu.rs
new file mode 100644
index 00000000000..eea7ff6c5e3
--- /dev/null
+++ b/fuzz/fuzz_targets/compress_gpu.rs
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![no_main]
+#![allow(clippy::unwrap_used, clippy::result_large_err)]
+
+use libfuzzer_sys::Corpus;
+use libfuzzer_sys::fuzz_target;
+use vortex_error::vortex_panic;
+use vortex_fuzz::FuzzCompressGpu;
+use vortex_fuzz::run_compress_gpu;
+
+fuzz_target!(|fuzz: FuzzCompressGpu| -> Corpus {
+    // Use tokio runtime to run async GPU fuzzer
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    match rt.block_on(run_compress_gpu(fuzz)) {
+        Ok(true) => Corpus::Keep,
+        Ok(false) => Corpus::Reject,
+        Err(e) => {
+            vortex_panic!("{e}");
+        }
+    }
+});
diff --git a/fuzz/src/gpu/mod.rs b/fuzz/src/gpu/mod.rs
new file mode 100644
index 00000000000..fac00bbe1a3
--- /dev/null
+++ b/fuzz/src/gpu/mod.rs
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! GPU fuzzer module for testing CUDA decompression.
+//!
+//! This module generates arbitrary instances of GPU-supported compressed encodings,
+//! then verifies that GPU decompression produces the same results as CPU decompression.
+
+use arbitrary::Arbitrary;
+use arbitrary::Result;
+use arbitrary::Unstructured;
+use vortex_array::ArrayRef;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ArbitraryDictArray;
+use vortex_dtype::Nullability;
+use vortex_dtype::PType;
+
+use crate::error::VortexFuzzResult;
+
+/// Which GPU-supported encoding to generate.
+#[derive(Debug, Clone, Copy)]
+pub enum GpuEncodingKind {
+    /// Dictionary encoding with GPU take support.
+    Dict,
+}
+
+impl<'a> Arbitrary<'a> for GpuEncodingKind {
+    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
+        // Currently only Dict is supported
+        match u.int_in_range(0..=0)? {
+            0 => Ok(GpuEncodingKind::Dict),
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// Input for the GPU decompression fuzzer.
+#[derive(Debug)]
+pub struct FuzzCompressGpu {
+    pub array: ArrayRef,
+}
+
+impl<'a> Arbitrary<'a> for FuzzCompressGpu {
+    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
+        let kind: GpuEncodingKind = u.arbitrary()?;
+
+        let array = match kind {
+            GpuEncodingKind::Dict => {
+                // Dict already has Arbitrary support, use primitive values for GPU compatibility
+                let dtype = arbitrary_gpu_primitive_dtype(u)?;
+                ArbitraryDictArray::with_dtype(u, &dtype, None)?
+                    .0
+                    .into_array()
+            }
+        };
+
+        Ok(FuzzCompressGpu { array })
+    }
+}
+
+/// Generate a random primitive DType suitable for GPU operations.
+fn arbitrary_gpu_primitive_dtype(u: &mut Unstructured) -> Result<vortex_dtype::DType> {
+    let nullability: Nullability = u.arbitrary()?;
+    let ptype = match u.int_in_range(0..=9)? {
+        0 => PType::U8,
+        1 => PType::U16,
+        2 => PType::U32,
+        3 => PType::U64,
+        4 => PType::I8,
+        5 => PType::I16,
+        6 => PType::I32,
+        7 => PType::I64,
+        8 => PType::F32,
+        9 => PType::F64,
+        _ => unreachable!(),
+    };
+    Ok(vortex_dtype::DType::Primitive(ptype, nullability))
+}
+
+/// Run the GPU decompression fuzzer.
+///
+/// This function:
+/// 1. Decompresses the array on CPU (reference)
+/// 2. Decompresses the array on GPU
+/// 3. Copies GPU result back to host using `CanonicalCudaExt::to_host`
+/// 4. Compares the results
+///
+/// Returns:
+/// - `Ok(true)` - test passed, keep in corpus
+/// - `Ok(false)` - test skipped (e.g., no CUDA), reject from corpus
+/// - `Err(_)` - a bug was found
+#[cfg(cuda_available)]
+#[allow(clippy::result_large_err)]
+pub async fn run_compress_gpu(fuzz: FuzzCompressGpu) -> VortexFuzzResult<bool> {
+    use vortex_array::Array;
+    use vortex_cuda::CanonicalCudaExt;
+    use vortex_cuda::CudaSession;
+    use vortex_cuda::executor::CudaArrayExt;
+    use vortex_cuda::initialize_cuda;
+    use vortex_cuda::session::CudaSessionExt;
+    use vortex_session::VortexSession;
+
+    use crate::error::Backtrace;
+    use crate::error::VortexFuzzError;
+
+    let FuzzCompressGpu { array } = fuzz;
+
+    // Store original properties for error reporting
+    let original_len = array.len();
+
+    // 1. CPU decompression (reference)
+    let cpu_canonical = match array.to_canonical() {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 2. Create CUDA execution context
+    let session = VortexSession::empty();
+    initialize_cuda(session.cuda_session().as_ref());
+
+    let mut cuda_ctx = session
+        .create_execution_ctx()
+        .vortex_expect("cannot create session");
+
+    // 3. GPU decompression
+    let gpu_canonical = match array.clone().execute_cuda(&mut cuda_ctx).await {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 4. Copy GPU result back to host using CanonicalCudaExt
+    let gpu_host_canonical = match gpu_canonical.to_host().await {
+        Ok(c) => c,
+        Err(e) => {
+            return Err(VortexFuzzError::VortexError(e, Backtrace::capture()));
+        }
+    };
+
+    // 5. Compare canonicals
+    let cpu_array = cpu_canonical.into_array();
+    let gpu_array = gpu_host_canonical.into_array();
+
+    // Verify dtype is preserved
+    if cpu_array.dtype() != gpu_array.dtype() {
+        return Err(VortexFuzzError::DTypeMismatch(
+            cpu_array,
+            gpu_array,
+            0,
+            Backtrace::capture(),
+        ));
+    }
+
+    // Verify length is preserved
+    if original_len != gpu_array.len() {
+        return Err(VortexFuzzError::LengthMismatch(
+            original_len,
+            gpu_array.len(),
+            array,
+            gpu_array,
+            0,
+            Backtrace::capture(),
+        ));
+    }
+
+    // Compare element by element
+    for i in 0..original_len {
+        let cpu_scalar = cpu_array
+            .scalar_at(i)
+            .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?;
+        let gpu_scalar = gpu_array
+            .scalar_at(i)
+            .map_err(|e| VortexFuzzError::VortexError(e, Backtrace::capture()))?;
+
+        if cpu_scalar != gpu_scalar {
+            return Err(VortexFuzzError::ArrayNotEqual(
+                cpu_scalar,
+                gpu_scalar,
+                i,
+                cpu_array,
+                gpu_array,
+                0,
+                Backtrace::capture(),
+            ));
+        }
+    }
+
+    Ok(true)
+}
+
+/// No-op fallback when CUDA is not available.
+#[cfg(not(cuda_available))]
+pub async fn run_compress_gpu(_fuzz: FuzzCompressGpu) -> VortexFuzzResult<bool> {
+    // Reject from corpus when CUDA is not available
+    Ok(false)
+}
diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs
index dd0b3022642..91c75d4018d 100644
--- a/fuzz/src/lib.rs
+++ b/fuzz/src/lib.rs
@@ -10,6 +10,10 @@ pub mod error;
 // File module only available for native builds (requires vortex-file which uses tokio)
 #[cfg(not(target_arch = "wasm32"))]
 pub mod file;
+
+// GPU fuzzer module (only available when CUDA is available)
+#[cfg(cuda_available)]
+pub mod gpu;
 pub use array::Action;
 pub use array::CompressorStrategy;
 pub use array::ExpectedValue;
@@ -20,6 +24,10 @@ pub use compress::FuzzCompressRoundtrip;
 pub use compress::run_compress_roundtrip;
 #[cfg(not(target_arch = "wasm32"))]
 pub use file::FuzzFileAction;
+#[cfg(cuda_available)]
+pub use gpu::FuzzCompressGpu;
+#[cfg(cuda_available)]
+pub use gpu::run_compress_gpu;
 
 // Runtime initialization - platform-specific
 #[cfg(not(target_arch = "wasm32"))]
diff --git a/vortex-array/src/arrays/primitive/compute/take/avx2.rs b/vortex-array/src/arrays/primitive/compute/take/avx2.rs
index c330a9226a3..5f46d626718 100644
--- a/vortex-array/src/arrays/primitive/compute/take/avx2.rs
+++ b/vortex-array/src/arrays/primitive/compute/take/avx2.rs
@@ -48,6 +48,7 @@ impl TakeImpl for TakeKernelAVX2 {
 ///
 /// The caller must ensure that if the validity has a length, it is the same length as the indices,
 /// and that the `avx2` feature is enabled.
+#[allow(unused)]
 #[target_feature(enable = "avx2")]
 unsafe fn take_primitive_avx2<V, I>(
     values: &[V],
diff --git a/vortex-cuda/src/canonical.rs b/vortex-cuda/src/canonical.rs
new file mode 100644
index 00000000000..7306c80aec1
--- /dev/null
+++ b/vortex-cuda/src/canonical.rs
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use async_trait::async_trait;
+use vortex_array::Canonical;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::BoolArrayParts;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::DecimalArrayParts;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::PrimitiveArrayParts;
+use vortex_array::buffer::BufferHandle;
+use vortex_error::VortexResult;
+
+/// Move all canonical data from to_host from device.
+#[async_trait]
+pub trait CanonicalCudaExt {
+    async fn to_host(self) -> VortexResult<Self>
+    where
+        Self: Sized;
+}
+
+#[async_trait]
+impl CanonicalCudaExt for Canonical {
+    async fn to_host(self) -> VortexResult<Self> {
+        match self {
+            n @ Canonical::Null(_) => Ok(n),
+            Canonical::Bool(bool) => {
+                // NOTE: update to copy to host when adding buffer handle.
+                // Also update other method to copy validity to host.
+                let BoolArrayParts { bits, validity, .. } = bool.into_parts();
+                Ok(Canonical::Bool(BoolArray::from_bit_buffer(bits, validity)))
+            }
+            Canonical::Primitive(prim) => {
+                let PrimitiveArrayParts {
+                    ptype,
+                    buffer,
+                    validity,
+                    ..
+                } = prim.into_parts();
+                Ok(Canonical::Primitive(PrimitiveArray::from_byte_buffer(
+                    buffer.try_into_host()?.await?,
+                    ptype,
+                    validity,
+                )))
+            }
+            Canonical::Decimal(decimal) => {
+                let DecimalArrayParts {
+                    decimal_dtype,
+                    values,
+                    values_type,
+                    validity,
+                    ..
+                } = decimal.into_parts();
+                Ok(Canonical::Decimal(unsafe {
+                    DecimalArray::new_unchecked_handle(
+                        BufferHandle::new_host(values.try_into_host()?.await?),
+                        values_type,
+                        decimal_dtype,
+                        validity,
+                    )
+                }))
+            }
+            _ => todo!(),
+        }
+    }
+}
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index d77be685b20..c331dc80103 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -3,12 +3,14 @@
 
 //! CUDA support for Vortex arrays.
 
+mod canonical;
 mod device_buffer;
 pub mod executor;
 mod kernel;
 mod session;
 mod stream;
 
+pub use canonical::CanonicalCudaExt;
 pub use device_buffer::CudaBufferExt;
 pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;