From 11351bf1e9f3959b78ce1f4483dfd10aef4b46e7 Mon Sep 17 00:00:00 2001
From: Onur Satici <onursatici@gmail.com>
Date: Tue, 27 Jan 2026 00:39:18 +0000
Subject: [PATCH 1/5] allocators

Signed-off-by: Onur Satici <onursatici@gmail.com>
Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                          |   4 +
 vortex-array/src/serde.rs           |  27 +-
 vortex-cuda/Cargo.toml              |   3 +
 vortex-cuda/nvcomp/build.rs         | 142 ++++++++---
 vortex-cuda/src/lib.rs              |   7 +
 vortex-cuda/src/pinned.rs           | 372 ++++++++++++++++++++++++++++
 vortex-cuda/src/pinned_allocator.rs | 165 ++++++++++++
 vortex-cuda/src/session.rs          |  14 +-
 vortex-file/src/open.rs             |  49 +++-
 vortex-file/src/read/driver.rs      |   4 +-
 vortex-file/src/read/request.rs     |  33 ++-
 vortex-file/src/segments/source.rs  |  35 ++-
 vortex-io/Cargo.toml                |   1 +
 vortex-io/src/allocator.rs          |  52 ++++
 vortex-io/src/file/object_store.rs  |  70 ++++++
 vortex-io/src/file/std_file.rs      |  25 +-
 vortex-io/src/lib.rs                |   4 +
 vortex-io/src/read.rs               |  71 ++++++
 vortex-io/src/runtime/tests.rs      |  23 ++
 vortex-io/src/write_target.rs       |  39 +++
 20 files changed, 1060 insertions(+), 80 deletions(-)
 create mode 100644 vortex-cuda/src/pinned.rs
 create mode 100644 vortex-cuda/src/pinned_allocator.rs
 create mode 100644 vortex-io/src/allocator.rs
 create mode 100644 vortex-io/src/write_target.rs
diff --git a/Cargo.lock b/Cargo.lock
index 62a1aa354cd..ff122aced98 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10335,10 +10335,12 @@ name = "vortex-cuda"
 version = "0.1.0"
 dependencies = [
  "async-trait",
+ "bytes",
  "codspeed-criterion-compat-walltime",
  "cudarc",
  "futures",
  "kanal",
+ "parking_lot",
  "rstest",
  "tokio",
  "tracing",
@@ -10350,6 +10352,7 @@ dependencies = [
  "vortex-dtype",
  "vortex-error",
  "vortex-fastlanes",
+ "vortex-io",
  "vortex-mask",
  "vortex-nvcomp",
  "vortex-scalar",
@@ -10688,6 +10691,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tracing",
+ "vortex-array",
  "vortex-buffer",
  "vortex-error",
  "vortex-metrics",
diff --git a/vortex-array/src/serde.rs b/vortex-array/src/serde.rs
index a3aa423b177..06aa687068a 100644
--- a/vortex-array/src/serde.rs
+++ b/vortex-array/src/serde.rs
@@ -490,10 +490,13 @@ impl ArrayParts {
         array_tree: ByteBuffer,
         segment: BufferHandle,
     ) -> VortexResult<Self> {
-        // TODO: this can also work with device buffers.
-        let segment = segment.try_to_host_sync()?;
-        // We align each buffer individually, so we remove alignment requirements on the buffer.
-        let segment = segment.aligned(Alignment::none());
+        // We align each buffer individually, so we remove alignment requirements on the segment
+        // for host-resident buffers. Device buffers are sliced directly.
+        let segment = if let Some(host) = segment.as_host_opt() {
+            BufferHandle::new_host(host.clone().aligned(Alignment::none()))
+        } else {
+            segment
+        };
 
         let fb_buffer = FlatBuffer::align_from(array_tree);
 
@@ -515,12 +518,18 @@ impl ArrayParts {
                     let buffer_len = fb_buf.length() as usize;
 
                     // Extract a buffer and ensure it's aligned, copying if necessary
-                    let buffer = segment
-                        .slice(offset..(offset + buffer_len))
-                        .aligned(Alignment::from_exponent(fb_buf.alignment_exponent()));
-
+                    let buffer = segment.slice(offset..(offset + buffer_len));
+                    let buffer = if let Some(host) = buffer.as_host_opt() {
+                        BufferHandle::new_host(
+                            host.clone().aligned(Alignment::from_exponent(
+                                fb_buf.alignment_exponent(),
+                            )),
+                        )
+                    } else {
+                        buffer
+                    };
                     offset += buffer_len;
-                    BufferHandle::new_host(buffer)
+                    buffer
                 })
                 .collect();
 
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index a7b7bd6d4b0..cf4c85f4567 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -22,9 +22,11 @@ _test-harness = []
 
 [dependencies]
 async-trait = { workspace = true }
+bytes = { workspace = true }
 cudarc = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
+parking_lot = { workspace = true }
 tracing = { workspace = true }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
@@ -35,6 +37,7 @@ vortex-error = { workspace = true }
 vortex-fastlanes = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-nvcomp = { path = "nvcomp" }
+vortex-io = { workspace = true }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
 vortex-zigzag = { workspace = true }
diff --git a/vortex-cuda/nvcomp/build.rs b/vortex-cuda/nvcomp/build.rs
index 8ada54e46ad..d3f7c37543a 100644
--- a/vortex-cuda/nvcomp/build.rs
+++ b/vortex-cuda/nvcomp/build.rs
@@ -32,6 +32,67 @@ typedef int cudaError_t;
 #define cudaSuccess 0
 "#;
 
+/// Minimal nvCOMP headers for non-Linux platforms to allow bindgen to run.
+const NVCOMP_STUB_HEADER: &str = r#"
+#pragma once
+#include <stddef.h>
+#include "cuda_runtime.h"
+
+typedef enum nvcompStatus_t {
+    nvcompSuccess = 0,
+    nvcompErrorInvalidValue = 1,
+    nvcompErrorNotSupported = 2,
+    nvcompErrorCannotDecompress = 3,
+    nvcompErrorBadChecksum = 4,
+    nvcompErrorCannotVerifyChecksums = 5,
+    nvcompErrorOutputBufferTooSmall = 6,
+    nvcompErrorWrongHeaderLength = 7,
+    nvcompErrorAlignment = 8,
+    nvcompErrorChunkSizeTooLarge = 9,
+    nvcompErrorCannotCompress = 10,
+    nvcompErrorWrongInputLength = 11,
+    nvcompErrorBatchSizeTooLarge = 12,
+    nvcompErrorCudaError = 13,
+    nvcompErrorInternal = 14
+} nvcompStatus_t;
+
+typedef enum nvcompDecompressBackend_t {
+    NVCOMP_DECOMPRESS_BACKEND_DEFAULT = 0,
+    NVCOMP_DECOMPRESS_BACKEND_HARDWARE = 1,
+    NVCOMP_DECOMPRESS_BACKEND_CUDA = 2
+} nvcompDecompressBackend_t;
+
+typedef struct nvcompBatchedZstdDecompressOpts_t {
+    nvcompDecompressBackend_t backend;
+    unsigned char reserved[60];
+} nvcompBatchedZstdDecompressOpts_t;
+"#;
+
+const NVCOMP_ZSTD_STUB_HEADER: &str = r#"
+#pragma once
+#include "nvcomp.h"
+
+nvcompStatus_t nvcompBatchedZstdDecompressGetTempSizeAsync(
+    size_t numChunks,
+    size_t maxUncompressedChunkBytes,
+    nvcompBatchedZstdDecompressOpts_t opts,
+    size_t* tempBytes,
+    size_t maxTotalUncompressedBytes);
+
+nvcompStatus_t nvcompBatchedZstdDecompressAsync(
+    const void* const* device_compressed_ptrs,
+    const size_t* device_compressed_bytes,
+    const size_t* device_uncompressed_bytes,
+    size_t* device_actual_uncompressed_bytes,
+    size_t num_chunks,
+    void* device_temp_ptr,
+    size_t temp_bytes,
+    void* const* device_uncompressed_ptrs,
+    nvcompBatchedZstdDecompressOpts_t opts,
+    nvcompStatus_t* device_statuses,
+    cudaStream_t stream);
+"#;
+
 fn main() {
     // Declare the cfg so rustc doesn't warn about unexpected cfg.
     println!("cargo::rustc-check-cfg=cfg(cuda_available)");
@@ -45,49 +106,60 @@ fn main() {
     fs::create_dir_all(&cuda_stub_dir).unwrap();
     fs::write(cuda_stub_dir.join("cuda_runtime.h"), CUDA_RUNTIME_STUB).unwrap();
 
-    let (os, arch) = match (env::consts::OS, env::consts::ARCH) {
-        ("linux", "x86_64") => ("linux", "x86_64"),
-        ("linux", "aarch64") => ("linux", "sbsa"),
-        // Fall back to linux-x86_64 to generate bindings for any platform.
-        _ => ("linux", "x86_64"),
-    };
-
-    let archive_name = format!("nvcomp-{os}-{arch}-{NVCOMP_VERSION}_{CUDA_VERSION}-archive");
-    let url = format!(
-        "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/{os}-{arch}/{archive_name}.tar.xz"
-    );
+    let is_linux = env::consts::OS == "linux";
+    let include_dir = if is_linux {
+        let (os, arch) = match (env::consts::OS, env::consts::ARCH) {
+            ("linux", "x86_64") => ("linux", "x86_64"),
+            ("linux", "aarch64") => ("linux", "sbsa"),
+            _ => ("linux", "x86_64"),
+        };
+
+        let archive_name = format!("nvcomp-{os}-{arch}-{NVCOMP_VERSION}_{CUDA_VERSION}-archive");
+        let url = format!(
+            "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/{os}-{arch}/{archive_name}.tar.xz"
+        );
 
-    let include_dir = nvcomp_dir.join("include");
+        let include_dir = nvcomp_dir.join("include");
 
-    if !include_dir.exists() {
-        let response = reqwest::blocking::get(&url)
-            .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}"));
+        if !include_dir.exists() {
+            let response = reqwest::blocking::get(&url)
+                .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}"));
 
-        assert!(
-            response.status().is_success(),
-            "Failed to download nvCOMP: HTTP {}",
-            response.status()
-        );
+            assert!(
+                response.status().is_success(),
+                "Failed to download nvCOMP: HTTP {}",
+                response.status()
+            );
 
-        let bytes = response.bytes().unwrap();
+            let bytes = response.bytes().unwrap();
 
-        // Extract tar.xz archive.
-        let cursor = Cursor::new(bytes.as_ref());
-        let xz = XzDecoder::new(cursor);
-        let mut archive = tar::Archive::new(xz);
+            // Extract tar.xz archive.
+            let cursor = Cursor::new(bytes.as_ref());
+            let xz = XzDecoder::new(cursor);
+            let mut archive = tar::Archive::new(xz);
 
-        let temp_dir = nvcomp_dir.with_extension("tmp");
-        fs::create_dir_all(&temp_dir).unwrap();
-        archive.unpack(&temp_dir).unwrap();
+            let temp_dir = nvcomp_dir.with_extension("tmp");
+            fs::create_dir_all(&temp_dir).unwrap();
+            archive.unpack(&temp_dir).unwrap();
 
-        // Move extracted content.
-        let extracted = temp_dir.join(&archive_name);
-        if nvcomp_dir.exists() {
-            fs::remove_dir_all(&nvcomp_dir).unwrap();
+            // Move extracted content.
+            let extracted = temp_dir.join(&archive_name);
+            if nvcomp_dir.exists() {
+                fs::remove_dir_all(&nvcomp_dir).unwrap();
+            }
+            fs::rename(&extracted, &nvcomp_dir).unwrap();
+            fs::remove_dir_all(&temp_dir).ok();
         }
-        fs::rename(&extracted, &nvcomp_dir).unwrap();
-        fs::remove_dir_all(&temp_dir).ok();
-    }
+
+        include_dir
+    } else {
+        let stub_include = out_dir.join("nvcomp-stub").join("include");
+        let stub_nvcomp = stub_include.join("nvcomp");
+        fs::create_dir_all(&stub_nvcomp).unwrap();
+        fs::write(stub_include.join("nvcomp.h"), NVCOMP_STUB_HEADER).unwrap();
+        fs::write(stub_nvcomp.join("zstd.h"), NVCOMP_ZSTD_STUB_HEADER).unwrap();
+        stub_include
+    };
 
     // Functions are loaded at runtime via libloading to avoid link-time symbol resolution.
     let bindings = bindgen::Builder::default()
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index da448e9b065..08b9806b38f 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -9,6 +9,8 @@ mod canonical;
 mod device_buffer;
 pub mod executor;
 mod kernel;
+mod pinned;
+mod pinned_allocator;
 mod session;
 mod stream;
 
@@ -17,6 +19,11 @@ pub use device_buffer::CudaBufferExt;
 pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
+pub use pinned::PinnedByteBuffer;
+pub use pinned::PinnedByteBufferPool;
+pub use pinned::PooledPinnedBuffer;
+pub use pinned_allocator::PinnedBufferAllocator;
+pub use pinned_allocator::PinnedDeviceAllocator;
 use kernel::ALPExecutor;
 use kernel::DecimalBytePartsExecutor;
 use kernel::DictExecutor;
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
new file mode 100644
index 00000000000..948bf7bfbb5
--- /dev/null
+++ b/vortex-cuda/src/pinned.rs
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use bytes::Bytes;
+use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
+use cudarc::driver::HostSlice;
+use cudarc::driver::PinnedHostSlice;
+use cudarc::driver::SyncOnDrop;
+use parking_lot::Mutex;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_error::vortex_panic;
+use vortex_utils::aliases::hash_map::HashMap;
+
+/// A page-locked host buffer allocated by CUDA.
+///
+/// This is intended as a staging buffer for H2D transfers. Contents are uninitialized after
+/// allocation.
+pub struct PinnedByteBuffer {
+    inner: PinnedHostSlice<u8>,
+}
+
+#[allow(clippy::same_name_method)]
+impl PinnedByteBuffer {
+    /// Allocate a pinned host buffer with uninitialized contents.
+    ///
+    /// # Safety
+    /// The returned buffer's contents are uninitialized. The caller must initialize before read.
+    pub unsafe fn uninit(ctx: &Arc<CudaContext>, len: usize) -> VortexResult<Self> {
+        let inner = unsafe {
+            ctx.alloc_pinned::<u8>(len)
+                .map_err(|e| vortex_err!("failed to allocate pinned host buffer: {e}"))?
+        };
+        Ok(Self { inner })
+    }
+
+    /// Returns the length of the buffer in bytes.
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// Returns true if the buffer is empty.
+    pub fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    /// Returns the buffer as an immutable slice.
+    pub fn as_slice(&self) -> VortexResult<&[u8]> {
+        self.inner
+            .as_slice()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns the buffer as a mutable slice.
+    pub fn as_mut_slice(&mut self) -> VortexResult<&mut [u8]> {
+        self.inner
+            .as_mut_slice()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns a raw pointer to the buffer.
+    pub fn as_ptr(&self) -> VortexResult<*const u8> {
+        self.inner
+            .as_ptr()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns a mutable raw pointer to the buffer.
+    pub fn as_mut_ptr(&mut self) -> VortexResult<*mut u8> {
+        self.inner
+            .as_mut_ptr()
+            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns the CUDA context that owns this allocation.
+    pub fn context(&self) -> &Arc<CudaContext> {
+        self.inner.context()
+    }
+}
+
+#[allow(clippy::same_name_method)]
+impl HostSlice<u8> for PinnedByteBuffer {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    unsafe fn stream_synced_slice<'a>(
+        &'a self,
+        stream: &'a CudaStream,
+    ) -> (&'a [u8], SyncOnDrop<'a>) {
+        unsafe { <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_slice(&self.inner, stream) }
+    }
+
+    unsafe fn stream_synced_mut_slice<'a>(
+        &'a mut self,
+        stream: &'a CudaStream,
+    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
+        unsafe {
+            <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_mut_slice(&mut self.inner, stream)
+        }
+    }
+}
+
+/// A simple pinned host buffer pool keyed by allocation size.
+pub struct PinnedByteBufferPool {
+    ctx: Arc<CudaContext>,
+    max_keep_per_size: usize,
+    buckets: Mutex<HashMap<usize, Vec<PinnedByteBuffer>>>,
+    hits: std::sync::atomic::AtomicU64,
+    misses: std::sync::atomic::AtomicU64,
+    allocs: std::sync::atomic::AtomicU64,
+    puts: std::sync::atomic::AtomicU64,
+}
+
+impl PinnedByteBufferPool {
+    /// Create a new pool with default limits.
+    pub fn new(ctx: Arc<CudaContext>) -> Self {
+        Self::with_limits(ctx, 4)
+    }
+
+    /// Create a new pool with a maximum number of cached buffers per size.
+    pub fn with_limits(ctx: Arc<CudaContext>, max_keep_per_size: usize) -> Self {
+        Self {
+            ctx,
+            max_keep_per_size: max_keep_per_size.max(1),
+            buckets: Mutex::new(HashMap::new()),
+            hits: std::sync::atomic::AtomicU64::new(0),
+            misses: std::sync::atomic::AtomicU64::new(0),
+            allocs: std::sync::atomic::AtomicU64::new(0),
+            puts: std::sync::atomic::AtomicU64::new(0),
+        }
+    }
+
+    /// Acquire a pinned buffer of the given size in bytes.
+    pub fn get(&self, len: usize) -> VortexResult<PinnedByteBuffer> {
+        let mut buckets = self.buckets.lock();
+        if let Some(bucket) = buckets.get_mut(&len)
+            && let Some(buf) = bucket.pop()
+        {
+            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            return Ok(buf);
+        }
+        self.misses
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        self.allocs.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        unsafe { PinnedByteBuffer::uninit(&self.ctx, len) }
+    }
+
+    /// Return a buffer to the pool.
+    pub fn put(&self, buf: PinnedByteBuffer) -> VortexResult<()> {
+        let len = buf.len();
+        let mut buckets = self.buckets.lock();
+        let bucket = buckets.entry(len).or_default();
+        if bucket.len() < self.max_keep_per_size {
+            bucket.push(buf);
+        }
+        self.puts.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        Ok(())
+    }
+
+    /// Get a pooled pinned buffer that will be returned to the pool on drop.
+    pub fn get_pooled(self: &Arc<Self>, len: usize) -> VortexResult<PooledPinnedBuffer> {
+        let inner = self.get(len)?;
+        Ok(PooledPinnedBuffer {
+            inner: Some(inner),
+            pool: self.clone(),
+        })
+    }
+
+    /// Snapshot pool reuse statistics.
+    pub fn stats(&self) -> PinnedPoolStats {
+        PinnedPoolStats {
+            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
+            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
+            allocs: self.allocs.load(std::sync::atomic::Ordering::Relaxed),
+            puts: self.puts.load(std::sync::atomic::Ordering::Relaxed),
+        }
+    }
+
+    /// Reset pool reuse statistics.
+    pub fn reset_stats(&self) {
+        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
+        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
+        self.allocs.store(0, std::sync::atomic::Ordering::Relaxed);
+        self.puts.store(0, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+/// Reuse counters for a pinned buffer pool.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct PinnedPoolStats {
+    pub hits: u64,
+    pub misses: u64,
+    pub allocs: u64,
+    pub puts: u64,
+}
+
+/// A pinned buffer that is returned to its pool when dropped.
+///
+/// This wrapper owns a [`PinnedByteBuffer`] and ensures it gets returned to the
+/// [`PinnedByteBufferPool`] when the buffer is no longer needed. This enables efficient
+/// buffer reuse for I/O operations.
+pub struct PooledPinnedBuffer {
+    inner: Option<PinnedByteBuffer>,
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+#[allow(clippy::same_name_method)]
+impl PooledPinnedBuffer {
+    /// Create a new pooled buffer.
+    pub fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
+        Self {
+            inner: Some(inner),
+            pool,
+        }
+    }
+
+    /// Returns the length of the buffer in bytes.
+    pub fn len(&self) -> usize {
+        self.inner
+            .as_ref()
+            .map(|b| b.len())
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"))
+    }
+
+    /// Returns true if the buffer is empty.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the buffer as an immutable slice.
+    pub fn as_slice(&self) -> &[u8] {
+        let inner = self
+            .inner
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        inner
+            .as_slice()
+            .unwrap_or_else(|e| vortex_panic!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Returns the buffer as a mutable slice.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the buffer has already been consumed or if the CUDA context is invalid.
+    pub fn as_mut_slice(&mut self) -> &mut [u8] {
+        let inner = self
+            .inner
+            .as_mut()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        inner
+            .as_mut_slice()
+            .unwrap_or_else(|e| vortex_panic!("failed to access pinned host buffer: {e}"))
+    }
+
+    /// Convert this pooled buffer into a [`ByteBuffer`].
+    ///
+    /// The returned buffer will return the underlying pinned memory to the pool when dropped.
+    /// This enables zero-copy conversion to the standard Vortex buffer type while maintaining
+    /// pool-based memory reuse.
+    pub fn into_byte_buffer(mut self) -> ByteBuffer {
+        let inner = self
+            .inner
+            .take()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        let len = inner.len();
+        let pool = self.pool.clone();
+
+        // Create a wrapper that will return the buffer to the pool on drop
+        let wrapper = PooledPinnedBufferOwner::new(inner, pool);
+
+        // Use Bytes::from_owner to create a Bytes that owns the wrapper
+        let bytes = Bytes::from_owner(wrapper);
+
+        // The ByteBuffer should have the full length
+        assert_eq!(bytes.len(), len);
+
+        ByteBuffer::from(bytes)
+    }
+}
+
+#[allow(clippy::same_name_method)]
+impl HostSlice<u8> for PooledPinnedBuffer {
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    unsafe fn stream_synced_slice<'a>(
+        &'a self,
+        stream: &'a CudaStream,
+    ) -> (&'a [u8], SyncOnDrop<'a>) {
+        let inner = self
+            .inner
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        unsafe { HostSlice::stream_synced_slice(inner, stream) }
+    }
+
+    unsafe fn stream_synced_mut_slice<'a>(
+        &'a mut self,
+        stream: &'a CudaStream,
+    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
+        let inner = self
+            .inner
+            .as_mut()
+            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
+        unsafe { HostSlice::stream_synced_mut_slice(inner, stream) }
+    }
+}
+
+impl Drop for PooledPinnedBuffer {
+    fn drop(&mut self) {
+        if let Some(inner) = self.inner.take() {
+            // Return the buffer to the pool, ignoring errors
+            drop(self.pool.put(inner));
+        }
+    }
+}
+
+/// Internal wrapper that owns a PinnedByteBuffer and returns it to the pool on drop.
+///
+/// This is used by `Bytes::from_owner` to manage the lifecycle of pooled pinned buffers.
+struct PooledPinnedBufferOwner {
+    // We use Option so we can take the buffer out in Drop
+    inner: Option<PinnedByteBuffer>,
+    // Cached pointer and length for AsRef implementation
+    ptr: *const u8,
+    len: usize,
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+// SAFETY: The pinned buffer is allocated by CUDA and is safe to send across threads.
+// The pointer is derived from the buffer and remains valid as long as the buffer exists.
+unsafe impl Send for PooledPinnedBufferOwner {}
+unsafe impl Sync for PooledPinnedBufferOwner {}
+
+impl PooledPinnedBufferOwner {
+    fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
+        let ptr = inner
+            .as_ptr()
+            .unwrap_or_else(|e| vortex_panic!("failed to get pointer to pinned buffer: {e}"));
+        let len = inner.len();
+        Self {
+            inner: Some(inner),
+            ptr,
+            len,
+            pool,
+        }
+    }
+}
+
+impl AsRef<[u8]> for PooledPinnedBufferOwner {
+    fn as_ref(&self) -> &[u8] {
+        // SAFETY: The pointer and length were captured when the buffer was created
+        // and remain valid as long as this struct exists (buffer is in the Mutex).
+        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
+    }
+}
+
+impl Drop for PooledPinnedBufferOwner {
+    fn drop(&mut self) {
+        // Take the buffer out and return it to the pool
+        if let Some(buffer) = self.inner.take() {
+            drop(self.pool.put(buffer));
+        }
+    }
+}
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
new file mode 100644
index 00000000000..6658094f5d3
--- /dev/null
+++ b/vortex-cuda/src/pinned_allocator.rs
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaStream;
+use cudarc::driver::DevicePtrMut;
+use cudarc::driver::result::memcpy_htod_async;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_io::BufferAllocator;
+use vortex_io::WriteTarget;
+use vortex_session::VortexSession;
+
+use crate::PinnedByteBufferPool;
+use crate::PooledPinnedBuffer;
+use crate::device_buffer::CudaDeviceBuffer;
+use crate::session::CudaSessionExt;
+use crate::stream::await_stream_callback;
+
+/// Allocator that sources buffers from a CUDA pinned pool.
+pub struct PinnedBufferAllocator {
+    pool: Arc<PinnedByteBufferPool>,
+}
+
+impl PinnedBufferAllocator {
+    pub fn new(pool: Arc<PinnedByteBufferPool>) -> Self {
+        Self { pool }
+    }
+}
+
+impl BufferAllocator for PinnedBufferAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let buffer = self.pool.get_pooled(len)?;
+        Ok(Box::new(AlignedPinnedWriteTarget::new(buffer, alignment)))
+    }
+}
+
+struct AlignedPinnedWriteTarget {
+    buffer: PooledPinnedBuffer,
+    alignment: Alignment,
+}
+
+impl AlignedPinnedWriteTarget {
+    fn new(buffer: PooledPinnedBuffer, alignment: Alignment) -> Self {
+        Self { buffer, alignment }
+    }
+}
+
+impl WriteTarget for AlignedPinnedWriteTarget {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.buffer.as_mut_slice()
+    }
+
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        async move {
+            let ptr = self.buffer.as_slice().as_ptr() as usize;
+            let align = *self.alignment;
+            // CUDA pinned allocations don't accept an explicit alignment request,
+            // so we validate the actual pointer alignment after allocation.
+            if align > 1 && ptr % align != 0 {
+                return Err(vortex_err!(
+                    "Pinned host buffer not aligned to {} (ptr=0x{:x})",
+                    align,
+                    ptr
+                ));
+            }
+            Ok(BufferHandle::new_host(self.buffer.into_byte_buffer()))
+        }
+        .boxed()
+    }
+}
+
+/// Allocator that reads into pinned buffers and transfers to device memory.
+pub struct PinnedDeviceAllocator {
+    pool: Arc<PinnedByteBufferPool>,
+    stream: Arc<CudaStream>,
+}
+
+impl PinnedDeviceAllocator {
+    pub fn new(pool: Arc<PinnedByteBufferPool>, stream: Arc<CudaStream>) -> Self {
+        Self { pool, stream }
+    }
+
+    pub fn from_session(
+        pool: Arc<PinnedByteBufferPool>,
+        session: &VortexSession,
+    ) -> VortexResult<Self> {
+        let stream = session.cuda_session().new_stream()?;
+        Ok(Self::new(pool, stream))
+    }
+}
+
+impl BufferAllocator for PinnedDeviceAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let buffer = self.pool.get_pooled(len)?;
+        Ok(Box::new(PinnedDeviceWriteTarget {
+            buffer,
+            stream: self.stream.clone(),
+            alignment,
+        }))
+    }
+}
+
+struct PinnedDeviceWriteTarget {
+    buffer: PooledPinnedBuffer,
+    stream: Arc<CudaStream>,
+    alignment: Alignment,
+}
+
+impl WriteTarget for PinnedDeviceWriteTarget {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.buffer.as_mut_slice()
+    }
+
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let len = self.buffer.len();
+        let stream = self.stream.clone();
+        let host = self.buffer;
+        let alignment = self.alignment;
+        async move {
+            let ptr = host.as_slice().as_ptr() as usize;
+            let align = *alignment;
+            // CUDA pinned allocations don't accept an explicit alignment request,
+            // so we validate the actual pointer alignment after allocation.
+            if align > 1 && ptr % align != 0 {
+                return Err(vortex_err!(
+                    "Pinned host buffer not aligned to {} (ptr=0x{:x})",
+                    align,
+                    ptr
+                ));
+            }
+
+            let mut device = unsafe { stream.alloc::<u8>(len) }
+                .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
+
+            let device_ptr = device.device_ptr_mut(&stream).0;
+            let host_slice = host.as_slice();
+            unsafe {
+                memcpy_htod_async(device_ptr, host_slice, stream.cu_stream())
+                    .map_err(|e| vortex_err!("Failed to schedule H2D copy: {e}"))?;
+            }
+
+            await_stream_callback(&stream).await?;
+
+            // Keep the host buffer alive until the copy completes.
+            let _keep_alive = host;
+
+            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(device))))
+        }
+        .boxed()
+    }
+}
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index c83128def3e..088c2465088 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -5,6 +5,7 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
 use vortex_array::VortexSessionExecute;
 use vortex_array::vtable::ArrayId;
 use vortex_error::VortexResult;
@@ -42,17 +43,20 @@ impl CudaSession {
     pub fn create_execution_ctx(
         vortex_session: &vortex_session::VortexSession,
     ) -> VortexResult<CudaExecutionCtx> {
-        let stream = vortex_session
-            .cuda_session()
-            .context
-            .new_stream()
-            .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))?;
+        let stream = vortex_session.cuda_session().new_stream()?;
         Ok(CudaExecutionCtx::new(
             stream,
             vortex_session.create_execution_ctx(),
         ))
     }
 
+    /// Create a new CUDA stream.
+    pub fn new_stream(&self) -> VortexResult<Arc<CudaStream>> {
+        self.context
+            .new_stream()
+            .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))
+    }
+
     /// Registers CUDA support for an array encoding.
     ///
     /// # Arguments
diff --git a/vortex-file/src/open.rs b/vortex-file/src/open.rs
index 0f9123fe480..4d9c566daf8 100644
--- a/vortex-file/src/open.rs
+++ b/vortex-file/src/open.rs
@@ -12,6 +12,8 @@ use vortex_dtype::DType;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_io::BufferAllocator;
+use vortex_io::InstrumentedReadAt;
 use vortex_io::VortexReadAt;
 use vortex_io::session::RuntimeSessionExt;
 use vortex_layout::segments::NoOpSegmentCache;
@@ -51,6 +53,8 @@ pub struct VortexOpenOptions {
     footer: Option<Footer>,
     /// The segments read during the initial read.
     initial_read_segments: RwLock<HashMap<SegmentId, ByteBuffer>>,
+    /// Optional allocator for read buffers.
+    allocator: Option<Arc<dyn BufferAllocator>>,
     /// A metrics registry for the file.
     metrics: Option<VortexMetrics>,
 }
@@ -66,6 +70,7 @@ pub trait OpenOptionsSessionExt: ArraySessionExt + LayoutSessionExt + RuntimeSes
             dtype: None,
             footer: None,
             initial_read_segments: Default::default(),
+            allocator: None,
             metrics: None,
         }
     }
@@ -125,6 +130,12 @@ impl VortexOpenOptions {
         self
     }
 
+    /// Configure a custom buffer allocator for reads.
+    pub fn with_allocator(mut self, allocator: Arc<dyn BufferAllocator>) -> Self {
+        self.allocator = Some(allocator);
+        self
+    }
+
     /// Open a Vortex file using the provided I/O source.
     ///
     /// This is the most common way to open a [`VortexFile`] and tends to provide the best
@@ -156,11 +167,13 @@ impl VortexOpenOptions {
 
     /// An API for opening a [`VortexFile`] using any [`VortexReadAt`] implementation.
     pub async fn open_read<R: VortexReadAt + Clone>(self, reader: R) -> VortexResult<VortexFile> {
-        let metrics = VortexMetrics::default();
+        let metrics = self.metrics.clone().unwrap_or_default();
+        let reader = InstrumentedReadAt::new(reader, &metrics);
+        let reader: Arc<dyn VortexReadAt> = Arc::new(reader);
         let footer = if let Some(footer) = self.footer {
             footer
         } else {
-            self.read_footer(&reader).await?
+            self.read_footer(reader.as_ref()).await?
         };
 
         let segment_cache = Arc::new(SegmentCacheMetrics::new(
@@ -172,12 +185,15 @@ impl VortexOpenOptions {
         ));
 
         // Create a segment source backed by the VortexRead implementation.
-        let segment_source = Arc::new(SharedSegmentSource::new(FileSegmentSource::open(
-            footer.segment_map().clone(),
-            reader,
-            self.session.handle(),
-            metrics.clone(),
-        )));
+        let segment_source = Arc::new(SharedSegmentSource::new(
+            FileSegmentSource::open_with_allocator(
+                footer.segment_map().clone(),
+                reader,
+                self.session.handle(),
+                metrics.clone(),
+                self.allocator.clone(),
+            ),
+        ));
 
         // Wrap up the segment source to first resolve segments from the initial read cache.
         let segment_source = Arc::new(SegmentCacheSourceAdapter::new(
@@ -286,6 +302,7 @@ mod tests {
     use std::sync::atomic::Ordering;
 
     use futures::future::BoxFuture;
+    use vortex_array::buffer::BufferHandle;
     use vortex_array::IntoArray;
     use vortex_array::expr::session::ExprSession;
     use vortex_array::session::ArraySession;
@@ -326,6 +343,22 @@ mod tests {
             self.inner.read_at(offset, length, alignment)
         }
 
+        fn read_at_into(
+            &self,
+            offset: u64,
+            target: Box<dyn vortex_io::WriteTarget>,
+        ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+            let length = target.len();
+            self.total_read.fetch_add(length, Ordering::Relaxed);
+            let _ = self.first_read_len.compare_exchange(
+                0,
+                length,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+            self.inner.read_at_into(offset, target)
+        }
+
         fn concurrency(&self) -> usize {
             self.inner.concurrency()
         }
diff --git a/vortex-file/src/read/driver.rs b/vortex-file/src/read/driver.rs
index f097385445e..18cdfea6a70 100644
--- a/vortex-file/src/read/driver.rs
+++ b/vortex-file/src/read/driver.rs
@@ -326,7 +326,7 @@ mod tests {
     use futures::StreamExt;
     use futures::stream;
     use vortex_buffer::Alignment;
-    use vortex_buffer::ByteBuffer;
+    use vortex_array::buffer::BufferHandle;
     use vortex_error::VortexResult;
 
     use super::*;
@@ -336,7 +336,7 @@ mod tests {
         id: usize,
         offset: u64,
         length: usize,
-    ) -> (ReadRequest, oneshot::Receiver<VortexResult<ByteBuffer>>) {
+    ) -> (ReadRequest, oneshot::Receiver<VortexResult<BufferHandle>>) {
         let (tx, rx) = oneshot::channel();
         (
             ReadRequest {
diff --git a/vortex-file/src/read/request.rs b/vortex-file/src/read/request.rs
index 256cb95851d..8d146a32d61 100644
--- a/vortex-file/src/read/request.rs
+++ b/vortex-file/src/read/request.rs
@@ -7,8 +7,8 @@ use std::fmt::Formatter;
 use std::ops::Range;
 use std::sync::Arc;
 
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
@@ -51,7 +51,7 @@ impl IoRequest {
     }
 
     /// Resolves the request with the given result.
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match self.0 {
             IoRequestInner::Single(req) => req.resolve(result),
             IoRequestInner::Coalesced(req) => req.resolve(result),
@@ -90,7 +90,7 @@ pub struct ReadRequest {
     pub(crate) offset: u64,
     pub(crate) length: usize,
     pub(crate) alignment: Alignment,
-    pub(crate) callback: oneshot::Sender<VortexResult<ByteBuffer>>,
+    pub(crate) callback: oneshot::Sender<VortexResult<BufferHandle>>,
 }
 
 impl Debug for ReadRequest {
@@ -106,7 +106,7 @@ impl Debug for ReadRequest {
 }
 
 impl ReadRequest {
-    pub(crate) fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub(crate) fn resolve(self, result: VortexResult<BufferHandle>) {
         if let Err(e) = self.callback.send(result) {
             tracing::debug!("ReadRequest {} dropped before resolving: {e}", self.id);
         }
@@ -132,16 +132,25 @@ impl Debug for CoalescedRequest {
 }
 
 impl CoalescedRequest {
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match result {
             Ok(buffer) => {
-                let buffer = buffer.aligned(Alignment::none());
-                for req in self.requests.into_iter() {
-                    let start = usize::try_from(req.offset - self.range.start)
-                        .vortex_expect("invalid offset");
-                    let end = start + req.length;
-                    let slice = buffer.slice(start..end).aligned(req.alignment);
-                    req.resolve(Ok(slice));
+                if let Some(host) = buffer.as_host_opt() {
+                    let host = host.clone().aligned(Alignment::none());
+                    for req in self.requests.into_iter() {
+                        let start = usize::try_from(req.offset - self.range.start)
+                            .vortex_expect("invalid offset");
+                        let end = start + req.length;
+                        let slice = host.slice(start..end).aligned(req.alignment);
+                        req.resolve(Ok(BufferHandle::new_host(slice)));
+                    }
+                } else {
+                    for req in self.requests.into_iter() {
+                        let start = usize::try_from(req.offset - self.range.start)
+                            .vortex_expect("invalid offset");
+                        let end = start + req.length;
+                        req.resolve(Ok(buffer.slice(start..end)));
+                    }
                 }
             }
             Err(e) => {
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index a1072af9998..04efe6117de 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -14,9 +14,9 @@ use futures::StreamExt;
 use futures::channel::mpsc;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
+use vortex_io::BufferAllocator;
 use vortex_io::VortexReadAt;
 use vortex_io::runtime::Handle;
 use vortex_layout::segments::SegmentFuture;
@@ -71,6 +71,16 @@ impl FileSegmentSource {
         reader: R,
         handle: Handle,
         metrics: VortexMetrics,
+    ) -> Self {
+        Self::open_with_allocator(segments, Arc::new(reader), handle, metrics, None)
+    }
+
+    pub fn open_with_allocator(
+        segments: Arc<[SegmentSpec]>,
+        source: Arc<dyn VortexReadAt>,
+        handle: Handle,
+        metrics: VortexMetrics,
+        allocator: Option<Arc<dyn BufferAllocator>>,
     ) -> Self {
         let (send, recv) = mpsc::unbounded();
 
@@ -87,6 +97,7 @@ impl FileSegmentSource {
             config
         });
         let concurrency = reader.concurrency();
+        let allocator = allocator.clone();
 
         let drive_fut = async move {
             let stream = IoRequestStream::new(
@@ -99,11 +110,20 @@ impl FileSegmentSource {
 
             stream
                 .map(move |req| {
-                    let source = reader.clone();
+                    let source = source.clone();
+                    let allocator = allocator.clone();
                     async move {
-                        let result = source
-                            .read_at(req.offset(), req.len(), req.alignment())
-                            .await;
+                        let result = if let Some(allocator) = allocator {
+                            match allocator.allocate(req.len(), req.alignment()) {
+                                Ok(target) => source.read_at_into(req.offset(), target).await,
+                                Err(e) => Err(e),
+                            }
+                        } else {
+                            source
+                                .read_at(req.offset(), req.len(), req.alignment())
+                                .await
+                                .map(BufferHandle::new_host)
+                        };
                         req.resolve(result);
                     }
                 })
@@ -162,7 +182,6 @@ impl SegmentSource for FileSegmentSource {
             maybe_fut
                 .ok_or_else(|| vortex_err!("Missing segment: {}", id))?
                 .await
-                .map(BufferHandle::new_host)
         }
         .boxed()
     }
@@ -174,13 +193,13 @@ impl SegmentSource for FileSegmentSource {
 /// If dropped, the read request will be canceled where possible.
 struct ReadFuture {
     id: usize,
-    recv: oneshot::Receiver<VortexResult<ByteBuffer>>,
+    recv: oneshot::Receiver<VortexResult<BufferHandle>>,
     polled: bool,
     events: mpsc::UnboundedSender<ReadEvent>,
 }
 
 impl Future for ReadFuture {
-    type Output = VortexResult<ByteBuffer>;
+    type Output = VortexResult<BufferHandle>;
 
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         if !self.polled {
diff --git a/vortex-io/Cargo.toml b/vortex-io/Cargo.toml
index cef1c69e351..5a98bd28528 100644
--- a/vortex-io/Cargo.toml
+++ b/vortex-io/Cargo.toml
@@ -35,6 +35,7 @@ handle = "1.0.2"
 tokio = { workspace = true, features = ["io-util", "rt", "sync"] }
 tracing = { workspace = true }
 vortex-buffer = { workspace = true }
+vortex-array = { workspace = true }
 vortex-error = { workspace = true }
 vortex-metrics = { workspace = true }
 vortex-session = { workspace = true }
diff --git a/vortex-io/src/allocator.rs b/vortex-io/src/allocator.rs
new file mode 100644
index 00000000000..89cdc637409
--- /dev/null
+++ b/vortex-io/src/allocator.rs
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+use crate::WriteTarget;
+
+/// Allocates buffers for I/O reads.
+pub trait BufferAllocator: Send + Sync + 'static {
+    /// Allocate a buffer for the requested length and alignment.
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>>;
+}
+
+/// The default allocator that uses `ByteBufferMut`.
+pub struct DefaultAllocator;
+
+/// Allocation counters for the default allocator.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct DefaultAllocStats {
+    pub count: u64,
+    pub bytes: u64,
+}
+
+static DEFAULT_ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
+static DEFAULT_ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
+
+pub fn default_alloc_stats() -> DefaultAllocStats {
+    DefaultAllocStats {
+        count: DEFAULT_ALLOC_COUNT.load(Ordering::Relaxed),
+        bytes: DEFAULT_ALLOC_BYTES.load(Ordering::Relaxed),
+    }
+}
+
+pub fn reset_default_alloc_stats() {
+    DEFAULT_ALLOC_COUNT.store(0, Ordering::Relaxed);
+    DEFAULT_ALLOC_BYTES.store(0, Ordering::Relaxed);
+}
+
+impl BufferAllocator for DefaultAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        DEFAULT_ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        DEFAULT_ALLOC_BYTES.fetch_add(len as u64, Ordering::Relaxed);
+        let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        unsafe { buffer.set_len(len) };
+        Ok(Box::new(buffer))
+    }
+}
diff --git a/vortex-io/src/file/object_store.rs b/vortex-io/src/file/object_store.rs
index 0d09cbdcd2b..882b3ef8db6 100644
--- a/vortex-io/src/file/object_store.rs
+++ b/vortex-io/src/file/object_store.rs
@@ -13,6 +13,7 @@ use object_store::GetRange;
 use object_store::GetResultPayload;
 use object_store::ObjectStore;
 use object_store::path::Path as ObjectPath;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
@@ -22,6 +23,7 @@ use vortex_error::vortex_ensure;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
+use crate::WriteTarget;
 #[cfg(not(target_arch = "wasm32"))]
 use crate::file::std_file::read_exact_at;
 use crate::runtime::Handle;
@@ -165,4 +167,72 @@ impl VortexReadAt for ObjectStoreSource {
         })
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let store = self.store.clone();
+        let path = self.path.clone();
+        let handle = self.handle.clone();
+        let length = target.len();
+        let range = offset..(offset + length as u64);
+
+        Compat::new(async move {
+            let response = store
+                .get_opts(
+                    &path,
+                    GetOptions {
+                        range: Some(GetRange::Bounded(range.clone())),
+                        ..Default::default()
+                    },
+                )
+                .await?;
+
+            match response.payload {
+                #[cfg(not(target_arch = "wasm32"))]
+                GetResultPayload::File(file, _) => {
+                    target = handle
+                        .spawn_blocking(move || {
+                            read_exact_at(&file, target.as_mut_slice(), range.start)?;
+                            Ok::<_, io::Error>(target)
+                        })
+                        .await
+                        .map_err(io::Error::other)?;
+                }
+                #[cfg(target_arch = "wasm32")]
+                GetResultPayload::File(..) => {
+                    unreachable!("File payload not supported on wasm32")
+                }
+                GetResultPayload::Stream(mut byte_stream) => {
+                    let mut filled = 0usize;
+                    while let Some(bytes) = byte_stream.next().await {
+                        let bytes = bytes?;
+                        let end = filled + bytes.len();
+                        vortex_ensure!(
+                            end <= length,
+                            "Object store stream returned more bytes than expected (expected {} bytes, got at least {} bytes, range: {:?})",
+                            length,
+                            end,
+                            range
+                        );
+                        target.as_mut_slice()[filled..end].copy_from_slice(&bytes);
+                        filled = end;
+                    }
+
+                    vortex_ensure!(
+                        filled == length,
+                        "Object store stream returned {} bytes but expected {} bytes (range: {:?})",
+                        filled,
+                        length,
+                        range
+                    );
+                }
+            }
+
+            target.into_handle().await
+        })
+        .boxed()
+    }
 }
diff --git a/vortex-io/src/file/std_file.rs b/vortex-io/src/file/std_file.rs
index 56abd56eb60..68a6982cb50 100644
--- a/vortex-io/src/file/std_file.rs
+++ b/vortex-io/src/file/std_file.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fs::File;
+use std::io;
 #[cfg(all(not(unix), not(windows)))]
 use std::io::Read;
 #[cfg(all(not(unix), not(windows)))]
@@ -15,6 +16,7 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
@@ -22,12 +24,13 @@ use vortex_error::VortexResult;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
+use crate::WriteTarget;
 use crate::runtime::Handle;
 
 /// Read exactly `buffer.len()` bytes from `file` starting at `offset`.
 /// This is a platform-specific helper that uses the most efficient method available.
 #[cfg(not(target_arch = "wasm32"))]
-pub(crate) fn read_exact_at(file: &File, buffer: &mut [u8], offset: u64) -> std::io::Result<()> {
+pub(crate) fn read_exact_at(file: &File, buffer: &mut [u8], offset: u64) -> io::Result<()> {
     #[cfg(unix)]
     {
         file.read_exact_at(buffer, offset)
@@ -122,4 +125,24 @@ impl VortexReadAt for FileReadAdapter {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let file = self.file.clone();
+        let handle = self.handle.clone();
+        async move {
+            let target = handle
+                .spawn_blocking(move || {
+                    read_exact_at(&file, target.as_mut_slice(), offset)?;
+                    Ok::<_, io::Error>(target)
+                })
+                .await
+                .map_err(io::Error::other)?;
+            target.into_handle().await
+        }
+        .boxed()
+    }
 }
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index 6a08c821c8f..f7706f63244 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -10,13 +10,16 @@
 //! This crate provides core traits for positioned and streaming IO, and via feature
 //! flags implements the core traits for several common async runtimes and backing stores.
 
+pub use allocator::*;
 pub use io_buf::*;
 pub use limit::*;
 #[cfg(feature = "object_store")]
 pub use object_store::*;
 pub use read::*;
 pub use write::*;
+pub use write_target::*;
 
+mod allocator;
 pub mod file;
 mod io_buf;
 pub mod kanal_ext;
@@ -29,3 +32,4 @@ pub mod session;
 #[cfg(feature = "tokio")]
 mod tokio;
 mod write;
+mod write_target;
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index fbcbd697d45..2c7007fbfbf 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
@@ -15,6 +16,7 @@ use vortex_metrics::Histogram;
 use vortex_metrics::Timer;
 use vortex_metrics::VortexMetrics;
 
+use crate::WriteTarget;
 /// Configuration for coalescing nearby I/O requests into single operations.
 #[derive(Clone, Copy, Debug)]
 pub struct CoalesceConfig {
@@ -81,6 +83,13 @@ pub trait VortexReadAt: Send + Sync + 'static {
         length: usize,
         alignment: Alignment,
     ) -> BoxFuture<'static, VortexResult<ByteBuffer>>;
+
+    /// Read into a pre-allocated target buffer.
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>>;
 }
 
 impl VortexReadAt for Arc<dyn VortexReadAt> {
@@ -108,6 +117,14 @@ impl VortexReadAt for Arc<dyn VortexReadAt> {
     ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
         self.as_ref().read_at(offset, length, alignment)
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        self.as_ref().read_at_into(offset, target)
+    }
 }
 
 impl<R: VortexReadAt> VortexReadAt for Arc<R> {
@@ -136,6 +153,14 @@ impl<R: VortexReadAt> VortexReadAt for Arc<R> {
         self.as_ref().read_at(offset, length, alignment)
     }
 
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        self.as_ref().read_at_into(offset, target)
+    }
+
     // fn drive(self: Arc<Self>, requests: BoxStream<'static, IoRequest>) -> BoxFuture<'static, ()> {
     //     // Delegate to the inner implementation's drive
     //     let inner: Arc<R> = Arc::clone(&self);
@@ -176,6 +201,32 @@ impl VortexReadAt for ByteBuffer {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let buffer = self.clone();
+        async move {
+            let start = usize::try_from(offset).vortex_expect("start too big for usize");
+            let end = usize::try_from(offset + target.len() as u64)
+                .vortex_expect("end too big for usize");
+            if end > buffer.len() {
+                vortex_bail!(
+                    "Requested range {}..{} out of bounds for buffer of length {}",
+                    start,
+                    end,
+                    buffer.len()
+                );
+            }
+            target
+                .as_mut_slice()
+                .copy_from_slice(&buffer.as_ref()[start..end]);
+            target.into_handle().await
+        }
+        .boxed()
+    }
 }
 
 /// A wrapper that instruments a [`VortexReadAt`] with metrics.
@@ -261,6 +312,26 @@ impl<T: VortexReadAt + Clone> VortexReadAt for InstrumentedReadAt<T> {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let durations = self.durations.clone();
+        let sizes = self.sizes.clone();
+        let total_size = self.total_size.clone();
+        let length = target.len();
+        let read_fut = self.read.read_at_into(offset, target);
+        async move {
+            let _timer = durations.time();
+            let result = read_fut.await;
+            sizes.update(length as i64);
+            total_size.add(length as i64);
+            result
+        }
+        .boxed()
+    }
 }
 
 #[cfg(test)]
diff --git a/vortex-io/src/runtime/tests.rs b/vortex-io/src/runtime/tests.rs
index 10832633983..73923607896 100644
--- a/vortex-io/src/runtime/tests.rs
+++ b/vortex-io/src/runtime/tests.rs
@@ -11,12 +11,14 @@ use std::sync::atomic::Ordering;
 use futures::FutureExt;
 use futures::future::BoxFuture;
 use tempfile::NamedTempFile;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
 use crate::VortexReadAt;
+use crate::WriteTarget;
 use crate::file::std_file::FileReadAdapter;
 use crate::runtime::single::block_on;
 use crate::runtime::tokio::TokioRuntime;
@@ -257,6 +259,27 @@ impl VortexReadAt for CountingReadAt {
         }
         .boxed()
     }
+
+    fn read_at_into(
+        &self,
+        offset: u64,
+        mut target: Box<dyn WriteTarget>,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        self.read_count.fetch_add(1, Ordering::SeqCst);
+        let data = self.data.clone();
+        async move {
+            let start = offset as usize;
+            let length = target.len();
+            if start + length > data.len() {
+                return Err(vortex_error::vortex_err!("Read out of bounds"));
+            }
+            target
+                .as_mut_slice()
+                .copy_from_slice(&data.as_slice()[start..start + length]);
+            target.into_handle().await
+        }
+        .boxed()
+    }
 }
 
 #[tokio::test]
diff --git a/vortex-io/src/write_target.rs b/vortex-io/src/write_target.rs
new file mode 100644
index 00000000000..ddffecfa27c
--- /dev/null
+++ b/vortex-io/src/write_target.rs
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+/// A destination for I/O reads that can be finalized into a [`BufferHandle`].
+pub trait WriteTarget: Send + 'static {
+    /// Returns the buffer as a mutable slice.
+    fn as_mut_slice(&mut self) -> &mut [u8];
+
+    /// Returns the length of the buffer in bytes.
+    fn len(&self) -> usize;
+
+    /// Returns true if the buffer is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Finalize the target into a buffer handle.
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>>;
+}
+
+impl WriteTarget for ByteBufferMut {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.as_mut()
+    }
+
+    fn len(&self) -> usize {
+        ByteBufferMut::len(self)
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        async move { Ok(BufferHandle::new_host(self.freeze())) }.boxed()
+    }
+}

From 5764ffeb3a59f22f1978508a3527e8ab927af842 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 27 Jan 2026 15:48:39 +0000
Subject: [PATCH 2/5] no pools yet

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 Cargo.lock                                  |   2 -
 vortex-cuda/Cargo.toml                      |   2 -
 vortex-cuda/nvcomp/build.rs                 | 142 ++------
 vortex-cuda/src/host_to_device_allocator.rs | 102 ++++++
 vortex-cuda/src/lib.rs                      |   9 +-
 vortex-cuda/src/pinned.rs                   | 372 --------------------
 vortex-cuda/src/pinned_allocator.rs         | 165 ---------
 7 files changed, 139 insertions(+), 655 deletions(-)
 create mode 100644 vortex-cuda/src/host_to_device_allocator.rs
 delete mode 100644 vortex-cuda/src/pinned.rs
 delete mode 100644 vortex-cuda/src/pinned_allocator.rs

diff --git a/Cargo.lock b/Cargo.lock
index ff122aced98..cb1f7236af5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10335,12 +10335,10 @@ name = "vortex-cuda"
 version = "0.1.0"
 dependencies = [
  "async-trait",
- "bytes",
  "codspeed-criterion-compat-walltime",
  "cudarc",
  "futures",
  "kanal",
- "parking_lot",
  "rstest",
  "tokio",
  "tracing",
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index cf4c85f4567..0e8fe6fc58c 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -22,11 +22,9 @@ _test-harness = []
 
 [dependencies]
 async-trait = { workspace = true }
-bytes = { workspace = true }
 cudarc = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
-parking_lot = { workspace = true }
 tracing = { workspace = true }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
diff --git a/vortex-cuda/nvcomp/build.rs b/vortex-cuda/nvcomp/build.rs
index d3f7c37543a..8ada54e46ad 100644
--- a/vortex-cuda/nvcomp/build.rs
+++ b/vortex-cuda/nvcomp/build.rs
@@ -32,67 +32,6 @@ typedef int cudaError_t;
 #define cudaSuccess 0
 "#;
 
-/// Minimal nvCOMP headers for non-Linux platforms to allow bindgen to run.
-const NVCOMP_STUB_HEADER: &str = r#"
-#pragma once
-#include <stddef.h>
-#include "cuda_runtime.h"
-
-typedef enum nvcompStatus_t {
-    nvcompSuccess = 0,
-    nvcompErrorInvalidValue = 1,
-    nvcompErrorNotSupported = 2,
-    nvcompErrorCannotDecompress = 3,
-    nvcompErrorBadChecksum = 4,
-    nvcompErrorCannotVerifyChecksums = 5,
-    nvcompErrorOutputBufferTooSmall = 6,
-    nvcompErrorWrongHeaderLength = 7,
-    nvcompErrorAlignment = 8,
-    nvcompErrorChunkSizeTooLarge = 9,
-    nvcompErrorCannotCompress = 10,
-    nvcompErrorWrongInputLength = 11,
-    nvcompErrorBatchSizeTooLarge = 12,
-    nvcompErrorCudaError = 13,
-    nvcompErrorInternal = 14
-} nvcompStatus_t;
-
-typedef enum nvcompDecompressBackend_t {
-    NVCOMP_DECOMPRESS_BACKEND_DEFAULT = 0,
-    NVCOMP_DECOMPRESS_BACKEND_HARDWARE = 1,
-    NVCOMP_DECOMPRESS_BACKEND_CUDA = 2
-} nvcompDecompressBackend_t;
-
-typedef struct nvcompBatchedZstdDecompressOpts_t {
-    nvcompDecompressBackend_t backend;
-    unsigned char reserved[60];
-} nvcompBatchedZstdDecompressOpts_t;
-"#;
-
-const NVCOMP_ZSTD_STUB_HEADER: &str = r#"
-#pragma once
-#include "nvcomp.h"
-
-nvcompStatus_t nvcompBatchedZstdDecompressGetTempSizeAsync(
-    size_t numChunks,
-    size_t maxUncompressedChunkBytes,
-    nvcompBatchedZstdDecompressOpts_t opts,
-    size_t* tempBytes,
-    size_t maxTotalUncompressedBytes);
-
-nvcompStatus_t nvcompBatchedZstdDecompressAsync(
-    const void* const* device_compressed_ptrs,
-    const size_t* device_compressed_bytes,
-    const size_t* device_uncompressed_bytes,
-    size_t* device_actual_uncompressed_bytes,
-    size_t num_chunks,
-    void* device_temp_ptr,
-    size_t temp_bytes,
-    void* const* device_uncompressed_ptrs,
-    nvcompBatchedZstdDecompressOpts_t opts,
-    nvcompStatus_t* device_statuses,
-    cudaStream_t stream);
-"#;
-
 fn main() {
     // Declare the cfg so rustc doesn't warn about unexpected cfg.
     println!("cargo::rustc-check-cfg=cfg(cuda_available)");
@@ -106,60 +45,49 @@ fn main() {
     fs::create_dir_all(&cuda_stub_dir).unwrap();
     fs::write(cuda_stub_dir.join("cuda_runtime.h"), CUDA_RUNTIME_STUB).unwrap();
 
-    let is_linux = env::consts::OS == "linux";
-    let include_dir = if is_linux {
-        let (os, arch) = match (env::consts::OS, env::consts::ARCH) {
-            ("linux", "x86_64") => ("linux", "x86_64"),
-            ("linux", "aarch64") => ("linux", "sbsa"),
-            _ => ("linux", "x86_64"),
-        };
-
-        let archive_name = format!("nvcomp-{os}-{arch}-{NVCOMP_VERSION}_{CUDA_VERSION}-archive");
-        let url = format!(
-            "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/{os}-{arch}/{archive_name}.tar.xz"
-        );
+    let (os, arch) = match (env::consts::OS, env::consts::ARCH) {
+        ("linux", "x86_64") => ("linux", "x86_64"),
+        ("linux", "aarch64") => ("linux", "sbsa"),
+        // Fall back to linux-x86_64 to generate bindings for any platform.
+        _ => ("linux", "x86_64"),
+    };
 
-        let include_dir = nvcomp_dir.join("include");
+    let archive_name = format!("nvcomp-{os}-{arch}-{NVCOMP_VERSION}_{CUDA_VERSION}-archive");
+    let url = format!(
+        "https://developer.download.nvidia.com/compute/nvcomp/redist/nvcomp/{os}-{arch}/{archive_name}.tar.xz"
+    );
 
-        if !include_dir.exists() {
-            let response = reqwest::blocking::get(&url)
-                .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}"));
+    let include_dir = nvcomp_dir.join("include");
 
-            assert!(
-                response.status().is_success(),
-                "Failed to download nvCOMP: HTTP {}",
-                response.status()
-            );
+    if !include_dir.exists() {
+        let response = reqwest::blocking::get(&url)
+            .unwrap_or_else(|e| panic!("Failed to download nvCOMP: {e}"));
 
-            let bytes = response.bytes().unwrap();
+        assert!(
+            response.status().is_success(),
+            "Failed to download nvCOMP: HTTP {}",
+            response.status()
+        );
 
-            // Extract tar.xz archive.
-            let cursor = Cursor::new(bytes.as_ref());
-            let xz = XzDecoder::new(cursor);
-            let mut archive = tar::Archive::new(xz);
+        let bytes = response.bytes().unwrap();
 
-            let temp_dir = nvcomp_dir.with_extension("tmp");
-            fs::create_dir_all(&temp_dir).unwrap();
-            archive.unpack(&temp_dir).unwrap();
+        // Extract tar.xz archive.
+        let cursor = Cursor::new(bytes.as_ref());
+        let xz = XzDecoder::new(cursor);
+        let mut archive = tar::Archive::new(xz);
 
-            // Move extracted content.
-            let extracted = temp_dir.join(&archive_name);
-            if nvcomp_dir.exists() {
-                fs::remove_dir_all(&nvcomp_dir).unwrap();
-            }
-            fs::rename(&extracted, &nvcomp_dir).unwrap();
-            fs::remove_dir_all(&temp_dir).ok();
-        }
+        let temp_dir = nvcomp_dir.with_extension("tmp");
+        fs::create_dir_all(&temp_dir).unwrap();
+        archive.unpack(&temp_dir).unwrap();
 
-        include_dir
-    } else {
-        let stub_include = out_dir.join("nvcomp-stub").join("include");
-        let stub_nvcomp = stub_include.join("nvcomp");
-        fs::create_dir_all(&stub_nvcomp).unwrap();
-        fs::write(stub_include.join("nvcomp.h"), NVCOMP_STUB_HEADER).unwrap();
-        fs::write(stub_nvcomp.join("zstd.h"), NVCOMP_ZSTD_STUB_HEADER).unwrap();
-        stub_include
-    };
+        // Move extracted content.
+        let extracted = temp_dir.join(&archive_name);
+        if nvcomp_dir.exists() {
+            fs::remove_dir_all(&nvcomp_dir).unwrap();
+        }
+        fs::rename(&extracted, &nvcomp_dir).unwrap();
+        fs::remove_dir_all(&temp_dir).ok();
+    }
 
     // Functions are loaded at runtime via libloading to avoid link-time symbol resolution.
     let bindings = bindgen::Builder::default()
diff --git a/vortex-cuda/src/host_to_device_allocator.rs b/vortex-cuda/src/host_to_device_allocator.rs
new file mode 100644
index 00000000000..70e3c5130c8
--- /dev/null
+++ b/vortex-cuda/src/host_to_device_allocator.rs
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaStream;
+use cudarc::driver::DevicePtrMut;
+use cudarc::driver::result::memcpy_htod_async;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_io::BufferAllocator;
+use vortex_io::WriteTarget;
+use vortex_session::VortexSession;
+
+use crate::device_buffer::CudaDeviceBuffer;
+use crate::session::CudaSessionExt;
+use crate::stream::await_stream_callback;
+
+/// Allocator that reads into host buffers and copies to device memory.
+pub struct HostToDeviceAllocator {
+    stream: Arc<CudaStream>,
+}
+
+impl HostToDeviceAllocator {
+    pub fn new(stream: Arc<CudaStream>) -> Self {
+        Self { stream }
+    }
+
+    pub fn from_session(session: &VortexSession) -> VortexResult<Self> {
+        let stream = session.cuda_session().new_stream()?;
+        Ok(Self::new(stream))
+    }
+}
+
+impl BufferAllocator for HostToDeviceAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+        let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        unsafe { buffer.set_len(len) };
+        Ok(Box::new(NaiveDeviceWriteTarget {
+            buffer,
+            stream: self.stream.clone(),
+            alignment,
+        }))
+    }
+}
+
+struct NaiveDeviceWriteTarget {
+    buffer: ByteBufferMut,
+    stream: Arc<CudaStream>,
+    alignment: Alignment,
+}
+
+impl WriteTarget for NaiveDeviceWriteTarget {
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.buffer.as_mut()
+    }
+
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let stream = self.stream.clone();
+        let alignment = self.alignment;
+        let host = self.buffer;
+        async move {
+            let ptr = host.as_ref().as_ptr() as usize;
+            let align = *alignment;
+            if align > 1 && ptr % align != 0 {
+                return Err(vortex_err!(
+                    "Host buffer not aligned to {} (ptr=0x{:x})",
+                    align,
+                    ptr
+                ));
+            }
+
+            let len = host.len();
+            let mut device = unsafe { stream.alloc::<u8>(len) }
+                .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
+
+            let device_ptr = device.device_ptr_mut(&stream).0;
+            let host_slice = host.as_ref();
+            unsafe {
+                memcpy_htod_async(device_ptr, host_slice, stream.cu_stream())
+                    .map_err(|e| vortex_err!("Failed to schedule H2D copy: {e}"))?;
+            }
+
+            await_stream_callback(&stream).await?;
+
+            // Keep the host buffer alive until the copy completes.
+            let _keep_alive = host;
+
+            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(device))))
+        }
+        .boxed()
+    }
+}
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 08b9806b38f..df87a0e2520 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -9,8 +9,7 @@ mod canonical;
 mod device_buffer;
 pub mod executor;
 mod kernel;
-mod pinned;
-mod pinned_allocator;
+mod host_to_device_allocator;
 mod session;
 mod stream;
 
@@ -19,11 +18,7 @@ pub use device_buffer::CudaBufferExt;
 pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
-pub use pinned::PinnedByteBuffer;
-pub use pinned::PinnedByteBufferPool;
-pub use pinned::PooledPinnedBuffer;
-pub use pinned_allocator::PinnedBufferAllocator;
-pub use pinned_allocator::PinnedDeviceAllocator;
+pub use host_to_device_allocator::HostToDeviceAllocator;
 use kernel::ALPExecutor;
 use kernel::DecimalBytePartsExecutor;
 use kernel::DictExecutor;
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
deleted file mode 100644
index 948bf7bfbb5..00000000000
--- a/vortex-cuda/src/pinned.rs
+++ /dev/null
@@ -1,372 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-use std::sync::Arc;
-
-use bytes::Bytes;
-use cudarc::driver::CudaContext;
-use cudarc::driver::CudaStream;
-use cudarc::driver::HostSlice;
-use cudarc::driver::PinnedHostSlice;
-use cudarc::driver::SyncOnDrop;
-use parking_lot::Mutex;
-use vortex_buffer::ByteBuffer;
-use vortex_error::VortexResult;
-use vortex_error::vortex_err;
-use vortex_error::vortex_panic;
-use vortex_utils::aliases::hash_map::HashMap;
-
-/// A page-locked host buffer allocated by CUDA.
-///
-/// This is intended as a staging buffer for H2D transfers. Contents are uninitialized after
-/// allocation.
-pub struct PinnedByteBuffer {
-    inner: PinnedHostSlice<u8>,
-}
-
-#[allow(clippy::same_name_method)]
-impl PinnedByteBuffer {
-    /// Allocate a pinned host buffer with uninitialized contents.
-    ///
-    /// # Safety
-    /// The returned buffer's contents are uninitialized. The caller must initialize before read.
-    pub unsafe fn uninit(ctx: &Arc<CudaContext>, len: usize) -> VortexResult<Self> {
-        let inner = unsafe {
-            ctx.alloc_pinned::<u8>(len)
-                .map_err(|e| vortex_err!("failed to allocate pinned host buffer: {e}"))?
-        };
-        Ok(Self { inner })
-    }
-
-    /// Returns the length of the buffer in bytes.
-    pub fn len(&self) -> usize {
-        self.inner.len()
-    }
-
-    /// Returns true if the buffer is empty.
-    pub fn is_empty(&self) -> bool {
-        self.inner.is_empty()
-    }
-
-    /// Returns the buffer as an immutable slice.
-    pub fn as_slice(&self) -> VortexResult<&[u8]> {
-        self.inner
-            .as_slice()
-            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Returns the buffer as a mutable slice.
-    pub fn as_mut_slice(&mut self) -> VortexResult<&mut [u8]> {
-        self.inner
-            .as_mut_slice()
-            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Returns a raw pointer to the buffer.
-    pub fn as_ptr(&self) -> VortexResult<*const u8> {
-        self.inner
-            .as_ptr()
-            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Returns a mutable raw pointer to the buffer.
-    pub fn as_mut_ptr(&mut self) -> VortexResult<*mut u8> {
-        self.inner
-            .as_mut_ptr()
-            .map_err(|e| vortex_err!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Returns the CUDA context that owns this allocation.
-    pub fn context(&self) -> &Arc<CudaContext> {
-        self.inner.context()
-    }
-}
-
-#[allow(clippy::same_name_method)]
-impl HostSlice<u8> for PinnedByteBuffer {
-    fn len(&self) -> usize {
-        self.len()
-    }
-
-    unsafe fn stream_synced_slice<'a>(
-        &'a self,
-        stream: &'a CudaStream,
-    ) -> (&'a [u8], SyncOnDrop<'a>) {
-        unsafe { <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_slice(&self.inner, stream) }
-    }
-
-    unsafe fn stream_synced_mut_slice<'a>(
-        &'a mut self,
-        stream: &'a CudaStream,
-    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
-        unsafe {
-            <PinnedHostSlice<u8> as HostSlice<u8>>::stream_synced_mut_slice(&mut self.inner, stream)
-        }
-    }
-}
-
-/// A simple pinned host buffer pool keyed by allocation size.
-pub struct PinnedByteBufferPool {
-    ctx: Arc<CudaContext>,
-    max_keep_per_size: usize,
-    buckets: Mutex<HashMap<usize, Vec<PinnedByteBuffer>>>,
-    hits: std::sync::atomic::AtomicU64,
-    misses: std::sync::atomic::AtomicU64,
-    allocs: std::sync::atomic::AtomicU64,
-    puts: std::sync::atomic::AtomicU64,
-}
-
-impl PinnedByteBufferPool {
-    /// Create a new pool with default limits.
-    pub fn new(ctx: Arc<CudaContext>) -> Self {
-        Self::with_limits(ctx, 4)
-    }
-
-    /// Create a new pool with a maximum number of cached buffers per size.
-    pub fn with_limits(ctx: Arc<CudaContext>, max_keep_per_size: usize) -> Self {
-        Self {
-            ctx,
-            max_keep_per_size: max_keep_per_size.max(1),
-            buckets: Mutex::new(HashMap::new()),
-            hits: std::sync::atomic::AtomicU64::new(0),
-            misses: std::sync::atomic::AtomicU64::new(0),
-            allocs: std::sync::atomic::AtomicU64::new(0),
-            puts: std::sync::atomic::AtomicU64::new(0),
-        }
-    }
-
-    /// Acquire a pinned buffer of the given size in bytes.
-    pub fn get(&self, len: usize) -> VortexResult<PinnedByteBuffer> {
-        let mut buckets = self.buckets.lock();
-        if let Some(bucket) = buckets.get_mut(&len)
-            && let Some(buf) = bucket.pop()
-        {
-            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-            return Ok(buf);
-        }
-        self.misses
-            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        self.allocs.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        unsafe { PinnedByteBuffer::uninit(&self.ctx, len) }
-    }
-
-    /// Return a buffer to the pool.
-    pub fn put(&self, buf: PinnedByteBuffer) -> VortexResult<()> {
-        let len = buf.len();
-        let mut buckets = self.buckets.lock();
-        let bucket = buckets.entry(len).or_default();
-        if bucket.len() < self.max_keep_per_size {
-            bucket.push(buf);
-        }
-        self.puts.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        Ok(())
-    }
-
-    /// Get a pooled pinned buffer that will be returned to the pool on drop.
-    pub fn get_pooled(self: &Arc<Self>, len: usize) -> VortexResult<PooledPinnedBuffer> {
-        let inner = self.get(len)?;
-        Ok(PooledPinnedBuffer {
-            inner: Some(inner),
-            pool: self.clone(),
-        })
-    }
-
-    /// Snapshot pool reuse statistics.
-    pub fn stats(&self) -> PinnedPoolStats {
-        PinnedPoolStats {
-            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
-            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
-            allocs: self.allocs.load(std::sync::atomic::Ordering::Relaxed),
-            puts: self.puts.load(std::sync::atomic::Ordering::Relaxed),
-        }
-    }
-
-    /// Reset pool reuse statistics.
-    pub fn reset_stats(&self) {
-        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
-        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
-        self.allocs.store(0, std::sync::atomic::Ordering::Relaxed);
-        self.puts.store(0, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-/// Reuse counters for a pinned buffer pool.
-#[derive(Clone, Copy, Debug, Default)]
-pub struct PinnedPoolStats {
-    pub hits: u64,
-    pub misses: u64,
-    pub allocs: u64,
-    pub puts: u64,
-}
-
-/// A pinned buffer that is returned to its pool when dropped.
-///
-/// This wrapper owns a [`PinnedByteBuffer`] and ensures it gets returned to the
-/// [`PinnedByteBufferPool`] when the buffer is no longer needed. This enables efficient
-/// buffer reuse for I/O operations.
-pub struct PooledPinnedBuffer {
-    inner: Option<PinnedByteBuffer>,
-    pool: Arc<PinnedByteBufferPool>,
-}
-
-#[allow(clippy::same_name_method)]
-impl PooledPinnedBuffer {
-    /// Create a new pooled buffer.
-    pub fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
-        Self {
-            inner: Some(inner),
-            pool,
-        }
-    }
-
-    /// Returns the length of the buffer in bytes.
-    pub fn len(&self) -> usize {
-        self.inner
-            .as_ref()
-            .map(|b| b.len())
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"))
-    }
-
-    /// Returns true if the buffer is empty.
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Returns the buffer as an immutable slice.
-    pub fn as_slice(&self) -> &[u8] {
-        let inner = self
-            .inner
-            .as_ref()
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
-        inner
-            .as_slice()
-            .unwrap_or_else(|e| vortex_panic!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Returns the buffer as a mutable slice.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the buffer has already been consumed or if the CUDA context is invalid.
-    pub fn as_mut_slice(&mut self) -> &mut [u8] {
-        let inner = self
-            .inner
-            .as_mut()
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
-        inner
-            .as_mut_slice()
-            .unwrap_or_else(|e| vortex_panic!("failed to access pinned host buffer: {e}"))
-    }
-
-    /// Convert this pooled buffer into a [`ByteBuffer`].
-    ///
-    /// The returned buffer will return the underlying pinned memory to the pool when dropped.
-    /// This enables zero-copy conversion to the standard Vortex buffer type while maintaining
-    /// pool-based memory reuse.
-    pub fn into_byte_buffer(mut self) -> ByteBuffer {
-        let inner = self
-            .inner
-            .take()
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
-        let len = inner.len();
-        let pool = self.pool.clone();
-
-        // Create a wrapper that will return the buffer to the pool on drop
-        let wrapper = PooledPinnedBufferOwner::new(inner, pool);
-
-        // Use Bytes::from_owner to create a Bytes that owns the wrapper
-        let bytes = Bytes::from_owner(wrapper);
-
-        // The ByteBuffer should have the full length
-        assert_eq!(bytes.len(), len);
-
-        ByteBuffer::from(bytes)
-    }
-}
-
-#[allow(clippy::same_name_method)]
-impl HostSlice<u8> for PooledPinnedBuffer {
-    fn len(&self) -> usize {
-        self.len()
-    }
-
-    unsafe fn stream_synced_slice<'a>(
-        &'a self,
-        stream: &'a CudaStream,
-    ) -> (&'a [u8], SyncOnDrop<'a>) {
-        let inner = self
-            .inner
-            .as_ref()
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
-        unsafe { HostSlice::stream_synced_slice(inner, stream) }
-    }
-
-    unsafe fn stream_synced_mut_slice<'a>(
-        &'a mut self,
-        stream: &'a CudaStream,
-    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
-        let inner = self
-            .inner
-            .as_mut()
-            .unwrap_or_else(|| vortex_panic!("buffer already consumed"));
-        unsafe { HostSlice::stream_synced_mut_slice(inner, stream) }
-    }
-}
-
-impl Drop for PooledPinnedBuffer {
-    fn drop(&mut self) {
-        if let Some(inner) = self.inner.take() {
-            // Return the buffer to the pool, ignoring errors
-            drop(self.pool.put(inner));
-        }
-    }
-}
-
-/// Internal wrapper that owns a PinnedByteBuffer and returns it to the pool on drop.
-///
-/// This is used by `Bytes::from_owner` to manage the lifecycle of pooled pinned buffers.
-struct PooledPinnedBufferOwner {
-    // We use Option so we can take the buffer out in Drop
-    inner: Option<PinnedByteBuffer>,
-    // Cached pointer and length for AsRef implementation
-    ptr: *const u8,
-    len: usize,
-    pool: Arc<PinnedByteBufferPool>,
-}
-
-// SAFETY: The pinned buffer is allocated by CUDA and is safe to send across threads.
-// The pointer is derived from the buffer and remains valid as long as the buffer exists.
-unsafe impl Send for PooledPinnedBufferOwner {}
-unsafe impl Sync for PooledPinnedBufferOwner {}
-
-impl PooledPinnedBufferOwner {
-    fn new(inner: PinnedByteBuffer, pool: Arc<PinnedByteBufferPool>) -> Self {
-        let ptr = inner
-            .as_ptr()
-            .unwrap_or_else(|e| vortex_panic!("failed to get pointer to pinned buffer: {e}"));
-        let len = inner.len();
-        Self {
-            inner: Some(inner),
-            ptr,
-            len,
-            pool,
-        }
-    }
-}
-
-impl AsRef<[u8]> for PooledPinnedBufferOwner {
-    fn as_ref(&self) -> &[u8] {
-        // SAFETY: The pointer and length were captured when the buffer was created
-        // and remain valid as long as this struct exists (buffer is in the Mutex).
-        unsafe { std::slice::from_raw_parts(self.ptr, self.len) }
-    }
-}
-
-impl Drop for PooledPinnedBufferOwner {
-    fn drop(&mut self) {
-        // Take the buffer out and return it to the pool
-        if let Some(buffer) = self.inner.take() {
-            drop(self.pool.put(buffer));
-        }
-    }
-}
diff --git a/vortex-cuda/src/pinned_allocator.rs b/vortex-cuda/src/pinned_allocator.rs
deleted file mode 100644
index 6658094f5d3..00000000000
--- a/vortex-cuda/src/pinned_allocator.rs
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-use std::sync::Arc;
-
-use cudarc::driver::CudaStream;
-use cudarc::driver::DevicePtrMut;
-use cudarc::driver::result::memcpy_htod_async;
-use futures::future::BoxFuture;
-use futures::FutureExt;
-use vortex_array::buffer::BufferHandle;
-use vortex_buffer::Alignment;
-use vortex_error::VortexResult;
-use vortex_error::vortex_err;
-use vortex_io::BufferAllocator;
-use vortex_io::WriteTarget;
-use vortex_session::VortexSession;
-
-use crate::PinnedByteBufferPool;
-use crate::PooledPinnedBuffer;
-use crate::device_buffer::CudaDeviceBuffer;
-use crate::session::CudaSessionExt;
-use crate::stream::await_stream_callback;
-
-/// Allocator that sources buffers from a CUDA pinned pool.
-pub struct PinnedBufferAllocator {
-    pool: Arc<PinnedByteBufferPool>,
-}
-
-impl PinnedBufferAllocator {
-    pub fn new(pool: Arc<PinnedByteBufferPool>) -> Self {
-        Self { pool }
-    }
-}
-
-impl BufferAllocator for PinnedBufferAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
-        let buffer = self.pool.get_pooled(len)?;
-        Ok(Box::new(AlignedPinnedWriteTarget::new(buffer, alignment)))
-    }
-}
-
-struct AlignedPinnedWriteTarget {
-    buffer: PooledPinnedBuffer,
-    alignment: Alignment,
-}
-
-impl AlignedPinnedWriteTarget {
-    fn new(buffer: PooledPinnedBuffer, alignment: Alignment) -> Self {
-        Self { buffer, alignment }
-    }
-}
-
-impl WriteTarget for AlignedPinnedWriteTarget {
-    fn as_mut_slice(&mut self) -> &mut [u8] {
-        self.buffer.as_mut_slice()
-    }
-
-    fn len(&self) -> usize {
-        self.buffer.len()
-    }
-
-    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        async move {
-            let ptr = self.buffer.as_slice().as_ptr() as usize;
-            let align = *self.alignment;
-            // CUDA pinned allocations don't accept an explicit alignment request,
-            // so we validate the actual pointer alignment after allocation.
-            if align > 1 && ptr % align != 0 {
-                return Err(vortex_err!(
-                    "Pinned host buffer not aligned to {} (ptr=0x{:x})",
-                    align,
-                    ptr
-                ));
-            }
-            Ok(BufferHandle::new_host(self.buffer.into_byte_buffer()))
-        }
-        .boxed()
-    }
-}
-
-/// Allocator that reads into pinned buffers and transfers to device memory.
-pub struct PinnedDeviceAllocator {
-    pool: Arc<PinnedByteBufferPool>,
-    stream: Arc<CudaStream>,
-}
-
-impl PinnedDeviceAllocator {
-    pub fn new(pool: Arc<PinnedByteBufferPool>, stream: Arc<CudaStream>) -> Self {
-        Self { pool, stream }
-    }
-
-    pub fn from_session(
-        pool: Arc<PinnedByteBufferPool>,
-        session: &VortexSession,
-    ) -> VortexResult<Self> {
-        let stream = session.cuda_session().new_stream()?;
-        Ok(Self::new(pool, stream))
-    }
-}
-
-impl BufferAllocator for PinnedDeviceAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
-        let buffer = self.pool.get_pooled(len)?;
-        Ok(Box::new(PinnedDeviceWriteTarget {
-            buffer,
-            stream: self.stream.clone(),
-            alignment,
-        }))
-    }
-}
-
-struct PinnedDeviceWriteTarget {
-    buffer: PooledPinnedBuffer,
-    stream: Arc<CudaStream>,
-    alignment: Alignment,
-}
-
-impl WriteTarget for PinnedDeviceWriteTarget {
-    fn as_mut_slice(&mut self) -> &mut [u8] {
-        self.buffer.as_mut_slice()
-    }
-
-    fn len(&self) -> usize {
-        self.buffer.len()
-    }
-
-    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        let len = self.buffer.len();
-        let stream = self.stream.clone();
-        let host = self.buffer;
-        let alignment = self.alignment;
-        async move {
-            let ptr = host.as_slice().as_ptr() as usize;
-            let align = *alignment;
-            // CUDA pinned allocations don't accept an explicit alignment request,
-            // so we validate the actual pointer alignment after allocation.
-            if align > 1 && ptr % align != 0 {
-                return Err(vortex_err!(
-                    "Pinned host buffer not aligned to {} (ptr=0x{:x})",
-                    align,
-                    ptr
-                ));
-            }
-
-            let mut device = unsafe { stream.alloc::<u8>(len) }
-                .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
-
-            let device_ptr = device.device_ptr_mut(&stream).0;
-            let host_slice = host.as_slice();
-            unsafe {
-                memcpy_htod_async(device_ptr, host_slice, stream.cu_stream())
-                    .map_err(|e| vortex_err!("Failed to schedule H2D copy: {e}"))?;
-            }
-
-            await_stream_callback(&stream).await?;
-
-            // Keep the host buffer alive until the copy completes.
-            let _keep_alive = host;
-
-            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(device))))
-        }
-        .boxed()
-    }
-}

From 613e21cabe975f1f17d21d9b3864f5955d4687b9 Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 27 Jan 2026 17:27:16 +0000
Subject: [PATCH 3/5] device buffer alignment + read region

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-array/src/arrays/decimal/vtable/mod.rs |  12 +-
 .../src/arrays/primitive/vtable/mod.rs        |  17 +--
 vortex-array/src/buffer.rs                    |  38 +++++
 vortex-array/src/serde.rs                     |  23 +--
 vortex-cuda/benches/dict_cuda.rs              |  13 +-
 vortex-cuda/src/device_buffer.rs              |  17 ++-
 vortex-cuda/src/executor.rs                   |   3 +-
 vortex-cuda/src/host_to_device_allocator.rs   |  29 ++--
 vortex-cuda/src/kernel/arrays/dict.rs         |   5 +-
 vortex-cuda/src/kernel/encodings/alp.rs       |   3 +-
 vortex-cuda/src/kernel/encodings/zstd.rs      |   2 +-
 vortex-file/src/open.rs                       |  36 ++---
 vortex-file/src/read/request.rs               |  37 +++--
 vortex-file/src/segments/source.rs            |  29 +---
 vortex-io/src/allocator.rs                    |   6 +-
 vortex-io/src/file/object_store.rs            |  74 +--------
 vortex-io/src/file/std_file.rs                |  26 +---
 vortex-io/src/lib.rs                          |   4 +-
 vortex-io/src/read.rs                         | 140 +++++++++---------
 vortex-io/src/read_target.rs                  |  67 +++++++++
 vortex-io/src/runtime/tests.rs                |  62 ++++----
 vortex-io/src/write_target.rs                 |  39 -----
 22 files changed, 313 insertions(+), 369 deletions(-)
 create mode 100644 vortex-io/src/read_target.rs
 delete mode 100644 vortex-io/src/write_target.rs

diff --git a/vortex-array/src/arrays/decimal/vtable/mod.rs b/vortex-array/src/arrays/decimal/vtable/mod.rs
index f253902813f..7ae29978423 100644
--- a/vortex-array/src/arrays/decimal/vtable/mod.rs
+++ b/vortex-array/src/arrays/decimal/vtable/mod.rs
@@ -107,13 +107,11 @@ impl VTable for DecimalVTable {
 
         match_each_decimal_value_type!(metadata.values_type(), |D| {
             // Check and reinterpret-cast the buffer
-            if let Some(buffer) = values.as_host_opt() {
-                vortex_ensure!(
-                    buffer.is_aligned(Alignment::of::<D>()),
-                    "DecimalArray buffer not aligned for values type {:?}",
-                    D::DECIMAL_TYPE
-                );
-            }
+            vortex_ensure!(
+                values.alignment().is_aligned_to(Alignment::of::<D>()),
+                "DecimalArray buffer not aligned for values type {:?}",
+                D::DECIMAL_TYPE
+            );
             DecimalArray::try_new_handle(values, metadata.values_type(), *decimal_dtype, validity)
         })
     }
diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
index cf702ca3dad..459f24b433e 100644
--- a/vortex-array/src/arrays/primitive/vtable/mod.rs
+++ b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -101,16 +101,13 @@ impl VTable for PrimitiveVTable {
             );
         }
 
-        // For host buffers, we eagerly check alignment on construction.
-        // TODO(aduffy): check for device buffers. CUDA buffers are generally 256-byte aligned,
-        //  but not sure about other devices.
-        if let Some(host_buf) = buffer.as_host_opt() {
-            vortex_ensure!(
-                host_buf.is_aligned(Alignment::new(ptype.byte_width())),
-                "PrimitiveArray::build: Buffer must be aligned to {}",
-                ptype.byte_width()
-            );
-        }
+        vortex_ensure!(
+            buffer
+                .alignment()
+                .is_aligned_to(Alignment::new(ptype.byte_width())),
+            "PrimitiveArray::build: Buffer must be aligned to {}",
+            ptype.byte_width()
+        );
 
         // SAFETY: checked ahead of time
         unsafe {
diff --git a/vortex-array/src/buffer.rs b/vortex-array/src/buffer.rs
index cf209b75f80..35cbf5a233b 100644
--- a/vortex-array/src/buffer.rs
+++ b/vortex-array/src/buffer.rs
@@ -14,6 +14,7 @@ use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
 use vortex_utils::dyn_traits::DynEq;
 use vortex_utils::dyn_traits::DynHash;
 
@@ -50,6 +51,9 @@ pub trait DeviceBuffer: 'static + Send + Sync + Debug + DynEq + DynHash {
     /// Returns the length of the buffer in bytes.
     fn len(&self) -> usize;
 
+    /// Returns the alignment of the buffer.
+    fn alignment(&self) -> Alignment;
+
     /// Returns true if the buffer is empty.
     fn is_empty(&self) -> bool {
         self.len() == 0
@@ -130,6 +134,40 @@ impl BufferHandle {
         }
     }
 
+    /// Returns the alignment of the buffer.
+    pub fn alignment(&self) -> Alignment {
+        match &self.0 {
+            Inner::Host(bytes) => bytes.alignment(),
+            Inner::Device(device) => device.alignment(),
+        }
+    }
+
+    /// Returns true if the buffer is aligned to the given alignment.
+    pub fn is_aligned(&self, alignment: Alignment) -> bool {
+        self.alignment().is_aligned_to(alignment)
+    }
+
+    /// Ensure the buffer satisfies the requested alignment.
+    ///
+    /// Host buffers will be copied if necessary. Device buffers will error if the
+    /// alignment requirement is not met.
+    pub fn ensure_aligned(&self, alignment: Alignment) -> VortexResult<Self> {
+        match &self.0 {
+            Inner::Host(buffer) => Ok(BufferHandle::new_host(buffer.clone().aligned(alignment))),
+            Inner::Device(device) => {
+                if device.alignment().is_aligned_to(alignment) {
+                    Ok(self.clone())
+                } else {
+                    vortex_bail!(
+                        "Device buffer alignment {} does not satisfy required alignment {}",
+                        device.alignment(),
+                        alignment
+                    );
+                }
+            }
+        }
+    }
+
     /// Check if the buffer is empty.
     pub fn is_empty(&self) -> bool {
         self.len() == 0
diff --git a/vortex-array/src/serde.rs b/vortex-array/src/serde.rs
index 06aa687068a..87b786b3717 100644
--- a/vortex-array/src/serde.rs
+++ b/vortex-array/src/serde.rs
@@ -492,11 +492,7 @@ impl ArrayParts {
     ) -> VortexResult<Self> {
         // We align each buffer individually, so we remove alignment requirements on the segment
         // for host-resident buffers. Device buffers are sliced directly.
-        let segment = if let Some(host) = segment.as_host_opt() {
-            BufferHandle::new_host(host.clone().aligned(Alignment::none()))
-        } else {
-            segment
-        };
+        let segment = segment.ensure_aligned(Alignment::none())?;
 
         let fb_buffer = FlatBuffer::align_from(array_tree);
 
@@ -507,7 +503,7 @@ impl ArrayParts {
             let flatbuffer_loc = fb_root._tab.loc();
 
             let mut offset = 0;
-            let buffers: Arc<[_]> = fb_array
+            let buffers: VortexResult<Vec<_>> = fb_array
                 .buffers()
                 .unwrap_or_default()
                 .iter()
@@ -519,19 +515,14 @@ impl ArrayParts {
 
                     // Extract a buffer and ensure it's aligned, copying if necessary
                     let buffer = segment.slice(offset..(offset + buffer_len));
-                    let buffer = if let Some(host) = buffer.as_host_opt() {
-                        BufferHandle::new_host(
-                            host.clone().aligned(Alignment::from_exponent(
-                                fb_buf.alignment_exponent(),
-                            )),
-                        )
-                    } else {
-                        buffer
-                    };
+                    let buffer = buffer.ensure_aligned(Alignment::from_exponent(
+                        fb_buf.alignment_exponent(),
+                    ))?;
                     offset += buffer_len;
-                    buffer
+                    Ok(buffer)
                 })
                 .collect();
+            let buffers: Arc<[_]> = buffers?.into();
 
             (flatbuffer_loc, buffers)
         };
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index d22b1acc326..779e0258778 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -17,6 +17,7 @@ use vortex_array::IntoArray;
 use vortex_array::arrays::DictArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
+use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaBufferExt;
 use vortex_cuda::CudaDeviceBuffer;
@@ -144,7 +145,8 @@ fn benchmark_dict_u32_u8(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u32>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device = CudaDeviceBuffer::new(output_slice);
+                        let output_device =
+                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u32>());
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -207,7 +209,8 @@ fn benchmark_dict_u32_u16(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u32>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device = CudaDeviceBuffer::new(output_slice);
+                        let output_device =
+                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u32>());
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -270,7 +273,8 @@ fn benchmark_dict_u64_u8(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u64>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device = CudaDeviceBuffer::new(output_slice);
+                        let output_device =
+                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u64>());
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -333,7 +337,8 @@ fn benchmark_dict_u64_u32(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u64>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device = CudaDeviceBuffer::new(output_slice);
+                        let output_device =
+                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u64>());
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index 2cc6517324d..c7beec10cce 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -27,11 +27,12 @@ pub struct CudaDeviceBuffer<T> {
     offset: usize,
     len: usize,
     device_ptr: u64,
+    alignment: Alignment,
 }
 
 impl<T: DeviceRepr> CudaDeviceBuffer<T> {
     /// Creates a new CUDA device buffer from a [`CudaSlice`].
-    pub fn new(cuda_slice: CudaSlice<T>) -> Self {
+    pub fn new(cuda_slice: CudaSlice<T>, alignment: Alignment) -> Self {
         let len = cuda_slice.len();
         let device_ptr = cuda_slice.device_ptr(cuda_slice.stream()).0;
 
@@ -40,6 +41,7 @@ impl<T: DeviceRepr> CudaDeviceBuffer<T> {
             offset: 0,
             len,
             device_ptr,
+            alignment,
         }
     }
 
@@ -109,6 +111,10 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
         self.len * size_of::<T>()
     }
 
+    fn alignment(&self) -> Alignment {
+        self.alignment
+    }
+
     /// Synchronous copy of CUDA device to host memory.
     ///
     /// The copy is not started before other operations on the streams are completed.
@@ -185,6 +191,14 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
     fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
         let new_offset = self.offset + range.start;
         let new_len = range.end - range.start;
+        let byte_offset = new_offset * size_of::<T>();
+        let alignment = if byte_offset == 0 {
+            self.alignment
+        } else {
+            let offset_alignment = 1usize << byte_offset.trailing_zeros();
+            let max_alignment = *self.alignment;
+            Alignment::new(offset_alignment.min(max_alignment))
+        };
 
         assert!(
             range.end <= self.len,
@@ -198,6 +212,7 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
             offset: new_offset,
             len: new_len,
             device_ptr: self.device_ptr,
+            alignment,
         })
     }
 
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 63b2af86675..eb34d412a61 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -20,6 +20,7 @@ use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_dtype::PType;
 use vortex_error::VortexResult;
@@ -176,7 +177,7 @@ impl CudaExecutionCtx {
                 .map_err(|e| vortex_err!("Failed to schedule async copy to device: {}", e))?;
         }
 
-        let cuda_buf = CudaDeviceBuffer::new(cuda_slice);
+        let cuda_buf = CudaDeviceBuffer::new(cuda_slice, Alignment::of::<T>());
         let stream = Arc::clone(&self.stream);
 
         Ok(Box::pin(async move {
diff --git a/vortex-cuda/src/host_to_device_allocator.rs b/vortex-cuda/src/host_to_device_allocator.rs
index 70e3c5130c8..add76ebf7d2 100644
--- a/vortex-cuda/src/host_to_device_allocator.rs
+++ b/vortex-cuda/src/host_to_device_allocator.rs
@@ -14,7 +14,8 @@ use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 use vortex_io::BufferAllocator;
-use vortex_io::WriteTarget;
+use vortex_io::ReadRegion;
+use vortex_io::ReadTarget;
 use vortex_session::VortexSession;
 
 use crate::device_buffer::CudaDeviceBuffer;
@@ -38,7 +39,7 @@ impl HostToDeviceAllocator {
 }
 
 impl BufferAllocator for HostToDeviceAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>> {
         let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
         unsafe { buffer.set_len(len) };
         Ok(Box::new(NaiveDeviceWriteTarget {
@@ -55,30 +56,20 @@ struct NaiveDeviceWriteTarget {
     alignment: Alignment,
 }
 
-impl WriteTarget for NaiveDeviceWriteTarget {
-    fn as_mut_slice(&mut self) -> &mut [u8] {
-        self.buffer.as_mut()
-    }
-
+impl ReadTarget for NaiveDeviceWriteTarget {
     fn len(&self) -> usize {
         self.buffer.len()
     }
 
+    fn region(&mut self) -> ReadRegion<'_> {
+        ReadRegion::HostSlice(self.buffer.as_mut())
+    }
+
     fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let stream = self.stream.clone();
         let alignment = self.alignment;
         let host = self.buffer;
         async move {
-            let ptr = host.as_ref().as_ptr() as usize;
-            let align = *alignment;
-            if align > 1 && ptr % align != 0 {
-                return Err(vortex_err!(
-                    "Host buffer not aligned to {} (ptr=0x{:x})",
-                    align,
-                    ptr
-                ));
-            }
-
             let len = host.len();
             let mut device = unsafe { stream.alloc::<u8>(len) }
                 .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
@@ -95,7 +86,9 @@ impl WriteTarget for NaiveDeviceWriteTarget {
             // Keep the host buffer alive until the copy completes.
             let _keep_alive = host;
 
-            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(device))))
+            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(
+                device, alignment,
+            ))))
         }
         .boxed()
     }
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 20d1a6c6425..084d9f5bdce 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -16,6 +16,7 @@ use vortex_array::arrays::DictVTable;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_dtype::DType;
 use vortex_dtype::DecimalType;
 use vortex_dtype::NativeDecimalType;
@@ -116,7 +117,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
 
     // Allocate output buffer on device
     let output_slice = ctx.device_alloc::<V>(codes_len)?;
-    let output_device = CudaDeviceBuffer::new(output_slice);
+    let output_device = CudaDeviceBuffer::new(output_slice, Alignment::of::<V>());
 
     // Get views for kernel launch
     let values_view = values_device.cuda_view::<V>()?;
@@ -225,7 +226,7 @@ async fn execute_dict_decimal_typed<
 
     // Allocate output buffer on device (codes_len * value_byte_width bytes)
     let output_slice = ctx.device_alloc::<V>(codes_len)?;
-    let output_device = CudaDeviceBuffer::new(output_slice);
+    let output_device = CudaDeviceBuffer::new(output_slice, Alignment::of::<V>());
 
     // Get views for kernel launch
     let values_view = values_device.cuda_view::<V>()?;
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index f8c40f73a34..57bfc826dda 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -18,6 +18,7 @@ use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_dtype::NativePType;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
@@ -83,7 +84,7 @@ where
 
     // Allocate output buffer
     let output_slice = ctx.device_alloc::<A>(array_len)?;
-    let output_buf = CudaDeviceBuffer::new(output_slice);
+    let output_buf = CudaDeviceBuffer::new(output_slice, Alignment::of::<A>());
     let output_view = output_buf.as_view();
 
     let array_len_u64 = array_len as u64;
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index 508990897d8..1aa8b2741fa 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -269,7 +269,7 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
     // self-contained. They neither have any parent or child encodings.
     //
     // TODO(0ax1): Don't copy back to host once VarBinView supports buffer handles.
-    let host_buffer = CudaDeviceBuffer::new(exec.device_output)
+    let host_buffer = CudaDeviceBuffer::new(exec.device_output, Alignment::of::<u8>())
         .copy_to_host(Alignment::new(1))?
         .await?;
 
diff --git a/vortex-file/src/open.rs b/vortex-file/src/open.rs
index 4d9c566daf8..eab011f4625 100644
--- a/vortex-file/src/open.rs
+++ b/vortex-file/src/open.rs
@@ -12,6 +12,7 @@ use vortex_dtype::DType;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_io::AllocatingReadAt;
 use vortex_io::BufferAllocator;
 use vortex_io::InstrumentedReadAt;
 use vortex_io::VortexReadAt;
@@ -169,7 +170,11 @@ impl VortexOpenOptions {
     pub async fn open_read<R: VortexReadAt + Clone>(self, reader: R) -> VortexResult<VortexFile> {
         let metrics = self.metrics.clone().unwrap_or_default();
         let reader = InstrumentedReadAt::new(reader, &metrics);
-        let reader: Arc<dyn VortexReadAt> = Arc::new(reader);
+        let reader: Arc<dyn VortexReadAt> = if let Some(allocator) = &self.allocator {
+            Arc::new(AllocatingReadAt::new(reader, allocator.clone()))
+        } else {
+            Arc::new(reader)
+        };
         let footer = if let Some(footer) = self.footer {
             footer
         } else {
@@ -186,12 +191,11 @@ impl VortexOpenOptions {
 
         // Create a segment source backed by the VortexRead implementation.
         let segment_source = Arc::new(SharedSegmentSource::new(
-            FileSegmentSource::open_with_allocator(
+            FileSegmentSource::open(
                 footer.segment_map().clone(),
                 reader,
                 self.session.handle(),
                 metrics.clone(),
-                self.allocator.clone(),
             ),
         ));
 
@@ -225,7 +229,8 @@ impl VortexOpenOptions {
         let initial_offset = file_size - initial_read_size as u64;
         let initial_read: ByteBuffer = read
             .read_at(initial_offset, initial_read_size, Alignment::none())
-            .await?;
+            .await?
+            .try_into_host_sync()?;
 
         let mut deserializer = Footer::deserializer(initial_read, self.session.clone())
             .with_size(file_size)
@@ -234,7 +239,10 @@ impl VortexOpenOptions {
         let footer = loop {
             match deserializer.deserialize()? {
                 DeserializeStep::NeedMoreData { offset, len } => {
-                    let more_data = read.read_at(offset, len, Alignment::none()).await?;
+                    let more_data = read
+                        .read_at(offset, len, Alignment::none())
+                        .await?
+                        .try_into_host_sync()?;
                     deserializer.prefix_data(more_data);
                 }
                 DeserializeStep::NeedFileSize => unreachable!("We passed file_size above"),
@@ -332,23 +340,7 @@ mod tests {
             offset: u64,
             length: usize,
             alignment: Alignment,
-        ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
-            self.total_read.fetch_add(length, Ordering::Relaxed);
-            let _ = self.first_read_len.compare_exchange(
-                0,
-                length,
-                Ordering::Relaxed,
-                Ordering::Relaxed,
-            );
-            self.inner.read_at(offset, length, alignment)
-        }
-
-        fn read_at_into(
-            &self,
-            offset: u64,
-            target: Box<dyn vortex_io::WriteTarget>,
         ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-            let length = target.len();
             self.total_read.fetch_add(length, Ordering::Relaxed);
             let _ = self.first_read_len.compare_exchange(
                 0,
@@ -356,7 +348,7 @@ mod tests {
                 Ordering::Relaxed,
                 Ordering::Relaxed,
             );
-            self.inner.read_at_into(offset, target)
+            self.inner.read_at(offset, length, alignment)
         }
 
         fn concurrency(&self) -> usize {
diff --git a/vortex-file/src/read/request.rs b/vortex-file/src/read/request.rs
index 8d146a32d61..cdd71670070 100644
--- a/vortex-file/src/read/request.rs
+++ b/vortex-file/src/read/request.rs
@@ -135,22 +135,29 @@ impl CoalescedRequest {
     pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match result {
             Ok(buffer) => {
-                if let Some(host) = buffer.as_host_opt() {
-                    let host = host.clone().aligned(Alignment::none());
-                    for req in self.requests.into_iter() {
-                        let start = usize::try_from(req.offset - self.range.start)
-                            .vortex_expect("invalid offset");
-                        let end = start + req.length;
-                        let slice = host.slice(start..end).aligned(req.alignment);
-                        req.resolve(Ok(BufferHandle::new_host(slice)));
-                    }
-                } else {
-                    for req in self.requests.into_iter() {
-                        let start = usize::try_from(req.offset - self.range.start)
-                            .vortex_expect("invalid offset");
-                        let end = start + req.length;
-                        req.resolve(Ok(buffer.slice(start..end)));
+                let base = match buffer.ensure_aligned(Alignment::none()) {
+                    Ok(base) => base,
+                    Err(e) => {
+                        let e = Arc::new(e);
+                        for req in self.requests.into_iter() {
+                            req.resolve(Err(VortexError::from(e.clone())));
+                        }
+                        return;
                     }
+                };
+
+                for req in self.requests.into_iter() {
+                    let start = usize::try_from(req.offset - self.range.start)
+                        .vortex_expect("invalid offset");
+                    let end = start + req.length;
+                    let slice = match base.slice(start..end).ensure_aligned(req.alignment) {
+                        Ok(slice) => slice,
+                        Err(e) => {
+                            req.resolve(Err(e));
+                            continue;
+                        }
+                    };
+                    req.resolve(Ok(slice));
                 }
             }
             Err(e) => {
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index 04efe6117de..344f805f516 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -16,7 +16,6 @@ use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
-use vortex_io::BufferAllocator;
 use vortex_io::VortexReadAt;
 use vortex_io::runtime::Handle;
 use vortex_layout::segments::SegmentFuture;
@@ -71,16 +70,6 @@ impl FileSegmentSource {
         reader: R,
         handle: Handle,
         metrics: VortexMetrics,
-    ) -> Self {
-        Self::open_with_allocator(segments, Arc::new(reader), handle, metrics, None)
-    }
-
-    pub fn open_with_allocator(
-        segments: Arc<[SegmentSpec]>,
-        source: Arc<dyn VortexReadAt>,
-        handle: Handle,
-        metrics: VortexMetrics,
-        allocator: Option<Arc<dyn BufferAllocator>>,
     ) -> Self {
         let (send, recv) = mpsc::unbounded();
 
@@ -97,7 +86,6 @@ impl FileSegmentSource {
             config
         });
         let concurrency = reader.concurrency();
-        let allocator = allocator.clone();
 
         let drive_fut = async move {
             let stream = IoRequestStream::new(
@@ -110,20 +98,11 @@ impl FileSegmentSource {
 
             stream
                 .map(move |req| {
-                    let source = source.clone();
-                    let allocator = allocator.clone();
+                    let reader = reader.clone();
                     async move {
-                        let result = if let Some(allocator) = allocator {
-                            match allocator.allocate(req.len(), req.alignment()) {
-                                Ok(target) => source.read_at_into(req.offset(), target).await,
-                                Err(e) => Err(e),
-                            }
-                        } else {
-                            source
-                                .read_at(req.offset(), req.len(), req.alignment())
-                                .await
-                                .map(BufferHandle::new_host)
-                        };
+                        let result = reader
+                            .read_at(req.offset(), req.len(), req.alignment())
+                            .await;
                         req.resolve(result);
                     }
                 })
diff --git a/vortex-io/src/allocator.rs b/vortex-io/src/allocator.rs
index 89cdc637409..2fa2d56844d 100644
--- a/vortex-io/src/allocator.rs
+++ b/vortex-io/src/allocator.rs
@@ -8,12 +8,12 @@ use vortex_buffer::Alignment;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
-use crate::WriteTarget;
+use crate::ReadTarget;
 
 /// Allocates buffers for I/O reads.
 pub trait BufferAllocator: Send + Sync + 'static {
     /// Allocate a buffer for the requested length and alignment.
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>>;
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>>;
 }
 
 /// The default allocator that uses `ByteBufferMut`.
@@ -42,7 +42,7 @@ pub fn reset_default_alloc_stats() {
 }
 
 impl BufferAllocator for DefaultAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteTarget>> {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>> {
         DEFAULT_ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
         DEFAULT_ALLOC_BYTES.fetch_add(len as u64, Ordering::Relaxed);
         let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
diff --git a/vortex-io/src/file/object_store.rs b/vortex-io/src/file/object_store.rs
index 882b3ef8db6..80c8a9343fe 100644
--- a/vortex-io/src/file/object_store.rs
+++ b/vortex-io/src/file/object_store.rs
@@ -15,7 +15,6 @@ use object_store::ObjectStore;
 use object_store::path::Path as ObjectPath;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexError;
 use vortex_error::VortexResult;
@@ -23,7 +22,6 @@ use vortex_error::vortex_ensure;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
-use crate::WriteTarget;
 #[cfg(not(target_arch = "wasm32"))]
 use crate::file::std_file::read_exact_at;
 use crate::runtime::Handle;
@@ -110,7 +108,7 @@ impl VortexReadAt for ObjectStoreSource {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let store = self.store.clone();
         let path = self.path.clone();
         let handle = self.handle.clone();
@@ -163,75 +161,7 @@ impl VortexReadAt for ObjectStoreSource {
                 }
             };
 
-            Ok(buffer.freeze())
-        })
-        .boxed()
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        mut target: Box<dyn WriteTarget>,
-    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        let store = self.store.clone();
-        let path = self.path.clone();
-        let handle = self.handle.clone();
-        let length = target.len();
-        let range = offset..(offset + length as u64);
-
-        Compat::new(async move {
-            let response = store
-                .get_opts(
-                    &path,
-                    GetOptions {
-                        range: Some(GetRange::Bounded(range.clone())),
-                        ..Default::default()
-                    },
-                )
-                .await?;
-
-            match response.payload {
-                #[cfg(not(target_arch = "wasm32"))]
-                GetResultPayload::File(file, _) => {
-                    target = handle
-                        .spawn_blocking(move || {
-                            read_exact_at(&file, target.as_mut_slice(), range.start)?;
-                            Ok::<_, io::Error>(target)
-                        })
-                        .await
-                        .map_err(io::Error::other)?;
-                }
-                #[cfg(target_arch = "wasm32")]
-                GetResultPayload::File(..) => {
-                    unreachable!("File payload not supported on wasm32")
-                }
-                GetResultPayload::Stream(mut byte_stream) => {
-                    let mut filled = 0usize;
-                    while let Some(bytes) = byte_stream.next().await {
-                        let bytes = bytes?;
-                        let end = filled + bytes.len();
-                        vortex_ensure!(
-                            end <= length,
-                            "Object store stream returned more bytes than expected (expected {} bytes, got at least {} bytes, range: {:?})",
-                            length,
-                            end,
-                            range
-                        );
-                        target.as_mut_slice()[filled..end].copy_from_slice(&bytes);
-                        filled = end;
-                    }
-
-                    vortex_ensure!(
-                        filled == length,
-                        "Object store stream returned {} bytes but expected {} bytes (range: {:?})",
-                        filled,
-                        length,
-                        range
-                    );
-                }
-            }
-
-            target.into_handle().await
+            Ok(BufferHandle::new_host(buffer.freeze()))
         })
         .boxed()
     }
diff --git a/vortex-io/src/file/std_file.rs b/vortex-io/src/file/std_file.rs
index 68a6982cb50..77417aea659 100644
--- a/vortex-io/src/file/std_file.rs
+++ b/vortex-io/src/file/std_file.rs
@@ -18,13 +18,11 @@ use futures::FutureExt;
 use futures::future::BoxFuture;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
 use crate::CoalesceConfig;
 use crate::VortexReadAt;
-use crate::WriteTarget;
 use crate::runtime::Handle;
 
 /// Read exactly `buffer.len()` bytes from `file` starting at `offset`.
@@ -110,7 +108,7 @@ impl VortexReadAt for FileReadAdapter {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let file = self.file.clone();
         let handle = self.handle.clone();
         async move {
@@ -119,29 +117,9 @@ impl VortexReadAt for FileReadAdapter {
                     let mut buffer = ByteBufferMut::with_capacity_aligned(length, alignment);
                     unsafe { buffer.set_len(length) };
                     read_exact_at(&file, &mut buffer, offset)?;
-                    Ok(buffer.freeze())
-                })
-                .await
-        }
-        .boxed()
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        mut target: Box<dyn WriteTarget>,
-    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        let file = self.file.clone();
-        let handle = self.handle.clone();
-        async move {
-            let target = handle
-                .spawn_blocking(move || {
-                    read_exact_at(&file, target.as_mut_slice(), offset)?;
-                    Ok::<_, io::Error>(target)
+                    Ok(BufferHandle::new_host(buffer.freeze()))
                 })
                 .await
-                .map_err(io::Error::other)?;
-            target.into_handle().await
         }
         .boxed()
     }
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index f7706f63244..d0120108b2e 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -17,7 +17,7 @@ pub use limit::*;
 pub use object_store::*;
 pub use read::*;
 pub use write::*;
-pub use write_target::*;
+pub use read_target::*;
 
 mod allocator;
 pub mod file;
@@ -27,9 +27,9 @@ mod limit;
 #[cfg(feature = "object_store")]
 mod object_store;
 mod read;
+mod read_target;
 pub mod runtime;
 pub mod session;
 #[cfg(feature = "tokio")]
 mod tokio;
 mod write;
-mod write_target;
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index 2c7007fbfbf..01f408c3be9 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -11,12 +11,15 @@ use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
+use vortex_error::vortex_err;
 use vortex_metrics::Counter;
 use vortex_metrics::Histogram;
 use vortex_metrics::Timer;
 use vortex_metrics::VortexMetrics;
 
-use crate::WriteTarget;
+use crate::BufferAllocator;
+use crate::ReadRegion;
+
 /// Configuration for coalescing nearby I/O requests into single operations.
 #[derive(Clone, Copy, Debug)]
 pub struct CoalesceConfig {
@@ -73,7 +76,7 @@ pub trait VortexReadAt: Send + Sync + 'static {
     /// Asynchronously get the number of bytes of the underlying source.
     fn size(&self) -> BoxFuture<'static, VortexResult<u64>>;
 
-    /// Request an asynchronous positional read. Results will be returned as a [`ByteBuffer`].
+    /// Request an asynchronous positional read. Results will be returned as a [`BufferHandle`].
     ///
     /// If the reader does not have the requested number of bytes, the returned Future will complete
     /// with an [`UnexpectedEof`][std::io::ErrorKind::UnexpectedEof] error.
@@ -82,13 +85,6 @@ pub trait VortexReadAt: Send + Sync + 'static {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>>;
-
-    /// Read into a pre-allocated target buffer.
-    fn read_at_into(
-        &self,
-        offset: u64,
-        target: Box<dyn WriteTarget>,
     ) -> BoxFuture<'static, VortexResult<BufferHandle>>;
 }
 
@@ -114,16 +110,8 @@ impl VortexReadAt for Arc<dyn VortexReadAt> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
-        self.as_ref().read_at(offset, length, alignment)
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        target: Box<dyn WriteTarget>,
     ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        self.as_ref().read_at_into(offset, target)
+        self.as_ref().read_at(offset, length, alignment)
     }
 }
 
@@ -149,16 +137,8 @@ impl<R: VortexReadAt> VortexReadAt for Arc<R> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
-        self.as_ref().read_at(offset, length, alignment)
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        target: Box<dyn WriteTarget>,
     ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        self.as_ref().read_at_into(offset, target)
+        self.as_ref().read_at(offset, length, alignment)
     }
 
     // fn drive(self: Arc<Self>, requests: BoxStream<'static, IoRequest>) -> BoxFuture<'static, ()> {
@@ -183,7 +163,7 @@ impl VortexReadAt for ByteBuffer {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let buffer = self.clone();
         async move {
             let start = usize::try_from(offset).vortex_expect("start too big for usize");
@@ -197,33 +177,9 @@ impl VortexReadAt for ByteBuffer {
                     buffer.len()
                 );
             }
-            Ok(buffer.slice_unaligned(start..end).aligned(alignment))
-        }
-        .boxed()
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        mut target: Box<dyn WriteTarget>,
-    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        let buffer = self.clone();
-        async move {
-            let start = usize::try_from(offset).vortex_expect("start too big for usize");
-            let end = usize::try_from(offset + target.len() as u64)
-                .vortex_expect("end too big for usize");
-            if end > buffer.len() {
-                vortex_bail!(
-                    "Requested range {}..{} out of bounds for buffer of length {}",
-                    start,
-                    end,
-                    buffer.len()
-                );
-            }
-            target
-                .as_mut_slice()
-                .copy_from_slice(&buffer.as_ref()[start..end]);
-            target.into_handle().await
+            Ok(BufferHandle::new_host(
+                buffer.slice_unaligned(start..end).aligned(alignment),
+            ))
         }
         .boxed()
     }
@@ -238,6 +194,19 @@ pub struct InstrumentedReadAt<T: VortexReadAt + Clone> {
     durations: Arc<Timer>,
 }
 
+/// A wrapper that uses an allocator to produce the returned buffer handle.
+#[derive(Clone)]
+pub struct AllocatingReadAt<T: VortexReadAt + Clone> {
+    read: T,
+    allocator: Arc<dyn BufferAllocator>,
+}
+
+impl<T: VortexReadAt + Clone> AllocatingReadAt<T> {
+    pub fn new(read: T, allocator: Arc<dyn BufferAllocator>) -> Self {
+        Self { read, allocator }
+    }
+}
+
 impl<T: VortexReadAt + Clone> InstrumentedReadAt<T> {
     pub fn new(read: T, metrics: &VortexMetrics) -> Self {
         Self {
@@ -298,7 +267,7 @@ impl<T: VortexReadAt + Clone> VortexReadAt for InstrumentedReadAt<T> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let durations = self.durations.clone();
         let sizes = self.sizes.clone();
         let total_size = self.total_size.clone();
@@ -312,23 +281,54 @@ impl<T: VortexReadAt + Clone> VortexReadAt for InstrumentedReadAt<T> {
         }
         .boxed()
     }
+}
 
-    fn read_at_into(
+impl<T: VortexReadAt + Clone> VortexReadAt for AllocatingReadAt<T> {
+    fn uri(&self) -> Option<&Arc<str>> {
+        self.read.uri()
+    }
+
+    fn coalesce_config(&self) -> Option<CoalesceConfig> {
+        self.read.coalesce_config()
+    }
+
+    fn concurrency(&self) -> usize {
+        self.read.concurrency()
+    }
+
+    fn size(&self) -> BoxFuture<'static, VortexResult<u64>> {
+        self.read.size()
+    }
+
+    fn read_at(
         &self,
         offset: u64,
-        target: Box<dyn WriteTarget>,
+        length: usize,
+        alignment: Alignment,
     ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        let durations = self.durations.clone();
-        let sizes = self.sizes.clone();
-        let total_size = self.total_size.clone();
-        let length = target.len();
-        let read_fut = self.read.read_at_into(offset, target);
+        let read = self.read.clone();
+        let allocator = self.allocator.clone();
         async move {
-            let _timer = durations.time();
-            let result = read_fut.await;
-            sizes.update(length as i64);
-            total_size.add(length as i64);
-            result
+            let handle = read.read_at(offset, length, alignment).await?;
+            if handle.is_on_device() {
+                return Ok(handle);
+            }
+
+            let host = handle
+                .as_host_opt()
+                .ok_or_else(|| vortex_err!("expected host buffer"))?;
+            let mut target = allocator.allocate(length, alignment)?;
+            match target.region() {
+                ReadRegion::HostSlice(slice) => {
+                    slice.copy_from_slice(host.as_slice());
+                }
+                ReadRegion::Registered(_) | ReadRegion::Device(_) => {
+                    return Err(vortex_err!(
+                        "AllocatingReadAt does not support non-host read regions"
+                    ));
+                }
+            }
+            target.into_handle().await
         }
         .boxed()
     }
@@ -362,7 +362,7 @@ mod tests {
         let data = ByteBuffer::from(vec![1, 2, 3, 4, 5]);
 
         let result = data.read_at(1, 3, Alignment::none()).await.unwrap();
-        assert_eq!(result.as_ref(), &[2, 3, 4]);
+        assert_eq!(result.to_host_sync().as_ref(), &[2, 3, 4]);
     }
 
     #[tokio::test]
@@ -378,7 +378,7 @@ mod tests {
         let data = Arc::new(ByteBuffer::from(vec![1, 2, 3, 4, 5]));
 
         let result = data.read_at(2, 3, Alignment::none()).await.unwrap();
-        assert_eq!(result.as_ref(), &[3, 4, 5]);
+        assert_eq!(result.to_host_sync().as_ref(), &[3, 4, 5]);
 
         let size = data.size().await.unwrap();
         assert_eq!(size, 5);
diff --git a/vortex-io/src/read_target.rs b/vortex-io/src/read_target.rs
new file mode 100644
index 00000000000..62fd23441b6
--- /dev/null
+++ b/vortex-io/src/read_target.rs
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::marker::PhantomData;
+
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+/// A destination memory region for reads.
+pub enum ReadRegion<'a> {
+    /// A standard host slice that can be written by the CPU.
+    HostSlice(&'a mut [u8]),
+    /// A registered host memory region suitable for RDMA writes.
+    Registered(RegisteredRegion<'a>),
+    /// A device memory region suitable for GPU-direct or other device DMA.
+    Device(DeviceRegion<'a>),
+}
+
+/// A registered host memory region suitable for RDMA writes.
+pub struct RegisteredRegion<'a> {
+    pub ptr: *mut u8,
+    pub len: usize,
+    pub lkey: u32,
+    pub rkey: u32,
+    pub(crate) _lifetime: PhantomData<&'a mut [u8]>,
+}
+
+/// A device memory region suitable for device DMA.
+pub struct DeviceRegion<'a> {
+    pub ptr: *mut u8,
+    pub len: usize,
+    pub(crate) _lifetime: PhantomData<&'a mut [u8]>,
+}
+
+/// A destination for I/O reads that can be finalized into a [`BufferHandle`].
+pub trait ReadTarget: Send + 'static {
+    /// Returns the length of the buffer in bytes.
+    fn len(&self) -> usize;
+
+    /// Returns true if the buffer is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the writable region for this target.
+    fn region(&mut self) -> ReadRegion<'_>;
+
+    /// Finalize the target into a buffer handle.
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>>;
+}
+
+impl ReadTarget for ByteBufferMut {
+    fn len(&self) -> usize {
+        ByteBufferMut::len(self)
+    }
+
+    fn region(&mut self) -> ReadRegion<'_> {
+        ReadRegion::HostSlice(self.as_mut())
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        async move { Ok(BufferHandle::new_host(self.freeze())) }.boxed()
+    }
+}
diff --git a/vortex-io/src/runtime/tests.rs b/vortex-io/src/runtime/tests.rs
index 73923607896..928fb476406 100644
--- a/vortex-io/src/runtime/tests.rs
+++ b/vortex-io/src/runtime/tests.rs
@@ -18,7 +18,6 @@ use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
 use crate::VortexReadAt;
-use crate::WriteTarget;
 use crate::file::std_file::FileReadAdapter;
 use crate::runtime::single::block_on;
 use crate::runtime::tokio::TokioRuntime;
@@ -44,7 +43,7 @@ fn test_file_read_with_single_thread_runtime() {
                 .await
                 .unwrap();
             assert_eq!(
-                result.as_slice(),
+                result.to_host_sync().as_slice(),
                 &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
             );
 
@@ -53,7 +52,7 @@ fn test_file_read_with_single_thread_runtime() {
                 .read_at(0, TEST_DATA.len(), Alignment::new(1))
                 .await
                 .unwrap();
-            assert_eq!(full.as_slice(), TEST_DATA);
+            assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 
             "success"
         }
@@ -72,7 +71,7 @@ async fn test_file_read_with_tokio_runtime() {
         .await
         .unwrap();
     assert_eq!(
-        result.as_slice(),
+        result.to_host_sync().as_slice(),
         &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
     );
 
@@ -81,7 +80,7 @@ async fn test_file_read_with_tokio_runtime() {
         .read_at(0, TEST_DATA.len(), Alignment::new(1))
         .await
         .unwrap();
-    assert_eq!(full.as_slice(), TEST_DATA);
+    assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 }
 
 // ============================================================================
@@ -109,7 +108,7 @@ fn test_file_read_with_real_file_single_thread() {
                 .await
                 .unwrap();
             assert_eq!(
-                result.as_slice(),
+                result.to_host_sync().as_slice(),
                 &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
             );
 
@@ -118,7 +117,7 @@ fn test_file_read_with_real_file_single_thread() {
                 .read_at(0, TEST_DATA.len(), Alignment::new(1))
                 .await
                 .unwrap();
-            assert_eq!(full.as_slice(), TEST_DATA);
+            assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 
             "success"
         }
@@ -146,7 +145,7 @@ async fn test_file_read_with_real_file_tokio() {
         .await
         .unwrap();
     assert_eq!(
-        result.as_slice(),
+        result.to_host_sync().as_slice(),
         &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
     );
 
@@ -155,7 +154,7 @@ async fn test_file_read_with_real_file_tokio() {
         .read_at(0, TEST_DATA.len(), Alignment::new(1))
         .await
         .unwrap();
-    assert_eq!(full.as_slice(), TEST_DATA);
+    assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 }
 
 // ============================================================================
@@ -176,10 +175,22 @@ async fn test_concurrent_reads() {
 
     let results = futures::future::join_all(futures).await;
 
-    assert_eq!(results[0].as_ref().unwrap().as_slice(), &TEST_DATA[0..5]);
-    assert_eq!(results[1].as_ref().unwrap().as_slice(), &TEST_DATA[5..10]);
-    assert_eq!(results[2].as_ref().unwrap().as_slice(), &TEST_DATA[10..15]);
-    assert_eq!(results[3].as_ref().unwrap().as_slice(), &TEST_DATA[15..20]);
+    assert_eq!(
+        results[0].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[0..5]
+    );
+    assert_eq!(
+        results[1].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[5..10]
+    );
+    assert_eq!(
+        results[2].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[10..15]
+    );
+    assert_eq!(
+        results[3].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[15..20]
+    );
 }
 
 // ============================================================================
@@ -242,7 +253,7 @@ impl VortexReadAt for CountingReadAt {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         self.read_count.fetch_add(1, Ordering::SeqCst);
         let data = self.data.clone();
         async move {
@@ -255,28 +266,7 @@ impl VortexReadAt for CountingReadAt {
             buffer
                 .as_mut_slice()
                 .copy_from_slice(&data.as_slice()[start..start + length]);
-            Ok(buffer.freeze())
-        }
-        .boxed()
-    }
-
-    fn read_at_into(
-        &self,
-        offset: u64,
-        mut target: Box<dyn WriteTarget>,
-    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        self.read_count.fetch_add(1, Ordering::SeqCst);
-        let data = self.data.clone();
-        async move {
-            let start = offset as usize;
-            let length = target.len();
-            if start + length > data.len() {
-                return Err(vortex_error::vortex_err!("Read out of bounds"));
-            }
-            target
-                .as_mut_slice()
-                .copy_from_slice(&data.as_slice()[start..start + length]);
-            target.into_handle().await
+            Ok(BufferHandle::new_host(buffer.freeze()))
         }
         .boxed()
     }
diff --git a/vortex-io/src/write_target.rs b/vortex-io/src/write_target.rs
deleted file mode 100644
index ddffecfa27c..00000000000
--- a/vortex-io/src/write_target.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-use futures::future::BoxFuture;
-use futures::FutureExt;
-use vortex_array::buffer::BufferHandle;
-use vortex_buffer::ByteBufferMut;
-use vortex_error::VortexResult;
-
-/// A destination for I/O reads that can be finalized into a [`BufferHandle`].
-pub trait WriteTarget: Send + 'static {
-    /// Returns the buffer as a mutable slice.
-    fn as_mut_slice(&mut self) -> &mut [u8];
-
-    /// Returns the length of the buffer in bytes.
-    fn len(&self) -> usize;
-
-    /// Returns true if the buffer is empty.
-    fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    /// Finalize the target into a buffer handle.
-    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>>;
-}
-
-impl WriteTarget for ByteBufferMut {
-    fn as_mut_slice(&mut self) -> &mut [u8] {
-        self.as_mut()
-    }
-
-    fn len(&self) -> usize {
-        ByteBufferMut::len(self)
-    }
-
-    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
-        async move { Ok(BufferHandle::new_host(self.freeze())) }.boxed()
-    }
-}

From d665f60ef09ff0e02d12c507a6ac71e844b6e7db Mon Sep 17 00:00:00 2001
From: Onur Satici <onur@spiraldb.com>
Date: Tue, 27 Jan 2026 17:47:48 +0000
Subject: [PATCH 4/5] rename

Signed-off-by: Onur Satici <onur@spiraldb.com>
---
 vortex-cuda/src/host_to_device_allocator.rs      | 16 ++++++++++------
 vortex-io/src/allocator.rs                       |  6 +++---
 vortex-io/src/lib.rs                             |  4 ++--
 vortex-io/src/read.rs                            |  6 +++---
 .../src/{read_target.rs => write_destination.rs} | 14 +++++++-------
 5 files changed, 25 insertions(+), 21 deletions(-)
 rename vortex-io/src/{read_target.rs => write_destination.rs} (85%)

diff --git a/vortex-cuda/src/host_to_device_allocator.rs b/vortex-cuda/src/host_to_device_allocator.rs
index add76ebf7d2..3b37c06f260 100644
--- a/vortex-cuda/src/host_to_device_allocator.rs
+++ b/vortex-cuda/src/host_to_device_allocator.rs
@@ -14,8 +14,8 @@ use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 use vortex_io::BufferAllocator;
-use vortex_io::ReadRegion;
-use vortex_io::ReadTarget;
+use vortex_io::WriteDestination;
+use vortex_io::WriteRegion;
 use vortex_session::VortexSession;
 
 use crate::device_buffer::CudaDeviceBuffer;
@@ -39,7 +39,11 @@ impl HostToDeviceAllocator {
 }
 
 impl BufferAllocator for HostToDeviceAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>> {
+    fn allocate(
+        &self,
+        len: usize,
+        alignment: Alignment,
+    ) -> VortexResult<Box<dyn WriteDestination>> {
         let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
         unsafe { buffer.set_len(len) };
         Ok(Box::new(NaiveDeviceWriteTarget {
@@ -56,13 +60,13 @@ struct NaiveDeviceWriteTarget {
     alignment: Alignment,
 }
 
-impl ReadTarget for NaiveDeviceWriteTarget {
+impl WriteDestination for NaiveDeviceWriteTarget {
     fn len(&self) -> usize {
         self.buffer.len()
     }
 
-    fn region(&mut self) -> ReadRegion<'_> {
-        ReadRegion::HostSlice(self.buffer.as_mut())
+    fn region(&mut self) -> WriteRegion<'_> {
+        WriteRegion::HostSlice(self.buffer.as_mut())
     }
 
     fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
diff --git a/vortex-io/src/allocator.rs b/vortex-io/src/allocator.rs
index 2fa2d56844d..b042fad8dd1 100644
--- a/vortex-io/src/allocator.rs
+++ b/vortex-io/src/allocator.rs
@@ -8,12 +8,12 @@ use vortex_buffer::Alignment;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
-use crate::ReadTarget;
+use crate::WriteDestination;
 
 /// Allocates buffers for I/O reads.
 pub trait BufferAllocator: Send + Sync + 'static {
     /// Allocate a buffer for the requested length and alignment.
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>>;
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteDestination>>;
 }
 
 /// The default allocator that uses `ByteBufferMut`.
@@ -42,7 +42,7 @@ pub fn reset_default_alloc_stats() {
 }
 
 impl BufferAllocator for DefaultAllocator {
-    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn ReadTarget>> {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteDestination>> {
         DEFAULT_ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
         DEFAULT_ALLOC_BYTES.fetch_add(len as u64, Ordering::Relaxed);
         let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index d0120108b2e..afc3598cf01 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -17,7 +17,7 @@ pub use limit::*;
 pub use object_store::*;
 pub use read::*;
 pub use write::*;
-pub use read_target::*;
+pub use write_destination::*;
 
 mod allocator;
 pub mod file;
@@ -27,7 +27,7 @@ mod limit;
 #[cfg(feature = "object_store")]
 mod object_store;
 mod read;
-mod read_target;
+mod write_destination;
 pub mod runtime;
 pub mod session;
 #[cfg(feature = "tokio")]
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index 01f408c3be9..858599d3060 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -18,7 +18,7 @@ use vortex_metrics::Timer;
 use vortex_metrics::VortexMetrics;
 
 use crate::BufferAllocator;
-use crate::ReadRegion;
+use crate::WriteRegion;
 
 /// Configuration for coalescing nearby I/O requests into single operations.
 #[derive(Clone, Copy, Debug)]
@@ -319,10 +319,10 @@ impl<T: VortexReadAt + Clone> VortexReadAt for AllocatingReadAt<T> {
                 .ok_or_else(|| vortex_err!("expected host buffer"))?;
             let mut target = allocator.allocate(length, alignment)?;
             match target.region() {
-                ReadRegion::HostSlice(slice) => {
+                WriteRegion::HostSlice(slice) => {
                     slice.copy_from_slice(host.as_slice());
                 }
-                ReadRegion::Registered(_) | ReadRegion::Device(_) => {
+                WriteRegion::Registered(_) | WriteRegion::Device(_) => {
                     return Err(vortex_err!(
                         "AllocatingReadAt does not support non-host read regions"
                     ));
diff --git a/vortex-io/src/read_target.rs b/vortex-io/src/write_destination.rs
similarity index 85%
rename from vortex-io/src/read_target.rs
rename to vortex-io/src/write_destination.rs
index 62fd23441b6..ea811005f11 100644
--- a/vortex-io/src/read_target.rs
+++ b/vortex-io/src/write_destination.rs
@@ -9,8 +9,8 @@ use vortex_array::buffer::BufferHandle;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
-/// A destination memory region for reads.
-pub enum ReadRegion<'a> {
+/// A destination memory region for writes.
+pub enum WriteRegion<'a> {
     /// A standard host slice that can be written by the CPU.
     HostSlice(&'a mut [u8]),
     /// A registered host memory region suitable for RDMA writes.
@@ -36,7 +36,7 @@ pub struct DeviceRegion<'a> {
 }
 
 /// A destination for I/O reads that can be finalized into a [`BufferHandle`].
-pub trait ReadTarget: Send + 'static {
+pub trait WriteDestination: Send + 'static {
     /// Returns the length of the buffer in bytes.
     fn len(&self) -> usize;
 
@@ -46,19 +46,19 @@ pub trait ReadTarget: Send + 'static {
     }
 
     /// Returns the writable region for this target.
-    fn region(&mut self) -> ReadRegion<'_>;
+    fn region(&mut self) -> WriteRegion<'_>;
 
     /// Finalize the target into a buffer handle.
     fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>>;
 }
 
-impl ReadTarget for ByteBufferMut {
+impl WriteDestination for ByteBufferMut {
     fn len(&self) -> usize {
         ByteBufferMut::len(self)
     }
 
-    fn region(&mut self) -> ReadRegion<'_> {
-        ReadRegion::HostSlice(self.as_mut())
+    fn region(&mut self) -> WriteRegion<'_> {
+        WriteRegion::HostSlice(self.as_mut())
     }
 
     fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {

From 069d998a45bdcbe39d2d3f9f760bd4124daaf96b Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Wed, 28 Jan 2026 11:38:07 +0000
Subject: [PATCH 5/5] update

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/src/serde.rs                   | 11 ++++-------
 vortex-cuda/benches/dict_cuda.rs            | 12 ++++--------
 vortex-cuda/src/device_buffer.rs            | 16 ++++++++++++----
 vortex-cuda/src/executor.rs                 |  8 +++++++-
 vortex-cuda/src/host_to_device_allocator.rs |  9 +++++----
 vortex-cuda/src/kernel/arrays/dict.rs       |  4 ++--
 vortex-cuda/src/kernel/encodings/alp.rs     |  2 +-
 vortex-cuda/src/kernel/encodings/zstd.rs    |  2 +-
 vortex-layout/src/layouts/chunked/reader.rs |  5 ++++-
 vortex-layout/src/layouts/flat/reader.rs    |  2 ++
 vortex-layout/src/layouts/struct_/reader.rs |  9 ++++++---
 11 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/vortex-array/src/serde.rs b/vortex-array/src/serde.rs
index 87b786b3717..cba1f2fd0df 100644
--- a/vortex-array/src/serde.rs
+++ b/vortex-array/src/serde.rs
@@ -503,7 +503,7 @@ impl ArrayParts {
             let flatbuffer_loc = fb_root._tab.loc();
 
             let mut offset = 0;
-            let buffers: VortexResult<Vec<_>> = fb_array
+            let buffers = fb_array
                 .buffers()
                 .unwrap_or_default()
                 .iter()
@@ -515,15 +515,12 @@ impl ArrayParts {
 
                     // Extract a buffer and ensure it's aligned, copying if necessary
                     let buffer = segment.slice(offset..(offset + buffer_len));
-                    let buffer = buffer.ensure_aligned(Alignment::from_exponent(
-                        fb_buf.alignment_exponent(),
-                    ))?;
+                    let buffer = buffer
+                        .ensure_aligned(Alignment::from_exponent(fb_buf.alignment_exponent()))?;
                     offset += buffer_len;
                     Ok(buffer)
                 })
-                .collect();
-            let buffers: Arc<[_]> = buffers?.into();
-
+                .collect::<VortexResult<Arc<[_]>>>()?;
             (flatbuffer_loc, buffers)
         };
 
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index 779e0258778..ecadf1b0955 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -145,8 +145,7 @@ fn benchmark_dict_u32_u8(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u32>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device =
-                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u32>());
+                        let output_device = CudaDeviceBuffer::new(output_slice);
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -209,8 +208,7 @@ fn benchmark_dict_u32_u16(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u32>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device =
-                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u32>());
+                        let output_device = CudaDeviceBuffer::new(output_slice);
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -273,8 +271,7 @@ fn benchmark_dict_u64_u8(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u64>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device =
-                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u64>());
+                        let output_device = CudaDeviceBuffer::new(output_slice);
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
@@ -337,8 +334,7 @@ fn benchmark_dict_u64_u32(c: &mut Criterion) {
                         let output_slice = cuda_ctx
                             .device_alloc::<u64>(dict_array.len())
                             .vortex_expect("failed to allocate output");
-                        let output_device =
-                            CudaDeviceBuffer::new(output_slice, Alignment::of::<u64>());
+                        let output_device = CudaDeviceBuffer::new(output_slice);
 
                         let kernel_time = launch_dict_kernel_timed(
                             codes_device
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index c7beec10cce..f97b032935c 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::cmp::min;
 use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
@@ -16,6 +17,7 @@ use vortex_array::buffer::DeviceBuffer;
 use vortex_buffer::Alignment;
 use vortex_buffer::BufferMut;
 use vortex_buffer::ByteBuffer;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -32,7 +34,12 @@ pub struct CudaDeviceBuffer<T> {
 
 impl<T: DeviceRepr> CudaDeviceBuffer<T> {
     /// Creates a new CUDA device buffer from a [`CudaSlice`].
-    pub fn new(cuda_slice: CudaSlice<T>, alignment: Alignment) -> Self {
+    pub fn new(cuda_slice: CudaSlice<T>) -> Self {
+        Self::new_aligned(cuda_slice, Alignment::of::<T>())
+    }
+
+    pub fn new_aligned(cuda_slice: CudaSlice<T>, alignment: Alignment) -> Self {
+        assert!(alignment.is_aligned_to(Alignment::of::<T>()));
         let len = cuda_slice.len();
         let device_ptr = cuda_slice.device_ptr(cuda_slice.stream()).0;
 
@@ -195,9 +202,10 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
         let alignment = if byte_offset == 0 {
             self.alignment
         } else {
-            let offset_alignment = 1usize << byte_offset.trailing_zeros();
-            let max_alignment = *self.alignment;
-            Alignment::new(offset_alignment.min(max_alignment))
+            Alignment::from_exponent(
+                u8::try_from((self.device_ptr + byte_offset as u64).trailing_zeros())
+                    .vortex_expect("impossible"),
+            )
         };
 
         assert!(
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index eb34d412a61..dc7a6e1bcc7 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -23,6 +23,7 @@ use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_dtype::PType;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -177,7 +178,12 @@ impl CudaExecutionCtx {
                 .map_err(|e| vortex_err!("Failed to schedule async copy to device: {}", e))?;
         }
 
-        let cuda_buf = CudaDeviceBuffer::new(cuda_slice, Alignment::of::<T>());
+        let cuda_buf = CudaDeviceBuffer::new_aligned(
+            cuda_slice,
+            Alignment::from_exponent(
+                u8::try_from(device_ptr.trailing_zeros()).vortex_expect("aligment over 2^2^8??"),
+            ),
+        );
         let stream = Arc::clone(&self.stream);
 
         Ok(Box::pin(async move {
diff --git a/vortex-cuda/src/host_to_device_allocator.rs b/vortex-cuda/src/host_to_device_allocator.rs
index 3b37c06f260..d7d7147d317 100644
--- a/vortex-cuda/src/host_to_device_allocator.rs
+++ b/vortex-cuda/src/host_to_device_allocator.rs
@@ -6,8 +6,8 @@ use std::sync::Arc;
 use cudarc::driver::CudaStream;
 use cudarc::driver::DevicePtrMut;
 use cudarc::driver::result::memcpy_htod_async;
-use futures::future::BoxFuture;
 use futures::FutureExt;
+use futures::future::BoxFuture;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBufferMut;
@@ -45,6 +45,7 @@ impl BufferAllocator for HostToDeviceAllocator {
         alignment: Alignment,
     ) -> VortexResult<Box<dyn WriteDestination>> {
         let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        // # Safety (Is this safe)??
         unsafe { buffer.set_len(len) };
         Ok(Box::new(NaiveDeviceWriteTarget {
             buffer,
@@ -90,9 +91,9 @@ impl WriteDestination for NaiveDeviceWriteTarget {
             // Keep the host buffer alive until the copy completes.
             let _keep_alive = host;
 
-            Ok(BufferHandle::new_device(Arc::new(CudaDeviceBuffer::new(
-                device, alignment,
-            ))))
+            Ok(BufferHandle::new_device(Arc::new(
+                CudaDeviceBuffer::new_aligned(device, alignment),
+            )))
         }
         .boxed()
     }
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 084d9f5bdce..2ef5933a327 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -117,7 +117,7 @@ async fn execute_dict_prim_typed<V: DeviceRepr + NativePType, I: DeviceRepr + Na
 
     // Allocate output buffer on device
     let output_slice = ctx.device_alloc::<V>(codes_len)?;
-    let output_device = CudaDeviceBuffer::new(output_slice, Alignment::of::<V>());
+    let output_device = CudaDeviceBuffer::new(output_slice);
 
     // Get views for kernel launch
     let values_view = values_device.cuda_view::<V>()?;
@@ -226,7 +226,7 @@ async fn execute_dict_decimal_typed<
 
     // Allocate output buffer on device (codes_len * value_byte_width bytes)
     let output_slice = ctx.device_alloc::<V>(codes_len)?;
-    let output_device = CudaDeviceBuffer::new(output_slice, Alignment::of::<V>());
+    let output_device = CudaDeviceBuffer::new(output_slice);
 
     // Get views for kernel launch
     let values_view = values_device.cuda_view::<V>()?;
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index 57bfc826dda..b6095bb2464 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -84,7 +84,7 @@ where
 
     // Allocate output buffer
     let output_slice = ctx.device_alloc::<A>(array_len)?;
-    let output_buf = CudaDeviceBuffer::new(output_slice, Alignment::of::<A>());
+    let output_buf = CudaDeviceBuffer::new(output_slice);
     let output_view = output_buf.as_view();
 
     let array_len_u64 = array_len as u64;
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index 1aa8b2741fa..508990897d8 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -269,7 +269,7 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
     // self-contained. They neither have any parent or child encodings.
     //
     // TODO(0ax1): Don't copy back to host once VarBinView supports buffer handles.
-    let host_buffer = CudaDeviceBuffer::new(exec.device_output, Alignment::of::<u8>())
+    let host_buffer = CudaDeviceBuffer::new(exec.device_output)
         .copy_to_host(Alignment::new(1))?
         .await?;
 
diff --git a/vortex-layout/src/layouts/chunked/reader.rs b/vortex-layout/src/layouts/chunked/reader.rs
index d5bc645ff94..2eef7d25986 100644
--- a/vortex-layout/src/layouts/chunked/reader.rs
+++ b/vortex-layout/src/layouts/chunked/reader.rs
@@ -301,7 +301,10 @@ impl LayoutReader for ChunkedReader {
             }
 
             // Combine the arrays.
-            Ok(ChunkedArray::try_new(chunks, dtype)?.to_array())
+            let x = ChunkedArray::try_new(chunks, dtype)?.to_array();
+            println!("{}", x.display_tree());
+
+            Ok(x)
         }
         .boxed())
     }
diff --git a/vortex-layout/src/layouts/flat/reader.rs b/vortex-layout/src/layouts/flat/reader.rs
index 786bc0c6f42..923877e2474 100644
--- a/vortex-layout/src/layouts/flat/reader.rs
+++ b/vortex-layout/src/layouts/flat/reader.rs
@@ -214,6 +214,8 @@ impl LayoutReader for FlatReader {
             // Evaluate the projection expression.
             array = array.apply(&expr)?;
 
+            println!("array {}", array.display_tree());
+
             Ok(array)
         }
         .boxed())
diff --git a/vortex-layout/src/layouts/struct_/reader.rs b/vortex-layout/src/layouts/struct_/reader.rs
index 81096e8212a..3b74d7bd603 100644
--- a/vortex-layout/src/layouts/struct_/reader.rs
+++ b/vortex-layout/src/layouts/struct_/reader.rs
@@ -346,7 +346,7 @@ impl LayoutReader for StructReader {
                 let mask = Mask::from_buffer(validity.to_bool().to_bit_buffer().not());
 
                 // If root expression was a pack, then we apply the validity to each child field
-                if is_pack_merge {
+                let res = if is_pack_merge {
                     let struct_array = array.to_struct();
                     let masked_fields: Vec<ArrayRef> = struct_array
                         .unmasked_fields()
@@ -365,9 +365,12 @@ impl LayoutReader for StructReader {
                     // If the root expression was not a pack or merge, e.g. if it's something like
                     // a get_item, then we apply the validity directly to the result
                     vortex_array::compute::mask(array.as_ref(), &mask)
-                }
+                };
+                res
             } else {
-                projected.await
+                projected
+                    .await
+                    .inspect(|a| println!("ret array {}", a.display_tree()))
             }
         }))
     }