diff --git a/Cargo.lock b/Cargo.lock
index 62a1aa354cd..cb1f7236af5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10350,6 +10350,7 @@ dependencies = [
  "vortex-dtype",
  "vortex-error",
  "vortex-fastlanes",
+ "vortex-io",
  "vortex-mask",
  "vortex-nvcomp",
  "vortex-scalar",
@@ -10688,6 +10689,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tracing",
+ "vortex-array",
  "vortex-buffer",
  "vortex-error",
  "vortex-metrics",
diff --git a/vortex-array/src/arrays/decimal/vtable/mod.rs b/vortex-array/src/arrays/decimal/vtable/mod.rs
index f253902813f..7ae29978423 100644
--- a/vortex-array/src/arrays/decimal/vtable/mod.rs
+++ b/vortex-array/src/arrays/decimal/vtable/mod.rs
@@ -107,13 +107,11 @@ impl VTable for DecimalVTable {
 
         match_each_decimal_value_type!(metadata.values_type(), |D| {
             // Check and reinterpret-cast the buffer
-            if let Some(buffer) = values.as_host_opt() {
-                vortex_ensure!(
-                    buffer.is_aligned(Alignment::of::<D>()),
-                    "DecimalArray buffer not aligned for values type {:?}",
-                    D::DECIMAL_TYPE
-                );
-            }
+            vortex_ensure!(
+                values.alignment().is_aligned_to(Alignment::of::<D>()),
+                "DecimalArray buffer not aligned for values type {:?}",
+                D::DECIMAL_TYPE
+            );
             DecimalArray::try_new_handle(values, metadata.values_type(), *decimal_dtype, validity)
         })
     }
diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
index cf702ca3dad..459f24b433e 100644
--- a/vortex-array/src/arrays/primitive/vtable/mod.rs
+++ b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -101,16 +101,13 @@ impl VTable for PrimitiveVTable {
             );
         }
 
-        // For host buffers, we eagerly check alignment on construction.
-        // TODO(aduffy): check for device buffers. CUDA buffers are generally 256-byte aligned,
-        //  but not sure about other devices.
-        if let Some(host_buf) = buffer.as_host_opt() {
-            vortex_ensure!(
-                host_buf.is_aligned(Alignment::new(ptype.byte_width())),
-                "PrimitiveArray::build: Buffer must be aligned to {}",
-                ptype.byte_width()
-            );
-        }
+        vortex_ensure!(
+            buffer
+                .alignment()
+                .is_aligned_to(Alignment::new(ptype.byte_width())),
+            "PrimitiveArray::build: Buffer must be aligned to {}",
+            ptype.byte_width()
+        );
 
         // SAFETY: checked ahead of time
         unsafe {
diff --git a/vortex-array/src/buffer.rs b/vortex-array/src/buffer.rs
index cf209b75f80..35cbf5a233b 100644
--- a/vortex-array/src/buffer.rs
+++ b/vortex-array/src/buffer.rs
@@ -14,6 +14,7 @@ use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
 use vortex_utils::dyn_traits::DynEq;
 use vortex_utils::dyn_traits::DynHash;
 
@@ -50,6 +51,9 @@ pub trait DeviceBuffer: 'static + Send + Sync + Debug + DynEq + DynHash {
     /// Returns the length of the buffer in bytes.
     fn len(&self) -> usize;
 
+    /// Returns the alignment of the buffer.
+    fn alignment(&self) -> Alignment;
+
     /// Returns true if the buffer is empty.
     fn is_empty(&self) -> bool {
         self.len() == 0
@@ -130,6 +134,40 @@ impl BufferHandle {
         }
     }
 
+    /// Returns the alignment of the buffer.
+    pub fn alignment(&self) -> Alignment {
+        match &self.0 {
+            Inner::Host(bytes) => bytes.alignment(),
+            Inner::Device(device) => device.alignment(),
+        }
+    }
+
+    /// Returns true if the buffer is aligned to the given alignment.
+    pub fn is_aligned(&self, alignment: Alignment) -> bool {
+        self.alignment().is_aligned_to(alignment)
+    }
+
+    /// Ensure the buffer satisfies the requested alignment.
+    ///
+    /// Host buffers will be copied if necessary. Device buffers will error if the
+    /// alignment requirement is not met.
+    pub fn ensure_aligned(&self, alignment: Alignment) -> VortexResult<Self> {
+        match &self.0 {
+            Inner::Host(buffer) => Ok(BufferHandle::new_host(buffer.clone().aligned(alignment))),
+            Inner::Device(device) => {
+                if device.alignment().is_aligned_to(alignment) {
+                    Ok(self.clone())
+                } else {
+                    vortex_bail!(
+                        "Device buffer alignment {} does not satisfy required alignment {}",
+                        device.alignment(),
+                        alignment
+                    );
+                }
+            }
+        }
+    }
+
     /// Check if the buffer is empty.
     pub fn is_empty(&self) -> bool {
         self.len() == 0
diff --git a/vortex-array/src/serde.rs b/vortex-array/src/serde.rs
index a3aa423b177..cba1f2fd0df 100644
--- a/vortex-array/src/serde.rs
+++ b/vortex-array/src/serde.rs
@@ -490,10 +490,9 @@ impl ArrayParts {
         array_tree: ByteBuffer,
         segment: BufferHandle,
     ) -> VortexResult<Self> {
-        // TODO: this can also work with device buffers.
-        let segment = segment.try_to_host_sync()?;
-        // We align each buffer individually, so we remove alignment requirements on the buffer.
-        let segment = segment.aligned(Alignment::none());
+        // We align each buffer individually, so we remove alignment requirements on the segment
+        // for host-resident buffers. Device buffers are sliced directly.
+        let segment = segment.ensure_aligned(Alignment::none())?;
 
         let fb_buffer = FlatBuffer::align_from(array_tree);
 
@@ -504,7 +503,7 @@ impl ArrayParts {
             let flatbuffer_loc = fb_root._tab.loc();
 
             let mut offset = 0;
-            let buffers: Arc<[_]> = fb_array
+            let buffers = fb_array
                 .buffers()
                 .unwrap_or_default()
                 .iter()
@@ -515,15 +514,13 @@ impl ArrayParts {
                     let buffer_len = fb_buf.length() as usize;
 
                     // Extract a buffer and ensure it's aligned, copying if necessary
-                    let buffer = segment
-                        .slice(offset..(offset + buffer_len))
-                        .aligned(Alignment::from_exponent(fb_buf.alignment_exponent()));
-
+                    let buffer = segment.slice(offset..(offset + buffer_len));
+                    let buffer = buffer
+                        .ensure_aligned(Alignment::from_exponent(fb_buf.alignment_exponent()))?;
                     offset += buffer_len;
-                    BufferHandle::new_host(buffer)
+                    Ok(buffer)
                 })
-                .collect();
-
+                .collect::<VortexResult<Arc<[_]>>>()?;
             (flatbuffer_loc, buffers)
         };
 
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
index a7b7bd6d4b0..0e8fe6fc58c 100644
--- a/vortex-cuda/Cargo.toml
+++ b/vortex-cuda/Cargo.toml
@@ -35,6 +35,7 @@ vortex-error = { workspace = true }
 vortex-fastlanes = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-nvcomp = { path = "nvcomp" }
+vortex-io = { workspace = true }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
 vortex-zigzag = { workspace = true }
diff --git a/vortex-cuda/benches/dict_cuda.rs b/vortex-cuda/benches/dict_cuda.rs
index d22b1acc326..ecadf1b0955 100644
--- a/vortex-cuda/benches/dict_cuda.rs
+++ b/vortex-cuda/benches/dict_cuda.rs
@@ -17,6 +17,7 @@ use vortex_array::IntoArray;
 use vortex_array::arrays::DictArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
+use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_cuda::CudaBufferExt;
 use vortex_cuda::CudaDeviceBuffer;
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
index 2cc6517324d..f97b032935c 100644
--- a/vortex-cuda/src/device_buffer.rs
+++ b/vortex-cuda/src/device_buffer.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::cmp::min;
 use std::fmt::Debug;
 use std::ops::Range;
 use std::sync::Arc;
@@ -16,6 +17,7 @@ use vortex_array::buffer::DeviceBuffer;
 use vortex_buffer::Alignment;
 use vortex_buffer::BufferMut;
 use vortex_buffer::ByteBuffer;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -27,11 +29,17 @@ pub struct CudaDeviceBuffer<T> {
     offset: usize,
     len: usize,
     device_ptr: u64,
+    alignment: Alignment,
 }
 
 impl<T: DeviceRepr> CudaDeviceBuffer<T> {
     /// Creates a new CUDA device buffer from a [`CudaSlice`].
     pub fn new(cuda_slice: CudaSlice<T>) -> Self {
+        Self::new_aligned(cuda_slice, Alignment::of::<T>())
+    }
+
+    pub fn new_aligned(cuda_slice: CudaSlice<T>, alignment: Alignment) -> Self {
+        assert!(alignment.is_aligned_to(Alignment::of::<T>()));
         let len = cuda_slice.len();
         let device_ptr = cuda_slice.device_ptr(cuda_slice.stream()).0;
 
@@ -40,6 +48,7 @@ impl<T: DeviceRepr> CudaDeviceBuffer<T> {
             offset: 0,
             len,
             device_ptr,
+            alignment,
         }
     }
 
@@ -109,6 +118,10 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
         self.len * size_of::<T>()
     }
 
+    fn alignment(&self) -> Alignment {
+        self.alignment
+    }
+
     /// Synchronous copy of CUDA device to host memory.
     ///
     /// The copy is not started before other operations on the streams are completed.
@@ -185,6 +198,15 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
     fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
         let new_offset = self.offset + range.start;
         let new_len = range.end - range.start;
+        let byte_offset = new_offset * size_of::<T>();
+        let alignment = if byte_offset == 0 {
+            self.alignment
+        } else {
+            Alignment::from_exponent(
+                u8::try_from((self.device_ptr + byte_offset as u64).trailing_zeros())
+                    .vortex_expect("impossible"),
+            )
+        };
 
         assert!(
             range.end <= self.len,
@@ -198,6 +220,7 @@ impl<T: DeviceRepr + Send + Sync + 'static> DeviceBuffer for CudaDeviceBuffer<T>
             offset: new_offset,
             len: new_len,
             device_ptr: self.device_ptr,
+            alignment,
         })
     }
 
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 63b2af86675..dc7a6e1bcc7 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -20,8 +20,10 @@ use vortex_array::ArrayRef;
 use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_dtype::PType;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 
@@ -176,7 +178,12 @@ impl CudaExecutionCtx {
                 .map_err(|e| vortex_err!("Failed to schedule async copy to device: {}", e))?;
         }
 
-        let cuda_buf = CudaDeviceBuffer::new(cuda_slice);
+        let cuda_buf = CudaDeviceBuffer::new_aligned(
+            cuda_slice,
+            Alignment::from_exponent(
+                u8::try_from(device_ptr.trailing_zeros()).vortex_expect("aligment over 2^2^8??"),
+            ),
+        );
         let stream = Arc::clone(&self.stream);
 
         Ok(Box::pin(async move {
diff --git a/vortex-cuda/src/host_to_device_allocator.rs b/vortex-cuda/src/host_to_device_allocator.rs
new file mode 100644
index 00000000000..d7d7147d317
--- /dev/null
+++ b/vortex-cuda/src/host_to_device_allocator.rs
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+
+use cudarc::driver::CudaStream;
+use cudarc::driver::DevicePtrMut;
+use cudarc::driver::result::memcpy_htod_async;
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_io::BufferAllocator;
+use vortex_io::WriteDestination;
+use vortex_io::WriteRegion;
+use vortex_session::VortexSession;
+
+use crate::device_buffer::CudaDeviceBuffer;
+use crate::session::CudaSessionExt;
+use crate::stream::await_stream_callback;
+
+/// Allocator that reads into host buffers and copies to device memory.
+pub struct HostToDeviceAllocator {
+    stream: Arc<CudaStream>,
+}
+
+impl HostToDeviceAllocator {
+    pub fn new(stream: Arc<CudaStream>) -> Self {
+        Self { stream }
+    }
+
+    pub fn from_session(session: &VortexSession) -> VortexResult<Self> {
+        let stream = session.cuda_session().new_stream()?;
+        Ok(Self::new(stream))
+    }
+}
+
+impl BufferAllocator for HostToDeviceAllocator {
+    fn allocate(
+        &self,
+        len: usize,
+        alignment: Alignment,
+    ) -> VortexResult<Box<dyn WriteDestination>> {
+        let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        // # Safety (Is this safe)??
+        unsafe { buffer.set_len(len) };
+        Ok(Box::new(NaiveDeviceWriteTarget {
+            buffer,
+            stream: self.stream.clone(),
+            alignment,
+        }))
+    }
+}
+
+struct NaiveDeviceWriteTarget {
+    buffer: ByteBufferMut,
+    stream: Arc<CudaStream>,
+    alignment: Alignment,
+}
+
+impl WriteDestination for NaiveDeviceWriteTarget {
+    fn len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    fn region(&mut self) -> WriteRegion<'_> {
+        WriteRegion::HostSlice(self.buffer.as_mut())
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let stream = self.stream.clone();
+        let alignment = self.alignment;
+        let host = self.buffer;
+        async move {
+            let len = host.len();
+            let mut device = unsafe { stream.alloc::<u8>(len) }
+                .map_err(|e| vortex_err!("Failed to allocate device memory: {e}"))?;
+
+            let device_ptr = device.device_ptr_mut(&stream).0;
+            let host_slice = host.as_ref();
+            unsafe {
+                memcpy_htod_async(device_ptr, host_slice, stream.cu_stream())
+                    .map_err(|e| vortex_err!("Failed to schedule H2D copy: {e}"))?;
+            }
+
+            await_stream_callback(&stream).await?;
+
+            // Keep the host buffer alive until the copy completes.
+            let _keep_alive = host;
+
+            Ok(BufferHandle::new_device(Arc::new(
+                CudaDeviceBuffer::new_aligned(device, alignment),
+            )))
+        }
+        .boxed()
+    }
+}
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index 20d1a6c6425..2ef5933a327 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -16,6 +16,7 @@ use vortex_array::arrays::DictVTable;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_dtype::DType;
 use vortex_dtype::DecimalType;
 use vortex_dtype::NativeDecimalType;
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index f8c40f73a34..b6095bb2464 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -18,6 +18,7 @@ use vortex_array::Canonical;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::PrimitiveArrayParts;
 use vortex_array::buffer::BufferHandle;
+use vortex_buffer::Alignment;
 use vortex_dtype::NativePType;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index da448e9b065..df87a0e2520 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -9,6 +9,7 @@ mod canonical;
 mod device_buffer;
 pub mod executor;
 mod kernel;
+mod host_to_device_allocator;
 mod session;
 mod stream;
 
@@ -17,6 +18,7 @@ pub use device_buffer::CudaBufferExt;
 pub use device_buffer::CudaDeviceBuffer;
 pub use executor::CudaExecutionCtx;
 pub use executor::CudaKernelEvents;
+pub use host_to_device_allocator::HostToDeviceAllocator;
 use kernel::ALPExecutor;
 use kernel::DecimalBytePartsExecutor;
 use kernel::DictExecutor;
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index c83128def3e..088c2465088 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -5,6 +5,7 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
 use vortex_array::VortexSessionExecute;
 use vortex_array::vtable::ArrayId;
 use vortex_error::VortexResult;
@@ -42,17 +43,20 @@ impl CudaSession {
     pub fn create_execution_ctx(
         vortex_session: &vortex_session::VortexSession,
     ) -> VortexResult<CudaExecutionCtx> {
-        let stream = vortex_session
-            .cuda_session()
-            .context
-            .new_stream()
-            .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))?;
+        let stream = vortex_session.cuda_session().new_stream()?;
         Ok(CudaExecutionCtx::new(
             stream,
             vortex_session.create_execution_ctx(),
         ))
     }
 
+    /// Create a new CUDA stream.
+    pub fn new_stream(&self) -> VortexResult<Arc<CudaStream>> {
+        self.context
+            .new_stream()
+            .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))
+    }
+
     /// Registers CUDA support for an array encoding.
     ///
     /// # Arguments
diff --git a/vortex-file/src/open.rs b/vortex-file/src/open.rs
index 0f9123fe480..eab011f4625 100644
--- a/vortex-file/src/open.rs
+++ b/vortex-file/src/open.rs
@@ -12,6 +12,9 @@ use vortex_dtype::DType;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_io::AllocatingReadAt;
+use vortex_io::BufferAllocator;
+use vortex_io::InstrumentedReadAt;
 use vortex_io::VortexReadAt;
 use vortex_io::session::RuntimeSessionExt;
 use vortex_layout::segments::NoOpSegmentCache;
@@ -51,6 +54,8 @@ pub struct VortexOpenOptions {
     footer: Option<Footer>,
     /// The segments read during the initial read.
     initial_read_segments: RwLock<HashMap<SegmentId, ByteBuffer>>,
+    /// Optional allocator for read buffers.
+    allocator: Option<Arc<dyn BufferAllocator>>,
     /// A metrics registry for the file.
     metrics: Option<VortexMetrics>,
 }
@@ -66,6 +71,7 @@ pub trait OpenOptionsSessionExt: ArraySessionExt + LayoutSessionExt + RuntimeSes
             dtype: None,
             footer: None,
             initial_read_segments: Default::default(),
+            allocator: None,
             metrics: None,
         }
     }
@@ -125,6 +131,12 @@ impl VortexOpenOptions {
         self
     }
 
+    /// Configure a custom buffer allocator for reads.
+    pub fn with_allocator(mut self, allocator: Arc<dyn BufferAllocator>) -> Self {
+        self.allocator = Some(allocator);
+        self
+    }
+
     /// Open a Vortex file using the provided I/O source.
     ///
     /// This is the most common way to open a [`VortexFile`] and tends to provide the best
@@ -156,11 +168,17 @@ impl VortexOpenOptions {
 
     /// An API for opening a [`VortexFile`] using any [`VortexReadAt`] implementation.
     pub async fn open_read<R: VortexReadAt + Clone>(self, reader: R) -> VortexResult<VortexFile> {
-        let metrics = VortexMetrics::default();
+        let metrics = self.metrics.clone().unwrap_or_default();
+        let reader = InstrumentedReadAt::new(reader, &metrics);
+        let reader: Arc<dyn VortexReadAt> = if let Some(allocator) = &self.allocator {
+            Arc::new(AllocatingReadAt::new(reader, allocator.clone()))
+        } else {
+            Arc::new(reader)
+        };
         let footer = if let Some(footer) = self.footer {
             footer
         } else {
-            self.read_footer(&reader).await?
+            self.read_footer(reader.as_ref()).await?
         };
 
         let segment_cache = Arc::new(SegmentCacheMetrics::new(
@@ -172,12 +190,14 @@ impl VortexOpenOptions {
         ));
 
         // Create a segment source backed by the VortexRead implementation.
-        let segment_source = Arc::new(SharedSegmentSource::new(FileSegmentSource::open(
-            footer.segment_map().clone(),
-            reader,
-            self.session.handle(),
-            metrics.clone(),
-        )));
+        let segment_source = Arc::new(SharedSegmentSource::new(
+            FileSegmentSource::open(
+                footer.segment_map().clone(),
+                reader,
+                self.session.handle(),
+                metrics.clone(),
+            ),
+        ));
 
         // Wrap up the segment source to first resolve segments from the initial read cache.
         let segment_source = Arc::new(SegmentCacheSourceAdapter::new(
@@ -209,7 +229,8 @@ impl VortexOpenOptions {
         let initial_offset = file_size - initial_read_size as u64;
         let initial_read: ByteBuffer = read
             .read_at(initial_offset, initial_read_size, Alignment::none())
-            .await?;
+            .await?
+            .try_into_host_sync()?;
 
         let mut deserializer = Footer::deserializer(initial_read, self.session.clone())
             .with_size(file_size)
@@ -218,7 +239,10 @@ impl VortexOpenOptions {
         let footer = loop {
             match deserializer.deserialize()? {
                 DeserializeStep::NeedMoreData { offset, len } => {
-                    let more_data = read.read_at(offset, len, Alignment::none()).await?;
+                    let more_data = read
+                        .read_at(offset, len, Alignment::none())
+                        .await?
+                        .try_into_host_sync()?;
                     deserializer.prefix_data(more_data);
                 }
                 DeserializeStep::NeedFileSize => unreachable!("We passed file_size above"),
@@ -286,6 +310,7 @@ mod tests {
     use std::sync::atomic::Ordering;
 
     use futures::future::BoxFuture;
+    use vortex_array::buffer::BufferHandle;
     use vortex_array::IntoArray;
     use vortex_array::expr::session::ExprSession;
     use vortex_array::session::ArraySession;
@@ -315,7 +340,7 @@ mod tests {
             offset: u64,
             length: usize,
             alignment: Alignment,
-        ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+        ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
             self.total_read.fetch_add(length, Ordering::Relaxed);
             let _ = self.first_read_len.compare_exchange(
                 0,
diff --git a/vortex-file/src/read/driver.rs b/vortex-file/src/read/driver.rs
index f097385445e..18cdfea6a70 100644
--- a/vortex-file/src/read/driver.rs
+++ b/vortex-file/src/read/driver.rs
@@ -326,7 +326,7 @@ mod tests {
     use futures::StreamExt;
     use futures::stream;
     use vortex_buffer::Alignment;
-    use vortex_buffer::ByteBuffer;
+    use vortex_array::buffer::BufferHandle;
     use vortex_error::VortexResult;
 
     use super::*;
@@ -336,7 +336,7 @@ mod tests {
         id: usize,
         offset: u64,
         length: usize,
-    ) -> (ReadRequest, oneshot::Receiver<VortexResult<ByteBuffer>>) {
+    ) -> (ReadRequest, oneshot::Receiver<VortexResult<BufferHandle>>) {
         let (tx, rx) = oneshot::channel();
         (
             ReadRequest {
diff --git a/vortex-file/src/read/request.rs b/vortex-file/src/read/request.rs
index 256cb95851d..cdd71670070 100644
--- a/vortex-file/src/read/request.rs
+++ b/vortex-file/src/read/request.rs
@@ -7,8 +7,8 @@ use std::fmt::Formatter;
 use std::ops::Range;
 use std::sync::Arc;
 
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_error::VortexError;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
@@ -51,7 +51,7 @@ impl IoRequest {
     }
 
     /// Resolves the request with the given result.
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match self.0 {
             IoRequestInner::Single(req) => req.resolve(result),
             IoRequestInner::Coalesced(req) => req.resolve(result),
@@ -90,7 +90,7 @@ pub struct ReadRequest {
     pub(crate) offset: u64,
     pub(crate) length: usize,
     pub(crate) alignment: Alignment,
-    pub(crate) callback: oneshot::Sender<VortexResult<ByteBuffer>>,
+    pub(crate) callback: oneshot::Sender<VortexResult<BufferHandle>>,
 }
 
 impl Debug for ReadRequest {
@@ -106,7 +106,7 @@ impl Debug for ReadRequest {
 }
 
 impl ReadRequest {
-    pub(crate) fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub(crate) fn resolve(self, result: VortexResult<BufferHandle>) {
         if let Err(e) = self.callback.send(result) {
             tracing::debug!("ReadRequest {} dropped before resolving: {e}", self.id);
         }
@@ -132,15 +132,31 @@ impl Debug for CoalescedRequest {
 }
 
 impl CoalescedRequest {
-    pub fn resolve(self, result: VortexResult<ByteBuffer>) {
+    pub fn resolve(self, result: VortexResult<BufferHandle>) {
         match result {
             Ok(buffer) => {
-                let buffer = buffer.aligned(Alignment::none());
+                let base = match buffer.ensure_aligned(Alignment::none()) {
+                    Ok(base) => base,
+                    Err(e) => {
+                        let e = Arc::new(e);
+                        for req in self.requests.into_iter() {
+                            req.resolve(Err(VortexError::from(e.clone())));
+                        }
+                        return;
+                    }
+                };
+
                 for req in self.requests.into_iter() {
                     let start = usize::try_from(req.offset - self.range.start)
                         .vortex_expect("invalid offset");
                     let end = start + req.length;
-                    let slice = buffer.slice(start..end).aligned(req.alignment);
+                    let slice = match base.slice(start..end).ensure_aligned(req.alignment) {
+                        Ok(slice) => slice,
+                        Err(e) => {
+                            req.resolve(Err(e));
+                            continue;
+                        }
+                    };
                     req.resolve(Ok(slice));
                 }
             }
diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs
index a1072af9998..344f805f516 100644
--- a/vortex-file/src/segments/source.rs
+++ b/vortex-file/src/segments/source.rs
@@ -14,7 +14,6 @@ use futures::StreamExt;
 use futures::channel::mpsc;
 use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 use vortex_error::vortex_err;
 use vortex_io::VortexReadAt;
@@ -99,9 +98,9 @@ impl FileSegmentSource {
 
             stream
                 .map(move |req| {
-                    let source = reader.clone();
+                    let reader = reader.clone();
                     async move {
-                        let result = source
+                        let result = reader
                             .read_at(req.offset(), req.len(), req.alignment())
                             .await;
                         req.resolve(result);
@@ -162,7 +161,6 @@ impl SegmentSource for FileSegmentSource {
             maybe_fut
                 .ok_or_else(|| vortex_err!("Missing segment: {}", id))?
                 .await
-                .map(BufferHandle::new_host)
         }
         .boxed()
     }
@@ -174,13 +172,13 @@ impl SegmentSource for FileSegmentSource {
 /// If dropped, the read request will be canceled where possible.
 struct ReadFuture {
     id: usize,
-    recv: oneshot::Receiver<VortexResult<ByteBuffer>>,
+    recv: oneshot::Receiver<VortexResult<BufferHandle>>,
     polled: bool,
     events: mpsc::UnboundedSender<ReadEvent>,
 }
 
 impl Future for ReadFuture {
-    type Output = VortexResult<ByteBuffer>;
+    type Output = VortexResult<BufferHandle>;
 
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         if !self.polled {
diff --git a/vortex-io/Cargo.toml b/vortex-io/Cargo.toml
index cef1c69e351..5a98bd28528 100644
--- a/vortex-io/Cargo.toml
+++ b/vortex-io/Cargo.toml
@@ -35,6 +35,7 @@ handle = "1.0.2"
 tokio = { workspace = true, features = ["io-util", "rt", "sync"] }
 tracing = { workspace = true }
 vortex-buffer = { workspace = true }
+vortex-array = { workspace = true }
 vortex-error = { workspace = true }
 vortex-metrics = { workspace = true }
 vortex-session = { workspace = true }
diff --git a/vortex-io/src/allocator.rs b/vortex-io/src/allocator.rs
new file mode 100644
index 00000000000..b042fad8dd1
--- /dev/null
+++ b/vortex-io/src/allocator.rs
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use vortex_buffer::Alignment;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+use crate::WriteDestination;
+
+/// Allocates buffers for I/O reads.
+pub trait BufferAllocator: Send + Sync + 'static {
+    /// Allocate a buffer for the requested length and alignment.
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteDestination>>;
+}
+
+/// The default allocator that uses `ByteBufferMut`.
+pub struct DefaultAllocator;
+
+/// Allocation counters for the default allocator.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct DefaultAllocStats {
+    pub count: u64,
+    pub bytes: u64,
+}
+
+static DEFAULT_ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
+static DEFAULT_ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
+
+pub fn default_alloc_stats() -> DefaultAllocStats {
+    DefaultAllocStats {
+        count: DEFAULT_ALLOC_COUNT.load(Ordering::Relaxed),
+        bytes: DEFAULT_ALLOC_BYTES.load(Ordering::Relaxed),
+    }
+}
+
+pub fn reset_default_alloc_stats() {
+    DEFAULT_ALLOC_COUNT.store(0, Ordering::Relaxed);
+    DEFAULT_ALLOC_BYTES.store(0, Ordering::Relaxed);
+}
+
+impl BufferAllocator for DefaultAllocator {
+    fn allocate(&self, len: usize, alignment: Alignment) -> VortexResult<Box<dyn WriteDestination>> {
+        DEFAULT_ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
+        DEFAULT_ALLOC_BYTES.fetch_add(len as u64, Ordering::Relaxed);
+        let mut buffer = ByteBufferMut::with_capacity_aligned(len, alignment);
+        unsafe { buffer.set_len(len) };
+        Ok(Box::new(buffer))
+    }
+}
diff --git a/vortex-io/src/file/object_store.rs b/vortex-io/src/file/object_store.rs
index 0d09cbdcd2b..80c8a9343fe 100644
--- a/vortex-io/src/file/object_store.rs
+++ b/vortex-io/src/file/object_store.rs
@@ -13,8 +13,8 @@ use object_store::GetRange;
 use object_store::GetResultPayload;
 use object_store::ObjectStore;
 use object_store::path::Path as ObjectPath;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexError;
 use vortex_error::VortexResult;
@@ -108,7 +108,7 @@ impl VortexReadAt for ObjectStoreSource {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let store = self.store.clone();
         let path = self.path.clone();
         let handle = self.handle.clone();
@@ -161,7 +161,7 @@ impl VortexReadAt for ObjectStoreSource {
                 }
             };
 
-            Ok(buffer.freeze())
+            Ok(BufferHandle::new_host(buffer.freeze()))
         })
         .boxed()
     }
diff --git a/vortex-io/src/file/std_file.rs b/vortex-io/src/file/std_file.rs
index 56abd56eb60..77417aea659 100644
--- a/vortex-io/src/file/std_file.rs
+++ b/vortex-io/src/file/std_file.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fs::File;
+use std::io;
 #[cfg(all(not(unix), not(windows)))]
 use std::io::Read;
 #[cfg(all(not(unix), not(windows)))]
@@ -15,8 +16,8 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
-use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
 use vortex_error::VortexResult;
 
@@ -27,7 +28,7 @@ use crate::runtime::Handle;
 /// Read exactly `buffer.len()` bytes from `file` starting at `offset`.
 /// This is a platform-specific helper that uses the most efficient method available.
 #[cfg(not(target_arch = "wasm32"))]
-pub(crate) fn read_exact_at(file: &File, buffer: &mut [u8], offset: u64) -> std::io::Result<()> {
+pub(crate) fn read_exact_at(file: &File, buffer: &mut [u8], offset: u64) -> io::Result<()> {
     #[cfg(unix)]
     {
         file.read_exact_at(buffer, offset)
@@ -107,7 +108,7 @@ impl VortexReadAt for FileReadAdapter {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let file = self.file.clone();
         let handle = self.handle.clone();
         async move {
@@ -116,7 +117,7 @@ impl VortexReadAt for FileReadAdapter {
                     let mut buffer = ByteBufferMut::with_capacity_aligned(length, alignment);
                     unsafe { buffer.set_len(length) };
                     read_exact_at(&file, &mut buffer, offset)?;
-                    Ok(buffer.freeze())
+                    Ok(BufferHandle::new_host(buffer.freeze()))
                 })
                 .await
         }
diff --git a/vortex-io/src/lib.rs b/vortex-io/src/lib.rs
index 6a08c821c8f..afc3598cf01 100644
--- a/vortex-io/src/lib.rs
+++ b/vortex-io/src/lib.rs
@@ -10,13 +10,16 @@
 //! This crate provides core traits for positioned and streaming IO, and via feature
 //! flags implements the core traits for several common async runtimes and backing stores.
 
+pub use allocator::*;
 pub use io_buf::*;
 pub use limit::*;
 #[cfg(feature = "object_store")]
 pub use object_store::*;
 pub use read::*;
 pub use write::*;
+pub use write_destination::*;
 
+mod allocator;
 pub mod file;
 mod io_buf;
 pub mod kanal_ext;
@@ -24,6 +27,7 @@ mod limit;
 #[cfg(feature = "object_store")]
 mod object_store;
 mod read;
+mod write_destination;
 pub mod runtime;
 pub mod session;
 #[cfg(feature = "tokio")]
diff --git a/vortex-io/src/read.rs b/vortex-io/src/read.rs
index fbcbd697d45..858599d3060 100644
--- a/vortex-io/src/read.rs
+++ b/vortex-io/src/read.rs
@@ -5,16 +5,21 @@ use std::sync::Arc;
 
 use futures::FutureExt;
 use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
+use vortex_error::vortex_err;
 use vortex_metrics::Counter;
 use vortex_metrics::Histogram;
 use vortex_metrics::Timer;
 use vortex_metrics::VortexMetrics;
 
+use crate::BufferAllocator;
+use crate::WriteRegion;
+
 /// Configuration for coalescing nearby I/O requests into single operations.
 #[derive(Clone, Copy, Debug)]
 pub struct CoalesceConfig {
@@ -71,7 +76,7 @@ pub trait VortexReadAt: Send + Sync + 'static {
     /// Asynchronously get the number of bytes of the underlying source.
     fn size(&self) -> BoxFuture<'static, VortexResult<u64>>;
 
-    /// Request an asynchronous positional read. Results will be returned as a [`ByteBuffer`].
+    /// Request an asynchronous positional read. Results will be returned as a [`BufferHandle`].
     ///
     /// If the reader does not have the requested number of bytes, the returned Future will complete
     /// with an [`UnexpectedEof`][std::io::ErrorKind::UnexpectedEof] error.
@@ -80,7 +85,7 @@ pub trait VortexReadAt: Send + Sync + 'static {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>>;
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>>;
 }
 
 impl VortexReadAt for Arc<dyn VortexReadAt> {
@@ -105,7 +110,7 @@ impl VortexReadAt for Arc<dyn VortexReadAt> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         self.as_ref().read_at(offset, length, alignment)
     }
 }
@@ -132,7 +137,7 @@ impl<R: VortexReadAt> VortexReadAt for Arc<R> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         self.as_ref().read_at(offset, length, alignment)
     }
 
@@ -158,7 +163,7 @@ impl VortexReadAt for ByteBuffer {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let buffer = self.clone();
         async move {
             let start = usize::try_from(offset).vortex_expect("start too big for usize");
@@ -172,7 +177,9 @@ impl VortexReadAt for ByteBuffer {
                     buffer.len()
                 );
             }
-            Ok(buffer.slice_unaligned(start..end).aligned(alignment))
+            Ok(BufferHandle::new_host(
+                buffer.slice_unaligned(start..end).aligned(alignment),
+            ))
         }
         .boxed()
     }
@@ -187,6 +194,19 @@ pub struct InstrumentedReadAt<T: VortexReadAt + Clone> {
     durations: Arc<Timer>,
 }
 
+/// A wrapper that uses an allocator to produce the returned buffer handle.
+#[derive(Clone)]
+pub struct AllocatingReadAt<T: VortexReadAt + Clone> {
+    read: T,
+    allocator: Arc<dyn BufferAllocator>,
+}
+
+impl<T: VortexReadAt + Clone> AllocatingReadAt<T> {
+    pub fn new(read: T, allocator: Arc<dyn BufferAllocator>) -> Self {
+        Self { read, allocator }
+    }
+}
+
 impl<T: VortexReadAt + Clone> InstrumentedReadAt<T> {
     pub fn new(read: T, metrics: &VortexMetrics) -> Self {
         Self {
@@ -247,7 +267,7 @@ impl<T: VortexReadAt + Clone> VortexReadAt for InstrumentedReadAt<T> {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         let durations = self.durations.clone();
         let sizes = self.sizes.clone();
         let total_size = self.total_size.clone();
@@ -263,6 +283,57 @@ impl<T: VortexReadAt + Clone> VortexReadAt for InstrumentedReadAt<T> {
     }
 }
 
+impl<T: VortexReadAt + Clone> VortexReadAt for AllocatingReadAt<T> {
+    fn uri(&self) -> Option<&Arc<str>> {
+        self.read.uri()
+    }
+
+    fn coalesce_config(&self) -> Option<CoalesceConfig> {
+        self.read.coalesce_config()
+    }
+
+    fn concurrency(&self) -> usize {
+        self.read.concurrency()
+    }
+
+    fn size(&self) -> BoxFuture<'static, VortexResult<u64>> {
+        self.read.size()
+    }
+
+    fn read_at(
+        &self,
+        offset: u64,
+        length: usize,
+        alignment: Alignment,
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        let read = self.read.clone();
+        let allocator = self.allocator.clone();
+        async move {
+            let handle = read.read_at(offset, length, alignment).await?;
+            if handle.is_on_device() {
+                return Ok(handle);
+            }
+
+            let host = handle
+                .as_host_opt()
+                .ok_or_else(|| vortex_err!("expected host buffer"))?;
+            let mut target = allocator.allocate(length, alignment)?;
+            match target.region() {
+                WriteRegion::HostSlice(slice) => {
+                    slice.copy_from_slice(host.as_slice());
+                }
+                WriteRegion::Registered(_) | WriteRegion::Device(_) => {
+                    return Err(vortex_err!(
+                        "AllocatingReadAt does not support non-host read regions"
+                    ));
+                }
+            }
+            target.into_handle().await
+        }
+        .boxed()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -291,7 +362,7 @@ mod tests {
         let data = ByteBuffer::from(vec![1, 2, 3, 4, 5]);
 
         let result = data.read_at(1, 3, Alignment::none()).await.unwrap();
-        assert_eq!(result.as_ref(), &[2, 3, 4]);
+        assert_eq!(result.to_host_sync().as_ref(), &[2, 3, 4]);
     }
 
     #[tokio::test]
@@ -307,7 +378,7 @@ mod tests {
         let data = Arc::new(ByteBuffer::from(vec![1, 2, 3, 4, 5]));
 
         let result = data.read_at(2, 3, Alignment::none()).await.unwrap();
-        assert_eq!(result.as_ref(), &[3, 4, 5]);
+        assert_eq!(result.to_host_sync().as_ref(), &[3, 4, 5]);
 
         let size = data.size().await.unwrap();
         assert_eq!(size, 5);
diff --git a/vortex-io/src/runtime/tests.rs b/vortex-io/src/runtime/tests.rs
index 10832633983..928fb476406 100644
--- a/vortex-io/src/runtime/tests.rs
+++ b/vortex-io/src/runtime/tests.rs
@@ -11,6 +11,7 @@ use std::sync::atomic::Ordering;
 use futures::FutureExt;
 use futures::future::BoxFuture;
 use tempfile::NamedTempFile;
+use vortex_array::buffer::BufferHandle;
 use vortex_buffer::Alignment;
 use vortex_buffer::ByteBuffer;
 use vortex_buffer::ByteBufferMut;
@@ -42,7 +43,7 @@ fn test_file_read_with_single_thread_runtime() {
                 .await
                 .unwrap();
             assert_eq!(
-                result.as_slice(),
+                result.to_host_sync().as_slice(),
                 &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
             );
 
@@ -51,7 +52,7 @@ fn test_file_read_with_single_thread_runtime() {
                 .read_at(0, TEST_DATA.len(), Alignment::new(1))
                 .await
                 .unwrap();
-            assert_eq!(full.as_slice(), TEST_DATA);
+            assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 
             "success"
         }
@@ -70,7 +71,7 @@ async fn test_file_read_with_tokio_runtime() {
         .await
         .unwrap();
     assert_eq!(
-        result.as_slice(),
+        result.to_host_sync().as_slice(),
         &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
     );
 
@@ -79,7 +80,7 @@ async fn test_file_read_with_tokio_runtime() {
         .read_at(0, TEST_DATA.len(), Alignment::new(1))
         .await
         .unwrap();
-    assert_eq!(full.as_slice(), TEST_DATA);
+    assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 }
 
 // ============================================================================
@@ -107,7 +108,7 @@ fn test_file_read_with_real_file_single_thread() {
                 .await
                 .unwrap();
             assert_eq!(
-                result.as_slice(),
+                result.to_host_sync().as_slice(),
                 &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
             );
 
@@ -116,7 +117,7 @@ fn test_file_read_with_real_file_single_thread() {
                 .read_at(0, TEST_DATA.len(), Alignment::new(1))
                 .await
                 .unwrap();
-            assert_eq!(full.as_slice(), TEST_DATA);
+            assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 
             "success"
         }
@@ -144,7 +145,7 @@ async fn test_file_read_with_real_file_tokio() {
         .await
         .unwrap();
     assert_eq!(
-        result.as_slice(),
+        result.to_host_sync().as_slice(),
         &TEST_DATA[TEST_OFFSET as usize..][..TEST_LEN]
     );
 
@@ -153,7 +154,7 @@ async fn test_file_read_with_real_file_tokio() {
         .read_at(0, TEST_DATA.len(), Alignment::new(1))
         .await
         .unwrap();
-    assert_eq!(full.as_slice(), TEST_DATA);
+    assert_eq!(full.to_host_sync().as_slice(), TEST_DATA);
 }
 
 // ============================================================================
@@ -174,10 +175,22 @@ async fn test_concurrent_reads() {
 
     let results = futures::future::join_all(futures).await;
 
-    assert_eq!(results[0].as_ref().unwrap().as_slice(), &TEST_DATA[0..5]);
-    assert_eq!(results[1].as_ref().unwrap().as_slice(), &TEST_DATA[5..10]);
-    assert_eq!(results[2].as_ref().unwrap().as_slice(), &TEST_DATA[10..15]);
-    assert_eq!(results[3].as_ref().unwrap().as_slice(), &TEST_DATA[15..20]);
+    assert_eq!(
+        results[0].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[0..5]
+    );
+    assert_eq!(
+        results[1].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[5..10]
+    );
+    assert_eq!(
+        results[2].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[10..15]
+    );
+    assert_eq!(
+        results[3].as_ref().unwrap().to_host_sync().as_slice(),
+        &TEST_DATA[15..20]
+    );
 }
 
 // ============================================================================
@@ -240,7 +253,7 @@ impl VortexReadAt for CountingReadAt {
         offset: u64,
         length: usize,
         alignment: Alignment,
-    ) -> BoxFuture<'static, VortexResult<ByteBuffer>> {
+    ) -> BoxFuture<'static, VortexResult<BufferHandle>> {
         self.read_count.fetch_add(1, Ordering::SeqCst);
         let data = self.data.clone();
         async move {
@@ -253,7 +266,7 @@ impl VortexReadAt for CountingReadAt {
             buffer
                 .as_mut_slice()
                 .copy_from_slice(&data.as_slice()[start..start + length]);
-            Ok(buffer.freeze())
+            Ok(BufferHandle::new_host(buffer.freeze()))
         }
         .boxed()
     }
diff --git a/vortex-io/src/write_destination.rs b/vortex-io/src/write_destination.rs
new file mode 100644
index 00000000000..ea811005f11
--- /dev/null
+++ b/vortex-io/src/write_destination.rs
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::marker::PhantomData;
+
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use vortex_array::buffer::BufferHandle;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+/// A destination memory region for writes.
+pub enum WriteRegion<'a> {
+    /// A standard host slice that can be written by the CPU.
+    HostSlice(&'a mut [u8]),
+    /// A registered host memory region suitable for RDMA writes.
+    Registered(RegisteredRegion<'a>),
+    /// A device memory region suitable for GPU-direct or other device DMA.
+    Device(DeviceRegion<'a>),
+}
+
+/// A registered host memory region suitable for RDMA writes.
+pub struct RegisteredRegion<'a> {
+    pub ptr: *mut u8,
+    pub len: usize,
+    pub lkey: u32,
+    pub rkey: u32,
+    pub(crate) _lifetime: PhantomData<&'a mut [u8]>,
+}
+
+/// A device memory region suitable for device DMA.
+pub struct DeviceRegion<'a> {
+    pub ptr: *mut u8,
+    pub len: usize,
+    pub(crate) _lifetime: PhantomData<&'a mut [u8]>,
+}
+
+/// A destination for I/O reads that can be finalized into a [`BufferHandle`].
+pub trait WriteDestination: Send + 'static {
+    /// Returns the length of the buffer in bytes.
+    fn len(&self) -> usize;
+
+    /// Returns true if the buffer is empty.
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the writable region for this target.
+    fn region(&mut self) -> WriteRegion<'_>;
+
+    /// Finalize the target into a buffer handle.
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>>;
+}
+
+impl WriteDestination for ByteBufferMut {
+    fn len(&self) -> usize {
+        ByteBufferMut::len(self)
+    }
+
+    fn region(&mut self) -> WriteRegion<'_> {
+        WriteRegion::HostSlice(self.as_mut())
+    }
+
+    fn into_handle(self: Box<Self>) -> BoxFuture<'static, VortexResult<BufferHandle>> {
+        async move { Ok(BufferHandle::new_host(self.freeze())) }.boxed()
+    }
+}
diff --git a/vortex-layout/src/layouts/chunked/reader.rs b/vortex-layout/src/layouts/chunked/reader.rs
index d5bc645ff94..2eef7d25986 100644
--- a/vortex-layout/src/layouts/chunked/reader.rs
+++ b/vortex-layout/src/layouts/chunked/reader.rs
@@ -301,7 +301,10 @@ impl LayoutReader for ChunkedReader {
             }
 
             // Combine the arrays.
-            Ok(ChunkedArray::try_new(chunks, dtype)?.to_array())
+            let x = ChunkedArray::try_new(chunks, dtype)?.to_array();
+            println!("{}", x.display_tree());
+
+            Ok(x)
         }
         .boxed())
     }
diff --git a/vortex-layout/src/layouts/flat/reader.rs b/vortex-layout/src/layouts/flat/reader.rs
index 786bc0c6f42..923877e2474 100644
--- a/vortex-layout/src/layouts/flat/reader.rs
+++ b/vortex-layout/src/layouts/flat/reader.rs
@@ -214,6 +214,8 @@ impl LayoutReader for FlatReader {
             // Evaluate the projection expression.
             array = array.apply(&expr)?;
 
+            println!("array {}", array.display_tree());
+
             Ok(array)
         }
         .boxed())
diff --git a/vortex-layout/src/layouts/struct_/reader.rs b/vortex-layout/src/layouts/struct_/reader.rs
index 81096e8212a..3b74d7bd603 100644
--- a/vortex-layout/src/layouts/struct_/reader.rs
+++ b/vortex-layout/src/layouts/struct_/reader.rs
@@ -346,7 +346,7 @@ impl LayoutReader for StructReader {
                 let mask = Mask::from_buffer(validity.to_bool().to_bit_buffer().not());
 
                 // If root expression was a pack, then we apply the validity to each child field
-                if is_pack_merge {
+                let res = if is_pack_merge {
                     let struct_array = array.to_struct();
                     let masked_fields: Vec<ArrayRef> = struct_array
                         .unmasked_fields()
@@ -365,9 +365,12 @@ impl LayoutReader for StructReader {
                     // If the root expression was not a pack or merge, e.g. if it's something like
                     // a get_item, then we apply the validity directly to the result
                     vortex_array::compute::mask(array.as_ref(), &mask)
-                }
+                };
+                res
             } else {
-                projected.await
+                projected
+                    .await
+                    .inspect(|a| println!("ret array {}", a.display_tree()))
             }
         }))
     }