Skip to content

Commit 057acd5

Browse files
committed
is this...it?
1 parent b9c15c6 commit 057acd5

File tree

2 files changed

+88
-3
lines changed

2 files changed

+88
-3
lines changed

pandas/core/interchange/buffer.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,57 @@ def __repr__(self) -> str:
7676
)
7777
+ ")"
7878
)
79+
80+
81+
class PandasPyarrowBackedBuffer(Buffer):
82+
"""
83+
Data in the buffer is guaranteed to be contiguous in memory.
84+
"""
85+
86+
def __init__(self, x: Any, allow_copy: bool = True) -> None:
87+
"""
88+
Handle only regular columns (= numpy arrays) for now.
89+
"""
90+
91+
# Store the numpy array in which the data resides as a private
92+
# attribute, so we can use it to retrieve the public attributes
93+
self._x = x
94+
95+
@property
96+
def bufsize(self) -> int:
97+
"""
98+
Buffer size in bytes.
99+
"""
100+
return self._x.size
101+
102+
@property
103+
def ptr(self) -> int:
104+
"""
105+
Pointer to start of the buffer as an integer.
106+
"""
107+
return self._x.address
108+
109+
def __dlpack__(self) -> Any:
110+
"""
111+
Represent this structure as DLPack interface.
112+
"""
113+
return self._x.__dlpack__()
114+
115+
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
116+
"""
117+
Device type and device ID for where the data in the buffer resides.
118+
"""
119+
return (DlpackDeviceType.CPU, None)
120+
121+
# def __repr__(self) -> str:
122+
# return (
123+
# "PandasBuffer("
124+
# + str(
125+
# {
126+
# "bufsize": self.bufsize,
127+
# "ptr": self.ptr,
128+
# "device": self.__dlpack_device__()[0].name,
129+
# }
130+
# )
131+
# + ")"
132+
# )

pandas/core/interchange/column.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
import pandas as pd
1919
from pandas.api.types import is_string_dtype
20-
from pandas.core.interchange.buffer import PandasBuffer
20+
from pandas.core.interchange.buffer import (
21+
PandasBuffer,
22+
PandasPyarrowBackedBuffer,
23+
)
2124
from pandas.core.interchange.dataframe_protocol import (
2225
Column,
2326
ColumnBuffers,
@@ -195,6 +198,17 @@ def describe_null(self):
195198
null_value = 1
196199
return column_null_dtype, null_value
197200
kind = self.dtype[0]
201+
if isinstance(self._col.dtype, ArrowDtype):
202+
column_null_dtype = ColumnNullType.USE_BITMASK
203+
null_value = 0
204+
if ~self._col.isna().any():
205+
try:
206+
null, value = _NULL_DESCRIPTION[kind]
207+
except KeyError as err:
208+
raise NotImplementedError(f"Data type {kind} not yet supported") from err
209+
210+
return null, value
211+
return column_null_dtype, null_value
198212
try:
199213
null, value = _NULL_DESCRIPTION[kind]
200214
except KeyError as err:
@@ -282,6 +296,16 @@ def _get_data_buffer(
282296
"""
283297
Return the buffer containing the data and the buffer's associated dtype.
284298
"""
299+
if isinstance(self._col.dtype, ArrowDtype):
300+
arr = self._col.array
301+
buffer = PandasPyarrowBackedBuffer(arr._pa_array.chunks[0].buffers()[1])
302+
dtype = (
303+
DtypeKind.BOOL,
304+
8,
305+
ArrowCTypes.BOOL,
306+
Endianness.NATIVE,
307+
) # note: currently only support native endianness
308+
return buffer, dtype
285309
if self.dtype[0] == DtypeKind.DATETIME:
286310
# self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make
287311
# it longer than 4 characters
@@ -305,8 +329,6 @@ def _get_data_buffer(
305329
arr = self._col.array
306330
if isinstance(self._col.dtype, BaseMaskedDtype):
307331
np_arr = arr._data # type: ignore[attr-defined]
308-
elif isinstance(self._col.dtype, ArrowDtype):
309-
raise NotImplementedError("ArrowDtype not handled yet")
310332
else:
311333
np_arr = arr._ndarray # type: ignore[attr-defined]
312334
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
@@ -351,6 +373,15 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
351373
"""
352374
null, invalid = self.describe_null
353375

376+
if isinstance(self._col.dtype, ArrowDtype):
377+
arr = self._col.array
378+
buf = arr._pa_array.chunks[0].buffers()[0]
379+
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
380+
if buf is None:
381+
return buf, dtype
382+
buffer = PandasPyarrowBackedBuffer(buf)
383+
return buffer, dtype
384+
354385
if isinstance(self._col.dtype, BaseMaskedDtype):
355386
mask = self._col.array._mask # type: ignore[attr-defined]
356387
buffer = PandasBuffer(mask)

0 commit comments

Comments
 (0)