Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions python/pyarrow/src/arrow/python/numpy_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
namespace arrow {
namespace py {

#ifndef NPY_VSTRING
# define NPY_VSTRING 2056
#endif

NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
PyAcquireGIL lock;
arr_ = ao;
Expand Down Expand Up @@ -122,6 +126,10 @@ Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) {
return NumPyDtypeToArrow(descr);
}

bool IsStringDType(PyArray_Descr* descr) {
return descr != nullptr && descr->type_num == NPY_VSTRING;
}

Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
return Status::TypeError("Did not pass numpy.dtype object");
Expand All @@ -133,6 +141,10 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
int type_num = fix_numpy_type_num(descr->type_num);

if (IsStringDType(descr)) {
return utf8();
}

switch (type_num) {
TO_ARROW_TYPE_CASE(BOOL, boolean);
TO_ARROW_TYPE_CASE(INT8, int8);
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/src/arrow/python/numpy_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
ARROW_PYTHON_EXPORT
Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);

ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr);

ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
const std::vector<std::string>& dim_names,
std::shared_ptr<Tensor>* out);
Expand Down
145 changes: 145 additions & 0 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <limits>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -59,6 +60,8 @@
#include "arrow/python/type_traits.h"
#include "arrow/python/vendored/pythoncapi_compat.h"

#include <numpy/arrayobject.h>

namespace arrow {

using internal::checked_cast;
Expand All @@ -74,6 +77,27 @@ using internal::NumPyTypeSize;

namespace {

#if NPY_ABI_VERSION >= 0x02000000
inline npy_string_allocator* ArrowNpyString_acquire_allocator(
const PyArray_StringDTypeObject* descr) {
using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*);
return reinterpret_cast<Func>(PyArray_API[316])(descr);
}

inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) {
using Func = void (*)(npy_string_allocator*);
reinterpret_cast<Func>(PyArray_API[318])(allocator);
}

inline int ArrowNpyString_load(npy_string_allocator* allocator,
const npy_packed_static_string* packed,
npy_static_string* out) {
using Func =
int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*);
return reinterpret_cast<Func>(PyArray_API[313])(allocator, packed, out);
}
#endif

Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
std::shared_ptr<ResizableBuffer>* out) {
int64_t null_bytes = bit_util::BytesForBits(length);
Expand Down Expand Up @@ -233,6 +257,13 @@ class NumPyConverter {
Status Visit(const LargeStringType& type);
Status Visit(const StringViewType& type);

#if NPY_ABI_VERSION >= 0x02000000
template <typename Builder>
Status AppendStringDTypeValues(Builder* builder);

Status ConvertStringDType();
#endif

Status Visit(const StructType& type);

Status Visit(const FixedSizeBinaryType& type);
Expand Down Expand Up @@ -338,6 +369,16 @@ Status NumPyConverter::Convert() {
return Status::OK();
}

if (IsStringDType(dtype_)) {
#if NPY_ABI_VERSION >= 0x02000000
RETURN_NOT_OK(ConvertStringDType());
return Status::OK();
#else
return Status::NotImplemented(
"NumPy StringDType requires building PyArrow with NumPy >= 2.0");
#endif
}

if (type_ == nullptr) {
return Status::Invalid("Must pass data type for non-object arrays");
}
Expand Down Expand Up @@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) {
return Status::OK();
}

#if NPY_ABI_VERSION >= 0x02000000
template <typename Builder>
Status NumPyConverter::AppendStringDTypeValues(Builder* builder) {
auto* descr = reinterpret_cast<PyArray_StringDTypeObject*>(dtype_);

npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI for other reviewers: this locks a mutex internally in NumPy.

if (allocator == nullptr) {
return Status::Invalid("Failed to acquire NumPy StringDType allocator");
}

struct AllocatorGuard {
npy_string_allocator* ptr;
explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {}
~AllocatorGuard() {
if (ptr != nullptr) {
ArrowNpyString_release_allocator(ptr);
}
}
} guard(allocator);

npy_static_string value = {0, nullptr};
char* data = PyArray_BYTES(arr_);

if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder->AppendNull());
continue;
}

const auto* packed =
reinterpret_cast<const npy_packed_static_string*>(data + i * stride_);
const int is_null = ArrowNpyString_load(allocator, packed, &value);
if (is_null == -1) {
RETURN_IF_PYERROR();
return Status::Invalid("Failed to unpack NumPy StringDType value");
}
if (is_null) {
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
}
}
return Status::OK();
}

for (int64_t i = 0; i < length_; ++i) {
const auto* packed = reinterpret_cast<const npy_packed_static_string*>(data);
const int is_null = ArrowNpyString_load(allocator, packed, &value);
if (is_null == -1) {
RETURN_IF_PYERROR();
return Status::Invalid("Failed to unpack NumPy StringDType value");
}
if (is_null) {
RETURN_NOT_OK(builder->AppendNull());
} else {
RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size}));
}
data += stride_;
}

return Status::OK();
}

Status NumPyConverter::ConvertStringDType() {
util::InitializeUTF8();

if (type_ == nullptr) {
type_ = utf8();
}

switch (type_->id()) {
case Type::STRING: {
arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
RETURN_NOT_OK(builder.Reserve(length_));
RETURN_NOT_OK(AppendStringDTypeValues(&builder));

ArrayVector chunks;
RETURN_NOT_OK(builder.Finish(&chunks));
for (const auto& chunk : chunks) {
RETURN_NOT_OK(PushArray(chunk->data()));
}
return Status::OK();
}
case Type::LARGE_STRING: {
LargeStringBuilder builder(pool_);
RETURN_NOT_OK(builder.Reserve(length_));
RETURN_NOT_OK(AppendStringDTypeValues(&builder));
return PushBuilderResult(&builder);
}
case Type::STRING_VIEW: {
StringViewBuilder builder(pool_);
RETURN_NOT_OK(builder.Reserve(length_));
RETURN_NOT_OK(AppendStringDTypeValues(&builder));
return PushBuilderResult(&builder);
}
default:
return Status::TypeError(
"NumPy StringDType can only be converted to Arrow string types");
}
}
#endif

Status NumPyConverter::Visit(const StructType& type) {
std::vector<NumPyConverter> sub_converters;
std::vector<OwnedRefNoGIL> sub_arrays;
Expand Down
113 changes: 113 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type):
assert arrow_arr.equals(expected)


@pytest.mark.numpy
def test_array_from_numpy_string_dtype():
dtypes_mod = getattr(np, "dtypes", None)
if dtypes_mod is None:
pytest.skip("NumPy dtypes module not available")

StringDType = getattr(dtypes_mod, "StringDType", None)
if StringDType is None:
pytest.skip("NumPy StringDType not available")

dtype = StringDType()

arr = np.array(["some", "strings"], dtype=dtype)

arrow_arr = pa.array(arr)

assert arrow_arr.type == pa.utf8()
assert arrow_arr.to_pylist() == ["some", "strings"]

arrow_arr = pa.array(arr, type=pa.string())
assert arrow_arr.type == pa.string()
assert arrow_arr.to_pylist() == ["some", "strings"]

arrow_arr = pa.array(arr, type=pa.large_string())
assert arrow_arr.type == pa.large_string()
assert arrow_arr.to_pylist() == ["some", "strings"]

arrow_arr = pa.array(arr, type=pa.string_view())
assert arrow_arr.type == pa.string_view()
assert arrow_arr.to_pylist() == ["some", "strings"]

arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype)
arr = arr_full[::2]
arrow_arr = pa.array(arr)
assert arrow_arr.type == pa.utf8()
assert arrow_arr.to_pylist() == ["a", "c", "e"]


@pytest.mark.numpy
def test_numpy_stringdtype_thresholds_and_unicode():
dtypes_mod = getattr(np, "dtypes", None)
if dtypes_mod is None:
pytest.skip("NumPy dtypes module not available")

StringDType = getattr(dtypes_mod, "StringDType", None)
if StringDType is None:
pytest.skip("NumPy StringDType not available")

dtype = StringDType()

short = "hello"
medium = "a" * 100
long_ = "b" * 300
unicode_ = "árvíztűrő tükörfúrógép 🥐 你好"
long_unicode = "🥐" * 200

arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype)
assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode]


@pytest.mark.numpy
def test_array_from_numpy_string_dtype_nulls_and_mask():
dtypes_mod = getattr(np, "dtypes", None)
if dtypes_mod is None:
pytest.skip("NumPy dtypes module not available")

StringDType = getattr(dtypes_mod, "StringDType", None)
if StringDType is None:
pytest.skip("NumPy StringDType not available")

# Real StringDType, use its NA sentinel
dtype = StringDType(na_object=None)
arr = np.array(["this array has", None, "as an entry"], dtype=dtype)

arrow_arr = pa.array(arr)
assert arrow_arr.type == pa.utf8()
assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"]

# Test interplay of NA sentinel and an explicit mask:
# - index 1 is null because of na_object / Python None
# - index 2 is forced null by the mask
mask = np.array([False, False, True], dtype=bool)
arrow_arr = pa.array(arr, mask=mask)
assert arrow_arr.type == pa.utf8()
assert arrow_arr.null_count == 2
assert arrow_arr.to_pylist() == ["this array has", None, None]

mask = np.array([True, False, True], dtype=bool)
assert pa.array(arr, mask=mask).to_pylist() == [None, None, None]


@pytest.mark.numpy
def test_array_from_numpy_string_dtype_string_sentinel_and_mask():
dtypes_mod = getattr(np, "dtypes", None)
if dtypes_mod is None:
pytest.skip("NumPy dtypes module not available")

StringDType = getattr(dtypes_mod, "StringDType", None)
if StringDType is None:
pytest.skip("NumPy StringDType not available")

sentinel = "__placeholder__"
dtype = StringDType(na_object=sentinel)
arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype)

arrow_arr = pa.array(arr)
assert arrow_arr.type == pa.utf8()
assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"]

mask = np.array([False, False, True], dtype=bool)
assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None]


@pytest.mark.numpy
def test_array_string_from_non_string():
# ARROW-5682 - when converting to string raise on non string-like dtype
Expand Down
Loading