diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4113cc67d2f..facad8adfc8 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -37,6 +37,10 @@ namespace arrow { namespace py { +#ifndef NPY_VSTRING +# define NPY_VSTRING 2056 +#endif + NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { PyAcquireGIL lock; arr_ = ao; @@ -122,6 +126,10 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } +bool IsStringDType(PyArray_Descr* descr) { + return descr != nullptr && descr->type_num == NPY_VSTRING; +} + Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); @@ -133,6 +141,10 @@ Result> NumPyDtypeToArrow(PyObject* dtype) { Result> NumPyDtypeToArrow(PyArray_Descr* descr) { int type_num = fix_numpy_type_num(descr->type_num); + if (IsStringDType(descr)) { + return utf8(); + } + switch (type_num) { TO_ARROW_TYPE_CASE(BOOL, boolean); TO_ARROW_TYPE_CASE(INT8, int8); diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h b/python/pyarrow/src/arrow/python/numpy_convert.h index 2d1086e1355..cac389d17a1 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.h +++ b/python/pyarrow/src/arrow/python/numpy_convert.h @@ -55,6 +55,8 @@ Result> NumPyDtypeToArrow(PyArray_Descr* descr); ARROW_PYTHON_EXPORT Result> NumPyScalarToArrowDataType(PyObject* scalar); +ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr); + ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, const std::vector& dim_names, std::shared_ptr* out); diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..dfbdd25a026 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,8 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#include + namespace arrow { using internal::checked_cast; @@ -74,6 +77,27 @@ using internal::NumPyTypeSize; namespace { +#if NPY_ABI_VERSION >= 0x02000000 +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + return reinterpret_cast(PyArray_API[316])(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + reinterpret_cast(PyArray_API[318])(allocator); +} + +inline int ArrowNpyString_load(npy_string_allocator* allocator, + const npy_packed_static_string* packed, + npy_static_string* out) { + using Func = + int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); + return reinterpret_cast(PyArray_API[313])(allocator, packed, out); +} +#endif + Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -233,6 +257,13 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#if NPY_ABI_VERSION >= 0x02000000 + template + Status AppendStringDTypeValues(Builder* builder); + + Status ConvertStringDType(); +#endif + Status Visit(const StructType& type); Status Visit(const FixedSizeBinaryType& type); @@ -338,6 +369,16 @@ Status NumPyConverter::Convert() { return Status::OK(); } + if (IsStringDType(dtype_)) { +#if NPY_ABI_VERSION >= 0x02000000 + RETURN_NOT_OK(ConvertStringDType()); + return Status::OK(); +#else + return Status::NotImplemented( + "NumPy StringDType requires building PyArrow with NumPy >= 2.0"); +#endif + } + if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); } @@ -815,6 +856,110 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#if NPY_ABI_VERSION >= 0x02000000 +template +Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { + auto* descr = reinterpret_cast(dtype_); + + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + + struct AllocatorGuard { + npy_string_allocator* ptr; + explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} + ~AllocatorGuard() { + if (ptr != nullptr) { + ArrowNpyString_release_allocator(ptr); + } + } + } guard(allocator); + + npy_static_string value = {0, nullptr}; + char* data = PyArray_BYTES(arr_); + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + const auto* packed = + reinterpret_cast(data + i * stride_); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); + } + } + return Status::OK(); + } + + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); + } + data += stride_; + } + + return Status::OK(); +} + +Status NumPyConverter::ConvertStringDType() { + util::InitializeUTF8(); + + if (type_ == nullptr) { + type_ = utf8(); + } + + switch (type_->id()) { + case Type::STRING: { + arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + + ArrayVector chunks; + RETURN_NOT_OK(builder.Finish(&chunks)); + for (const auto& chunk : chunks) { + RETURN_NOT_OK(PushArray(chunk->data())); + } + return Status::OK(); + } + case Type::LARGE_STRING: { + LargeStringBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + case Type::STRING_VIEW: { + StringViewBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + default: + return Status::TypeError( + "NumPy StringDType can only be converted to Arrow string types"); + } +} +#endif + Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..74ef81646ed 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2758,6 +2758,119 @@ def test_array_from_numpy_unicode(string_type): assert arrow_arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_string_dtype(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + arr = np.array(["some", "strings"], dtype=dtype) + + arrow_arr = pa.array(arr) + + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.string()) + assert arrow_arr.type == pa.string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.large_string()) + assert arrow_arr.type == pa.large_string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.string_view()) + assert arrow_arr.type == pa.string_view() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype) + arr = arr_full[::2] + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["a", "c", "e"] + + +@pytest.mark.numpy +def test_numpy_stringdtype_thresholds_and_unicode(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + short = "hello" + medium = "a" * 100 + long_ = "b" * 300 + unicode_ = "árvíztűrő tükörfúrógép 🥐 你好" + long_unicode = "🥐" * 200 + + arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype) + assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_nulls_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + # Real StringDType, use its NA sentinel + dtype = StringDType(na_object=None) + arr = np.array(["this array has", None, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + # Test interplay of NA sentinel and an explicit mask: + # - index 1 is null because of na_object / Python None + # - index 2 is forced null by the mask + mask = np.array([False, False, True], dtype=bool) + arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.null_count == 2 + assert arrow_arr.to_pylist() == ["this array has", None, None] + + mask = np.array([True, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == [None, None, None] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_string_sentinel_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + sentinel = "__placeholder__" + dtype = StringDType(na_object=sentinel) + arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype