From 3b49f62b04e1515f67f3f3ec7839a354a96975c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 18:28:39 -0500 Subject: [PATCH 1/9] Fix StringDType helper declaration and initialize UTF8 --- .../pyarrow/src/arrow/python/numpy_convert.cc | 13 ++ .../pyarrow/src/arrow/python/numpy_convert.h | 2 + .../src/arrow/python/numpy_to_arrow.cc | 126 ++++++++++++++++++ python/pyarrow/tests/test_array.py | 36 +++++ 4 files changed, 177 insertions(+) diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index 4113cc67d2f..d5faef66193 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -122,6 +122,15 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } +#if NPY_ABI_VERSION >= 0x02000000 +bool IsStringDType(PyArray_Descr* descr) { + // NumPy's variable-width StringDType exposes a dedicated dtype number. + return descr != nullptr && descr->type_num == NPY_VSTRING; +} +#else +bool IsStringDType(PyArray_Descr* /*descr*/) { return false; } +#endif + Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { return Status::TypeError("Did not pass numpy.dtype object"); @@ -133,6 +142,10 @@ Result> NumPyDtypeToArrow(PyObject* dtype) { Result> NumPyDtypeToArrow(PyArray_Descr* descr) { int type_num = fix_numpy_type_num(descr->type_num); + if (IsStringDType(descr)) { + return utf8(); + } + switch (type_num) { TO_ARROW_TYPE_CASE(BOOL, boolean); TO_ARROW_TYPE_CASE(INT8, int8); diff --git a/python/pyarrow/src/arrow/python/numpy_convert.h b/python/pyarrow/src/arrow/python/numpy_convert.h index 2d1086e1355..cac389d17a1 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.h +++ b/python/pyarrow/src/arrow/python/numpy_convert.h @@ -55,6 +55,8 @@ Result> NumPyDtypeToArrow(PyArray_Descr* descr); ARROW_PYTHON_EXPORT Result> NumPyScalarToArrowDataType(PyObject* scalar); +ARROW_PYTHON_EXPORT bool IsStringDType(PyArray_Descr* descr); + ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, const std::vector& dim_names, std::shared_ptr* out); diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..b4598d4f3b6 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,7 @@ #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/scope_guard.h" #include "arrow/util/string.h" #include "arrow/util/utf8.h" #include "arrow/visit_type_inline.h" @@ -59,6 +61,10 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#if NPY_ABI_VERSION >= 0x02000000 +#include +#endif + namespace arrow { using internal::checked_cast; @@ -233,6 +239,13 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#if NPY_ABI_VERSION >= 0x02000000 + template + Status AppendStringDTypeValues(Builder* builder); + + Status ConvertStringDType(); +#endif + Status Visit(const StructType& type); Status Visit(const FixedSizeBinaryType& type); @@ -338,6 +351,25 @@ Status NumPyConverter::Convert() { return Status::OK(); } + if (IsStringDType(dtype_)) { +#if NPY_ABI_VERSION >= 0x02000000 + RETURN_NOT_OK(ConvertStringDType()); + return Status::OK(); +#else + // Fall back to the generic Python sequence conversion path when the StringDType + // C API is unavailable. + PyConversionOptions py_options; + py_options.type = type_; + py_options.from_pandas = from_pandas_; + ARROW_ASSIGN_OR_RAISE( + auto chunked_array, + ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options, pool_)); + out_arrays_ = chunked_array->chunks(); + return Status::OK(); +#endif + } + if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); } @@ -815,6 +847,100 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#if NPY_ABI_VERSION >= 0x02000000 + +template +Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { + auto* descr = reinterpret_cast(dtype_); + + PyAcquireGIL gil_lock; + + npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + if (allocator == nullptr) { + return Status::Invalid("Failed to acquire NumPy StringDType allocator"); + } + + auto release_allocator = ::arrow::internal::MakeScopeGuard( + [&]() { NpyString_release_allocator(allocator); }); + + npy_static_string value = {0, nullptr}; + + auto append_value = [&](const npy_packed_static_string* packed) -> Status { + int rc = NpyString_load(allocator, packed, &value); + if (rc == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (rc == 1) { + return builder->AppendNull(); + } + return builder->Append(std::string_view{value.buf, value.size}); + }; + + char* data = PyArray_BYTES(arr_); + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + const auto* packed = + reinterpret_cast(data + i * stride_); + RETURN_NOT_OK(append_value(packed)); + } + } + } else { + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + RETURN_NOT_OK(append_value(packed)); + data += stride_; + } + } + + return Status::OK(); +} + +Status NumPyConverter::ConvertStringDType() { + util::InitializeUTF8(); + + if (type_ == nullptr) { + type_ = utf8(); + } + + switch (type_->id()) { + case Type::STRING: { + internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + + ArrayVector chunks; + RETURN_NOT_OK(builder.Finish(&chunks)); + for (const auto& chunk : chunks) { + RETURN_NOT_OK(PushArray(chunk->data())); + } + return Status::OK(); + } + case Type::LARGE_STRING: { + LargeStringBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + case Type::STRING_VIEW: { + StringViewBuilder builder(pool_); + RETURN_NOT_OK(builder.Reserve(length_)); + RETURN_NOT_OK(AppendStringDTypeValues(&builder)); + return PushBuilderResult(&builder); + } + default: + return Status::TypeError( + "NumPy StringDType can only be converted to Arrow string types"); + } +} + +#endif + Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..a83e65bdf1c 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2758,6 +2758,42 @@ def test_array_from_numpy_unicode(string_type): assert arrow_arr.equals(expected) +@pytest.mark.numpy +def test_array_from_numpy_string_dtype(): + StringDType = getattr(np.dtypes, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + arr = np.array(["some", "strings"], dtype=StringDType()) + + arrow_arr = pa.array(arr) + + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arrow_arr = pa.array(arr, type=pa.large_string()) + assert arrow_arr.type == pa.large_string() + assert arrow_arr.to_pylist() == ["some", "strings"] + + +@pytest.mark.numpy +def test_array_from_numpy_string_dtype_nulls_and_mask(): + StringDType = getattr(np.dtypes, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType(na_object=None) + arr = np.array(["this array has", None, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, True, False]) + arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.to_pylist() == ["this array has", None, None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype From 6e4c3c64c4278fa3138320370deec07d05476720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 19:28:45 -0500 Subject: [PATCH 2/9] Fix NumPy string dtype allocator guard --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index b4598d4f3b6..5a6be35f5f0 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -44,7 +44,6 @@ #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" -#include "arrow/util/scope_guard.h" #include "arrow/util/string.h" #include "arrow/util/utf8.h" #include "arrow/visit_type_inline.h" @@ -860,8 +859,8 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } - auto release_allocator = ::arrow::internal::MakeScopeGuard( - [&]() { NpyString_release_allocator(allocator); }); + std::unique_ptr + allocator_guard(allocator, &NpyString_release_allocator); npy_static_string value = {0, nullptr}; From a90ea23f5f006e3b07c9bf7d54de5c186a0b88a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 21:45:34 -0500 Subject: [PATCH 3/9] Remove StringDType header comment --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5a6be35f5f0..7e624c62751 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,10 +60,6 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" -#if NPY_ABI_VERSION >= 0x02000000 -#include -#endif - namespace arrow { using internal::checked_cast; From 8729eb3ca37413b60c3aa3c86bfda8481e1d4319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 22:36:49 -0500 Subject: [PATCH 4/9] Format numpy_to_arrow include --- .../src/arrow/python/numpy_to_arrow.cc | 22 ++++++++++++++----- python/pyarrow/tests/test_array.py | 14 ++++++++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 7e624c62751..c6e9e549f14 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,6 +60,12 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" +#if NPY_ABI_VERSION >= 0x02000000 +// Needed for NpyString_acquire_allocator / NpyString_load / +// NpyString_release_allocator +# include +#endif + namespace arrow { using internal::checked_cast; @@ -848,22 +854,26 @@ template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); - PyAcquireGIL gil_lock; - npy_string_allocator* allocator = NpyString_acquire_allocator(descr); if (allocator == nullptr) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } - std::unique_ptr - allocator_guard(allocator, &NpyString_release_allocator); + struct AllocatorGuard { + npy_string_allocator* ptr; + explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} + ~AllocatorGuard() { + if (ptr != nullptr) { + NpyString_release_allocator(ptr); + } + } + } guard(allocator); npy_static_string value = {0, nullptr}; auto append_value = [&](const npy_packed_static_string* packed) -> Status { int rc = NpyString_load(allocator, packed, &value); if (rc == -1) { - RETURN_IF_PYERROR(); return Status::Invalid("Failed to unpack NumPy StringDType value"); } if (rc == 1) { @@ -905,7 +915,7 @@ Status NumPyConverter::ConvertStringDType() { switch (type_->id()) { case Type::STRING: { - internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); RETURN_NOT_OK(builder.Reserve(length_)); RETURN_NOT_OK(AppendStringDTypeValues(&builder)); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a83e65bdf1c..987c9f6621b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2771,10 +2771,24 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.type == pa.utf8() assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.string()) + assert arrow_arr.type == pa.string() + assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.large_string()) assert arrow_arr.type == pa.large_string() assert arrow_arr.to_pylist() == ["some", "strings"] + arrow_arr = pa.array(arr, type=pa.string_view()) + assert arrow_arr.type == pa.string_view() + assert arrow_arr.to_pylist() == ["some", "strings"] + + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=StringDType()) + arr = arr_full[::2] + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["a", "c", "e"] + @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): From f49ba675b6c55b6b7da283b3a33fb387359a2ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Sun, 7 Dec 2025 23:38:07 -0500 Subject: [PATCH 5/9] Run clang-format on numpy_to_arrow --- .../src/arrow/python/numpy_to_arrow.cc | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index c6e9e549f14..90d4a805d12 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -81,6 +81,37 @@ using internal::NumPyTypeSize; namespace { +#if NPY_ABI_VERSION >= 0x02000000 + +// NumPy exposes StringDType helpers in the C-API table from version 2.0 onward, +// but the corresponding macros are only available when compiling against a +// 2.0+ feature level. Arrow still targets an older feature level, so provide +// local wrappers that call the C-API entries directly. + +inline npy_string_allocator* ArrowNpyString_acquire_allocator( + const PyArray_StringDTypeObject* descr) { + using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); + auto func = reinterpret_cast(PyArray_API[316]); + return func(descr); +} + +inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { + using Func = void (*)(npy_string_allocator*); + auto func = reinterpret_cast(PyArray_API[318]); + func(allocator); +} + +inline int ArrowNpyString_load(npy_string_allocator* allocator, + const npy_packed_static_string* packed, + npy_static_string* out) { + using Func = + int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); + auto func = reinterpret_cast(PyArray_API[313]); + return func(allocator, packed, out); +} + +#endif // NPY_ABI_VERSION >= 0x02000000 + Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -854,7 +885,7 @@ template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); - npy_string_allocator* allocator = NpyString_acquire_allocator(descr); + npy_string_allocator* allocator = ArrowNpyString_acquire_allocator(descr); if (allocator == nullptr) { return Status::Invalid("Failed to acquire NumPy StringDType allocator"); } @@ -864,7 +895,7 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { explicit AllocatorGuard(npy_string_allocator* p) : ptr(p) {} ~AllocatorGuard() { if (ptr != nullptr) { - NpyString_release_allocator(ptr); + ArrowNpyString_release_allocator(ptr); } } } guard(allocator); @@ -872,8 +903,9 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { npy_static_string value = {0, nullptr}; auto append_value = [&](const npy_packed_static_string* packed) -> Status { - int rc = NpyString_load(allocator, packed, &value); + int rc = ArrowNpyString_load(allocator, packed, &value); if (rc == -1) { + RETURN_IF_PYERROR(); return Status::Invalid("Failed to unpack NumPy StringDType value"); } if (rc == 1) { From 050ca867ad1a74d9b98f2aa1c321fc359562f875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Mon, 8 Dec 2025 00:24:03 -0500 Subject: [PATCH 6/9] Handle missing NumPy dtypes module in StringDType tests --- python/pyarrow/tests/test_array.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 987c9f6621b..f4d85904b3a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2760,11 +2760,17 @@ def test_array_from_numpy_unicode(string_type): @pytest.mark.numpy def test_array_from_numpy_string_dtype(): - StringDType = getattr(np.dtypes, "StringDType", None) + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) if StringDType is None: pytest.skip("NumPy StringDType not available") - arr = np.array(["some", "strings"], dtype=StringDType()) + dtype = StringDType() + + arr = np.array(["some", "strings"], dtype=dtype) arrow_arr = pa.array(arr) @@ -2783,7 +2789,7 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.type == pa.string_view() assert arrow_arr.to_pylist() == ["some", "strings"] - arr_full = np.array(["a", "b", "c", "d", "e"], dtype=StringDType()) + arr_full = np.array(["a", "b", "c", "d", "e"], dtype=dtype) arr = arr_full[::2] arrow_arr = pa.array(arr) assert arrow_arr.type == pa.utf8() @@ -2792,10 +2798,15 @@ def test_array_from_numpy_string_dtype(): @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): - StringDType = getattr(np.dtypes, "StringDType", None) + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) if StringDType is None: pytest.skip("NumPy StringDType not available") + # Real StringDType, use its NA sentinel dtype = StringDType(na_object=None) arr = np.array(["this array has", None, "as an entry"], dtype=dtype) @@ -2803,7 +2814,10 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): assert arrow_arr.type == pa.utf8() assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] - mask = np.array([False, True, False]) + # Test interplay of NA sentinel and an explicit mask: + # - index 1 is null because of na_object / Python None + # - index 2 is forced null by the mask + mask = np.array([False, False, True]) arrow_arr = pa.array(arr, mask=mask) assert arrow_arr.to_pylist() == ["this array has", None, None] From da255c9ec0f8ec0f09cede930064c508866e3faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 00:08:23 -0500 Subject: [PATCH 7/9] Make StringDType support unconditional --- .../pyarrow/src/arrow/python/numpy_convert.cc | 9 +- .../src/arrow/python/numpy_to_arrow.cc | 93 +++++++------------ python/pyarrow/tests/test_array.py | 28 ++++++ 3 files changed, 64 insertions(+), 66 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_convert.cc b/python/pyarrow/src/arrow/python/numpy_convert.cc index d5faef66193..facad8adfc8 100644 --- a/python/pyarrow/src/arrow/python/numpy_convert.cc +++ b/python/pyarrow/src/arrow/python/numpy_convert.cc @@ -37,6 +37,10 @@ namespace arrow { namespace py { +#ifndef NPY_VSTRING +# define NPY_VSTRING 2056 +#endif + NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { PyAcquireGIL lock; arr_ = ao; @@ -122,14 +126,9 @@ Result> NumPyScalarToArrowDataType(PyObject* scalar) { return NumPyDtypeToArrow(descr); } -#if NPY_ABI_VERSION >= 0x02000000 bool IsStringDType(PyArray_Descr* descr) { - // NumPy's variable-width StringDType exposes a dedicated dtype number. return descr != nullptr && descr->type_num == NPY_VSTRING; } -#else -bool IsStringDType(PyArray_Descr* /*descr*/) { return false; } -#endif Result> NumPyDtypeToArrow(PyObject* dtype) { if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 90d4a805d12..e39fdadea2f 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -60,11 +60,7 @@ #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" -#if NPY_ABI_VERSION >= 0x02000000 -// Needed for NpyString_acquire_allocator / NpyString_load / -// NpyString_release_allocator -# include -#endif +#include namespace arrow { @@ -81,24 +77,15 @@ using internal::NumPyTypeSize; namespace { -#if NPY_ABI_VERSION >= 0x02000000 - -// NumPy exposes StringDType helpers in the C-API table from version 2.0 onward, -// but the corresponding macros are only available when compiling against a -// 2.0+ feature level. Arrow still targets an older feature level, so provide -// local wrappers that call the C-API entries directly. - inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); - auto func = reinterpret_cast(PyArray_API[316]); - return func(descr); + return reinterpret_cast(PyArray_API[316])(descr); } inline void ArrowNpyString_release_allocator(npy_string_allocator* allocator) { using Func = void (*)(npy_string_allocator*); - auto func = reinterpret_cast(PyArray_API[318]); - func(allocator); + reinterpret_cast(PyArray_API[318])(allocator); } inline int ArrowNpyString_load(npy_string_allocator* allocator, @@ -106,12 +93,9 @@ inline int ArrowNpyString_load(npy_string_allocator* allocator, npy_static_string* out) { using Func = int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); - auto func = reinterpret_cast(PyArray_API[313]); - return func(allocator, packed, out); + return reinterpret_cast(PyArray_API[313])(allocator, packed, out); } -#endif // NPY_ABI_VERSION >= 0x02000000 - Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); @@ -271,12 +255,10 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); -#if NPY_ABI_VERSION >= 0x02000000 template Status AppendStringDTypeValues(Builder* builder); Status ConvertStringDType(); -#endif Status Visit(const StructType& type); @@ -384,22 +366,8 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { -#if NPY_ABI_VERSION >= 0x02000000 RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); -#else - // Fall back to the generic Python sequence conversion path when the StringDType - // C API is unavailable. - PyConversionOptions py_options; - py_options.type = type_; - py_options.from_pandas = from_pandas_; - ARROW_ASSIGN_OR_RAISE( - auto chunked_array, - ConvertPySequence(reinterpret_cast(arr_), - reinterpret_cast(mask_), py_options, pool_)); - out_arrays_ = chunked_array->chunks(); - return Status::OK(); -#endif } if (type_ == nullptr) { @@ -879,8 +847,6 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } -#if NPY_ABI_VERSION >= 0x02000000 - template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); @@ -901,19 +867,6 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { } guard(allocator); npy_static_string value = {0, nullptr}; - - auto append_value = [&](const npy_packed_static_string* packed) -> Status { - int rc = ArrowNpyString_load(allocator, packed, &value); - if (rc == -1) { - RETURN_IF_PYERROR(); - return Status::Invalid("Failed to unpack NumPy StringDType value"); - } - if (rc == 1) { - return builder->AppendNull(); - } - return builder->Append(std::string_view{value.buf, value.size}); - }; - char* data = PyArray_BYTES(arr_); if (mask_ != nullptr) { @@ -921,18 +874,38 @@ Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder->AppendNull()); + continue; + } + + const auto* packed = + reinterpret_cast(data + i * stride_); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); } else { - const auto* packed = - reinterpret_cast(data + i * stride_); - RETURN_NOT_OK(append_value(packed)); + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); } } - } else { - for (int64_t i = 0; i < length_; ++i) { - const auto* packed = reinterpret_cast(data); - RETURN_NOT_OK(append_value(packed)); - data += stride_; + return Status::OK(); + } + + for (int64_t i = 0; i < length_; ++i) { + const auto* packed = reinterpret_cast(data); + const int is_null = ArrowNpyString_load(allocator, packed, &value); + if (is_null == -1) { + RETURN_IF_PYERROR(); + return Status::Invalid("Failed to unpack NumPy StringDType value"); + } + if (is_null) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->Append(std::string_view{value.buf, value.size})); } + data += stride_; } return Status::OK(); @@ -976,8 +949,6 @@ Status NumPyConverter::ConvertStringDType() { } } -#endif - Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f4d85904b3a..a7377477dbe 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2796,6 +2796,28 @@ def test_array_from_numpy_string_dtype(): assert arrow_arr.to_pylist() == ["a", "c", "e"] +@pytest.mark.numpy +def test_numpy_stringdtype_thresholds_and_unicode(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + dtype = StringDType() + + short = "hello" + medium = "a" * 100 + long_ = "b" * 300 + unicode_ = "árvíztűrő tükörfúrógép 🥐 你好" + long_unicode = "🥐" * 200 + + arr = np.array([short, medium, long_, unicode_, long_unicode], dtype=dtype) + assert pa.array(arr).to_pylist() == [short, medium, long_, unicode_, long_unicode] + + @pytest.mark.numpy def test_array_from_numpy_string_dtype_nulls_and_mask(): dtypes_mod = getattr(np, "dtypes", None) @@ -2822,6 +2844,12 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): assert arrow_arr.to_pylist() == ["this array has", None, None] +@pytest.mark.numpy +def test_numpy_object_str_still_works(): + arr_obj = np.array(["x", "y", None], dtype=object) + assert pa.array(arr_obj).to_pylist() == ["x", "y", None] + + @pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype From 80a3aca59adb658533c2406920f3de8299c702ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 00:38:55 -0500 Subject: [PATCH 8/9] Remove StringDType endif comments --- python/pyarrow/src/arrow/python/numpy_to_arrow.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index e39fdadea2f..b3e0dc0c17d 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -77,6 +77,7 @@ using internal::NumPyTypeSize; namespace { +#ifdef npy_string_allocator inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); @@ -95,6 +96,7 @@ inline int ArrowNpyString_load(npy_string_allocator* allocator, int (*)(npy_string_allocator*, const npy_packed_static_string*, npy_static_string*); return reinterpret_cast(PyArray_API[313])(allocator, packed, out); } +#endif Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { @@ -255,10 +257,12 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); +#ifdef npy_string_allocator template Status AppendStringDTypeValues(Builder* builder); Status ConvertStringDType(); +#endif Status Visit(const StructType& type); @@ -366,8 +370,13 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { +#ifdef npy_string_allocator RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); +#else + return Status::NotImplemented( + "NumPy StringDType requires building PyArrow with NumPy >= 2.0"); +#endif } if (type_ == nullptr) { @@ -847,6 +856,7 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } +#ifdef npy_string_allocator template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); @@ -948,6 +958,7 @@ Status NumPyConverter::ConvertStringDType() { "NumPy StringDType can only be converted to Arrow string types"); } } +#endif Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; From bef2c71b3d45baae280a4496cb78382a9ffd2e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81d=C3=A1m=20Lippai?= Date: Fri, 12 Dec 2025 01:23:33 -0500 Subject: [PATCH 9/9] Add StringDType mask coverage and sentinel test --- .../src/arrow/python/numpy_to_arrow.cc | 8 ++--- python/pyarrow/tests/test_array.py | 29 ++++++++++++++++--- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index b3e0dc0c17d..dfbdd25a026 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -77,7 +77,7 @@ using internal::NumPyTypeSize; namespace { -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 inline npy_string_allocator* ArrowNpyString_acquire_allocator( const PyArray_StringDTypeObject* descr) { using Func = npy_string_allocator* (*)(const PyArray_StringDTypeObject*); @@ -257,7 +257,7 @@ class NumPyConverter { Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 template Status AppendStringDTypeValues(Builder* builder); @@ -370,7 +370,7 @@ Status NumPyConverter::Convert() { } if (IsStringDType(dtype_)) { -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 RETURN_NOT_OK(ConvertStringDType()); return Status::OK(); #else @@ -856,7 +856,7 @@ Status NumPyConverter::Visit(const StringViewType& type) { return Status::OK(); } -#ifdef npy_string_allocator +#if NPY_ABI_VERSION >= 0x02000000 template Status NumPyConverter::AppendStringDTypeValues(Builder* builder) { auto* descr = reinterpret_cast(dtype_); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a7377477dbe..74ef81646ed 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2839,15 +2839,36 @@ def test_array_from_numpy_string_dtype_nulls_and_mask(): # Test interplay of NA sentinel and an explicit mask: # - index 1 is null because of na_object / Python None # - index 2 is forced null by the mask - mask = np.array([False, False, True]) + mask = np.array([False, False, True], dtype=bool) arrow_arr = pa.array(arr, mask=mask) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.null_count == 2 assert arrow_arr.to_pylist() == ["this array has", None, None] + mask = np.array([True, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == [None, None, None] + @pytest.mark.numpy -def test_numpy_object_str_still_works(): - arr_obj = np.array(["x", "y", None], dtype=object) - assert pa.array(arr_obj).to_pylist() == ["x", "y", None] +def test_array_from_numpy_string_dtype_string_sentinel_and_mask(): + dtypes_mod = getattr(np, "dtypes", None) + if dtypes_mod is None: + pytest.skip("NumPy dtypes module not available") + + StringDType = getattr(dtypes_mod, "StringDType", None) + if StringDType is None: + pytest.skip("NumPy StringDType not available") + + sentinel = "__placeholder__" + dtype = StringDType(na_object=sentinel) + arr = np.array(["this array has", sentinel, "as an entry"], dtype=dtype) + + arrow_arr = pa.array(arr) + assert arrow_arr.type == pa.utf8() + assert arrow_arr.to_pylist() == ["this array has", None, "as an entry"] + + mask = np.array([False, False, True], dtype=bool) + assert pa.array(arr, mask=mask).to_pylist() == ["this array has", None, None] @pytest.mark.numpy