Skip to content

Commit 2a4aed2

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-index-engine
2 parents a669d75 + 54c88a2 commit 2a4aed2

File tree

18 files changed

+184
-206
lines changed

18 files changed

+184
-206
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8484
-i "pandas.Timestamp.resolution PR02" \
8585
-i "pandas.Timestamp.tzinfo GL08" \
8686
-i "pandas.api.types.is_re_compilable PR07,SA01" \
87-
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
8887
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
8988
-i "pandas.arrays.IntegerArray SA01" \
9089
-i "pandas.arrays.IntervalArray.length SA01" \

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ Interval
118118

119119
Indexing
120120
^^^^^^^^
121-
-
121+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
122122
-
123123

124124
Missing

pandas/core/array_algos/replace.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,6 @@ def re_replacer(s):
151151
if mask is None:
152152
values[:] = f(values)
153153
else:
154+
if values.ndim != mask.ndim:
155+
mask = np.broadcast_to(mask, values.shape)
154156
values[mask] = f(values[mask])

pandas/core/dtypes/common.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1785,16 +1785,22 @@ def pandas_dtype(dtype) -> DtypeObj:
17851785
17861786
Parameters
17871787
----------
1788-
dtype : object to be converted
1788+
dtype : object
1789+
The object to be converted into a dtype.
17891790
17901791
Returns
17911792
-------
17921793
np.dtype or a pandas dtype
1794+
The converted dtype, which can be either a numpy dtype or a pandas dtype.
17931795
17941796
Raises
17951797
------
17961798
TypeError if not a dtype
17971799
1800+
See Also
1801+
--------
1802+
api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype.
1803+
17981804
Examples
17991805
--------
18001806
>>> pd.api.types.pandas_dtype(int)

pandas/core/indexes/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6560,7 +6560,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
65606560
"""
65616561
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
65626562
"""
6563-
return ensure_index(target)
6563+
target_index = ensure_index(target)
6564+
if (
6565+
not hasattr(target, "dtype")
6566+
and self.dtype == object
6567+
and target_index.dtype == "string"
6568+
):
6569+
# If we started with a list-like, avoid inference to string dtype if self
6570+
# is object dtype (coercing to string dtype will alter the missing values)
6571+
target_index = Index(target, dtype=self.dtype)
6572+
return target_index
65646573

65656574
@final
65666575
def _validate_indexer(

pandas/core/internals/blocks.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,13 @@ def where(self, other, cond) -> list[Block]:
16881688
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
16891689
# TestSetitemFloatIntervalWithIntIntervalValues
16901690
blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False)
1691+
if (
1692+
self.ndim == 2
1693+
and isinstance(orig_cond, np.ndarray)
1694+
and orig_cond.ndim == 1
1695+
and not is_1d_only_ea_dtype(blk.dtype)
1696+
):
1697+
orig_cond = orig_cond[:, None]
16911698
return blk.where(orig_other, orig_cond)
16921699

16931700
elif isinstance(self, NDArrayBackedExtensionBlock):

pandas/io/_util.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
3+
from typing import (
4+
TYPE_CHECKING,
5+
Literal,
6+
)
47

58
import numpy as np
69

10+
from pandas._config import using_string_dtype
11+
12+
from pandas._libs import lib
713
from pandas.compat import pa_version_under18p0
814
from pandas.compat._optional import import_optional_dependency
915

@@ -12,6 +18,10 @@
1218
if TYPE_CHECKING:
1319
from collections.abc import Callable
1420

21+
import pyarrow
22+
23+
from pandas._typing import DtypeBackend
24+
1525

1626
def _arrow_dtype_mapping() -> dict:
1727
pa = import_optional_dependency("pyarrow")
@@ -33,7 +43,7 @@ def _arrow_dtype_mapping() -> dict:
3343
}
3444

3545

36-
def arrow_string_types_mapper() -> Callable:
46+
def _arrow_string_types_mapper() -> Callable:
3747
pa = import_optional_dependency("pyarrow")
3848

3949
mapping = {
@@ -44,3 +54,31 @@ def arrow_string_types_mapper() -> Callable:
4454
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
4555

4656
return mapping.get
57+
58+
59+
def arrow_table_to_pandas(
60+
table: pyarrow.Table,
61+
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
62+
null_to_int64: bool = False,
63+
) -> pd.DataFrame:
64+
pa = import_optional_dependency("pyarrow")
65+
66+
types_mapper: type[pd.ArrowDtype] | None | Callable
67+
if dtype_backend == "numpy_nullable":
68+
mapping = _arrow_dtype_mapping()
69+
if null_to_int64:
70+
# Modify the default mapping to also map null to Int64
71+
# (to match other engines - only for CSV parser)
72+
mapping[pa.null()] = pd.Int64Dtype()
73+
types_mapper = mapping.get
74+
elif dtype_backend == "pyarrow":
75+
types_mapper = pd.ArrowDtype
76+
elif using_string_dtype():
77+
types_mapper = _arrow_string_types_mapper()
78+
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
79+
types_mapper = None
80+
else:
81+
raise NotImplementedError
82+
83+
df = table.to_pandas(types_mapper=types_mapper)
84+
return df

pandas/io/feather_format.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,10 @@
1515
from pandas.util._decorators import doc
1616
from pandas.util._validators import check_dtype_backend
1717

18-
import pandas as pd
1918
from pandas.core.api import DataFrame
2019
from pandas.core.shared_docs import _shared_docs
2120

22-
from pandas.io._util import arrow_string_types_mapper
21+
from pandas.io._util import arrow_table_to_pandas
2322
from pandas.io.common import get_handle
2423

2524
if TYPE_CHECKING:
@@ -147,16 +146,4 @@ def read_feather(
147146
pa_table = feather.read_table(
148147
handles.handle, columns=columns, use_threads=bool(use_threads)
149148
)
150-
151-
if dtype_backend == "numpy_nullable":
152-
from pandas.io._util import _arrow_dtype_mapping
153-
154-
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
155-
156-
elif dtype_backend == "pyarrow":
157-
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
158-
159-
elif using_string_dtype():
160-
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
161-
else:
162-
raise NotImplementedError
149+
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)

pandas/io/json/_json.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from pandas.core.dtypes.dtypes import PeriodDtype
3737

3838
from pandas import (
39-
ArrowDtype,
4039
DataFrame,
4140
Index,
4241
MultiIndex,
@@ -48,6 +47,7 @@
4847
from pandas.core.reshape.concat import concat
4948
from pandas.core.shared_docs import _shared_docs
5049

50+
from pandas.io._util import arrow_table_to_pandas
5151
from pandas.io.common import (
5252
IOHandles,
5353
dedup_names,
@@ -940,18 +940,7 @@ def read(self) -> DataFrame | Series:
940940
if self.engine == "pyarrow":
941941
pyarrow_json = import_optional_dependency("pyarrow.json")
942942
pa_table = pyarrow_json.read_json(self.data)
943-
944-
mapping: type[ArrowDtype] | None | Callable
945-
if self.dtype_backend == "pyarrow":
946-
mapping = ArrowDtype
947-
elif self.dtype_backend == "numpy_nullable":
948-
from pandas.io._util import _arrow_dtype_mapping
949-
950-
mapping = _arrow_dtype_mapping().get
951-
else:
952-
mapping = None
953-
954-
return pa_table.to_pandas(types_mapper=mapping)
943+
return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
955944
elif self.engine == "ujson":
956945
if self.lines:
957946
if self.chunksize:

pandas/io/orc.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,13 @@
99
Literal,
1010
)
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas._libs import lib
1513
from pandas.compat._optional import import_optional_dependency
1614
from pandas.util._validators import check_dtype_backend
1715

18-
import pandas as pd
1916
from pandas.core.indexes.api import default_index
2017

21-
from pandas.io._util import arrow_string_types_mapper
18+
from pandas.io._util import arrow_table_to_pandas
2219
from pandas.io.common import (
2320
get_handle,
2421
is_fsspec_url,
@@ -127,21 +124,7 @@ def read_orc(
127124
pa_table = orc.read_table(
128125
source=source, columns=columns, filesystem=filesystem, **kwargs
129126
)
130-
if dtype_backend is not lib.no_default:
131-
if dtype_backend == "pyarrow":
132-
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
133-
else:
134-
from pandas.io._util import _arrow_dtype_mapping
135-
136-
mapping = _arrow_dtype_mapping()
137-
df = pa_table.to_pandas(types_mapper=mapping.get)
138-
return df
139-
else:
140-
if using_string_dtype():
141-
types_mapper = arrow_string_types_mapper()
142-
else:
143-
types_mapper = None
144-
return pa_table.to_pandas(types_mapper=types_mapper)
127+
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
145128

146129

147130
def to_orc(

0 commit comments

Comments
 (0)