Merge remote-tracking branch 'upstream/main' into string-dtype-index-engine

jorisvandenbossche · jorisvandenbossche · commit 2a4aed2f5627 · 2024-11-16T20:21:48.000+01:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
-        -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -118,7 +118,7 @@ Interval
 
 Indexing
 ^^^^^^^^
--
+- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
 -
 
 Missing
diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py
@@ -151,4 +151,6 @@ def re_replacer(s):
     if mask is None:
         values[:] = f(values)
     else:
+        if values.ndim != mask.ndim:
+            mask = np.broadcast_to(mask, values.shape)
         values[mask] = f(values[mask])
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1785,16 +1785,22 @@ def pandas_dtype(dtype) -> DtypeObj:
 
     Parameters
     ----------
-    dtype : object to be converted
+    dtype : object
+        The object to be converted into a dtype.
 
     Returns
     -------
     np.dtype or a pandas dtype
+        The converted dtype, which can be either a numpy dtype or a pandas dtype.
 
     Raises
     ------
     TypeError if not a dtype
 
+    See Also
+    --------
+    api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype.
+
     Examples
     --------
     >>> pd.api.types.pandas_dtype(int)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6560,7 +6560,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
         """
         Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
         """
-        return ensure_index(target)
+        target_index = ensure_index(target)
+        if (
+            not hasattr(target, "dtype")
+            and self.dtype == object
+            and target_index.dtype == "string"
+        ):
+            # If we started with a list-like, avoid inference to string dtype if self
+            # is object dtype (coercing to string dtype will alter the missing values)
+            target_index = Index(target, dtype=self.dtype)
+        return target_index
 
     @final
     def _validate_indexer(
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1688,6 +1688,13 @@ def where(self, other, cond) -> list[Block]:
                 if isinstance(self.dtype, (IntervalDtype, StringDtype)):
                     # TestSetitemFloatIntervalWithIntIntervalValues
                     blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False)
+                    if (
+                        self.ndim == 2
+                        and isinstance(orig_cond, np.ndarray)
+                        and orig_cond.ndim == 1
+                        and not is_1d_only_ea_dtype(blk.dtype)
+                    ):
+                        orig_cond = orig_cond[:, None]
                     return blk.where(orig_other, orig_cond)
 
                 elif isinstance(self, NDArrayBackedExtensionBlock):
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -1,9 +1,15 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+)
 
 import numpy as np
 
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
 from pandas.compat import pa_version_under18p0
 from pandas.compat._optional import import_optional_dependency
 
@@ -12,6 +18,10 @@
 if TYPE_CHECKING:
     from collections.abc import Callable
 
+    import pyarrow
+
+    from pandas._typing import DtypeBackend
+
 
 def _arrow_dtype_mapping() -> dict:
     pa = import_optional_dependency("pyarrow")
@@ -33,7 +43,7 @@ def _arrow_dtype_mapping() -> dict:
     }
 
 
-def arrow_string_types_mapper() -> Callable:
+def _arrow_string_types_mapper() -> Callable:
     pa = import_optional_dependency("pyarrow")
 
     mapping = {
@@ -44,3 +54,31 @@ def arrow_string_types_mapper() -> Callable:
         mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
 
     return mapping.get
+
+
+def arrow_table_to_pandas(
+    table: pyarrow.Table,
+    dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
+    null_to_int64: bool = False,
+) -> pd.DataFrame:
+    pa = import_optional_dependency("pyarrow")
+
+    types_mapper: type[pd.ArrowDtype] | None | Callable
+    if dtype_backend == "numpy_nullable":
+        mapping = _arrow_dtype_mapping()
+        if null_to_int64:
+            # Modify the default mapping to also map null to Int64
+            # (to match other engines - only for CSV parser)
+            mapping[pa.null()] = pd.Int64Dtype()
+        types_mapper = mapping.get
+    elif dtype_backend == "pyarrow":
+        types_mapper = pd.ArrowDtype
+    elif using_string_dtype():
+        types_mapper = _arrow_string_types_mapper()
+    elif dtype_backend is lib.no_default or dtype_backend == "numpy":
+        types_mapper = None
+    else:
+        raise NotImplementedError
+
+    df = table.to_pandas(types_mapper=types_mapper)
+    return df
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -15,11 +15,10 @@
 from pandas.util._decorators import doc
 from pandas.util._validators import check_dtype_backend
 
-import pandas as pd
 from pandas.core.api import DataFrame
 from pandas.core.shared_docs import _shared_docs
 
-from pandas.io._util import arrow_string_types_mapper
+from pandas.io._util import arrow_table_to_pandas
 from pandas.io.common import get_handle
 
 if TYPE_CHECKING:
@@ -147,16 +146,4 @@ def read_feather(
         pa_table = feather.read_table(
             handles.handle, columns=columns, use_threads=bool(use_threads)
         )
-
-        if dtype_backend == "numpy_nullable":
-            from pandas.io._util import _arrow_dtype_mapping
-
-            return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
-
-        elif dtype_backend == "pyarrow":
-            return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
-
-        elif using_string_dtype():
-            return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
-        else:
-            raise NotImplementedError
+        return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -36,7 +36,6 @@
 from pandas.core.dtypes.dtypes import PeriodDtype
 
 from pandas import (
-    ArrowDtype,
     DataFrame,
     Index,
     MultiIndex,
@@ -48,6 +47,7 @@
 from pandas.core.reshape.concat import concat
 from pandas.core.shared_docs import _shared_docs
 
+from pandas.io._util import arrow_table_to_pandas
 from pandas.io.common import (
     IOHandles,
     dedup_names,
@@ -940,18 +940,7 @@ def read(self) -> DataFrame | Series:
             if self.engine == "pyarrow":
                 pyarrow_json = import_optional_dependency("pyarrow.json")
                 pa_table = pyarrow_json.read_json(self.data)
-
-                mapping: type[ArrowDtype] | None | Callable
-                if self.dtype_backend == "pyarrow":
-                    mapping = ArrowDtype
-                elif self.dtype_backend == "numpy_nullable":
-                    from pandas.io._util import _arrow_dtype_mapping
-
-                    mapping = _arrow_dtype_mapping().get
-                else:
-                    mapping = None
-
-                return pa_table.to_pandas(types_mapper=mapping)
+                return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
             elif self.engine == "ujson":
                 if self.lines:
                     if self.chunksize:
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -9,16 +9,13 @@
     Literal,
 )
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._validators import check_dtype_backend
 
-import pandas as pd
 from pandas.core.indexes.api import default_index
 
-from pandas.io._util import arrow_string_types_mapper
+from pandas.io._util import arrow_table_to_pandas
 from pandas.io.common import (
     get_handle,
     is_fsspec_url,
@@ -127,21 +124,7 @@ def read_orc(
         pa_table = orc.read_table(
             source=source, columns=columns, filesystem=filesystem, **kwargs
         )
-    if dtype_backend is not lib.no_default:
-        if dtype_backend == "pyarrow":
-            df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
-        else:
-            from pandas.io._util import _arrow_dtype_mapping
-
-            mapping = _arrow_dtype_mapping()
-            df = pa_table.to_pandas(types_mapper=mapping.get)
-        return df
-    else:
-        if using_string_dtype():
-            types_mapper = arrow_string_types_mapper()
-        else:
-            types_mapper = None
-        return pa_table.to_pandas(types_mapper=types_mapper)
+    return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
 
 
 def to_orc(
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -15,22 +15,19 @@
     filterwarnings,
 )
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._validators import check_dtype_backend
 
-import pandas as pd
 from pandas import (
     DataFrame,
     get_option,
 )
 from pandas.core.shared_docs import _shared_docs
 
-from pandas.io._util import arrow_string_types_mapper
+from pandas.io._util import arrow_table_to_pandas
 from pandas.io.common import (
     IOHandles,
     get_handle,
@@ -249,17 +246,6 @@ def read(
     ) -> DataFrame:
         kwargs["use_pandas_metadata"] = True
 
-        to_pandas_kwargs = {}
-        if dtype_backend == "numpy_nullable":
-            from pandas.io._util import _arrow_dtype_mapping
-
-            mapping = _arrow_dtype_mapping()
-            to_pandas_kwargs["types_mapper"] = mapping.get
-        elif dtype_backend == "pyarrow":
-            to_pandas_kwargs["types_mapper"] = pd.ArrowDtype  # type: ignore[assignment]
-        elif using_string_dtype():
-            to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()
-
         path_or_handle, handles, filesystem = _get_path_or_handle(
             path,
             filesystem,
@@ -280,7 +266,7 @@ def read(
                     "make_block is deprecated",
                     DeprecationWarning,
                 )
-                result = pa_table.to_pandas(**to_pandas_kwargs)
+                result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
 
             if pa_table.schema.metadata:
                 if b"PANDAS_ATTRS" in pa_table.schema.metadata:
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -3,8 +3,6 @@
 from typing import TYPE_CHECKING
 import warnings
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
@@ -16,18 +14,14 @@
 from pandas.core.dtypes.common import pandas_dtype
 from pandas.core.dtypes.inference import is_integer
 
-import pandas as pd
-from pandas import DataFrame
-
-from pandas.io._util import (
-    _arrow_dtype_mapping,
-    arrow_string_types_mapper,
-)
+from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
 if TYPE_CHECKING:
     from pandas._typing import ReadBuffer
 
+    from pandas import DataFrame
+
 
 class ArrowParserWrapper(ParserBase):
     """
@@ -293,17 +287,8 @@ def read(self) -> DataFrame:
                 "make_block is deprecated",
                 DeprecationWarning,
             )
-            if dtype_backend == "pyarrow":
-                frame = table.to_pandas(types_mapper=pd.ArrowDtype)
-            elif dtype_backend == "numpy_nullable":
-                # Modify the default mapping to also
-                # map null to Int64 (to match other engines)
-                dtype_mapping = _arrow_dtype_mapping()
-                dtype_mapping[pa.null()] = pd.Int64Dtype()
-                frame = table.to_pandas(types_mapper=dtype_mapping.get)
-            elif using_string_dtype():
-                frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
+            frame = arrow_table_to_pandas(
+                table, dtype_backend=dtype_backend, null_to_int64=True
+            )
 
-            else:
-                frame = table.to_pandas()
         return self._finalize_pandas_output(frame)
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ Interval`
`118`	`118`
`119`	`119`	`Indexing`
`120`	`120`	`^^^^^^^^`
`121`		`--`
	`121`	+- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
`122`	`122`	`-`
`123`	`123`
`124`	`124`	`Missing`