|
84 | 84 | ABCNumpyExtensionArray, |
85 | 85 | ABCSeries, |
86 | 86 | ) |
| 87 | +from pandas.core.dtypes.inference import is_re |
87 | 88 | from pandas.core.dtypes.missing import ( |
88 | 89 | is_valid_na_for_dtype, |
89 | 90 | isna, |
|
115 | 116 | PeriodArray, |
116 | 117 | TimedeltaArray, |
117 | 118 | ) |
| 119 | +from pandas.core.arrays.string_ import StringDtype |
118 | 120 | from pandas.core.base import PandasObject |
119 | 121 | import pandas.core.common as com |
120 | 122 | from pandas.core.computation import expressions |
@@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: |
476 | 478 | # Up/Down-casting |
477 | 479 |
|
478 | 480 | @final |
479 | | - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: |
| 481 | + def coerce_to_target_dtype( |
| 482 | + self, other, warn_on_upcast: bool = False, using_cow: bool = False |
| 483 | + ) -> Block: |
480 | 484 | """ |
481 | 485 | coerce the current block to a dtype compat for other |
482 | 486 | we will return a block, possibly object, and not raise |
@@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: |
528 | 532 | f"{self.values.dtype}. Please report a bug at " |
529 | 533 | "https://github.com/pandas-dev/pandas/issues." |
530 | 534 | ) |
531 | | - return self.astype(new_dtype, copy=False) |
| 535 | + copy = False |
| 536 | + if ( |
| 537 | + not using_cow |
| 538 | + and isinstance(self.dtype, StringDtype) |
| 539 | + and self.dtype.storage == "python" |
| 540 | + ): |
| 541 | + copy = True |
| 542 | + return self.astype(new_dtype, copy=copy, using_cow=using_cow) |
532 | 543 |
|
533 | 544 | @final |
534 | 545 | def _maybe_downcast( |
@@ -879,7 +890,7 @@ def replace( |
879 | 890 | else: |
880 | 891 | return [self] if inplace else [self.copy()] |
881 | 892 |
|
882 | | - elif self._can_hold_element(value): |
| 893 | + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): |
883 | 894 | # TODO(CoW): Maybe split here as well into columns where mask has True |
884 | 895 | # and rest? |
885 | 896 | blk = self._maybe_copy(using_cow, inplace) |
@@ -926,12 +937,13 @@ def replace( |
926 | 937 | if value is None or value is NA: |
927 | 938 | blk = self.astype(np.dtype(object)) |
928 | 939 | else: |
929 | | - blk = self.coerce_to_target_dtype(value) |
| 940 | + blk = self.coerce_to_target_dtype(value, using_cow=using_cow) |
930 | 941 | return blk.replace( |
931 | 942 | to_replace=to_replace, |
932 | 943 | value=value, |
933 | 944 | inplace=True, |
934 | 945 | mask=mask, |
| 946 | + using_cow=using_cow, |
935 | 947 | ) |
936 | 948 |
|
937 | 949 | else: |
@@ -980,16 +992,26 @@ def _replace_regex( |
980 | 992 | ------- |
981 | 993 | List[Block] |
982 | 994 | """ |
983 | | - if not self._can_hold_element(to_replace): |
| 995 | + if not is_re(to_replace) and not self._can_hold_element(to_replace): |
984 | 996 | # i.e. only if self.is_object is True, but could in principle include a |
985 | 997 | # String ExtensionBlock |
986 | 998 | if using_cow: |
987 | 999 | return [self.copy(deep=False)] |
988 | 1000 | return [self] if inplace else [self.copy()] |
989 | 1001 |
|
990 | | - rx = re.compile(to_replace) |
| 1002 | + if is_re(to_replace) and self.dtype not in [object, "string"]: |
| 1003 | + # only object or string dtype can hold strings, and a regex object |
| 1004 | + # will only match strings |
| 1005 | + return [self.copy(deep=False)] |
991 | 1006 |
|
992 | | - block = self._maybe_copy(using_cow, inplace) |
| 1007 | + if not ( |
| 1008 | + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) |
| 1009 | + ): |
| 1010 | + block = self.astype(np.dtype(object)) |
| 1011 | + else: |
| 1012 | + block = self._maybe_copy(using_cow, inplace) |
| 1013 | + |
| 1014 | + rx = re.compile(to_replace) |
993 | 1015 |
|
994 | 1016 | replace_regex(block.values, rx, value, mask) |
995 | 1017 |
|
@@ -1048,7 +1070,9 @@ def replace_list( |
1048 | 1070 |
|
1049 | 1071 | # Exclude anything that we know we won't contain |
1050 | 1072 | pairs = [ |
1051 | | - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) |
| 1073 | + (x, y) |
| 1074 | + for x, y in zip(src_list, dest_list) |
| 1075 | + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) |
1052 | 1076 | ] |
1053 | 1077 | if not len(pairs): |
1054 | 1078 | if using_cow: |
@@ -1686,7 +1710,7 @@ def fillna( |
1686 | 1710 | return nbs |
1687 | 1711 |
|
1688 | 1712 | if limit is not None: |
1689 | | - mask[mask.cumsum(self.ndim - 1) > limit] = False |
| 1713 | + mask[mask.cumsum(self.values.ndim - 1) > limit] = False |
1690 | 1714 |
|
1691 | 1715 | if inplace: |
1692 | 1716 | nbs = self.putmask( |
@@ -2112,7 +2136,7 @@ def where( |
2112 | 2136 | res_values = arr._where(cond, other).T |
2113 | 2137 | except (ValueError, TypeError): |
2114 | 2138 | if self.ndim == 1 or self.shape[0] == 1: |
2115 | | - if isinstance(self.dtype, IntervalDtype): |
| 2139 | + if isinstance(self.dtype, (IntervalDtype, StringDtype)): |
2116 | 2140 | # TestSetitemFloatIntervalWithIntIntervalValues |
2117 | 2141 | blk = self.coerce_to_target_dtype(orig_other) |
2118 | 2142 | nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) |
@@ -2314,7 +2338,7 @@ def fillna( |
2314 | 2338 | using_cow: bool = False, |
2315 | 2339 | already_warned=None, |
2316 | 2340 | ) -> list[Block]: |
2317 | | - if isinstance(self.dtype, IntervalDtype): |
| 2341 | + if isinstance(self.dtype, (IntervalDtype, StringDtype)): |
2318 | 2342 | # Block.fillna handles coercion (test_fillna_interval) |
2319 | 2343 | return super().fillna( |
2320 | 2344 | value=value, |
|
0 commit comments