Skip to content

Commit 0995055

Browse files
TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string
1 parent 73da90c commit 0995055

File tree

3 files changed

+78
-80
lines changed

3 files changed

+78
-80
lines changed

pandas/core/array_algos/replace.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,6 @@ def re_replacer(s):
151151
if mask is None:
152152
values[:] = f(values)
153153
else:
154+
if values.ndim != mask.ndim:
155+
mask = np.broadcast_to(mask, values.shape)
154156
values[mask] = f(values[mask])

pandas/tests/frame/methods/test_fillna.py

Lines changed: 27 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas import (
75
Categorical,
86
DataFrame,
@@ -65,15 +63,25 @@ def test_fillna_datetime(self, datetime_frame):
6563
with pytest.raises(TypeError, match=msg):
6664
datetime_frame.fillna()
6765

68-
# TODO(infer_string) test as actual error instead of xfail
69-
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
70-
def test_fillna_mixed_type(self, float_string_frame):
66+
def test_fillna_mixed_type(self, float_string_frame, using_infer_string):
7167
mf = float_string_frame
7268
mf.loc[mf.index[5:20], "foo"] = np.nan
7369
mf.loc[mf.index[-10:], "A"] = np.nan
74-
# TODO: make stronger assertion here, GH 25640
75-
mf.fillna(value=0)
76-
mf.ffill()
70+
71+
result = mf.ffill()
72+
assert (
73+
result.loc[result.index[-10:], "A"] == result.loc[result.index[-11], "A"]
74+
).all()
75+
assert (result.loc[result.index[5:20], "foo"] == "bar").all()
76+
77+
if using_infer_string:
78+
with pytest.raises(TypeError, match="Invalid value"):
79+
mf.fillna(value=0)
80+
81+
mf["foo"] = mf["foo"].astype("object")
82+
result = mf.fillna(value=0)
83+
assert (result.loc[result.index[-10:], "A"] == 0).all()
84+
assert (result.loc[result.index[5:20], "foo"] == 0).all()
7785

7886
def test_fillna_mixed_float(self, mixed_float_frame):
7987
# mixed numeric (but no float16)
@@ -84,28 +92,21 @@ def test_fillna_mixed_float(self, mixed_float_frame):
8492
result = mf.ffill()
8593
_check_mixed_float(result, dtype={"C": None})
8694

87-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
88-
def test_fillna_different_dtype(self, using_infer_string):
95+
def test_fillna_different_dtype(self):
8996
# with different dtype (GH#3386)
9097
df = DataFrame(
9198
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
9299
)
93100

94-
if using_infer_string:
95-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
96-
result = df.fillna({2: "foo"})
97-
else:
98-
result = df.fillna({2: "foo"})
101+
result = df.fillna({2: "foo"})
99102
expected = DataFrame(
100103
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
101104
)
105+
# column is originally float (all-NaN) -> filling with string gives object dtype
106+
expected[2] = expected[2].astype("object")
102107
tm.assert_frame_equal(result, expected)
103108

104-
if using_infer_string:
105-
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
106-
return_value = df.fillna({2: "foo"}, inplace=True)
107-
else:
108-
return_value = df.fillna({2: "foo"}, inplace=True)
109+
return_value = df.fillna({2: "foo"}, inplace=True)
109110
tm.assert_frame_equal(df, expected)
110111
assert return_value is None
111112

@@ -276,8 +277,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns):
276277
expected["A"] = 0.0
277278
tm.assert_frame_equal(result, expected)
278279

279-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
280-
def test_fillna_dtype_conversion(self, using_infer_string):
280+
def test_fillna_dtype_conversion(self):
281281
# make sure that fillna on an empty frame works
282282
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
283283
result = df.dtypes
@@ -292,7 +292,7 @@ def test_fillna_dtype_conversion(self, using_infer_string):
292292
# empty block
293293
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
294294
result = df.fillna("nan")
295-
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
295+
expected = DataFrame("nan", dtype="object", index=range(3), columns=["A", "B"])
296296
tm.assert_frame_equal(result, expected)
297297

298298
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
@@ -540,18 +540,10 @@ def test_fillna_col_reordering(self):
540540
filled = df.ffill()
541541
assert df.columns.tolist() == filled.columns.tolist()
542542

543-
# TODO(infer_string) test as actual error instead of xfail
544-
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
545-
def test_fill_corner(self, float_frame, float_string_frame):
546-
mf = float_string_frame
547-
mf.loc[mf.index[5:20], "foo"] = np.nan
548-
mf.loc[mf.index[-10:], "A"] = np.nan
549-
550-
filled = float_string_frame.fillna(value=0)
551-
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
552-
del float_string_frame["foo"]
553-
554-
float_frame.reindex(columns=[]).fillna(value=0)
543+
def test_fill_empty(self, float_frame):
544+
df = float_frame.reindex(columns=[])
545+
result = df.fillna(value=0)
546+
tm.assert_frame_equal(result, df)
555547

556548
def test_fillna_with_columns_and_limit(self):
557549
# GH40989

pandas/tests/frame/methods/test_replace.py

Lines changed: 49 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
import pandas as pd
1210
from pandas import (
1311
DataFrame,
@@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]:
3028

3129

3230
class TestDataFrameReplace:
33-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
3431
def test_replace_inplace(self, datetime_frame, float_string_frame):
3532
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
3633
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
@@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
4643
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
4744

4845
result = float_string_frame.replace(np.nan, 0)
49-
expected = float_string_frame.fillna(value=0)
46+
expected = float_string_frame.copy()
47+
expected["foo"] = expected["foo"].astype(object)
48+
expected = expected.fillna(value=0)
5049
tm.assert_frame_equal(result, expected)
5150

5251
tsframe = datetime_frame.copy()
@@ -291,22 +290,20 @@ def test_regex_replace_dict_nested_non_first_character(
291290
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
292291
tm.assert_frame_equal(result, expected)
293292

294-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
295293
def test_regex_replace_dict_nested_gh4115(self):
296-
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
297-
expected = DataFrame(
298-
{"Type": Series([0, 1, 0, 0, 1], dtype=df.Type.dtype), "tmp": 2}
294+
df = DataFrame(
295+
{"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2}
299296
)
297+
expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2})
300298
result = df.replace({"Type": {"Q": 0, "T": 1}})
301299
tm.assert_frame_equal(result, expected)
302300

303-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
304301
def test_regex_replace_list_to_scalar(self, mix_abc):
305302
df = DataFrame(mix_abc)
306303
expec = DataFrame(
307304
{
308305
"a": mix_abc["a"],
309-
"b": np.array([np.nan] * 4, dtype=object),
306+
"b": Series([np.nan] * 4, dtype="str"),
310307
"c": [np.nan, np.nan, np.nan, "d"],
311308
}
312309
)
@@ -326,7 +323,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc):
326323
tm.assert_frame_equal(res2, expec)
327324
tm.assert_frame_equal(res3, expec)
328325

329-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
330326
def test_regex_replace_str_to_numeric(self, mix_abc):
331327
# what happens when you try to replace a numeric value with a regex?
332328
df = DataFrame(mix_abc)
@@ -338,11 +334,12 @@ def test_regex_replace_str_to_numeric(self, mix_abc):
338334
return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
339335
assert return_value is None
340336
expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]})
337+
# TODO(infer_string)
338+
expec["c"] = expec["c"].astype(object)
341339
tm.assert_frame_equal(res, expec)
342340
tm.assert_frame_equal(res2, expec)
343341
tm.assert_frame_equal(res3, expec)
344342

345-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
346343
def test_regex_replace_regex_list_to_numeric(self, mix_abc):
347344
df = DataFrame(mix_abc)
348345
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
@@ -535,31 +532,37 @@ def test_replace_series_dict(self):
535532
result = df.replace(s, df.mean())
536533
tm.assert_frame_equal(result, expected)
537534

538-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
539-
def test_replace_convert(self):
540-
# gh 3907
541-
df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
535+
def test_replace_convert(self, any_string_dtype):
536+
# gh 3907 (pandas >= 3.0 no longer converts dtypes)
537+
df = DataFrame(
538+
[["foo", "bar", "bah"], ["bar", "foo", "bah"]], dtype=any_string_dtype
539+
)
542540
m = {"foo": 1, "bar": 2, "bah": 3}
543541
rep = df.replace(m)
544-
expec = df.dtypes
545-
res = rep.dtypes
546-
tm.assert_series_equal(expec, res)
542+
assert (rep.dtypes == object).all()
547543

548-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
549544
def test_replace_mixed(self, float_string_frame):
550545
mf = float_string_frame
551546
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
552547
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
553548

554549
result = float_string_frame.replace(np.nan, -18)
555-
expected = float_string_frame.fillna(value=-18)
550+
expected = float_string_frame.copy()
551+
expected["foo"] = expected["foo"].astype(object)
552+
expected = expected.fillna(value=-18)
556553
tm.assert_frame_equal(result, expected)
557-
tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame)
554+
expected2 = float_string_frame.copy()
555+
expected2["foo"] = expected2["foo"].astype(object)
556+
tm.assert_frame_equal(result.replace(-18, np.nan), expected2)
558557

559558
result = float_string_frame.replace(np.nan, -1e8)
560-
expected = float_string_frame.fillna(value=-1e8)
559+
expected = float_string_frame.copy()
560+
expected["foo"] = expected["foo"].astype(object)
561+
expected = expected.fillna(value=-1e8)
561562
tm.assert_frame_equal(result, expected)
562-
tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame)
563+
expected2 = float_string_frame.copy()
564+
expected2["foo"] = expected2["foo"].astype(object)
565+
tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2)
563566

564567
def test_replace_mixed_int_block_upcasting(self):
565568
# int block upcasting
@@ -601,8 +604,7 @@ def test_replace_mixed_int_block_splitting(self):
601604
result = df.replace(0, 0.5)
602605
tm.assert_frame_equal(result, expected)
603606

604-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
605-
def test_replace_mixed2(self, using_infer_string):
607+
def test_replace_mixed2(self):
606608
# to object block upcasting
607609
df = DataFrame(
608610
{
@@ -621,7 +623,7 @@ def test_replace_mixed2(self, using_infer_string):
621623

622624
expected = DataFrame(
623625
{
624-
"A": Series(["foo", "bar"]),
626+
"A": Series(["foo", "bar"], dtype="object"),
625627
"B": Series([0, "foo"], dtype="object"),
626628
}
627629
)
@@ -917,16 +919,16 @@ def test_replace_limit(self):
917919
# TODO
918920
pass
919921

920-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
921-
def test_replace_dict_no_regex(self):
922+
def test_replace_dict_no_regex(self, any_string_dtype):
922923
answer = Series(
923924
{
924925
0: "Strongly Agree",
925926
1: "Agree",
926927
2: "Neutral",
927928
3: "Disagree",
928929
4: "Strongly Disagree",
929-
}
930+
},
931+
dtype=any_string_dtype,
930932
)
931933
weights = {
932934
"Agree": 4,
@@ -935,19 +937,20 @@ def test_replace_dict_no_regex(self):
935937
"Strongly Agree": 5,
936938
"Strongly Disagree": 1,
937939
}
938-
expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=answer.dtype)
940+
expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object)
939941
result = answer.replace(weights)
940942
tm.assert_series_equal(result, expected)
941943

942-
def test_replace_series_no_regex(self):
944+
def test_replace_series_no_regex(self, any_string_dtype):
943945
answer = Series(
944946
{
945947
0: "Strongly Agree",
946948
1: "Agree",
947949
2: "Neutral",
948950
3: "Disagree",
949951
4: "Strongly Disagree",
950-
}
952+
},
953+
dtype=any_string_dtype,
951954
)
952955
weights = Series(
953956
{
@@ -1043,16 +1046,15 @@ def test_nested_dict_overlapping_keys_replace_str(self):
10431046
expected = df.replace({"a": dict(zip(astr, bstr))})
10441047
tm.assert_frame_equal(result, expected)
10451048

1046-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
1047-
def test_replace_swapping_bug(self, using_infer_string):
1049+
def test_replace_swapping_bug(self):
10481050
df = DataFrame({"a": [True, False, True]})
10491051
res = df.replace({"a": {True: "Y", False: "N"}})
1050-
expect = DataFrame({"a": ["Y", "N", "Y"]})
1052+
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
10511053
tm.assert_frame_equal(res, expect)
10521054

10531055
df = DataFrame({"a": [0, 1, 0]})
10541056
res = df.replace({"a": {0: "Y", 1: "N"}})
1055-
expect = DataFrame({"a": ["Y", "N", "Y"]})
1057+
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
10561058
tm.assert_frame_equal(res, expect)
10571059

10581060
def test_replace_datetimetz(self):
@@ -1186,7 +1188,7 @@ def test_replace_commutative(self, df, to_replace, exp):
11861188
)
11871189
def test_replace_replacer_dtype(self, replacer):
11881190
# GH26632
1189-
df = DataFrame(["a"])
1191+
df = DataFrame(["a"], dtype=object)
11901192
result = df.replace({"a": replacer, "b": replacer})
11911193
expected = DataFrame([replacer], dtype=object)
11921194
tm.assert_frame_equal(result, expected)
@@ -1266,7 +1268,6 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data):
12661268
assert return_value is None
12671269
tm.assert_frame_equal(df, expected)
12681270

1269-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
12701271
def test_replace_value_category_type(self):
12711272
"""
12721273
Test for #23305: to ensure category dtypes are maintained
@@ -1322,7 +1323,7 @@ def test_replace_value_category_type(self):
13221323
lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"})
13231324
)
13241325

1325-
result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"})
1326+
result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"})
13261327
tm.assert_frame_equal(result, expected)
13271328

13281329
def test_replace_dict_category_type(self):
@@ -1363,12 +1364,11 @@ def test_replace_with_compiled_regex(self):
13631364
expected = DataFrame(["z", "b", "c"])
13641365
tm.assert_frame_equal(result, expected)
13651366

1366-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
13671367
def test_replace_intervals(self):
13681368
# https://github.com/pandas-dev/pandas/issues/35931
13691369
df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]})
13701370
result = df.replace({"a": {pd.Interval(0, 1): "x"}})
1371-
expected = DataFrame({"a": ["x", "x"]})
1371+
expected = DataFrame({"a": ["x", "x"]}, dtype=object)
13721372
tm.assert_frame_equal(result, expected)
13731373

13741374
def test_replace_unicode(self):
@@ -1468,17 +1468,21 @@ def test_regex_replace_scalar(
14681468
expected.loc[expected["a"] == ".", "a"] = expected_replace_val
14691469
tm.assert_frame_equal(result, expected)
14701470

1471-
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
14721471
@pytest.mark.parametrize("regex", [False, True])
14731472
def test_replace_regex_dtype_frame(self, regex):
14741473
# GH-48644
14751474
df1 = DataFrame({"A": ["0"], "B": ["0"]})
1476-
expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=df1.dtypes.iloc[0])
1475+
expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=object)
14771476
result_df1 = df1.replace(to_replace="0", value=1, regex=regex)
14781477
tm.assert_frame_equal(result_df1, expected_df1)
14791478

14801479
df2 = DataFrame({"A": ["0"], "B": ["1"]})
1481-
expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=df2.dtypes.iloc[0])
1480+
if regex:
1481+
# TODO(infer_string): both string columns get cast to object,
1482+
# while only needed for column A
1483+
expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object)
1484+
else:
1485+
expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]})
14821486
result_df2 = df2.replace(to_replace="0", value=1, regex=regex)
14831487
tm.assert_frame_equal(result_df2, expected_df2)
14841488

0 commit comments

Comments
 (0)