Skip to content

Commit e156770

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-tests-frame-replace-fillna
2 parents f0fa390 + fba5f08 commit e156770

File tree

10 files changed

+58
-81
lines changed

10 files changed

+58
-81
lines changed

pandas/core/arrays/arrow/array.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1644,7 +1644,11 @@ def _accumulate(
16441644
else:
16451645
data_to_accum = data_to_accum.cast(pa.int64())
16461646

1647-
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1647+
try:
1648+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1649+
except pa.ArrowNotImplementedError as err:
1650+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1651+
raise TypeError(msg) from err
16481652

16491653
if convert_to_int:
16501654
result = result.cast(pa_dtype)

pandas/tests/apply/test_invalid_arg.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -218,18 +218,12 @@ def transform(row):
218218
def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
219219
# GH 21224
220220
if using_infer_string:
221-
if df.dtypes.iloc[0].storage == "pyarrow":
222-
import pyarrow as pa
223-
224-
# TODO(infer_string)
225-
# should raise a proper TypeError instead of propagating the pyarrow error
226-
227-
expected = (expected, pa.lib.ArrowNotImplementedError)
228-
else:
229-
expected = (expected, NotImplementedError)
221+
expected = (expected, NotImplementedError)
230222

231223
msg = (
232-
"can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform"
224+
"can't multiply sequence by non-int of type 'str'"
225+
"|cannot perform cumprod with type str" # NotImplementedError python backend
226+
"|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow
233227
)
234228
warn = None if isinstance(func, str) else FutureWarning
235229
with pytest.raises(expected, match=msg):
@@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri
259253
if func == "median" or func is np.nanmedian or func is np.median:
260254
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
261255

262-
if using_infer_string:
263-
if series.dtype.storage == "pyarrow":
264-
import pyarrow as pa
265-
266-
# TODO(infer_string)
267-
# should raise a proper TypeError instead of propagating the pyarrow error
268-
expected = (expected, pa.lib.ArrowNotImplementedError)
269-
else:
270-
expected = (expected, NotImplementedError)
271-
msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform"
256+
if using_infer_string and func == "cumprod":
257+
expected = (expected, NotImplementedError)
258+
259+
msg = (
260+
msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation"
261+
)
272262
warn = None if isinstance(func, str) else FutureWarning
273263

274264
with pytest.raises(expected, match=msg):

pandas/tests/apply/test_str.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas._config import using_string_dtype
8-
97
from pandas.compat import WASM
108

119
from pandas.core.dtypes.common import is_number
@@ -81,7 +79,6 @@ def test_apply_np_transformer(float_frame, op, how):
8179
tm.assert_frame_equal(result, expected)
8280

8381

84-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
8582
@pytest.mark.parametrize(
8683
"series, func, expected",
8784
chain(
@@ -140,7 +137,6 @@ def test_agg_cython_table_series(series, func, expected):
140137
assert result == expected
141138

142139

143-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
144140
@pytest.mark.parametrize(
145141
"series, func, expected",
146142
chain(
@@ -163,10 +159,17 @@ def test_agg_cython_table_series(series, func, expected):
163159
),
164160
),
165161
)
166-
def test_agg_cython_table_transform_series(series, func, expected):
162+
def test_agg_cython_table_transform_series(request, series, func, expected):
167163
# GH21224
168164
# test transforming functions in
169165
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
166+
if series.dtype == "string" and func == "cumsum":
167+
request.applymarker(
168+
pytest.mark.xfail(
169+
raises=(TypeError, NotImplementedError),
170+
reason="TODO(infer_string) cumsum not yet implemented for string",
171+
)
172+
)
170173
warn = None if isinstance(func, str) else FutureWarning
171174
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
172175
result = series.agg(func)

pandas/tests/extension/test_arrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
441441
request.applymarker(
442442
pytest.mark.xfail(
443443
reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
444-
raises=NotImplementedError,
444+
raises=TypeError,
445445
)
446446
)
447447

pandas/tests/io/json/test_json_table_schema.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.core.dtypes.dtypes import (
1311
CategoricalDtype,
1412
DatetimeTZDtype,
@@ -27,10 +25,6 @@
2725
set_default_names,
2826
)
2927

30-
pytestmark = pytest.mark.xfail(
31-
using_string_dtype(), reason="TODO(infer_string)", strict=False
32-
)
33-
3428

3529
@pytest.fixture
3630
def df_schema():
@@ -127,7 +121,7 @@ def test_multiindex(self, df_schema, using_infer_string):
127121
expected["fields"][0] = {
128122
"name": "level_0",
129123
"type": "any",
130-
"extDtype": "string",
124+
"extDtype": "str",
131125
}
132126
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"}
133127
assert result == expected

pandas/tests/io/json/test_pandas.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def datetime_frame(self):
8484
# since that doesn't round-trip, see GH#33711
8585
df = DataFrame(
8686
np.random.default_rng(2).standard_normal((30, 4)),
87-
columns=Index(list("ABCD"), dtype=object),
87+
columns=Index(list("ABCD")),
8888
index=date_range("2000-01-01", periods=30, freq="B"),
8989
)
9090
df.index = df.index._with_freq(None)
@@ -184,7 +184,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame):
184184

185185
assert_json_roundtrip_equal(result, expected, orient)
186186

187-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
188187
@pytest.mark.parametrize("dtype", [False, np.int64])
189188
@pytest.mark.parametrize("convert_axes", [True, False])
190189
def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame):
@@ -270,7 +269,6 @@ def test_roundtrip_empty(self, orient, convert_axes):
270269

271270
tm.assert_frame_equal(result, expected)
272271

273-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
274272
@pytest.mark.parametrize("convert_axes", [True, False])
275273
def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame):
276274
# TODO: improve coverage with date_format parameter
@@ -698,7 +696,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string
698696

699697
tm.assert_series_equal(result, expected)
700698

701-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
702699
@pytest.mark.parametrize("dtype", [False, None])
703700
def test_series_roundtrip_object(self, orient, dtype, object_series):
704701
data = StringIO(object_series.to_json(orient=orient))
@@ -710,6 +707,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series):
710707
if orient != "split":
711708
expected.name = None
712709

710+
if using_string_dtype():
711+
expected = expected.astype("str")
712+
713713
tm.assert_series_equal(result, expected)
714714

715715
def test_series_roundtrip_empty(self, orient):
@@ -808,7 +808,6 @@ def test_path(self, float_frame, int_frame, datetime_frame):
808808
df.to_json(path)
809809
read_json(path)
810810

811-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
812811
def test_axis_dates(self, datetime_series, datetime_frame):
813812
# frame
814813
json = StringIO(datetime_frame.to_json())
@@ -821,7 +820,6 @@ def test_axis_dates(self, datetime_series, datetime_frame):
821820
tm.assert_series_equal(result, datetime_series, check_names=False)
822821
assert result.name is None
823822

824-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
825823
def test_convert_dates(self, datetime_series, datetime_frame):
826824
# frame
827825
df = datetime_frame
@@ -912,7 +910,6 @@ def test_convert_dates_infer(self, infer_word):
912910
result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]]
913911
tm.assert_frame_equal(result, expected)
914912

915-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
916913
@pytest.mark.parametrize(
917914
"date,date_unit",
918915
[
@@ -973,7 +970,6 @@ def test_date_format_series_raises(self, datetime_series):
973970
with pytest.raises(ValueError, match=msg):
974971
ts.to_json(date_format="iso", date_unit="foo")
975972

976-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
977973
def test_date_unit(self, unit, datetime_frame):
978974
df = datetime_frame
979975
df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
@@ -1114,7 +1110,6 @@ def test_round_trip_exception(self, datapath):
11141110
res = res.fillna(np.nan)
11151111
tm.assert_frame_equal(res, df)
11161112

1117-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
11181113
@pytest.mark.network
11191114
@pytest.mark.single_cpu
11201115
@pytest.mark.parametrize(
@@ -1555,7 +1550,6 @@ def test_data_frame_size_after_to_json(self):
15551550

15561551
assert size_before == size_after
15571552

1558-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
15591553
@pytest.mark.parametrize(
15601554
"index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]]
15611555
)

pandas/tests/io/test_clipboard.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas.errors import (
97
PyperclipException,
108
PyperclipWindowsException,
@@ -26,10 +24,6 @@
2624
init_qt_clipboard,
2725
)
2826

29-
pytestmark = pytest.mark.xfail(
30-
using_string_dtype(), reason="TODO(infer_string)", strict=False
31-
)
32-
3327

3428
def build_kwargs(sep, excel):
3529
kwargs = {}
@@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data):
351345

352346
@pytest.mark.parametrize("engine", ["c", "python"])
353347
def test_read_clipboard_dtype_backend(
354-
self, clipboard, string_storage, dtype_backend, engine
348+
self, clipboard, string_storage, dtype_backend, engine, using_infer_string
355349
):
356350
# GH#50502
357351
if dtype_backend == "pyarrow":
@@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend(
396390
)
397391
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
398392

393+
if using_infer_string:
394+
expected.columns = expected.columns.astype(
395+
pd.StringDtype(string_storage, na_value=np.nan)
396+
)
397+
399398
tm.assert_frame_equal(result, expected)
400399

401400
def test_invalid_dtype_backend(self):

pandas/tests/io/test_common.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ def test_bytesiowrapper_returns_correct_bytes(self):
140140
assert result == data.encode("utf-8")
141141

142142
# Test that pyarrow can handle a file opened with get_handle
143-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
144143
def test_get_handle_pyarrow_compat(self):
145144
pa_csv = pytest.importorskip("pyarrow.csv")
146145

@@ -155,6 +154,8 @@ def test_get_handle_pyarrow_compat(self):
155154
s = StringIO(data)
156155
with icom.get_handle(s, "rb", is_text=False) as handles:
157156
df = pa_csv.read_csv(handles.handle).to_pandas()
157+
# TODO will have to update this when pyarrow' to_pandas() is fixed
158+
expected = expected.astype("object")
158159
tm.assert_frame_equal(df, expected)
159160
assert not s.closed
160161

@@ -338,7 +339,6 @@ def test_read_fspath_all(self, reader, module, path, datapath):
338339
("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
339340
],
340341
)
341-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
342342
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
343343
if writer_name in ["to_latex"]: # uses Styler implementation
344344
pytest.importorskip("jinja2")
@@ -365,7 +365,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
365365
expected = f_path.read()
366366
assert result == expected
367367

368-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
368+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
369369
def test_write_fspath_hdf5(self):
370370
# Same test as write_fspath_all, except HDF5 files aren't
371371
# necessarily byte-for-byte identical for a given dataframe, so we'll
@@ -438,14 +438,13 @@ def test_unknown_engine(self):
438438
with tm.ensure_clean() as path:
439439
df = pd.DataFrame(
440440
1.1 * np.arange(120).reshape((30, 4)),
441-
columns=pd.Index(list("ABCD"), dtype=object),
442-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
441+
columns=pd.Index(list("ABCD")),
442+
index=pd.Index([f"i-{i}" for i in range(30)]),
443443
)
444444
df.to_csv(path)
445445
with pytest.raises(ValueError, match="Unknown engine"):
446446
pd.read_csv(path, engine="pyt")
447447

448-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
449448
def test_binary_mode(self):
450449
"""
451450
'encoding' shouldn't be passed to 'open' in binary mode.
@@ -455,8 +454,8 @@ def test_binary_mode(self):
455454
with tm.ensure_clean() as path:
456455
df = pd.DataFrame(
457456
1.1 * np.arange(120).reshape((30, 4)),
458-
columns=pd.Index(list("ABCD"), dtype=object),
459-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
457+
columns=pd.Index(list("ABCD")),
458+
index=pd.Index([f"i-{i}" for i in range(30)]),
460459
)
461460
df.to_csv(path, mode="w+b")
462461
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
@@ -473,8 +472,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
473472
"""
474473
df = pd.DataFrame(
475474
1.1 * np.arange(120).reshape((30, 4)),
476-
columns=pd.Index(list("ABCD"), dtype=object),
477-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
475+
columns=pd.Index(list("ABCD")),
476+
index=pd.Index([f"i-{i}" for i in range(30)]),
478477
)
479478
with tm.ensure_clean() as path:
480479
with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
@@ -504,15 +503,14 @@ def test_is_fsspec_url():
504503
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
505504

506505

507-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
508506
@pytest.mark.parametrize("encoding", [None, "utf-8"])
509507
@pytest.mark.parametrize("format", ["csv", "json"])
510508
def test_codecs_encoding(encoding, format):
511509
# GH39247
512510
expected = pd.DataFrame(
513511
1.1 * np.arange(120).reshape((30, 4)),
514-
columns=pd.Index(list("ABCD"), dtype=object),
515-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
512+
columns=pd.Index(list("ABCD")),
513+
index=pd.Index([f"i-{i}" for i in range(30)]),
516514
)
517515
with tm.ensure_clean() as path:
518516
with codecs.open(path, mode="w", encoding=encoding) as handle:
@@ -525,13 +523,12 @@ def test_codecs_encoding(encoding, format):
525523
tm.assert_frame_equal(expected, df)
526524

527525

528-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
529526
def test_codecs_get_writer_reader():
530527
# GH39247
531528
expected = pd.DataFrame(
532529
1.1 * np.arange(120).reshape((30, 4)),
533-
columns=pd.Index(list("ABCD"), dtype=object),
534-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
530+
columns=pd.Index(list("ABCD")),
531+
index=pd.Index([f"i-{i}" for i in range(30)]),
535532
)
536533
with tm.ensure_clean() as path:
537534
with open(path, "wb") as handle:
@@ -556,8 +553,8 @@ def test_explicit_encoding(io_class, mode, msg):
556553
# wrong mode is requested
557554
expected = pd.DataFrame(
558555
1.1 * np.arange(120).reshape((30, 4)),
559-
columns=pd.Index(list("ABCD"), dtype=object),
560-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
556+
columns=pd.Index(list("ABCD")),
557+
index=pd.Index([f"i-{i}" for i in range(30)]),
561558
)
562559
with io_class() as buffer:
563560
with pytest.raises(TypeError, match=msg):

0 commit comments

Comments
 (0)