diff --git a/bigframes/series.py b/bigframes/series.py index e11c60a999..64a986d1fa 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2303,7 +2303,7 @@ def to_dict( *, allow_large_results: Optional[bool] = None, ) -> typing.Mapping: - return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into)) # type: ignore + return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into=into)) # type: ignore def to_excel( self, excel_writer, sheet_name="Sheet1", *, allow_large_results=None, **kwargs diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index ae93c00464..6679f53b2c 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -14,6 +14,7 @@ import base64 import decimal +import re from typing import Iterable, Optional, Sequence, Set, Union import geopandas as gpd # type: ignore @@ -69,6 +70,12 @@ ] +def pandas_major_version() -> int: + match = re.search(r"^v?(\d+)", pd.__version__.strip()) + assert match is not None + return int(match.group(1)) + + # Prefer this function for tests that run in both ordered and unordered mode def assert_dfs_equivalent(pd_df: pd.DataFrame, bf_df: bpd.DataFrame, **kwargs): bf_df_local = bf_df.to_pandas() @@ -83,7 +90,7 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar def _normalize_all_nulls(col: pd.Series) -> pd.Series: - if col.dtype == bigframes.dtypes.FLOAT_DTYPE: + if col.dtype in (bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.INT_DTYPE): col = col.astype("float64") if pd_types.is_object_dtype(col): col = col.fillna(float("nan")) @@ -134,6 +141,15 @@ def assert_series_equal( left = left.sort_index() right = right.sort_index() + if isinstance(left.index, pd.RangeIndex) or pd_types.is_integer_dtype( + left.index.dtype, + ): + left.index = left.index.astype("Int64") + if isinstance(right.index, pd.RangeIndex) or pd_types.is_integer_dtype( + right.index.dtype, + ): + right.index = right.index.astype("Int64") + if nulls_are_nan: left = _normalize_all_nulls(left) right = _normalize_all_nulls(right) diff --git a/tests/unit/core/test_groupby.py b/tests/unit/core/test_groupby.py index f3d9218123..4bef581b2f 100644 --- a/tests/unit/core/test_groupby.py +++ b/tests/unit/core/test_groupby.py @@ -18,6 +18,7 @@ import bigframes.core.utils as utils import bigframes.pandas as bpd +from bigframes.testing.utils import assert_series_equal pytest.importorskip("polars") pytest.importorskip("pandas", minversion="2.0.0") @@ -217,7 +218,7 @@ def test_groupby_series_iter_by_series(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_series_equal( + assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -236,7 +237,7 @@ def test_groupby_series_iter_by_series_list_one_item(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_series_equal( + assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -258,6 +259,6 @@ def test_groupby_series_iter_by_series_list_multiple(polars_session): bf_result = bf_group_series.to_pandas() pd_key, pd_result = pd_group assert bf_key == pd_key - pandas.testing.assert_series_equal( + assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b8d251c88e..1c73d9dc6b 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -593,8 +593,8 @@ def test_drop_bigframes_index_with_na(scalars_dfs): scalars_pandas_df = scalars_pandas_df.copy() scalars_df = scalars_df.set_index("bytes_col") scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") - drop_index = scalars_df.iloc[[3, 5]].index - drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + drop_index = scalars_df.iloc[[2, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[2, 5]].index pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) bf_result = scalars_df.drop(index=drop_index).to_pandas() @@ -2682,9 +2682,10 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() # pandas 3.0 does not automatically ffill anymore pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods) - pd.testing.assert_frame_equal( + assert_frame_equal( pd_result, bf_result, + nulls_are_nan=True, ) @@ -4297,8 +4298,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): subset, normalize=normalize, ascending=ascending, dropna=dropna ) - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + assert_series_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + # different pandas versions inconsistent for tie-handling + ignore_order=True, ) diff --git a/tests/unit/test_local_engine.py b/tests/unit/test_local_engine.py index 8c8c2dcf0d..5f80e4928c 100644 --- a/tests/unit/test_local_engine.py +++ b/tests/unit/test_local_engine.py @@ -19,6 +19,7 @@ import bigframes import bigframes.pandas as bpd +from bigframes.testing.utils import assert_frame_equal, assert_series_equal pytest.importorskip("polars") pytest.importorskip("pandas", minversion="2.0.0") @@ -47,7 +48,7 @@ def test_polars_local_engine_series(polars_session: bigframes.Session): pd_series = pd.Series([1, 2, 3], dtype=bf_series.dtype) bf_result = bf_series.to_pandas() pd_result = pd_series - pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_polars_local_engine_add( @@ -74,9 +75,9 @@ def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_ses pd_df = small_inline_frame bf_df = bpd.DataFrame(pd_df, session=polars_session) - bf_result = bf_df.filter(bf_df["int2"] >= 1).to_pandas() - pd_result = pd_df.filter(pd_df["int2"] >= 1) # type: ignore - pandas.testing.assert_frame_equal(bf_result, pd_result) + bf_result = bf_df[bf_df["int2"] >= 1].to_pandas() + pd_result = pd_df[pd_df["int2"] >= 1] # type: ignore + assert_frame_equal(bf_result, pd_result) def test_polars_local_engine_series_rename_with_mapping(polars_session): @@ -88,7 +89,7 @@ def test_polars_local_engine_series_rename_with_mapping(polars_session): bf_result = bf_series.rename({1: 100, 2: 200, 3: 300}).to_pandas() pd_result = pd_series.rename({1: 100, 2: 200, 3: 300}) # pd default index is int64, bf is Int64 - pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session): @@ -103,7 +104,7 @@ def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session): bf_result = bf_series.to_pandas() pd_result = pd_series # pd default index is int64, bf is Int64 - pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_polars_local_engine_reset_index( @@ -129,11 +130,12 @@ def test_polars_local_engine_join_binop(polars_session): bf_result = (bf_df_1 + bf_df_2).to_pandas() pd_result = pd_df_1 + pd_df_2 # Sort since different join ordering - pandas.testing.assert_frame_equal( + assert_frame_equal( bf_result.sort_index(), pd_result.sort_index(), check_dtype=False, check_index_type=False, + nulls_are_nan=True, ) diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 9f1a247250..516a46d4dd 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -42,6 +42,7 @@ assert_series_equal, convert_pandas_dtypes, get_first_file_from_wildcard, + pandas_major_version, ) pytest.importorskip("polars") @@ -147,7 +148,7 @@ def test_series_construct_timestamps(): bf_result = series.Series(datetimes).to_pandas() pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us"))) - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_series_construct_copy_with_index(scalars_dfs): @@ -313,9 +314,7 @@ def test_series_construct_geodata(): series = bigframes.pandas.Series(pd_series) - pd.testing.assert_series_equal( - pd_series, series.to_pandas(), check_index_type=False - ) + assert_series_equal(pd_series, series.to_pandas(), check_index_type=False) @pytest.mark.parametrize( @@ -581,6 +580,8 @@ def test_series___getitem__(scalars_dfs, index_col, key): ), ) def test_series___getitem___with_int_key(scalars_dfs, key): + if pd.__version__.startswith("3."): + pytest.skip("pandas 3.0 dropped getitem with int key") col_name = "int64_too" index_col = "string_col" scalars_df, scalars_pandas_df = scalars_dfs @@ -835,7 +836,7 @@ def test_series_dropna(scalars_dfs, ignore_index): col_name = "string_col" bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal(pd_result, bf_result, check_index_type=False) @pytest.mark.parametrize( @@ -1179,7 +1180,7 @@ def test_mods(scalars_dfs, col_x, col_y, method): else: bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) - pd.testing.assert_series_equal(pd_result, bf_result) + assert_series_equal(pd_result, bf_result, nulls_are_nan=True) # We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this @@ -1879,6 +1880,10 @@ def test_series_binop_w_other_types(scalars_dfs, other): bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas() pd_result = scalars_pandas_df["int64_col"].head(3) + other + if isinstance(other, pd.Series): + # pandas 3.0 preserves series name, bigframe, earlier pandas do not + pd_result.index.name = bf_result.index.name + assert_series_equal( bf_result, pd_result, @@ -3962,7 +3967,7 @@ def test_string_astype_date(): pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_datetime(): @@ -3975,7 +3980,7 @@ def test_string_astype_datetime(): pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) def test_string_astype_timestamp(): @@ -3994,7 +3999,7 @@ def test_string_astype_timestamp(): pd.ArrowDtype(pa.timestamp("us", tz="UTC")) ).to_pandas() - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + assert_series_equal(bf_result, pd_result, check_index_type=False) @pytest.mark.skip(reason="AssertionError: Series are different") @@ -4615,7 +4620,7 @@ def test_apply_lambda(scalars_dfs, col, lambda_): bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() pd_col = scalars_pandas_df[col] - if pd.__version__[:3] in ("2.2", "2.3"): + if pd.__version__[:3] in ("2.2", "2.3") or pandas_major_version() >= 3: pd_result = pd_col.apply(lambda_, by_row=False) else: pd_result = pd_col.apply(lambda_) @@ -4623,7 +4628,12 @@ def test_apply_lambda(scalars_dfs, col, lambda_): # ignore dtype check, which are Int64 and object respectively # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" assert_series_equal( - bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + bf_result, + pd_result, + check_dtype=False, + check_exact=False, + rtol=0.001, + nulls_are_nan=True, ) @@ -4805,7 +4815,12 @@ def foo(x): # ignore dtype check, which are Int64 and object respectively # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" assert_series_equal( - bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + bf_result, + pd_result, + check_dtype=False, + check_exact=False, + rtol=0.001, + nulls_are_nan=True, ) @@ -4924,7 +4939,7 @@ def test_series_explode_w_index(index, ignore_index): s = bigframes.pandas.Series(data, index=index) pd_s = pd.Series(data, index=index) # TODO(b/340885567): fix type error - pd.testing.assert_series_equal( + assert_series_equal( s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore check_index_type=False,