Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2303,7 +2303,7 @@ def to_dict(
*,
allow_large_results: Optional[bool] = None,
) -> typing.Mapping:
return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into)) # type: ignore
return typing.cast(dict, self.to_pandas(allow_large_results=allow_large_results).to_dict(into=into)) # type: ignore

def to_excel(
self, excel_writer, sheet_name="Sheet1", *, allow_large_results=None, **kwargs
Expand Down
18 changes: 17 additions & 1 deletion bigframes/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import base64
import decimal
import re
from typing import Iterable, Optional, Sequence, Set, Union

import geopandas as gpd # type: ignore
Expand Down Expand Up @@ -69,6 +70,12 @@
]


def pandas_major_version() -> int:
match = re.search(r"^v?(\d+)", pd.__version__.strip())
assert match is not None
return int(match.group(1))


# Prefer this function for tests that run in both ordered and unordered mode
def assert_dfs_equivalent(pd_df: pd.DataFrame, bf_df: bpd.DataFrame, **kwargs):
bf_df_local = bf_df.to_pandas()
Expand All @@ -83,7 +90,7 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar


def _normalize_all_nulls(col: pd.Series) -> pd.Series:
if col.dtype == bigframes.dtypes.FLOAT_DTYPE:
if col.dtype in (bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.INT_DTYPE):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[no action required] Pandas's mistreatment of the so-called nullable types does make me want to switch to pyarrow types sooner rather than later. Let's refresh our 3.0 plans soon.

col = col.astype("float64")
if pd_types.is_object_dtype(col):
col = col.fillna(float("nan"))
Expand Down Expand Up @@ -134,6 +141,15 @@ def assert_series_equal(
left = left.sort_index()
right = right.sort_index()

if isinstance(left.index, pd.RangeIndex) or pd_types.is_integer_dtype(
left.index.dtype,
):
left.index = left.index.astype("Int64")
if isinstance(right.index, pd.RangeIndex) or pd_types.is_integer_dtype(
right.index.dtype,
):
right.index = right.index.astype("Int64")

if nulls_are_nan:
left = _normalize_all_nulls(left)
right = _normalize_all_nulls(right)
Expand Down
7 changes: 4 additions & 3 deletions tests/unit/core/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import bigframes.core.utils as utils
import bigframes.pandas as bpd
from bigframes.testing.utils import assert_series_equal

pytest.importorskip("polars")
pytest.importorskip("pandas", minversion="2.0.0")
Expand Down Expand Up @@ -217,7 +218,7 @@ def test_groupby_series_iter_by_series(polars_session):
bf_result = bf_group_series.to_pandas()
pd_key, pd_result = pd_group
assert bf_key == pd_key
pandas.testing.assert_series_equal(
assert_series_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)

Expand All @@ -236,7 +237,7 @@ def test_groupby_series_iter_by_series_list_one_item(polars_session):
bf_result = bf_group_series.to_pandas()
pd_key, pd_result = pd_group
assert bf_key == pd_key
pandas.testing.assert_series_equal(
assert_series_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)

Expand All @@ -258,6 +259,6 @@ def test_groupby_series_iter_by_series_list_multiple(polars_session):
bf_result = bf_group_series.to_pandas()
pd_key, pd_result = pd_group
assert bf_key == pd_key
pandas.testing.assert_series_equal(
assert_series_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)
16 changes: 11 additions & 5 deletions tests/unit/test_dataframe_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,8 +593,8 @@ def test_drop_bigframes_index_with_na(scalars_dfs):
scalars_pandas_df = scalars_pandas_df.copy()
scalars_df = scalars_df.set_index("bytes_col")
scalars_pandas_df = scalars_pandas_df.set_index("bytes_col")
drop_index = scalars_df.iloc[[3, 5]].index
drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index
drop_index = scalars_df.iloc[[2, 5]].index
drop_pandas_index = scalars_pandas_df.iloc[[2, 5]].index

pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index)
bf_result = scalars_df.drop(index=drop_index).to_pandas()
Expand Down Expand Up @@ -2682,9 +2682,10 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods
bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas()
# pandas 3.0 does not automatically ffill anymore
pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods)
pd.testing.assert_frame_equal(
assert_frame_equal(
pd_result,
bf_result,
nulls_are_nan=True,
)


Expand Down Expand Up @@ -4297,8 +4298,13 @@ def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna):
subset, normalize=normalize, ascending=ascending, dropna=dropna
)

pd.testing.assert_series_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
assert_series_equal(
bf_result,
pd_result,
check_dtype=False,
check_index_type=False,
# different pandas versions inconsistent for tie-handling
ignore_order=True,
)


Expand Down
16 changes: 9 additions & 7 deletions tests/unit/test_local_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import bigframes
import bigframes.pandas as bpd
from bigframes.testing.utils import assert_frame_equal, assert_series_equal

pytest.importorskip("polars")
pytest.importorskip("pandas", minversion="2.0.0")
Expand Down Expand Up @@ -47,7 +48,7 @@ def test_polars_local_engine_series(polars_session: bigframes.Session):
pd_series = pd.Series([1, 2, 3], dtype=bf_series.dtype)
bf_result = bf_series.to_pandas()
pd_result = pd_series
pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_polars_local_engine_add(
Expand All @@ -74,9 +75,9 @@ def test_polars_local_engine_filter(small_inline_frame: pd.DataFrame, polars_ses
pd_df = small_inline_frame
bf_df = bpd.DataFrame(pd_df, session=polars_session)

bf_result = bf_df.filter(bf_df["int2"] >= 1).to_pandas()
pd_result = pd_df.filter(pd_df["int2"] >= 1) # type: ignore
pandas.testing.assert_frame_equal(bf_result, pd_result)
bf_result = bf_df[bf_df["int2"] >= 1].to_pandas()
pd_result = pd_df[pd_df["int2"] >= 1] # type: ignore
assert_frame_equal(bf_result, pd_result)


def test_polars_local_engine_series_rename_with_mapping(polars_session):
Expand All @@ -88,7 +89,7 @@ def test_polars_local_engine_series_rename_with_mapping(polars_session):
bf_result = bf_series.rename({1: 100, 2: 200, 3: 300}).to_pandas()
pd_result = pd_series.rename({1: 100, 2: 200, 3: 300})
# pd default index is int64, bf is Int64
pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session):
Expand All @@ -103,7 +104,7 @@ def test_polars_local_engine_series_rename_with_mapping_inplace(polars_session):
bf_result = bf_series.to_pandas()
pd_result = pd_series
# pd default index is int64, bf is Int64
pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_polars_local_engine_reset_index(
Expand All @@ -129,11 +130,12 @@ def test_polars_local_engine_join_binop(polars_session):
bf_result = (bf_df_1 + bf_df_2).to_pandas()
pd_result = pd_df_1 + pd_df_2
# Sort since different join ordering
pandas.testing.assert_frame_equal(
assert_frame_equal(
bf_result.sort_index(),
pd_result.sort_index(),
check_dtype=False,
check_index_type=False,
nulls_are_nan=True,
)


Expand Down
41 changes: 28 additions & 13 deletions tests/unit/test_series_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
assert_series_equal,
convert_pandas_dtypes,
get_first_file_from_wildcard,
pandas_major_version,
)

pytest.importorskip("polars")
Expand Down Expand Up @@ -147,7 +148,7 @@ def test_series_construct_timestamps():
bf_result = series.Series(datetimes).to_pandas()
pd_result = pd.Series(datetimes, dtype=pd.ArrowDtype(pa.timestamp("us")))

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_series_construct_copy_with_index(scalars_dfs):
Expand Down Expand Up @@ -313,9 +314,7 @@ def test_series_construct_geodata():

series = bigframes.pandas.Series(pd_series)

pd.testing.assert_series_equal(
pd_series, series.to_pandas(), check_index_type=False
)
assert_series_equal(pd_series, series.to_pandas(), check_index_type=False)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -581,6 +580,8 @@ def test_series___getitem__(scalars_dfs, index_col, key):
),
)
def test_series___getitem___with_int_key(scalars_dfs, key):
if pd.__version__.startswith("3."):
pytest.skip("pandas 3.0 dropped getitem with int key")
col_name = "int64_too"
index_col = "string_col"
scalars_df, scalars_pandas_df = scalars_dfs
Expand Down Expand Up @@ -835,7 +836,7 @@ def test_series_dropna(scalars_dfs, ignore_index):
col_name = "string_col"
bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas()
pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index)
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
assert_series_equal(pd_result, bf_result, check_index_type=False)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1179,7 +1180,7 @@ def test_mods(scalars_dfs, col_x, col_y, method):
else:
bf_result = bf_series.astype("Float64").to_pandas()
pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y])
pd.testing.assert_series_equal(pd_result, bf_result)
assert_series_equal(pd_result, bf_result, nulls_are_nan=True)


# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this
Expand Down Expand Up @@ -1879,6 +1880,10 @@ def test_series_binop_w_other_types(scalars_dfs, other):
bf_result = (scalars_df["int64_col"].head(3) + other).to_pandas()
pd_result = scalars_pandas_df["int64_col"].head(3) + other

if isinstance(other, pd.Series):
# pandas 3.0 preserves series name, bigframe, earlier pandas do not
pd_result.index.name = bf_result.index.name

assert_series_equal(
bf_result,
pd_result,
Expand Down Expand Up @@ -3962,7 +3967,7 @@ def test_string_astype_date():
pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore
bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_string_astype_datetime():
Expand All @@ -3975,7 +3980,7 @@ def test_string_astype_datetime():
pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us")))
bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


def test_string_astype_timestamp():
Expand All @@ -3994,7 +3999,7 @@ def test_string_astype_timestamp():
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
).to_pandas()

pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
assert_series_equal(bf_result, pd_result, check_index_type=False)


@pytest.mark.skip(reason="AssertionError: Series are different")
Expand Down Expand Up @@ -4615,15 +4620,20 @@ def test_apply_lambda(scalars_dfs, col, lambda_):
bf_result = bf_col.apply(lambda_, by_row=False).to_pandas()

pd_col = scalars_pandas_df[col]
if pd.__version__[:3] in ("2.2", "2.3"):
if pd.__version__[:3] in ("2.2", "2.3") or pandas_major_version() >= 3:
pd_result = pd_col.apply(lambda_, by_row=False)
else:
pd_result = pd_col.apply(lambda_)

# ignore dtype check, which are Int64 and object respectively
# Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough"
assert_series_equal(
bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001
bf_result,
pd_result,
check_dtype=False,
check_exact=False,
rtol=0.001,
nulls_are_nan=True,
)


Expand Down Expand Up @@ -4805,7 +4815,12 @@ def foo(x):
# ignore dtype check, which are Int64 and object respectively
# Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough"
assert_series_equal(
bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001
bf_result,
pd_result,
check_dtype=False,
check_exact=False,
rtol=0.001,
nulls_are_nan=True,
)


Expand Down Expand Up @@ -4924,7 +4939,7 @@ def test_series_explode_w_index(index, ignore_index):
s = bigframes.pandas.Series(data, index=index)
pd_s = pd.Series(data, index=index)
# TODO(b/340885567): fix type error
pd.testing.assert_series_equal(
assert_series_equal(
s.explode(ignore_index=ignore_index).to_pandas(), # type: ignore
pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), # type: ignore
check_index_type=False,
Expand Down