From 7ff08da22d5b08520c57968a309463a8785d3a7d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 11 Dec 2025 23:22:18 +0000 Subject: [PATCH 1/2] fix: Improve strictness of nan vs None usage --- bigframes/core/blocks.py | 2 +- bigframes/dataframe.py | 2 +- bigframes/series.py | 15 +++++++++------ bigframes/testing/utils.py | 19 ++++++++++++++++++- tests/system/small/test_dataframe.py | 15 +++++++++------ tests/system/small/test_series.py | 7 +++++-- tests/unit/test_dataframe_polars.py | 14 +++++++++----- tests/unit/test_series_polars.py | 14 +++++++------- 8 files changed, 59 insertions(+), 29 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6f87e43821..df7c6dee43 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1295,7 +1295,7 @@ def aggregate_all_and_stack( as_array = ops.ToArrayOp().as_expr(*(col for col in self.value_columns)) reduced = ops.ArrayReduceOp(operation).as_expr(as_array) block, id = self.project_expr(reduced, None) - return block.select_column(id) + return block.select_column(id).with_column_labels(pd.Index([None])) def aggregate_size( self, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b34c1cafd0..763e7c8662 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -5012,7 +5012,7 @@ def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Serie return bigframes.series.Series( block.select_column( indicator, - ) + ).with_column_labels(pandas.Index([None])), ) def rank( diff --git a/bigframes/series.py b/bigframes/series.py index 51d7cc76ee..e11c60a999 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2653,9 +2653,10 @@ def _apply_unary_op( ) -> Series: """Applies a unary operator to the series.""" block, result_id = self._block.apply_unary_op( - self._value_column, op, result_label=self._name + self._value_column, + op, ) - return Series(block.select_column(result_id)) + return Series(block.select_column(result_id), name=self.name) # type: ignore def _apply_binary_op( self, @@ -2683,8 +2684,9 @@ def _apply_binary_op( expr = op.as_expr( other_col if reverse else self_col, self_col if reverse else other_col ) - block, result_id = block.project_expr(expr, name) - return Series(block.select_column(result_id)) + block, result_id = block.project_expr(expr) + block = block.select_column(result_id).with_column_labels([name]) + return Series(block) # type: ignore else: # Scalar binop name = self._name @@ -2692,8 +2694,9 @@ def _apply_binary_op( ex.const(other) if reverse else self._value_column, self._value_column if reverse else ex.const(other), ) - block, result_id = self._block.project_expr(expr, name) - return Series(block.select_column(result_id)) + block, result_id = self._block.project_expr(expr) + block = block.select_column(result_id).with_column_labels([name]) + return Series(block) # type: ignore def _apply_nary_op( self, diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index cf9c9fc031..1bc8f447c9 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -22,11 +22,13 @@ from google.cloud.functions_v2.types import functions import numpy as np import pandas as pd +import pandas.api.types as pd_types import pyarrow as pa # type: ignore import pytest from bigframes import operations as ops from bigframes.core import expression as ex +import bigframes.dtypes import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd @@ -98,7 +100,12 @@ def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): def assert_series_equal( - left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs + left: pd.Series, + right: pd.Series, + *, + ignore_order: bool = False, + nulls_are_nan: bool = True, + **kwargs, ): if ignore_order: if left.index.name is None: @@ -108,6 +115,16 @@ def assert_series_equal( left = left.sort_index() right = right.sort_index() + if nulls_are_nan: + if left.dtype == bigframes.dtypes.FLOAT_DTYPE: + left = left.astype("float64") + if right.dtype == bigframes.dtypes.FLOAT_DTYPE: + right = right.astype("float64") + if pd_types.is_object_dtype(left): + left = left.fillna(float("nan")) + if pd_types.is_object_dtype(right): + right = right.fillna(float("nan")) + pd.testing.assert_series_equal(left, right, **kwargs) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19d3c67e19..48b7fd1888 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3547,7 +3547,8 @@ def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): col_names = ["int64_too", "float64_col", "int64_col"] bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + # pandas 3.0 does not automatically ffill anymore + pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods) pd.testing.assert_frame_equal( pd_result, bf_result, @@ -3657,8 +3658,12 @@ def test_df_transpose(): ) rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) - bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + pd_df = pandas.DataFrame( + values, index=rows_multi, columns=columns_multi, dtype="Float64" + ) + bf_df = dataframe.DataFrame( + values, index=rows_multi, columns=columns_multi, dtype="Float64" + ) pd_result = pd_df.T bf_result = bf_df.T.to_pandas() @@ -4376,10 +4381,8 @@ def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, bf_result = op(scalars_df_index[col_names]).to_pandas() pd_result = op(scalars_pandas_df_index[col_names]) - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal(pd_result, bf_result, check_index_type=False, check_dtype=False) def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6c681596f5..6dd3e8a6b1 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -801,6 +801,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ) def test_series_interpolate(method): pytest.importorskip("scipy") + if method == "pad" and pd.__version__.startswith("3."): + pytest.skip("pandas 3.0 dropped method='pad'") values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] @@ -813,11 +815,12 @@ def test_series_interpolate(method): bf_result = bf_series.interpolate(method=method).to_pandas() # pd uses non-null types, while bf uses nullable types - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result, check_index_type=False, check_dtype=False, + nulls_are_nan=True, ) @@ -2730,7 +2733,7 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods): def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) + pd_result = scalars_pandas_df_index["int64_col"].ffill().pct_change(periods=periods) pd.testing.assert_series_equal( bf_result, diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b83380d789..b9c51ab1ec 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -2687,7 +2687,8 @@ def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): col_names = ["int64_too", "float64_col", "int64_col"] bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + # pandas 3.0 does not automatically ffill anymore + pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods) pd.testing.assert_frame_equal( pd_result, bf_result, @@ -2797,8 +2798,12 @@ def test_df_transpose(): ) rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) - bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + pd_df = pandas.DataFrame( + values, index=rows_multi, columns=columns_multi, dtype="Float64" + ) + bf_df = dataframe.DataFrame( + values, index=rows_multi, columns=columns_multi, dtype="Float64" + ) pd_result = pd_df.T bf_result = bf_df.T.to_pandas() @@ -3386,9 +3391,8 @@ def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, pd_result = op(scalars_pandas_df_index[col_names]) # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") # Pandas has object index type - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + assert_series_equal(pd_result, bf_result, check_index_type=False, check_dtype=False) @pytest.mark.parametrize( diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index cffeedea35..ece42fe3d4 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -798,6 +798,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ) def test_series_interpolate(method): pytest.importorskip("scipy") + if method == "pad" and pd.__version__.startswith("3."): + pytest.skip("pandas 3.0 dropped method='pad'") values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] @@ -810,11 +812,12 @@ def test_series_interpolate(method): bf_result = bf_series.interpolate(method=method).to_pandas() # pd uses non-null types, while bf uses nullable types - pd.testing.assert_series_equal( + assert_series_equal( pd_result, bf_result, check_index_type=False, check_dtype=False, + nulls_are_nan=True, ) @@ -2739,12 +2742,9 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods): def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods): bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas() # cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA - pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods) + pd_result = scalars_pandas_df_index["int64_col"].ffill().pct_change(periods=periods) - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) + assert_series_equal(bf_result, pd_result, nulls_are_nan=True) @pytest.mark.skip( @@ -4696,7 +4696,7 @@ def wrapped(x): pd_result = pd_col.apply(wrapped) - assert_series_equal(bf_result, pd_result, check_dtype=False) + assert_series_equal(bf_result, pd_result, check_dtype=False, nulls_are_nan=True) @pytest.mark.parametrize( From cde4ff37852b71813e717c4cfc3404e955ed7bcb Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 12 Dec 2025 03:00:16 +0000 Subject: [PATCH 2/2] fix transpose test --- bigframes/testing/utils.py | 49 ++++--- .../large/functions/test_remote_function.py | 42 +++--- .../small/bigquery/test_vector_search.py | 4 +- .../small/functions/test_remote_function.py | 24 ++-- tests/system/small/ml/test_cluster.py | 4 +- tests/system/small/ml/test_core.py | 2 +- tests/system/small/ml/test_decomposition.py | 4 +- .../test_issue355_merge_after_filter.py | 4 +- tests/system/small/test_dataframe.py | 136 ++++++++---------- tests/system/small/test_dataframe_io.py | 8 +- tests/system/small/test_groupby.py | 6 +- tests/system/small/test_large_local_data.py | 10 +- tests/system/small/test_multiindex.py | 6 +- tests/system/small/test_pandas.py | 20 +-- tests/system/small/test_polars_execution.py | 6 +- tests/system/small/test_series.py | 14 +- tests/system/small/test_unordered.py | 16 +-- tests/unit/test_dataframe_polars.py | 114 ++++++--------- tests/unit/test_series_polars.py | 14 +- 19 files changed, 227 insertions(+), 256 deletions(-) diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index 1bc8f447c9..ae93c00464 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -73,7 +73,7 @@ def assert_dfs_equivalent(pd_df: pd.DataFrame, bf_df: bpd.DataFrame, **kwargs): bf_df_local = bf_df.to_pandas() ignore_order = not bf_df._session._strictly_ordered - assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) + assert_frame_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwargs): @@ -82,21 +82,40 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) -def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): +def _normalize_all_nulls(col: pd.Series) -> pd.Series: + if col.dtype == bigframes.dtypes.FLOAT_DTYPE: + col = col.astype("float64") + if pd_types.is_object_dtype(col): + col = col.fillna(float("nan")) + return col + + +def assert_frame_equal( + left: pd.DataFrame, + right: pd.DataFrame, + *, + ignore_order: bool = False, + nulls_are_nan: bool = True, + **kwargs, +): if ignore_order: # Sort by a column to get consistent results. - if df0.index.name != "rowindex": - df0 = df0.sort_values( - list(df0.columns.drop("geography_col", errors="ignore")) + if left.index.name != "rowindex": + left = left.sort_values( + list(left.columns.drop("geography_col", errors="ignore")) ).reset_index(drop=True) - df1 = df1.sort_values( - list(df1.columns.drop("geography_col", errors="ignore")) + right = right.sort_values( + list(right.columns.drop("geography_col", errors="ignore")) ).reset_index(drop=True) else: - df0 = df0.sort_index() - df1 = df1.sort_index() + left = left.sort_index() + right = right.sort_index() + + if nulls_are_nan: + left = left.apply(_normalize_all_nulls) + right = right.apply(_normalize_all_nulls) - pd.testing.assert_frame_equal(df0, df1, **kwargs) + pd.testing.assert_frame_equal(left, right, **kwargs) def assert_series_equal( @@ -116,14 +135,8 @@ def assert_series_equal( right = right.sort_index() if nulls_are_nan: - if left.dtype == bigframes.dtypes.FLOAT_DTYPE: - left = left.astype("float64") - if right.dtype == bigframes.dtypes.FLOAT_DTYPE: - right = right.astype("float64") - if pd_types.is_object_dtype(left): - left = left.fillna(float("nan")) - if pd_types.is_object_dtype(right): - right = right.fillna(float("nan")) + left = _normalize_all_nulls(left) + right = _normalize_all_nulls(right) pd.testing.assert_series_equal(left, right, **kwargs) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index dae51c5b49..253bc7b617 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -36,7 +36,7 @@ import bigframes.pandas as bpd import bigframes.series from bigframes.testing.utils import ( - assert_pandas_df_equal, + assert_frame_equal, cleanup_function_assets, delete_cloud_function, get_cloud_functions, @@ -214,7 +214,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @@ -261,7 +261,7 @@ def add_one(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -349,7 +349,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @@ -403,7 +403,7 @@ def sign(num): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -453,7 +453,7 @@ def circumference(radius): pd_result_col = pd_result_col.astype(pandas.Float64Dtype()) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -503,7 +503,7 @@ def find_team(num): pd_result_col = pd_result_col.astype(pandas.StringDtype(storage="pyarrow")) pd_result = pd_float64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -591,7 +591,7 @@ def inner_test(): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) # Test that the remote function works as expected inner_test() @@ -683,7 +683,7 @@ def is_odd(num): pd_result_col = pd_int64_col.mask(is_odd) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -727,7 +727,7 @@ def is_odd(num): pd_result_col = pd_int64_col[pd_int64_col.notnull()].mask(is_odd, -1) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -770,7 +770,7 @@ def test_remote_udf_lambda(session, scalars_dfs, dataset_id, bq_cf_connection): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -829,7 +829,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -884,7 +884,7 @@ def pd_np_foo(x) -> None: # comparing for the purpose of this test pd_result.result = pd_result.result.astype(pandas.Float64Dtype()) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -928,7 +928,7 @@ def test_internal(rf, udf): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) # Create an explicit name for the remote function prefixer = test_utils.prefixer.Prefixer("foo", "") @@ -1109,7 +1109,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @@ -1150,7 +1150,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @@ -1225,7 +1225,7 @@ def square(x): pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) finally: # clean up the gcp assets created for the remote function cleanup_function_assets(square, session.bqclient, session.cloudfunctionsclient) @@ -1283,7 +1283,7 @@ def square_num(x): pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1357,7 +1357,7 @@ def square_num(x): pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( @@ -1416,7 +1416,7 @@ def square_num(x): pd_result_col = df["num"].apply(lambda x: x if x is None else x * x) pd_result = df.assign(result=pd_result_col) - assert_pandas_df_equal( + assert_frame_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) @@ -1504,7 +1504,7 @@ def square_num(x): pd_result_col = pd_int64_col.apply(square_num) pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) finally: # clean up the gcp assets created for the remote function cleanup_function_assets( diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 3107795730..ff320731e2 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -23,7 +23,7 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal # Need at least 5,000 rows to create a vector index. VECTOR_DF = pd.DataFrame( @@ -154,7 +154,7 @@ def test_vector_search_basic_params_with_df(): }, index=pd.Index([1, 0, 0, 1], dtype="Int64"), ) - assert_pandas_df_equal( + assert_frame_equal( expected.sort_values("id"), vector_search_result.sort_values("id"), check_dtype=False, diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 805505ecd5..1ee60dafd6 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -34,7 +34,7 @@ from bigframes.functions import _utils as bff_utils from bigframes.functions import function as bff import bigframes.session._io.bigquery -from bigframes.testing.utils import assert_pandas_df_equal, get_function_name +from bigframes.testing.utils import assert_frame_equal, get_function_name _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -159,7 +159,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -208,7 +208,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -300,7 +300,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -388,7 +388,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -437,7 +437,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -482,7 +482,7 @@ def square(x): pd_result_col = pd_result_col.astype(pd.Int64Dtype()) pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -517,7 +517,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -552,7 +552,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -585,7 +585,7 @@ def add_one(x): for col in pd_result: pd_result[col] = pd_result[col].astype(pd_int64_df[col].dtype) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.flaky(retries=2, delay=120) @@ -738,7 +738,7 @@ def square1(x): s2_result_col = int64_col_filtered.apply(square2) s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) - assert_pandas_df_equal(s1_result.to_pandas(), s2_result.to_pandas()) + assert_frame_equal(s1_result.to_pandas(), s2_result.to_pandas()) def test_read_gbq_function_runs_existing_udf(session): @@ -937,7 +937,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) converted_indirect_df = indirect_df.to_pandas() - assert_pandas_df_equal( + assert_frame_equal( direct_df, converted_indirect_df, ignore_order=True, check_index_type=False ) diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index 4840329cda..2a5e979b30 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -16,7 +16,7 @@ from bigframes.ml import cluster import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( { @@ -71,7 +71,7 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): dtype="Int64", index=pd.Index(["test1", "test2", "test3", "test4"], dtype="string[pyarrow]"), ) - assert_pandas_df_equal(result, expected, ignore_order=True) + assert_frame_equal(result, expected, ignore_order=True) def test_kmeans_detect_anomalies( diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index ef62e5ddd3..9add4a4a53 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -233,7 +233,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], }, ) - utils.assert_pandas_df_equal( + utils.assert_frame_equal( result, expected, check_exact=False, diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 10255003a1..297ee49739 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -180,7 +180,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): "explained_variance": [3.278657, 1.270829, 1.125354], }, ) - bigframes.testing.utils.assert_pandas_df_equal( + bigframes.testing.utils.assert_frame_equal( result, expected, check_exact=False, @@ -200,7 +200,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): "explained_variance_ratio": [0.469357, 0.181926, 0.1611], }, ) - bigframes.testing.utils.assert_pandas_df_equal( + bigframes.testing.utils.assert_frame_equal( result, expected, check_exact=False, diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py index 1c3b6e4fe3..d3486810f7 100644 --- a/tests/system/small/regression/test_issue355_merge_after_filter.py +++ b/tests/system/small/regression/test_issue355_merge_after_filter.py @@ -15,7 +15,7 @@ import pandas as pd import pytest -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal @pytest.mark.parametrize( @@ -67,4 +67,4 @@ def test_merge_after_filter(baseball_schedules_df, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 48b7fd1888..cb6d742c88 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -35,7 +35,7 @@ import bigframes.series as series from bigframes.testing.utils import ( assert_dfs_equivalent, - assert_pandas_df_equal, + assert_frame_equal, assert_series_equal, assert_series_equivalent, ) @@ -263,7 +263,7 @@ def test_get_rows_with_slice(scalars_dfs, row_slice): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[row_slice].to_pandas() pd_result = scalars_pandas_df[row_slice] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_hasattr(scalars_dfs): @@ -290,7 +290,7 @@ def test_head_with_custom_column_labels( bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) bf_result = bf_df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): @@ -635,7 +635,7 @@ def test_drop_with_custom_column_labels(scalars_dfs): pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_memory_usage(scalars_dfs): @@ -1024,7 +1024,7 @@ def test_take_df(scalars_dfs, indices, axis): bf_result = scalars_df.take(indices, axis=axis).to_pandas() pd_result = scalars_pandas_df.take(indices, axis=axis) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_filter_df(scalars_dfs): @@ -1036,7 +1036,7 @@ def test_filter_df(scalars_dfs): pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_read_gbq_direct_to_batches_row_count(unordered_session): @@ -1057,7 +1057,7 @@ def test_df_to_pandas_batches(scalars_dfs): assert 6 == capped_unfiltered_batches.total_rows assert len(pd_result) == filtered_batches.total_rows - assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) + assert_frame_equal(pd.concat(filtered_batches), pd_result) @pytest.mark.parametrize( @@ -1108,7 +1108,7 @@ def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): pd_result = scalars_pandas_df.assign(new_col=new_col_pd) pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_loc(scalars_dfs): @@ -1312,7 +1312,7 @@ def test_assign_existing_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_listlike_to_empty_df(session): @@ -1324,7 +1324,7 @@ def test_assign_listlike_to_empty_df(session): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + assert_frame_equal(bf_result.to_pandas(), pd_result) def test_assign_to_empty_df_multiindex_error(session): @@ -1358,7 +1358,7 @@ def test_assign_series(scalars_dfs, ordered): bf_result = df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) def test_assign_series_overwrite(scalars_dfs): @@ -1370,7 +1370,7 @@ def test_assign_series_overwrite(scalars_dfs): **{column_name: scalars_pandas_df[column_name] + 3} ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_sequential(scalars_dfs): @@ -1385,7 +1385,7 @@ def test_assign_sequential(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) # Require an index so that the self-join is consistent each time. @@ -1419,7 +1419,7 @@ def test_assign_different_df( new_col=scalars_pandas_df_index[column_name] ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_different_df_w_loc( @@ -1470,7 +1470,7 @@ def test_assign_callable_lambda(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1867,9 +1867,7 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1902,9 +1900,7 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1934,9 +1930,7 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1969,9 +1963,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) def test_self_merge_self_w_on_args(): @@ -2013,7 +2005,7 @@ def test_dataframe_round(scalars_dfs, decimals): bf_result = scalars_df.round(decimals).to_pandas() pd_result = scalars_pandas_df.round(decimals) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_get_dtypes(scalars_df_default_index): @@ -2495,7 +2487,7 @@ def test_df_pos(scalars_dfs): bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] - assert_pandas_df_equal(pd_result, bf_result) + assert_frame_equal(pd_result, bf_result) def test_df_neg(scalars_dfs): @@ -2503,7 +2495,7 @@ def test_df_neg(scalars_dfs): bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] - assert_pandas_df_equal(pd_result, bf_result) + assert_frame_equal(pd_result, bf_result) def test_df__abs__(scalars_dfs): @@ -2513,7 +2505,7 @@ def test_df__abs__(scalars_dfs): ).to_pandas() pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) - assert_pandas_df_equal(pd_result, bf_result) + assert_frame_equal(pd_result, bf_result) def test_df_invert(scalars_dfs): @@ -2523,7 +2515,7 @@ def test_df_invert(scalars_dfs): bf_result = (~scalars_df[columns]).to_pandas() pd_result = ~scalars_pandas_df[columns] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_isnull(scalars_dfs): @@ -2540,7 +2532,7 @@ def test_df_isnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_notnull(scalars_dfs): @@ -2557,7 +2549,7 @@ def test_df_notnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2930,7 +2922,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_dataframe_string_radd_const(scalars_dfs): @@ -2946,7 +2938,7 @@ def test_dataframe_string_radd_const(scalars_dfs): bf_result = ("prefix" + scalars_df[columns]).to_pandas() pd_result = "prefix" + scalars_pandas_df[columns] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize(("other_scalar"), [1, -2]) @@ -2958,7 +2950,7 @@ def test_mod(scalars_dfs, other_scalar): bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_scalar_binop_str_exception(scalars_dfs): @@ -3014,7 +3006,7 @@ def test_series_binop_axis_index( bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -3042,7 +3034,7 @@ def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): input = input.to_pandas() pd_result = scalars_pandas_df[df_columns].add(input, axis=1) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_df_reverse_binop_pandas(scalars_dfs): @@ -3057,7 +3049,7 @@ def test_df_reverse_binop_pandas(scalars_dfs): bf_result = pd_series + scalars_df[df_columns].to_pandas() pd_result = pd_series + scalars_pandas_df[df_columns] - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_listlike_binop_axis_1_bf_index(scalars_dfs): @@ -3072,7 +3064,7 @@ def test_listlike_binop_axis_1_bf_index(scalars_dfs): ) pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): @@ -3092,7 +3084,7 @@ def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): executions = execution_count_after - execution_count_before assert executions == 1 - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): @@ -3113,9 +3105,7 @@ def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): assert executions == 1 pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, check_dtype=False, check_index_type=False) @pytest.mark.parametrize( @@ -3183,7 +3173,7 @@ def test_series_binop_add_different_table( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) # TODO(garrettwu): Test series binop with different index @@ -3216,7 +3206,7 @@ def test_join_same_table(scalars_dfs_maybe_ordered, how): pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) def test_join_incompatible_key_type_error(scalars_dfs): @@ -3244,7 +3234,7 @@ def test_join_different_table( pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -3363,7 +3353,7 @@ def test_join_param_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -3384,7 +3374,7 @@ def test_df_join_series(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_series_b = pd_df["float64_col"] pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -3658,17 +3648,13 @@ def test_df_transpose(): ) rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - pd_df = pandas.DataFrame( - values, index=rows_multi, columns=columns_multi, dtype="Float64" - ) - bf_df = dataframe.DataFrame( - values, index=rows_multi, columns=columns_multi, dtype="Float64" - ) + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) pd_result = pd_df.T bf_result = bf_df.T.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + assert_frame_equal(pd_result, bf_result, check_dtype=False, nulls_are_nan=True) # type: ignore def test_df_transpose_error(): @@ -4016,7 +4002,7 @@ def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) @pytest.mark.parametrize( @@ -4680,7 +4666,7 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal( + assert_frame_equal( bf_result, pd_result, ignore_order=True, @@ -4952,7 +4938,7 @@ def test_df_setattr_index(): pd_df.index = pandas.Index([4, 5]) bf_df.index = [4, 5] - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -4967,7 +4953,7 @@ def test_df_setattr_columns(): bf_df.columns = pandas.Index([4, 5, 6]) - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -4980,7 +4966,7 @@ def test_df_setattr_modify_column(): pd_df.my_column = [4, 5] bf_df.my_column = [4, 5] - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -5262,9 +5248,7 @@ def test_df_from_dict_columns_orient(): data = {"a": [1, 2], "b": [3.3, 2.4]} bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="columns") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_dict_index_orient(): @@ -5273,9 +5257,7 @@ def test_df_from_dict_index_orient(): data, orient="index", columns=["col1", "col2"] ).to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_dict_tight_orient(): @@ -5289,9 +5271,7 @@ def test_df_from_dict_tight_orient(): bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="tight") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_records(): @@ -5301,9 +5281,7 @@ def test_df_from_records(): records, columns=["c1", "c2"] ).to_pandas() pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): @@ -5650,7 +5628,7 @@ def test_assign_after_binop_row_joins(): bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + assert_frame_equal(bf_df.to_pandas(), pd_df) def test_df_cache_with_implicit_join(scalars_df_index): @@ -5819,7 +5797,7 @@ def test_query_complexity_repeated_joins( bf_result = bf_df.to_pandas() pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) + assert_frame_equal(bf_result, pd_result, check_index_type=False) def test_query_complexity_repeated_subtrees( @@ -5833,7 +5811,7 @@ def test_query_complexity_repeated_subtrees( bf_df = bpd.concat(10 * [bf_df]).head(5) bf_result = bf_df.to_pandas() pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.skipif( @@ -5850,7 +5828,7 @@ def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_ pd_df = pd_df.diff() bf_result = bf_df.to_pandas() pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): @@ -6051,7 +6029,7 @@ def test_resample_with_index( .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 4d4a144d0a..02acb8d8f2 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -1126,7 +1126,7 @@ def test_to_sql_query_unnamed_index_included( ) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) + utils.assert_frame_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_included( @@ -1147,7 +1147,7 @@ def test_to_sql_query_named_index_included( columns="duration_col" ) roundtrip = session.read_gbq(sql, index_col=idx_ids) - utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) + utils.assert_frame_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( @@ -1164,7 +1164,7 @@ def test_to_sql_query_unnamed_index_excluded( columns="duration_col" ) roundtrip = session.read_gbq(sql) - utils.assert_pandas_df_equal( + utils.assert_frame_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) @@ -1187,7 +1187,7 @@ def test_to_sql_query_named_index_excluded( .drop(columns="duration_col") ) roundtrip = session.read_gbq(sql) - utils.assert_pandas_df_equal( + utils.assert_frame_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 2e09ffd1a6..579e7cd414 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal # ================= # DataFrame.groupby @@ -205,7 +205,7 @@ def test_dataframe_groupby_agg_string( pd_result = scalars_pandas_df_index[col_names].groupby("string_col").agg("count") bf_result_computed = bf_result.to_pandas(ordered=ordered) - assert_pandas_df_equal( + assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) @@ -509,7 +509,7 @@ def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index, order pd_result = scalars_pandas_df_index[col_names].groupby("string_col").diff(-1) bf_result_computed = bf_result.to_pandas(ordered=ordered) - assert_pandas_df_equal( + assert_frame_equal( pd_result, bf_result_computed, check_dtype=False, ignore_order=not ordered ) diff --git a/tests/system/small/test_large_local_data.py b/tests/system/small/test_large_local_data.py index 0c03a8b6a3..39885ea853 100644 --- a/tests/system/small/test_large_local_data.py +++ b/tests/system/small/test_large_local_data.py @@ -17,7 +17,7 @@ import pytest import bigframes -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal large_dataframe = pd.DataFrame(np.random.rand(10000, 10), dtype="Float64") large_dataframe.index = large_dataframe.index.astype("Int64") @@ -27,7 +27,7 @@ def test_read_pandas_defer_noop(session: bigframes.Session): pytest.importorskip("pandas", minversion="2.0.0") bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") - assert_pandas_df_equal(large_dataframe, bf_df.to_pandas()) + assert_frame_equal(large_dataframe, bf_df.to_pandas()) def test_read_pandas_defer_cumsum(session: bigframes.Session): @@ -35,7 +35,7 @@ def test_read_pandas_defer_cumsum(session: bigframes.Session): bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") bf_df = bf_df.cumsum() - assert_pandas_df_equal(large_dataframe.cumsum(), bf_df.to_pandas()) + assert_frame_equal(large_dataframe.cumsum(), bf_df.to_pandas()) def test_read_pandas_defer_cache_cumsum_cumsum(session: bigframes.Session): @@ -43,7 +43,7 @@ def test_read_pandas_defer_cache_cumsum_cumsum(session: bigframes.Session): bf_df = session.read_pandas(large_dataframe, write_engine="_deferred") bf_df = bf_df.cumsum().cache().cumsum() - assert_pandas_df_equal(large_dataframe.cumsum().cumsum(), bf_df.to_pandas()) + assert_frame_equal(large_dataframe.cumsum().cumsum(), bf_df.to_pandas()) def test_read_pandas_defer_peek(session: bigframes.Session): @@ -52,4 +52,4 @@ def test_read_pandas_defer_peek(session: bigframes.Session): bf_result = bf_df.peek(15) assert len(bf_result) == 15 - assert_pandas_df_equal(large_dataframe.loc[bf_result.index], bf_result) + assert_frame_equal(large_dataframe.loc[bf_result.index], bf_result) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 4233ed7aae..a28e02a54f 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -17,7 +17,7 @@ import pytest import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal # Sample MultiIndex for testing DataFrames where() method. _MULTI_INDEX = pandas.MultiIndex.from_tuples( @@ -583,7 +583,7 @@ def test_multi_index_dataframe_join(scalars_dfs, how): (["bool_col", "rowindex_2"]) )[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -604,7 +604,7 @@ def test_multi_index_dataframe_join_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) def test_multi_index_dataframe_where_series_cond_none_other( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index e3c5ace8a9..a1c0dc9851 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -21,7 +21,7 @@ import pytz import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal @pytest.mark.parametrize( @@ -37,7 +37,7 @@ def test_concat_dataframe(scalars_dfs, ordered): bf_result = bf_result.to_pandas(ordered=ordered) pd_result = pd.concat(11 * [scalars_pandas_df]) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) def test_concat_dataframe_w_struct_cols(nested_structs_df, nested_structs_pandas_df): @@ -306,7 +306,7 @@ def test_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -340,7 +340,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) def test_merge_cross(scalars_dfs): @@ -395,7 +395,7 @@ def test_merge_series(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) def test_merge_w_common_columns(scalars_dfs): @@ -413,7 +413,7 @@ def test_merge_w_common_columns(scalars_dfs): "inner", sort=True, ) - assert_pandas_df_equal(df.to_pandas(), pd_result, ignore_order=True) + assert_frame_equal(df.to_pandas(), pd_result, ignore_order=True) def test_merge_raises_error_when_no_common_columns(scalars_dfs): @@ -460,7 +460,7 @@ def test_crosstab_aligned_series(scalars_dfs): scalars_df["int64_col"], scalars_df["int64_too"] ).to_pandas() - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_crosstab_nondefault_func(scalars_dfs): @@ -479,7 +479,7 @@ def test_crosstab_nondefault_func(scalars_dfs): aggfunc="mean", ).to_pandas() - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_crosstab_multi_cols(scalars_dfs): @@ -498,7 +498,7 @@ def test_crosstab_multi_cols(scalars_dfs): colnames=["c", "d"], ).to_pandas() - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_crosstab_unaligned_series(scalars_dfs, session): @@ -513,7 +513,7 @@ def test_crosstab_unaligned_series(scalars_dfs, session): pd_result = pd.crosstab(scalars_pandas_df["int64_col"], other_pd_series) bf_result = bpd.crosstab(scalars_df["int64_col"], other_bf_series).to_pandas() - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def _convert_pandas_category(pd_s: pd.Series): diff --git a/tests/system/small/test_polars_execution.py b/tests/system/small/test_polars_execution.py index 46eb59260b..1b58dc9d12 100644 --- a/tests/system/small/test_polars_execution.py +++ b/tests/system/small/test_polars_execution.py @@ -17,7 +17,7 @@ import bigframes import bigframes.bigquery -from bigframes.testing.utils import assert_pandas_df_equal +from bigframes.testing.utils import assert_frame_equal polars = pytest.importorskip("polars") @@ -40,7 +40,7 @@ def test_polar_execution_sorted(session_w_polars, scalars_pandas_df_index): bf_result = bf_df.sort_index(ascending=False)[["int64_too", "bool_col"]].to_pandas() assert session_w_polars._metrics.execution_count == execution_count_before - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_polar_execution_sorted_filtered(session_w_polars, scalars_pandas_df_index): @@ -57,7 +57,7 @@ def test_polar_execution_sorted_filtered(session_w_polars, scalars_pandas_df_ind ) assert session_w_polars._metrics.execution_count == execution_count_before - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_polar_execution_unsupported_sql_fallback( diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6dd3e8a6b1..a95c9623e5 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -33,7 +33,7 @@ import bigframes.pandas import bigframes.series as series from bigframes.testing.utils import ( - assert_pandas_df_equal, + assert_frame_equal, assert_series_equal, get_first_file_from_wildcard, ) @@ -1774,7 +1774,7 @@ def test_take(scalars_dfs, indices): bf_result = scalars_df.take(indices).to_pandas() pd_result = scalars_pandas_df.take(indices) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_nested_filter(scalars_dfs): @@ -3423,7 +3423,7 @@ def test_to_frame(scalars_dfs): bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_to_frame_no_name(scalars_dfs): @@ -3432,7 +3432,7 @@ def test_to_frame_no_name(scalars_dfs): bf_result = scalars_df["int64_col"].rename(None).to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].rename(None).to_frame() - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): @@ -3676,7 +3676,7 @@ def test_mask_default_value(scalars_dfs): pd_col_masked = pd_col.mask(pd_col % 2 == 1) pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_mask_custom_value(scalars_dfs): @@ -3694,7 +3694,7 @@ def test_mask_custom_value(scalars_dfs): # odd so should be left as is, but it is being masked in pandas. # Accidentally the bigframes bahavior matches, but it should be updated # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): @@ -4145,7 +4145,7 @@ def test_loc_bool_series_default_index( scalars_pandas_df_default_index.bool_col ] - assert_pandas_df_equal( + assert_frame_equal( bf_result.to_frame(), pd_result.to_frame(), ) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 07fdb215df..c7ff0ca1dd 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -19,7 +19,7 @@ import bigframes.exceptions import bigframes.pandas as bpd -from bigframes.testing.utils import assert_pandas_df_equal, assert_series_equal +from bigframes.testing.utils import assert_frame_equal, assert_series_equal def test_unordered_mode_sql_no_hash(unordered_session): @@ -48,7 +48,7 @@ def test_unordered_mode_cache_aggregate(unordered_session): bf_result = mean_diff.to_pandas(ordered=False) pd_result = pd_df - pd_df.mean() - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) # type: ignore def test_unordered_mode_series_peek(unordered_session): @@ -103,7 +103,7 @@ def test_unordered_mode_read_gbq(unordered_session): } ) # Don't need ignore_order as there is only 1 row - assert_pandas_df_equal(df.to_pandas(), expected, check_index_type=False) + assert_frame_equal(df.to_pandas(), expected, check_index_type=False) @pytest.mark.parametrize( @@ -124,7 +124,7 @@ def test_unordered_drop_duplicates(unordered_session, keep): bf_result = bf_df.drop_duplicates(keep=keep) pd_result = pd_df.drop_duplicates(keep=keep) - assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) + assert_frame_equal(bf_result.to_pandas(), pd_result, ignore_order=True) def test_unordered_reset_index(unordered_session): @@ -134,7 +134,7 @@ def test_unordered_reset_index(unordered_session): bf_result = bf_df.set_index("b").reset_index(drop=False) pd_result = pd_df.set_index("b").reset_index(drop=False) - assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + assert_frame_equal(bf_result.to_pandas(), pd_result) def test_unordered_merge(unordered_session): @@ -146,7 +146,7 @@ def test_unordered_merge(unordered_session): bf_result = bf_df.merge(bf_df, left_on="a", right_on="c") pd_result = pd_df.merge(pd_df, left_on="a", right_on="c") - assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) + assert_frame_equal(bf_result.to_pandas(), pd_result, ignore_order=True) def test_unordered_drop_duplicates_ambiguous(unordered_session): @@ -167,7 +167,7 @@ def test_unordered_drop_duplicates_ambiguous(unordered_session): .drop_duplicates() ) - assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) + assert_frame_equal(bf_result.to_pandas(), pd_result, ignore_order=True) def test_unordered_mode_cache_preserves_order(unordered_session): @@ -181,7 +181,7 @@ def test_unordered_mode_cache_preserves_order(unordered_session): pd_result = pd_df.sort_values("b") # B is unique so unstrict order mode result here should be equivalent to strictly ordered - assert_pandas_df_equal(bf_result, pd_result, ignore_order=False) + assert_frame_equal(bf_result, pd_result, ignore_order=False) def test_unordered_mode_no_ordering_error(unordered_session): diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b9c51ab1ec..40de3bcdc1 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -32,7 +32,7 @@ import bigframes.series as series from bigframes.testing.utils import ( assert_dfs_equivalent, - assert_pandas_df_equal, + assert_frame_equal, assert_series_equal, assert_series_equivalent, convert_pandas_dtypes, @@ -226,7 +226,7 @@ def test_get_rows_with_slice(scalars_dfs, row_slice): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[row_slice].to_pandas() pd_result = scalars_pandas_df[row_slice] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_hasattr(scalars_dfs): @@ -253,7 +253,7 @@ def test_head_with_custom_column_labels( bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) bf_result = bf_df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): @@ -492,7 +492,7 @@ def test_drop_with_custom_column_labels(scalars_dfs): pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( columns=dropped_columns ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_memory_usage(scalars_dfs): @@ -799,7 +799,7 @@ def test_take_df(scalars_dfs, indices, axis): bf_result = scalars_df.take(indices, axis=axis).to_pandas() pd_result = scalars_pandas_df.take(indices, axis=axis) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_filter_df(scalars_dfs): @@ -811,7 +811,7 @@ def test_filter_df(scalars_dfs): pd_bool_series = scalars_pandas_df["bool_col"] pd_result = scalars_pandas_df[pd_bool_series] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_new_column(scalars_dfs): @@ -824,7 +824,7 @@ def test_assign_new_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_new_column_w_loc(scalars_dfs): @@ -962,7 +962,7 @@ def test_assign_existing_column(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_listlike_to_empty_df(session): @@ -974,7 +974,7 @@ def test_assign_listlike_to_empty_df(session): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + assert_frame_equal(bf_result.to_pandas(), pd_result) def test_assign_to_empty_df_multiindex_error(session): @@ -1008,7 +1008,7 @@ def test_assign_series(scalars_dfs, ordered): bf_result = df.to_pandas(ordered=ordered) pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) def test_assign_series_overwrite(scalars_dfs): @@ -1020,7 +1020,7 @@ def test_assign_series_overwrite(scalars_dfs): **{column_name: scalars_pandas_df[column_name] + 3} ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_sequential(scalars_dfs): @@ -1035,7 +1035,7 @@ def test_assign_sequential(scalars_dfs): pd_result["new_col"] = pd_result["new_col"].astype("Int64") pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) # Require an index so that the self-join is consistent each time. @@ -1069,7 +1069,7 @@ def test_assign_different_df( new_col=scalars_pandas_df_index[column_name] ) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_assign_different_df_w_loc( @@ -1120,7 +1120,7 @@ def test_assign_callable_lambda(scalars_dfs): # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. pd_result["new_col"] = pd_result["new_col"].astype("Int64") - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -1395,9 +1395,7 @@ def test_df_merge(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1431,9 +1429,7 @@ def test_df_merge_multi_key(scalars_dfs, left_on, right_on): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1463,9 +1459,7 @@ def test_merge_custom_col_name(scalars_dfs, merge_how): pandas_right_df = scalars_pandas_df[right_columns] pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) @pytest.mark.parametrize( @@ -1498,9 +1492,7 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): sort=True, ) - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) + assert_frame_equal(bf_result, pd_result, ignore_order=True, check_index_type=False) def test_shape(scalars_dfs): @@ -1800,7 +1792,7 @@ def test_df_pos(scalars_dfs): bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] - assert_pandas_df_equal(pd_result, bf_result) + assert_frame_equal(pd_result, bf_result) def test_df_neg(scalars_dfs): @@ -1808,7 +1800,7 @@ def test_df_neg(scalars_dfs): bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] - assert_pandas_df_equal(pd_result, bf_result) + assert_frame_equal(pd_result, bf_result) def test_df_invert(scalars_dfs): @@ -1818,7 +1810,7 @@ def test_df_invert(scalars_dfs): bf_result = (~scalars_df[columns]).to_pandas() pd_result = ~scalars_pandas_df[columns] - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_isnull(scalars_dfs): @@ -1835,7 +1827,7 @@ def test_df_isnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_df_notnull(scalars_dfs): @@ -1852,7 +1844,7 @@ def test_df_notnull(scalars_dfs): pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2190,7 +2182,7 @@ def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize(("other_scalar"), [1, -2]) @@ -2202,7 +2194,7 @@ def test_mod(scalars_dfs, other_scalar): bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_scalar_binop_str_exception(scalars_dfs): @@ -2258,7 +2250,7 @@ def test_series_binop_axis_index( bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.parametrize( @@ -2286,7 +2278,7 @@ def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): input = input.to_pandas() pd_result = scalars_pandas_df[df_columns].add(input, axis=1) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_df_reverse_binop_pandas(scalars_dfs): @@ -2301,7 +2293,7 @@ def test_df_reverse_binop_pandas(scalars_dfs): bf_result = pd_series + scalars_df[df_columns].to_pandas() pd_result = pd_series + scalars_pandas_df[df_columns] - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_listlike_binop_axis_1_bf_index(scalars_dfs): @@ -2316,7 +2308,7 @@ def test_listlike_binop_axis_1_bf_index(scalars_dfs): ) pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_binop_with_self_aggregate(session, scalars_dfs): @@ -2330,7 +2322,7 @@ def test_binop_with_self_aggregate(session, scalars_dfs): pd_df = scalars_pandas_df[df_columns] pd_result = pd_df - pd_df.mean() - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + assert_frame_equal(bf_result, pd_result, check_dtype=False) @pytest.mark.parametrize( @@ -2398,7 +2390,7 @@ def test_series_binop_add_different_table( scalars_pandas_df_index[series_column], axis="index" ) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) # TODO(garrettwu): Test series binop with different index @@ -2433,7 +2425,7 @@ def test_join_same_table(scalars_dfs, how): pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -2446,7 +2438,7 @@ def test_join_different_table( pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -2503,7 +2495,7 @@ def test_join_param_on(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_df_b = pd_df[["float64_col"]] pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @all_joins @@ -2524,7 +2516,7 @@ def test_df_join_series(scalars_dfs, how): pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) pd_series_b = pd_df["float64_col"] pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + assert_frame_equal(bf_result, pd_result, ignore_order=True) @pytest.mark.parametrize( @@ -2798,17 +2790,13 @@ def test_df_transpose(): ) rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - pd_df = pandas.DataFrame( - values, index=rows_multi, columns=columns_multi, dtype="Float64" - ) - bf_df = dataframe.DataFrame( - values, index=rows_multi, columns=columns_multi, dtype="Float64" - ) + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) pd_result = pd_df.T bf_result = bf_df.T.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + assert_frame_equal(pd_result, bf_result, check_dtype=False, nulls_are_nan=True) def test_df_transpose_error(): @@ -3034,7 +3022,7 @@ def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + assert_frame_equal(bf_result, pd_result, ignore_order=not ordered) @pytest.mark.parametrize( @@ -3802,7 +3790,7 @@ def test_df_setattr_index(): pd_df.index = pandas.Index([4, 5]) bf_df.index = [4, 5] - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -3817,7 +3805,7 @@ def test_df_setattr_columns(): bf_df.columns = pandas.Index([4, 5, 6]) - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -3830,7 +3818,7 @@ def test_df_setattr_modify_column(): pd_df.my_column = [4, 5] bf_df.my_column = [4, 5] - assert_pandas_df_equal( + assert_frame_equal( pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False ) @@ -4061,9 +4049,7 @@ def test_df_from_dict_columns_orient(): data = {"a": [1, 2], "b": [3.3, 2.4]} bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="columns") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_dict_index_orient(): @@ -4072,9 +4058,7 @@ def test_df_from_dict_index_orient(): data, orient="index", columns=["col1", "col2"] ).to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_dict_tight_orient(): @@ -4088,9 +4072,7 @@ def test_df_from_dict_tight_orient(): bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() pd_result = pd.DataFrame.from_dict(data, orient="tight") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_from_records(): @@ -4100,9 +4082,7 @@ def test_df_from_records(): records, columns=["c1", "c2"] ).to_pandas() pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) + assert_frame_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): @@ -4342,7 +4322,7 @@ def test_assign_after_binop_row_joins(): bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + assert_frame_equal(bf_df.to_pandas(), pd_df) def test_df_dot_inline(session): diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index ece42fe3d4..9f1a247250 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -38,7 +38,7 @@ import bigframes.pandas as bpd import bigframes.series as series from bigframes.testing.utils import ( - assert_pandas_df_equal, + assert_frame_equal, assert_series_equal, convert_pandas_dtypes, get_first_file_from_wildcard, @@ -1786,7 +1786,7 @@ def test_take(scalars_dfs, indices): bf_result = scalars_df.take(indices).to_pandas() pd_result = scalars_pandas_df.take(indices) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_nested_filter(scalars_dfs): @@ -3455,7 +3455,7 @@ def test_to_frame(scalars_dfs): bf_result = scalars_df["int64_col"].to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].to_frame() - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_to_frame_no_name(scalars_dfs): @@ -3464,7 +3464,7 @@ def test_to_frame_no_name(scalars_dfs): bf_result = scalars_df["int64_col"].rename(None).to_frame().to_pandas() pd_result = scalars_pandas_df["int64_col"].rename(None).to_frame() - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) @pytest.mark.skip(reason="fixture 'gcs_folder' not found") @@ -3713,7 +3713,7 @@ def test_mask_default_value(scalars_dfs): pd_col_masked = pd_col.mask(pd_col % 2 == 1) pd_result = pd_col.to_frame().assign(int64_col_masked=pd_col_masked) - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_mask_custom_value(scalars_dfs): @@ -3731,7 +3731,7 @@ def test_mask_custom_value(scalars_dfs): # odd so should be left as is, but it is being masked in pandas. # Accidentally the bigframes bahavior matches, but it should be updated # after the resolution of https://github.com/pandas-dev/pandas/issues/52955 - assert_pandas_df_equal(bf_result, pd_result) + assert_frame_equal(bf_result, pd_result) def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): @@ -4194,7 +4194,7 @@ def test_loc_bool_series_default_index( scalars_pandas_df_default_index.bool_col ] - assert_pandas_df_equal( + assert_frame_equal( bf_result.to_frame(), pd_result.to_frame(), )