Skip to content

Commit 7ff08da

Browse files
fix: Improve strictness of nan vs None usage
1 parent 2526448 commit 7ff08da

File tree

8 files changed

+59
-29
lines changed

8 files changed

+59
-29
lines changed

bigframes/core/blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,7 @@ def aggregate_all_and_stack(
12951295
as_array = ops.ToArrayOp().as_expr(*(col for col in self.value_columns))
12961296
reduced = ops.ArrayReduceOp(operation).as_expr(as_array)
12971297
block, id = self.project_expr(reduced, None)
1298-
return block.select_column(id)
1298+
return block.select_column(id).with_column_labels(pd.Index([None]))
12991299

13001300
def aggregate_size(
13011301
self,

bigframes/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5012,7 +5012,7 @@ def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Serie
50125012
return bigframes.series.Series(
50135013
block.select_column(
50145014
indicator,
5015-
)
5015+
).with_column_labels(pandas.Index([None])),
50165016
)
50175017

50185018
def rank(

bigframes/series.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2653,9 +2653,10 @@ def _apply_unary_op(
26532653
) -> Series:
26542654
"""Applies a unary operator to the series."""
26552655
block, result_id = self._block.apply_unary_op(
2656-
self._value_column, op, result_label=self._name
2656+
self._value_column,
2657+
op,
26572658
)
2658-
return Series(block.select_column(result_id))
2659+
return Series(block.select_column(result_id), name=self.name) # type: ignore
26592660

26602661
def _apply_binary_op(
26612662
self,
@@ -2683,17 +2684,19 @@ def _apply_binary_op(
26832684
expr = op.as_expr(
26842685
other_col if reverse else self_col, self_col if reverse else other_col
26852686
)
2686-
block, result_id = block.project_expr(expr, name)
2687-
return Series(block.select_column(result_id))
2687+
block, result_id = block.project_expr(expr)
2688+
block = block.select_column(result_id).with_column_labels([name])
2689+
return Series(block) # type: ignore
26882690

26892691
else: # Scalar binop
26902692
name = self._name
26912693
expr = op.as_expr(
26922694
ex.const(other) if reverse else self._value_column,
26932695
self._value_column if reverse else ex.const(other),
26942696
)
2695-
block, result_id = self._block.project_expr(expr, name)
2696-
return Series(block.select_column(result_id))
2697+
block, result_id = self._block.project_expr(expr)
2698+
block = block.select_column(result_id).with_column_labels([name])
2699+
return Series(block) # type: ignore
26972700

26982701
def _apply_nary_op(
26992702
self,

bigframes/testing/utils.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
from google.cloud.functions_v2.types import functions
2323
import numpy as np
2424
import pandas as pd
25+
import pandas.api.types as pd_types
2526
import pyarrow as pa # type: ignore
2627
import pytest
2728

2829
from bigframes import operations as ops
2930
from bigframes.core import expression as ex
31+
import bigframes.dtypes
3032
import bigframes.functions._utils as bff_utils
3133
import bigframes.pandas as bpd
3234

@@ -98,7 +100,12 @@ def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs):
98100

99101

100102
def assert_series_equal(
101-
left: pd.Series, right: pd.Series, ignore_order: bool = False, **kwargs
103+
left: pd.Series,
104+
right: pd.Series,
105+
*,
106+
ignore_order: bool = False,
107+
nulls_are_nan: bool = True,
108+
**kwargs,
102109
):
103110
if ignore_order:
104111
if left.index.name is None:
@@ -108,6 +115,16 @@ def assert_series_equal(
108115
left = left.sort_index()
109116
right = right.sort_index()
110117

118+
if nulls_are_nan:
119+
if left.dtype == bigframes.dtypes.FLOAT_DTYPE:
120+
left = left.astype("float64")
121+
if right.dtype == bigframes.dtypes.FLOAT_DTYPE:
122+
right = right.astype("float64")
123+
if pd_types.is_object_dtype(left):
124+
left = left.fillna(float("nan"))
125+
if pd_types.is_object_dtype(right):
126+
right = right.fillna(float("nan"))
127+
111128
pd.testing.assert_series_equal(left, right, **kwargs)
112129

113130

tests/system/small/test_dataframe.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3547,7 +3547,8 @@ def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods):
35473547
def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
35483548
col_names = ["int64_too", "float64_col", "int64_col"]
35493549
bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas()
3550-
pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods)
3550+
# pandas 3.0 does not automatically ffill anymore
3551+
pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods)
35513552
pd.testing.assert_frame_equal(
35523553
pd_result,
35533554
bf_result,
@@ -3657,8 +3658,12 @@ def test_df_transpose():
36573658
)
36583659
rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"])
36593660

3660-
pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi)
3661-
bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi)
3661+
pd_df = pandas.DataFrame(
3662+
values, index=rows_multi, columns=columns_multi, dtype="Float64"
3663+
)
3664+
bf_df = dataframe.DataFrame(
3665+
values, index=rows_multi, columns=columns_multi, dtype="Float64"
3666+
)
36623667

36633668
pd_result = pd_df.T
36643669
bf_result = bf_df.T.to_pandas()
@@ -4376,10 +4381,8 @@ def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index,
43764381
bf_result = op(scalars_df_index[col_names]).to_pandas()
43774382
pd_result = op(scalars_pandas_df_index[col_names])
43784383

4379-
# Pandas may produce narrower numeric types, but bigframes always produces Float64
4380-
pd_result = pd_result.astype("Float64")
43814384
# Pandas has object index type
4382-
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
4385+
assert_series_equal(pd_result, bf_result, check_index_type=False, check_dtype=False)
43834386

43844387

43854388
def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index):

tests/system/small/test_series.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
801801
)
802802
def test_series_interpolate(method):
803803
pytest.importorskip("scipy")
804+
if method == "pad" and pd.__version__.startswith("3."):
805+
pytest.skip("pandas 3.0 dropped method='pad'")
804806

805807
values = [None, 1, 2, None, None, 16, None]
806808
index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
@@ -813,11 +815,12 @@ def test_series_interpolate(method):
813815
bf_result = bf_series.interpolate(method=method).to_pandas()
814816

815817
# pd uses non-null types, while bf uses nullable types
816-
pd.testing.assert_series_equal(
818+
assert_series_equal(
817819
pd_result,
818820
bf_result,
819821
check_index_type=False,
820822
check_dtype=False,
823+
nulls_are_nan=True,
821824
)
822825

823826

@@ -2730,7 +2733,7 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods):
27302733
def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
27312734
bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas()
27322735
# cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA
2733-
pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods)
2736+
pd_result = scalars_pandas_df_index["int64_col"].ffill().pct_change(periods=periods)
27342737

27352738
pd.testing.assert_series_equal(
27362739
bf_result,

tests/unit/test_dataframe_polars.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2687,7 +2687,8 @@ def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods):
26872687
def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
26882688
col_names = ["int64_too", "float64_col", "int64_col"]
26892689
bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas()
2690-
pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods)
2690+
# pandas 3.0 does not automatically ffill anymore
2691+
pd_result = scalars_pandas_df_index[col_names].ffill().pct_change(periods=periods)
26912692
pd.testing.assert_frame_equal(
26922693
pd_result,
26932694
bf_result,
@@ -2797,8 +2798,12 @@ def test_df_transpose():
27972798
)
27982799
rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"])
27992800

2800-
pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi)
2801-
bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi)
2801+
pd_df = pandas.DataFrame(
2802+
values, index=rows_multi, columns=columns_multi, dtype="Float64"
2803+
)
2804+
bf_df = dataframe.DataFrame(
2805+
values, index=rows_multi, columns=columns_multi, dtype="Float64"
2806+
)
28022807

28032808
pd_result = pd_df.T
28042809
bf_result = bf_df.T.to_pandas()
@@ -3386,9 +3391,8 @@ def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index,
33863391
pd_result = op(scalars_pandas_df_index[col_names])
33873392

33883393
# Pandas may produce narrower numeric types, but bigframes always produces Float64
3389-
pd_result = pd_result.astype("Float64")
33903394
# Pandas has object index type
3391-
pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
3395+
assert_series_equal(pd_result, bf_result, check_index_type=False, check_dtype=False)
33923396

33933397

33943398
@pytest.mark.parametrize(

tests/unit/test_series_polars.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
798798
)
799799
def test_series_interpolate(method):
800800
pytest.importorskip("scipy")
801+
if method == "pad" and pd.__version__.startswith("3."):
802+
pytest.skip("pandas 3.0 dropped method='pad'")
801803

802804
values = [None, 1, 2, None, None, 16, None]
803805
index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
@@ -810,11 +812,12 @@ def test_series_interpolate(method):
810812
bf_result = bf_series.interpolate(method=method).to_pandas()
811813

812814
# pd uses non-null types, while bf uses nullable types
813-
pd.testing.assert_series_equal(
815+
assert_series_equal(
814816
pd_result,
815817
bf_result,
816818
check_index_type=False,
817819
check_dtype=False,
820+
nulls_are_nan=True,
818821
)
819822

820823

@@ -2739,12 +2742,9 @@ def test_diff(scalars_df_index, scalars_pandas_df_index, periods):
27392742
def test_series_pct_change(scalars_df_index, scalars_pandas_df_index, periods):
27402743
bf_result = scalars_df_index["int64_col"].pct_change(periods=periods).to_pandas()
27412744
# cumsum does not behave well on nullable ints in pandas, produces object type and never ignores NA
2742-
pd_result = scalars_pandas_df_index["int64_col"].pct_change(periods=periods)
2745+
pd_result = scalars_pandas_df_index["int64_col"].ffill().pct_change(periods=periods)
27432746

2744-
pd.testing.assert_series_equal(
2745-
bf_result,
2746-
pd_result,
2747-
)
2747+
assert_series_equal(bf_result, pd_result, nulls_are_nan=True)
27482748

27492749

27502750
@pytest.mark.skip(
@@ -4696,7 +4696,7 @@ def wrapped(x):
46964696

46974697
pd_result = pd_col.apply(wrapped)
46984698

4699-
assert_series_equal(bf_result, pd_result, check_dtype=False)
4699+
assert_series_equal(bf_result, pd_result, check_dtype=False, nulls_are_nan=True)
47004700

47014701

47024702
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)