From 4a8e0287a630226166f02f21b88e8e07f23bcca4 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 27 Jun 2025 21:48:44 +0000 Subject: [PATCH 01/13] support thresh in dropna --- bigframes/core/block_transforms.py | 53 ++++++++++++++----- bigframes/dataframe.py | 49 ++++++++++++----- tests/system/small/test_dataframe.py | 41 +++++++++----- .../bigframes_vendored/pandas/core/frame.py | 24 +++++++++ 4 files changed, 130 insertions(+), 37 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 09ef17dff5..939d41c885 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -523,6 +523,7 @@ def dropna( block: blocks.Block, column_ids: typing.Sequence[str], how: typing.Literal["all", "any"] = "any", + thresh: typing.Optional[int] = None, subset: Optional[typing.Sequence[str]] = None, ): """ @@ -531,18 +532,46 @@ def dropna( if subset is None: subset = column_ids - predicates = [ - ops.notnull_op.as_expr(column_id) - for column_id in column_ids - if column_id in subset - ] - if len(predicates) == 0: - return block - if how == "any": - predicate = functools.reduce(ops.and_op.as_expr, predicates) - else: # "all" - predicate = functools.reduce(ops.or_op.as_expr, predicates) - return block.filter(predicate) + if thresh is not None: + # Count non-null values per row + notnull_predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] + + if len(notnull_predicates) == 0: + return block + + # Handle single predicate case + if len(notnull_predicates) == 1: + count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(notnull_predicates[0]) + else: + # Sum the boolean expressions to count non-null values + count_expr = functools.reduce( + lambda a, b: ops.add_op.as_expr( + ops.AsTypeOp(pd.Int64Dtype()).as_expr(a), + ops.AsTypeOp(pd.Int64Dtype()).as_expr(b), + ), + notnull_predicates, + ) + + # Filter rows where count >= thresh + thresh_predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh)) + return block.filter(thresh_predicate) + else: + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] + if len(predicates) == 0: + return block + if how == "any": + predicate = functools.reduce(ops.and_op.as_expr, predicates) + else: # "all" + predicate = functools.reduce(ops.or_op.as_expr, predicates) + return block.filter(predicate) def nsmallest( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1884f0beff..5430371433 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2801,7 +2801,8 @@ def dropna( self, *, axis: int | str = 0, - how: str = "any", + how: typing.Literal["all", "any"] = "any", + thresh: typing.Optional[int] = None, subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, inplace: bool = False, ignore_index=False, @@ -2810,6 +2811,10 @@ def dropna( raise NotImplementedError( f"'inplace'=True not supported. {constants.FEEDBACK_LINK}" ) + if thresh is not None and how != "any": + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) if how not in ("any", "all"): raise ValueError("'how' must be one of 'any', 'all'") @@ -2833,21 +2838,41 @@ def dropna( for id_ in self._block.label_to_col_id[label] ] - result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore + result = block_ops.dropna( + self._block, + self._block.value_columns, + how=how, + thresh=thresh, + subset=subset_ids, + ) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) else: - isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) - if how == "any": - null_locations = DataFrame(isnull_block).any().to_pandas() - else: # 'all' - null_locations = DataFrame(isnull_block).all().to_pandas() - keep_columns = [ - col - for col, to_drop in zip(self._block.value_columns, null_locations) - if not to_drop - ] + if thresh is not None: + # Count non-null values per column + isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) + notnull_block = self._block.multi_apply_unary_op(ops.notnull_op) + + # Sum non-null values for each column + notnull_counts = DataFrame(notnull_block).sum().to_pandas() + + keep_columns = [ + col + for col, count in zip(self._block.value_columns, notnull_counts) + if count >= thresh + ] + else: + isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) + if how == "any": + null_locations = DataFrame(isnull_block).any().to_pandas() + else: # 'all' + null_locations = DataFrame(isnull_block).all().to_pandas() + keep_columns = [ + col + for col, to_drop in zip(self._block.value_columns, null_locations) + if not to_drop + ] return DataFrame(self._block.select_columns(keep_columns)) def any( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8ea1259325..5f0c4f3b2a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1196,26 +1196,41 @@ def test_assign_callable_lambda(scalars_dfs): @pytest.mark.parametrize( - ("axis", "how", "ignore_index", "subset"), + ("axis", "how", "ignore_index", "subset", "thresh"), [ - (0, "any", False, None), - (0, "any", True, None), - (0, "all", False, ["bool_col", "time_col"]), - (0, "any", False, ["bool_col", "time_col"]), - (0, "all", False, "time_col"), - (1, "any", False, None), - (1, "all", False, None), + (0, "any", False, None, None), + (0, "any", True, None, None), + (0, "all", False, ["bool_col", "time_col"], None), + (0, "any", False, ["bool_col", "time_col"], None), + (0, "all", False, "time_col", None), + (1, "any", False, None, None), + (1, "all", False, None, None), + (0, "any", False, None, 2), + (0, "any", True, None, 3), + (1, "any", False, None, 2), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset, thresh): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + + if thresh is not None: + df = scalars_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + else: + df = scalars_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 40ab5a7352..3bd10281d9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1762,6 +1762,7 @@ def dropna( *, axis: int | str = 0, how: str = "any", + thresh: Optional[int] = None, subset=None, inplace: bool = False, ignore_index=False, @@ -1812,6 +1813,25 @@ def dropna( [3 rows x 3 columns] + Keep rows with at least 2 non-null values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [2 rows x 3 columns] + + Keep columns with at least 2 non-null values: + + >>> df.dropna(axis='columns', thresh=2) + name toy + 0 Alfred + 1 Batman Batmobile + 2 Catwoman Bullwhip + + [3 rows x 2 columns] + Define in which columns to look for missing values. >>> df.dropna(subset=['name', 'toy']) @@ -1834,6 +1854,8 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + typing(int, optional): + Require that many non-NA values. Cannot be combined with how. subset (column label or sequence of labels, optional): Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. @@ -1851,6 +1873,8 @@ def dropna( Raises: ValueError: If ``how`` is not one of ``any`` or ``all``. + TyperError: + If both ``how`` and ``thresh`` are specified. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From cbe83a96dcbd523f15b80a879be641ae08b0b7b4 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 30 Jun 2025 05:33:02 +0000 Subject: [PATCH 02/13] update docstring, and polish function --- bigframes/core/block_transforms.py | 37 ++++------ bigframes/dataframe.py | 5 +- tests/system/small/test_dataframe.py | 71 +++++++++++-------- .../bigframes_vendored/pandas/core/frame.py | 4 +- 4 files changed, 61 insertions(+), 56 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 939d41c885..cf0d8ec57f 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -532,20 +532,20 @@ def dropna( if subset is None: subset = column_ids - if thresh is not None: - # Count non-null values per row - notnull_predicates = [ - ops.notnull_op.as_expr(column_id) - for column_id in column_ids - if column_id in subset - ] + # Predicates to check for non-null values in the subset of columns + predicates = [ + ops.notnull_op.as_expr(column_id) + for column_id in column_ids + if column_id in subset + ] - if len(notnull_predicates) == 0: - return block + if len(predicates) == 0: + return block + if thresh is not None: # Handle single predicate case - if len(notnull_predicates) == 1: - count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(notnull_predicates[0]) + if len(predicates) == 1: + count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0]) else: # Sum the boolean expressions to count non-null values count_expr = functools.reduce( @@ -553,25 +553,18 @@ def dropna( ops.AsTypeOp(pd.Int64Dtype()).as_expr(a), ops.AsTypeOp(pd.Int64Dtype()).as_expr(b), ), - notnull_predicates, + predicates, ) # Filter rows where count >= thresh - thresh_predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh)) - return block.filter(thresh_predicate) + predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh)) else: - predicates = [ - ops.notnull_op.as_expr(column_id) - for column_id in column_ids - if column_id in subset - ] - if len(predicates) == 0: - return block if how == "any": predicate = functools.reduce(ops.and_op.as_expr, predicates) else: # "all" predicate = functools.reduce(ops.or_op.as_expr, predicates) - return block.filter(predicate) + + return block.filter(predicate) def nsmallest( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5430371433..c3ea03045d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2850,11 +2850,8 @@ def dropna( return DataFrame(result) else: if thresh is not None: - # Count non-null values per column - isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) + # Keep columns with at least 'thresh' non-null values notnull_block = self._block.multi_apply_unary_op(ops.notnull_op) - - # Sum non-null values for each column notnull_counts = DataFrame(notnull_block).sum().to_pandas() keep_columns = [ diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5f0c4f3b2a..70f551ef6f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1196,47 +1196,62 @@ def test_assign_callable_lambda(scalars_dfs): @pytest.mark.parametrize( - ("axis", "how", "ignore_index", "subset", "thresh"), + ("axis", "how", "ignore_index", "subset"), [ - (0, "any", False, None, None), - (0, "any", True, None, None), - (0, "all", False, ["bool_col", "time_col"], None), - (0, "any", False, ["bool_col", "time_col"], None), - (0, "all", False, "time_col", None), - (1, "any", False, None, None), - (1, "all", False, None, None), - (0, "any", False, None, 2), - (0, "any", True, None, 3), - (1, "any", False, None, 2), + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset, thresh): +def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs - - if thresh is not None: - df = scalars_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - pd_result = scalars_pandas_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - else: - df = scalars_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) - pd_result = scalars_pandas_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) - + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("axis", "ignore_index", "subset", "thresh"), + [ + (0, False, None, 2), + (0, True, None, 3), + (1, False, None, 2), + ], +) +def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): + """ + Tests that dropna correctly keeps rows/columns with a minimum number + of non-null values. + """ + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_result = scalars_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + + bf_result = df_result.to_pandas() + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_dropna_range_columns(scalars_dfs): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 3bd10281d9..61abca74db 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1842,7 +1842,7 @@ def dropna( [2 rows x 3 columns] Args: - axis ({0 or 'index', 1 or 'columns'}, default 'columns'): + axis ({0 or 'index', 1 or 'columns'}, default 0): Determine if rows or columns which contain missing values are removed. @@ -1854,7 +1854,7 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. - typing(int, optional): + thresh (int, optional): Require that many non-NA values. Cannot be combined with how. subset (column label or sequence of labels, optional): Labels along other axis to consider, e.g. if you are dropping From e1a0fafa784177f86a608022f8ecb52858763dd8 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 16:30:32 +0000 Subject: [PATCH 03/13] fix mypy --- bigframes/core/block_transforms.py | 4 ++-- bigframes/dataframe.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index cf0d8ec57f..cb7c1923cf 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -522,7 +522,7 @@ def rank( def dropna( block: blocks.Block, column_ids: typing.Sequence[str], - how: typing.Literal["all", "any"] = "any", + how: str = "any", thresh: typing.Optional[int] = None, subset: Optional[typing.Sequence[str]] = None, ): @@ -555,10 +555,10 @@ def dropna( ), predicates, ) - # Filter rows where count >= thresh predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh)) else: + # Only handle 'how' parameter when thresh is not specified if how == "any": predicate = functools.reduce(ops.and_op.as_expr, predicates) else: # "all" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c3ea03045d..1821c5791b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2801,7 +2801,7 @@ def dropna( self, *, axis: int | str = 0, - how: typing.Literal["all", "any"] = "any", + how: str = "any", thresh: typing.Optional[int] = None, subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, inplace: bool = False, @@ -2811,12 +2811,18 @@ def dropna( raise NotImplementedError( f"'inplace'=True not supported. {constants.FEEDBACK_LINK}" ) - if thresh is not None and how != "any": - raise TypeError( - "You cannot set both the how and thresh arguments at the same time." - ) - if how not in ("any", "all"): - raise ValueError("'how' must be one of 'any', 'all'") + + # Check if both thresh and how are explicitly provided + if thresh is not None: + # cannot specify both thresh and how parameters + if how != "any": + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) + else: + # Only validate 'how' when thresh is not provided + if how not in ("any", "all"): + raise ValueError("'how' must be one of 'any', 'all'") axis_n = utils.get_axis_number(axis) From 03f86ba7b08162e913e03d882e064093e8e7844c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 20:31:35 +0000 Subject: [PATCH 04/13] add explicit handling for empty slices --- bigframes/core/rewrite/slices.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index bed3a8a3f3..c503a6d36d 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,6 +76,17 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) + + # Handle empty slice cases exlicitly (e.g. [0:0]) + if ( + node.start is not None + and node.stop is not None + and node.start >= node.stop + and (node.step is None or node.step > 0) + ): + # Return empty result by filtering with impossible condition + return slice_as_filter(node.child, node.start, node.start, node.step or 1) + # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child From 6a188588025da0bd2eddefdf56f3a06cdf0a46d3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 21:16:03 +0000 Subject: [PATCH 05/13] fix the kokoro issue with unreliable row count --- bigframes/core/rewrite/slices.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index c503a6d36d..6703810404 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -77,15 +77,12 @@ def rewrite_slice(node: nodes.BigFrameNode): slice_def = (node.start, node.stop, node.step) - # Handle empty slice cases exlicitly (e.g. [0:0]) - if ( - node.start is not None - and node.stop is not None - and node.start >= node.stop - and (node.step is None or node.step > 0) - ): - # Return empty result by filtering with impossible condition - return slice_as_filter(node.child, node.start, node.start, node.step or 1) + # Handle empty slice cases explicitly before any row count dependent logic + # This ensures empty slices always return empty results regardless of statistics + if _is_empty_slice(node.start, node.stop, node.step): + # Create a filter that will always return empty results + # Use start=0, stop=0, step=1 to ensure empty result + return slice_as_filter(node.child, 0, 0, 1) # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): @@ -100,6 +97,26 @@ def rewrite_slice(node: nodes.BigFrameNode): return slice_as_filter(node.child, *slice_def) +def _is_empty_slice(start, stop, step): + """Check if a slice will always return empty results.""" + if start is None or stop is None: + return False + + # Normalize step + if step is None: + step = 1 + + # For positive step, empty if start >= stop + # For negative step, empty if start <= stop + if step > 0: + return start >= stop + elif step < 0: + return start <= stop + else: + # step == 0 is invalid, but handle gracefully + return True + + def slice_as_filter( node: nodes.BigFrameNode, start: Optional[int], stop: Optional[int], step: int ) -> nodes.BigFrameNode: From dbcce52831458dc53b5ee8c22acf655527fafc59 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 22:01:14 +0000 Subject: [PATCH 06/13] debug for row count issue in kokoro --- bigframes/core/rewrite/slices.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 6703810404..b2278b356c 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,12 +76,14 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) + print(f"DEBUG: Processing slice {slice_def}, row_count={node.child.row_count}") # Handle empty slice cases explicitly before any row count dependent logic # This ensures empty slices always return empty results regardless of statistics if _is_empty_slice(node.start, node.stop, node.step): # Create a filter that will always return empty results # Use start=0, stop=0, step=1 to ensure empty result + print("DEBUG: Detected empty slice, returning filtered result") return slice_as_filter(node.child, 0, 0, 1) # no-op (eg. df[::1]) From 236a6e54a84391f0ae8317eb529823cf0d39f024 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 23:55:56 +0000 Subject: [PATCH 07/13] Revert "debug for row count issue in kokoro" This reverts commit 0141a7cda596b1da928c7c2d770653b9c6be10e7. --- bigframes/core/rewrite/slices.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index b2278b356c..6703810404 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,14 +76,12 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) - print(f"DEBUG: Processing slice {slice_def}, row_count={node.child.row_count}") # Handle empty slice cases explicitly before any row count dependent logic # This ensures empty slices always return empty results regardless of statistics if _is_empty_slice(node.start, node.stop, node.step): # Create a filter that will always return empty results # Use start=0, stop=0, step=1 to ensure empty result - print("DEBUG: Detected empty slice, returning filtered result") return slice_as_filter(node.child, 0, 0, 1) # no-op (eg. df[::1]) From 8fd03f8364c5bf980ad82acd881aab7801c70f96 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 23:56:01 +0000 Subject: [PATCH 08/13] Revert "fix the kokoro issue with unreliable row count" This reverts commit e3180f7f5b2dbcc8bc933545fc6c31c64cb92702. --- bigframes/core/rewrite/slices.py | 35 ++++++++------------------------ 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 6703810404..c503a6d36d 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -77,12 +77,15 @@ def rewrite_slice(node: nodes.BigFrameNode): slice_def = (node.start, node.stop, node.step) - # Handle empty slice cases explicitly before any row count dependent logic - # This ensures empty slices always return empty results regardless of statistics - if _is_empty_slice(node.start, node.stop, node.step): - # Create a filter that will always return empty results - # Use start=0, stop=0, step=1 to ensure empty result - return slice_as_filter(node.child, 0, 0, 1) + # Handle empty slice cases exlicitly (e.g. [0:0]) + if ( + node.start is not None + and node.stop is not None + and node.start >= node.stop + and (node.step is None or node.step > 0) + ): + # Return empty result by filtering with impossible condition + return slice_as_filter(node.child, node.start, node.start, node.step or 1) # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): @@ -97,26 +100,6 @@ def rewrite_slice(node: nodes.BigFrameNode): return slice_as_filter(node.child, *slice_def) -def _is_empty_slice(start, stop, step): - """Check if a slice will always return empty results.""" - if start is None or stop is None: - return False - - # Normalize step - if step is None: - step = 1 - - # For positive step, empty if start >= stop - # For negative step, empty if start <= stop - if step > 0: - return start >= stop - elif step < 0: - return start <= stop - else: - # step == 0 is invalid, but handle gracefully - return True - - def slice_as_filter( node: nodes.BigFrameNode, start: Optional[int], stop: Optional[int], step: int ) -> nodes.BigFrameNode: From 8c181949dc7b223bdb4fe299f00f653fbf0c2504 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 7 Jul 2025 23:56:03 +0000 Subject: [PATCH 09/13] Revert "add explicit handling for empty slices" This reverts commit d0d616b0eca4247dd3d7498b6413f92e72e46e72. --- bigframes/core/rewrite/slices.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index c503a6d36d..bed3a8a3f3 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,17 +76,6 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) - - # Handle empty slice cases exlicitly (e.g. [0:0]) - if ( - node.start is not None - and node.stop is not None - and node.start >= node.stop - and (node.step is None or node.step > 0) - ): - # Return empty result by filtering with impossible condition - return slice_as_filter(node.child, node.start, node.start, node.step or 1) - # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child From 8af0582987f7f9b8288de97220c6542854be4c0e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 8 Jul 2025 00:26:17 +0000 Subject: [PATCH 10/13] Reapply "fix the kokoro issue with unreliable row count" This reverts commit b5c8f52c065652dbb8912ec9259fe57b27ae2a94. --- bigframes/core/rewrite/slices.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index bed3a8a3f3..6703810404 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,6 +76,14 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) + + # Handle empty slice cases explicitly before any row count dependent logic + # This ensures empty slices always return empty results regardless of statistics + if _is_empty_slice(node.start, node.stop, node.step): + # Create a filter that will always return empty results + # Use start=0, stop=0, step=1 to ensure empty result + return slice_as_filter(node.child, 0, 0, 1) + # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child @@ -89,6 +97,26 @@ def rewrite_slice(node: nodes.BigFrameNode): return slice_as_filter(node.child, *slice_def) +def _is_empty_slice(start, stop, step): + """Check if a slice will always return empty results.""" + if start is None or stop is None: + return False + + # Normalize step + if step is None: + step = 1 + + # For positive step, empty if start >= stop + # For negative step, empty if start <= stop + if step > 0: + return start >= stop + elif step < 0: + return start <= stop + else: + # step == 0 is invalid, but handle gracefully + return True + + def slice_as_filter( node: nodes.BigFrameNode, start: Optional[int], stop: Optional[int], step: int ) -> nodes.BigFrameNode: From 78755e20b38d86602fce2ffb57e1257e35d3f654 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 8 Jul 2025 20:43:23 +0000 Subject: [PATCH 11/13] Revert "Reapply "fix the kokoro issue with unreliable row count"" This reverts commit a6bc2c5a3b21113bcdfdbc3d81a9cc45ca259a12. --- bigframes/core/rewrite/slices.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 6703810404..bed3a8a3f3 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,14 +76,6 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) - - # Handle empty slice cases explicitly before any row count dependent logic - # This ensures empty slices always return empty results regardless of statistics - if _is_empty_slice(node.start, node.stop, node.step): - # Create a filter that will always return empty results - # Use start=0, stop=0, step=1 to ensure empty result - return slice_as_filter(node.child, 0, 0, 1) - # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child @@ -97,26 +89,6 @@ def rewrite_slice(node: nodes.BigFrameNode): return slice_as_filter(node.child, *slice_def) -def _is_empty_slice(start, stop, step): - """Check if a slice will always return empty results.""" - if start is None or stop is None: - return False - - # Normalize step - if step is None: - step = 1 - - # For positive step, empty if start >= stop - # For negative step, empty if start <= stop - if step > 0: - return start >= stop - elif step < 0: - return start <= stop - else: - # step == 0 is invalid, but handle gracefully - return True - - def slice_as_filter( node: nodes.BigFrameNode, start: Optional[int], stop: Optional[int], step: int ) -> nodes.BigFrameNode: From 64985266ec84f9433c5b6d313cc2f85f31a906ff Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 8 Jul 2025 20:43:26 +0000 Subject: [PATCH 12/13] Reapply "add explicit handling for empty slices" This reverts commit 1a9a5a89fb23a4985a47a264dbb213e65cacbe38. --- bigframes/core/rewrite/slices.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index bed3a8a3f3..c503a6d36d 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,6 +76,17 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) + + # Handle empty slice cases exlicitly (e.g. [0:0]) + if ( + node.start is not None + and node.stop is not None + and node.start >= node.stop + and (node.step is None or node.step > 0) + ): + # Return empty result by filtering with impossible condition + return slice_as_filter(node.child, node.start, node.start, node.step or 1) + # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child From 0173100475fa9846f2bede71c7b61ef2ed536c72 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 8 Jul 2025 20:44:47 +0000 Subject: [PATCH 13/13] Revert "Reapply "add explicit handling for empty slices"" This reverts commit 42a08d1a6d5a5bf62f9342aa3ceaac7f62e49f3b. --- bigframes/core/rewrite/slices.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index c503a6d36d..bed3a8a3f3 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -76,17 +76,6 @@ def rewrite_slice(node: nodes.BigFrameNode): return node slice_def = (node.start, node.stop, node.step) - - # Handle empty slice cases exlicitly (e.g. [0:0]) - if ( - node.start is not None - and node.stop is not None - and node.start >= node.stop - and (node.step is None or node.step > 0) - ): - # Return empty result by filtering with impossible condition - return slice_as_filter(node.child, node.start, node.start, node.step or 1) - # no-op (eg. df[::1]) if slices.is_noop(slice_def, node.child.row_count): return node.child