diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 09ef17dff5..cb7c1923cf 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -522,7 +522,8 @@ def rank( def dropna( block: blocks.Block, column_ids: typing.Sequence[str], - how: typing.Literal["all", "any"] = "any", + how: str = "any", + thresh: typing.Optional[int] = None, subset: Optional[typing.Sequence[str]] = None, ): """ @@ -531,17 +532,38 @@ def dropna( if subset is None: subset = column_ids + # Predicates to check for non-null values in the subset of columns predicates = [ ops.notnull_op.as_expr(column_id) for column_id in column_ids if column_id in subset ] + if len(predicates) == 0: return block - if how == "any": - predicate = functools.reduce(ops.and_op.as_expr, predicates) - else: # "all" - predicate = functools.reduce(ops.or_op.as_expr, predicates) + + if thresh is not None: + # Handle single predicate case + if len(predicates) == 1: + count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0]) + else: + # Sum the boolean expressions to count non-null values + count_expr = functools.reduce( + lambda a, b: ops.add_op.as_expr( + ops.AsTypeOp(pd.Int64Dtype()).as_expr(a), + ops.AsTypeOp(pd.Int64Dtype()).as_expr(b), + ), + predicates, + ) + # Filter rows where count >= thresh + predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh)) + else: + # Only handle 'how' parameter when thresh is not specified + if how == "any": + predicate = functools.reduce(ops.and_op.as_expr, predicates) + else: # "all" + predicate = functools.reduce(ops.or_op.as_expr, predicates) + return block.filter(predicate) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1884f0beff..1821c5791b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2802,6 +2802,7 @@ def dropna( *, axis: int | str = 0, how: str = "any", + thresh: typing.Optional[int] = None, subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None, inplace: bool = False, ignore_index=False, @@ -2810,8 +2811,18 @@ def dropna( raise NotImplementedError( f"'inplace'=True not supported. {constants.FEEDBACK_LINK}" ) - if how not in ("any", "all"): - raise ValueError("'how' must be one of 'any', 'all'") + + # Check if both thresh and how are explicitly provided + if thresh is not None: + # cannot specify both thresh and how parameters + if how != "any": + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) + else: + # Only validate 'how' when thresh is not provided + if how not in ("any", "all"): + raise ValueError("'how' must be one of 'any', 'all'") axis_n = utils.get_axis_number(axis) @@ -2833,21 +2844,38 @@ def dropna( for id_ in self._block.label_to_col_id[label] ] - result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore + result = block_ops.dropna( + self._block, + self._block.value_columns, + how=how, + thresh=thresh, + subset=subset_ids, + ) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) else: - isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) - if how == "any": - null_locations = DataFrame(isnull_block).any().to_pandas() - else: # 'all' - null_locations = DataFrame(isnull_block).all().to_pandas() - keep_columns = [ - col - for col, to_drop in zip(self._block.value_columns, null_locations) - if not to_drop - ] + if thresh is not None: + # Keep columns with at least 'thresh' non-null values + notnull_block = self._block.multi_apply_unary_op(ops.notnull_op) + notnull_counts = DataFrame(notnull_block).sum().to_pandas() + + keep_columns = [ + col + for col, count in zip(self._block.value_columns, notnull_counts) + if count >= thresh + ] + else: + isnull_block = self._block.multi_apply_unary_op(ops.isnull_op) + if how == "any": + null_locations = DataFrame(isnull_block).any().to_pandas() + else: # 'all' + null_locations = DataFrame(isnull_block).all().to_pandas() + keep_columns = [ + col + for col, to_drop in zip(self._block.value_columns, null_locations) + if not to_drop + ] return DataFrame(self._block.select_columns(keep_columns)) def any( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8ea1259325..70f551ef6f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1207,7 +1207,7 @@ def test_assign_callable_lambda(scalars_dfs): (1, "all", False, None), ], ) -def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): +def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs @@ -1222,6 +1222,36 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("axis", "ignore_index", "subset", "thresh"), + [ + (0, False, None, 2), + (0, True, None, 3), + (1, False, None, 2), + ], +) +def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): + """ + Tests that dropna correctly keeps rows/columns with a minimum number + of non-null values. + """ + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_result = scalars_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + + bf_result = df_result.to_pandas() + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_df_dropna_range_columns(scalars_dfs): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 40ab5a7352..61abca74db 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1762,6 +1762,7 @@ def dropna( *, axis: int | str = 0, how: str = "any", + thresh: Optional[int] = None, subset=None, inplace: bool = False, ignore_index=False, @@ -1812,6 +1813,25 @@ def dropna( [3 rows x 3 columns] + Keep rows with at least 2 non-null values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [2 rows x 3 columns] + + Keep columns with at least 2 non-null values: + + >>> df.dropna(axis='columns', thresh=2) + name toy + 0 Alfred + 1 Batman Batmobile + 2 Catwoman Bullwhip + + [3 rows x 2 columns] + Define in which columns to look for missing values. >>> df.dropna(subset=['name', 'toy']) @@ -1822,7 +1842,7 @@ def dropna( [2 rows x 3 columns] Args: - axis ({0 or 'index', 1 or 'columns'}, default 'columns'): + axis ({0 or 'index', 1 or 'columns'}, default 0): Determine if rows or columns which contain missing values are removed. @@ -1834,6 +1854,8 @@ def dropna( * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. + thresh (int, optional): + Require that many non-NA values. Cannot be combined with how. subset (column label or sequence of labels, optional): Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. @@ -1851,6 +1873,8 @@ def dropna( Raises: ValueError: If ``how`` is not one of ``any`` or ``all``. + TyperError: + If both ``how`` and ``thresh`` are specified. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)