support thresh in dropna

shuoweil · shuoweil · commit ceccbaa312f7 · 2025-07-07T19:10:47.000Z
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -523,6 +523,7 @@ def dropna(
     block: blocks.Block,
     column_ids: typing.Sequence[str],
     how: typing.Literal["all", "any"] = "any",
+    thresh: typing.Optional[int] = None,
     subset: Optional[typing.Sequence[str]] = None,
 ):
     """
@@ -531,18 +532,46 @@ def dropna(
     if subset is None:
         subset = column_ids
 
-    predicates = [
-        ops.notnull_op.as_expr(column_id)
-        for column_id in column_ids
-        if column_id in subset
-    ]
-    if len(predicates) == 0:
-        return block
-    if how == "any":
-        predicate = functools.reduce(ops.and_op.as_expr, predicates)
-    else:  # "all"
-        predicate = functools.reduce(ops.or_op.as_expr, predicates)
-    return block.filter(predicate)
+    if thresh is not None:
+        # Count non-null values per row
+        notnull_predicates = [
+            ops.notnull_op.as_expr(column_id)
+            for column_id in column_ids
+            if column_id in subset
+        ]
+
+        if len(notnull_predicates) == 0:
+            return block
+
+        # Handle single predicate case
+        if len(notnull_predicates) == 1:
+            count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(notnull_predicates[0])
+        else:
+            # Sum the boolean expressions to count non-null values
+            count_expr = functools.reduce(
+                lambda a, b: ops.add_op.as_expr(
+                    ops.AsTypeOp(pd.Int64Dtype()).as_expr(a),
+                    ops.AsTypeOp(pd.Int64Dtype()).as_expr(b),
+                ),
+                notnull_predicates,
+            )
+
+        # Filter rows where count >= thresh
+        thresh_predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
+        return block.filter(thresh_predicate)
+    else:
+        predicates = [
+            ops.notnull_op.as_expr(column_id)
+            for column_id in column_ids
+            if column_id in subset
+        ]
+        if len(predicates) == 0:
+            return block
+        if how == "any":
+            predicate = functools.reduce(ops.and_op.as_expr, predicates)
+        else:  # "all"
+            predicate = functools.reduce(ops.or_op.as_expr, predicates)
+        return block.filter(predicate)
 
 
 def nsmallest(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2801,7 +2801,8 @@ def dropna(
         self,
         *,
         axis: int | str = 0,
-        how: str = "any",
+        how: typing.Literal["all", "any"] = "any",
+        thresh: typing.Optional[int] = None,
         subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
         inplace: bool = False,
         ignore_index=False,
@@ -2810,6 +2811,10 @@ def dropna(
             raise NotImplementedError(
                 f"'inplace'=True not supported. {constants.FEEDBACK_LINK}"
             )
+        if thresh is not None and how != "any":
+            raise TypeError(
+                "You cannot set both the how and thresh arguments at the same time."
+            )
         if how not in ("any", "all"):
             raise ValueError("'how' must be one of 'any', 'all'")
 
@@ -2833,21 +2838,41 @@ def dropna(
                     for id_ in self._block.label_to_col_id[label]
                 ]
 
-            result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids)  # type: ignore
+            result = block_ops.dropna(
+                self._block,
+                self._block.value_columns,
+                how=how,
+                thresh=thresh,
+                subset=subset_ids,
+            )  # type: ignore
             if ignore_index:
                 result = result.reset_index()
             return DataFrame(result)
         else:
-            isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
-            if how == "any":
-                null_locations = DataFrame(isnull_block).any().to_pandas()
-            else:  # 'all'
-                null_locations = DataFrame(isnull_block).all().to_pandas()
-            keep_columns = [
-                col
-                for col, to_drop in zip(self._block.value_columns, null_locations)
-                if not to_drop
-            ]
+            if thresh is not None:
+                # Count non-null values per column
+                isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
+                notnull_block = self._block.multi_apply_unary_op(ops.notnull_op)
+
+                # Sum non-null values for each column
+                notnull_counts = DataFrame(notnull_block).sum().to_pandas()
+
+                keep_columns = [
+                    col
+                    for col, count in zip(self._block.value_columns, notnull_counts)
+                    if count >= thresh
+                ]
+            else:
+                isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
+                if how == "any":
+                    null_locations = DataFrame(isnull_block).any().to_pandas()
+                else:  # 'all'
+                    null_locations = DataFrame(isnull_block).all().to_pandas()
+                keep_columns = [
+                    col
+                    for col, to_drop in zip(self._block.value_columns, null_locations)
+                    if not to_drop
+                ]
             return DataFrame(self._block.select_columns(keep_columns))
 
     def any(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1181,26 +1181,41 @@ def test_assign_callable_lambda(scalars_dfs):
 
 
 @pytest.mark.parametrize(
-    ("axis", "how", "ignore_index", "subset"),
+    ("axis", "how", "ignore_index", "subset", "thresh"),
     [
-        (0, "any", False, None),
-        (0, "any", True, None),
-        (0, "all", False, ["bool_col", "time_col"]),
-        (0, "any", False, ["bool_col", "time_col"]),
-        (0, "all", False, "time_col"),
-        (1, "any", False, None),
-        (1, "all", False, None),
+        (0, "any", False, None, None),
+        (0, "any", True, None, None),
+        (0, "all", False, ["bool_col", "time_col"], None),
+        (0, "any", False, ["bool_col", "time_col"], None),
+        (0, "all", False, "time_col", None),
+        (1, "any", False, None, None),
+        (1, "all", False, None, None),
+        (0, "any", False, None, 2),
+        (0, "any", True, None, 3),
+        (1, "any", False, None, 2),
     ],
 )
-def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
+def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset, thresh):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     scalars_df, scalars_pandas_df = scalars_dfs
-    df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
+
+    if thresh is not None:
+        df = scalars_df.dropna(
+            axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+        )
+        pd_result = scalars_pandas_df.dropna(
+            axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+        )
+    else:
+        df = scalars_df.dropna(
+            axis=axis, how=how, ignore_index=ignore_index, subset=subset
+        )
+        pd_result = scalars_pandas_df.dropna(
+            axis=axis, how=how, ignore_index=ignore_index, subset=subset
+        )
+
     bf_result = df.to_pandas()
-    pd_result = scalars_pandas_df.dropna(
-        axis=axis, how=how, ignore_index=ignore_index, subset=subset
-    )
 
     # Pandas uses int64 instead of Int64 (nullable) dtype.
     pd_result.index = pd_result.index.astype(pd.Int64Dtype())
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1762,6 +1762,7 @@ def dropna(
         *,
         axis: int | str = 0,
         how: str = "any",
+        thresh: Optional[int] = None,
         subset=None,
         inplace: bool = False,
         ignore_index=False,
@@ -1812,6 +1813,25 @@ def dropna(
             <BLANKLINE>
             [3 rows x 3 columns]
 
+        Keep rows with at least 2 non-null values.
+
+            >>> df.dropna(thresh=2)
+                            name        toy        born
+            1    Batman  Batmobile  1940-04-25
+            2  Catwoman   Bullwhip        <NA>
+            <BLANKLINE>
+            [2 rows x 3 columns]
+
+        Keep columns with at least 2 non-null values:
+
+            >>> df.dropna(axis='columns', thresh=2)
+                name        toy
+            0    Alfred       <NA>
+            1    Batman  Batmobile
+            2  Catwoman   Bullwhip
+            <BLANKLINE>
+            [3 rows x 2 columns]
+
         Define in which columns to look for missing values.
 
             >>> df.dropna(subset=['name', 'toy'])
@@ -1834,6 +1854,8 @@ def dropna(
 
                 * 'any' : If any NA values are present, drop that row or column.
                 * 'all' : If all values are NA, drop that row or column.
+            typing(int, optional):
+                Require that many non-NA values. Cannot be combined with how.
             subset (column label or sequence of labels, optional):
                 Labels along other axis to consider, e.g. if you are dropping
                 rows these would be a list of columns to include.
@@ -1851,6 +1873,8 @@ def dropna(
         Raises:
             ValueError:
                 If ``how`` is not one of ``any`` or ``all``.
+            TyperError:
+                If both ``how`` and ``thresh`` are specified.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)