update docstring, and polish function

shuoweil · shuoweil · commit e81e5f41bc1c · 2025-07-07T15:29:05.000Z
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -532,46 +532,39 @@ def dropna(
     if subset is None:
         subset = column_ids
 
-    if thresh is not None:
-        # Count non-null values per row
-        notnull_predicates = [
-            ops.notnull_op.as_expr(column_id)
-            for column_id in column_ids
-            if column_id in subset
-        ]
+    # Predicates to check for non-null values in the subset of columns
+    predicates = [
+        ops.notnull_op.as_expr(column_id)
+        for column_id in column_ids
+        if column_id in subset
+    ]
 
-        if len(notnull_predicates) == 0:
-            return block
+    if len(predicates) == 0:
+        return block
 
+    if thresh is not None:
         # Handle single predicate case
-        if len(notnull_predicates) == 1:
-            count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(notnull_predicates[0])
+        if len(predicates) == 1:
+            count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0])
         else:
             # Sum the boolean expressions to count non-null values
             count_expr = functools.reduce(
                 lambda a, b: ops.add_op.as_expr(
                     ops.AsTypeOp(pd.Int64Dtype()).as_expr(a),
                     ops.AsTypeOp(pd.Int64Dtype()).as_expr(b),
                 ),
-                notnull_predicates,
+                predicates,
             )
 
         # Filter rows where count >= thresh
-        thresh_predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
-        return block.filter(thresh_predicate)
+        predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
     else:
-        predicates = [
-            ops.notnull_op.as_expr(column_id)
-            for column_id in column_ids
-            if column_id in subset
-        ]
-        if len(predicates) == 0:
-            return block
         if how == "any":
             predicate = functools.reduce(ops.and_op.as_expr, predicates)
         else:  # "all"
             predicate = functools.reduce(ops.or_op.as_expr, predicates)
-        return block.filter(predicate)
+
+    return block.filter(predicate)
 
 
 def nsmallest(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2850,11 +2850,8 @@ def dropna(
             return DataFrame(result)
         else:
             if thresh is not None:
-                # Count non-null values per column
-                isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
+                # Keep columns with at least 'thresh' non-null values
                 notnull_block = self._block.multi_apply_unary_op(ops.notnull_op)
-
-                # Sum non-null values for each column
                 notnull_counts = DataFrame(notnull_block).sum().to_pandas()
 
                 keep_columns = [
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -1181,47 +1181,62 @@ def test_assign_callable_lambda(scalars_dfs):
 
 
 @pytest.mark.parametrize(
-    ("axis", "how", "ignore_index", "subset", "thresh"),
+    ("axis", "how", "ignore_index", "subset"),
     [
-        (0, "any", False, None, None),
-        (0, "any", True, None, None),
-        (0, "all", False, ["bool_col", "time_col"], None),
-        (0, "any", False, ["bool_col", "time_col"], None),
-        (0, "all", False, "time_col", None),
-        (1, "any", False, None, None),
-        (1, "all", False, None, None),
-        (0, "any", False, None, 2),
-        (0, "any", True, None, 3),
-        (1, "any", False, None, 2),
+        (0, "any", False, None),
+        (0, "any", True, None),
+        (0, "all", False, ["bool_col", "time_col"]),
+        (0, "any", False, ["bool_col", "time_col"]),
+        (0, "all", False, "time_col"),
+        (1, "any", False, None),
+        (1, "all", False, None),
     ],
 )
-def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset, thresh):
+def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
     scalars_df, scalars_pandas_df = scalars_dfs
-
-    if thresh is not None:
-        df = scalars_df.dropna(
-            axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
-        )
-        pd_result = scalars_pandas_df.dropna(
-            axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
-        )
-    else:
-        df = scalars_df.dropna(
-            axis=axis, how=how, ignore_index=ignore_index, subset=subset
-        )
-        pd_result = scalars_pandas_df.dropna(
-            axis=axis, how=how, ignore_index=ignore_index, subset=subset
-        )
-
+    df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
     bf_result = df.to_pandas()
+    pd_result = scalars_pandas_df.dropna(
+        axis=axis, how=how, ignore_index=ignore_index, subset=subset
+    )
 
     # Pandas uses int64 instead of Int64 (nullable) dtype.
     pd_result.index = pd_result.index.astype(pd.Int64Dtype())
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+@pytest.mark.parametrize(
+    ("axis", "ignore_index", "subset", "thresh"),
+    [
+        (0, False, None, 2),
+        (0, True, None, 3),
+        (1, False, None, 2),
+    ],
+)
+def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh):
+    """
+    Tests that dropna correctly keeps rows/columns with a minimum number
+    of non-null values.
+    """
+    # TODO: supply a reason why this isn't compatible with pandas 1.x
+    pytest.importorskip("pandas", minversion="2.0.0")
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    df_result = scalars_df.dropna(
+        axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+    )
+    pd_result = scalars_pandas_df.dropna(
+        axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
+    )
+
+    bf_result = df_result.to_pandas()
+    # Pandas uses int64 instead of Int64 (nullable) dtype.
+    pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+    pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
 def test_df_dropna_range_columns(scalars_dfs):
     # TODO: supply a reason why this isn't compatible with pandas 1.x
     pytest.importorskip("pandas", minversion="2.0.0")
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -1842,7 +1842,7 @@ def dropna(
             [2 rows x 3 columns]
 
         Args:
-            axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
+            axis ({0 or 'index', 1 or 'columns'}, default 0):
                 Determine if rows or columns which contain missing values are
                 removed.
 
@@ -1854,7 +1854,7 @@ def dropna(
 
                 * 'any' : If any NA values are present, drop that row or column.
                 * 'all' : If all values are NA, drop that row or column.
-            typing(int, optional):
+            thresh (int, optional):
                 Require that many non-NA values. Cannot be combined with how.
             subset (column label or sequence of labels, optional):
                 Labels along other axis to consider, e.g. if you are dropping