Skip to content

Commit e81e5f4

Browse files
committed
update docstring, and polish function
1 parent bc20a31 commit e81e5f4

File tree

4 files changed

+61
-56
lines changed

4 files changed

+61
-56
lines changed

bigframes/core/block_transforms.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -532,46 +532,39 @@ def dropna(
532532
if subset is None:
533533
subset = column_ids
534534

535-
if thresh is not None:
536-
# Count non-null values per row
537-
notnull_predicates = [
538-
ops.notnull_op.as_expr(column_id)
539-
for column_id in column_ids
540-
if column_id in subset
541-
]
535+
# Predicates to check for non-null values in the subset of columns
536+
predicates = [
537+
ops.notnull_op.as_expr(column_id)
538+
for column_id in column_ids
539+
if column_id in subset
540+
]
542541

543-
if len(notnull_predicates) == 0:
544-
return block
542+
if len(predicates) == 0:
543+
return block
545544

545+
if thresh is not None:
546546
# Handle single predicate case
547-
if len(notnull_predicates) == 1:
548-
count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(notnull_predicates[0])
547+
if len(predicates) == 1:
548+
count_expr = ops.AsTypeOp(pd.Int64Dtype()).as_expr(predicates[0])
549549
else:
550550
# Sum the boolean expressions to count non-null values
551551
count_expr = functools.reduce(
552552
lambda a, b: ops.add_op.as_expr(
553553
ops.AsTypeOp(pd.Int64Dtype()).as_expr(a),
554554
ops.AsTypeOp(pd.Int64Dtype()).as_expr(b),
555555
),
556-
notnull_predicates,
556+
predicates,
557557
)
558558

559559
# Filter rows where count >= thresh
560-
thresh_predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
561-
return block.filter(thresh_predicate)
560+
predicate = ops.ge_op.as_expr(count_expr, ex.const(thresh))
562561
else:
563-
predicates = [
564-
ops.notnull_op.as_expr(column_id)
565-
for column_id in column_ids
566-
if column_id in subset
567-
]
568-
if len(predicates) == 0:
569-
return block
570562
if how == "any":
571563
predicate = functools.reduce(ops.and_op.as_expr, predicates)
572564
else: # "all"
573565
predicate = functools.reduce(ops.or_op.as_expr, predicates)
574-
return block.filter(predicate)
566+
567+
return block.filter(predicate)
575568

576569

577570
def nsmallest(

bigframes/dataframe.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2850,11 +2850,8 @@ def dropna(
28502850
return DataFrame(result)
28512851
else:
28522852
if thresh is not None:
2853-
# Count non-null values per column
2854-
isnull_block = self._block.multi_apply_unary_op(ops.isnull_op)
2853+
# Keep columns with at least 'thresh' non-null values
28552854
notnull_block = self._block.multi_apply_unary_op(ops.notnull_op)
2856-
2857-
# Sum non-null values for each column
28582855
notnull_counts = DataFrame(notnull_block).sum().to_pandas()
28592856

28602857
keep_columns = [

tests/system/small/test_dataframe.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,47 +1181,62 @@ def test_assign_callable_lambda(scalars_dfs):
11811181

11821182

11831183
@pytest.mark.parametrize(
1184-
("axis", "how", "ignore_index", "subset", "thresh"),
1184+
("axis", "how", "ignore_index", "subset"),
11851185
[
1186-
(0, "any", False, None, None),
1187-
(0, "any", True, None, None),
1188-
(0, "all", False, ["bool_col", "time_col"], None),
1189-
(0, "any", False, ["bool_col", "time_col"], None),
1190-
(0, "all", False, "time_col", None),
1191-
(1, "any", False, None, None),
1192-
(1, "all", False, None, None),
1193-
(0, "any", False, None, 2),
1194-
(0, "any", True, None, 3),
1195-
(1, "any", False, None, 2),
1186+
(0, "any", False, None),
1187+
(0, "any", True, None),
1188+
(0, "all", False, ["bool_col", "time_col"]),
1189+
(0, "any", False, ["bool_col", "time_col"]),
1190+
(0, "all", False, "time_col"),
1191+
(1, "any", False, None),
1192+
(1, "all", False, None),
11961193
],
11971194
)
1198-
def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset, thresh):
1195+
def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset):
11991196
# TODO: supply a reason why this isn't compatible with pandas 1.x
12001197
pytest.importorskip("pandas", minversion="2.0.0")
12011198
scalars_df, scalars_pandas_df = scalars_dfs
1202-
1203-
if thresh is not None:
1204-
df = scalars_df.dropna(
1205-
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
1206-
)
1207-
pd_result = scalars_pandas_df.dropna(
1208-
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
1209-
)
1210-
else:
1211-
df = scalars_df.dropna(
1212-
axis=axis, how=how, ignore_index=ignore_index, subset=subset
1213-
)
1214-
pd_result = scalars_pandas_df.dropna(
1215-
axis=axis, how=how, ignore_index=ignore_index, subset=subset
1216-
)
1217-
1199+
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
12181200
bf_result = df.to_pandas()
1201+
pd_result = scalars_pandas_df.dropna(
1202+
axis=axis, how=how, ignore_index=ignore_index, subset=subset
1203+
)
12191204

12201205
# Pandas uses int64 instead of Int64 (nullable) dtype.
12211206
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
12221207
pandas.testing.assert_frame_equal(bf_result, pd_result)
12231208

12241209

1210+
@pytest.mark.parametrize(
1211+
("axis", "ignore_index", "subset", "thresh"),
1212+
[
1213+
(0, False, None, 2),
1214+
(0, True, None, 3),
1215+
(1, False, None, 2),
1216+
],
1217+
)
1218+
def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh):
1219+
"""
1220+
Tests that dropna correctly keeps rows/columns with a minimum number
1221+
of non-null values.
1222+
"""
1223+
# TODO: supply a reason why this isn't compatible with pandas 1.x
1224+
pytest.importorskip("pandas", minversion="2.0.0")
1225+
scalars_df, scalars_pandas_df = scalars_dfs
1226+
1227+
df_result = scalars_df.dropna(
1228+
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
1229+
)
1230+
pd_result = scalars_pandas_df.dropna(
1231+
axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset
1232+
)
1233+
1234+
bf_result = df_result.to_pandas()
1235+
# Pandas uses int64 instead of Int64 (nullable) dtype.
1236+
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
1237+
pd.testing.assert_frame_equal(bf_result, pd_result)
1238+
1239+
12251240
def test_df_dropna_range_columns(scalars_dfs):
12261241
# TODO: supply a reason why this isn't compatible with pandas 1.x
12271242
pytest.importorskip("pandas", minversion="2.0.0")

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,7 +1842,7 @@ def dropna(
18421842
[2 rows x 3 columns]
18431843
18441844
Args:
1845-
axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
1845+
axis ({0 or 'index', 1 or 'columns'}, default 0):
18461846
Determine if rows or columns which contain missing values are
18471847
removed.
18481848
@@ -1854,7 +1854,7 @@ def dropna(
18541854
18551855
* 'any' : If any NA values are present, drop that row or column.
18561856
* 'all' : If all values are NA, drop that row or column.
1857-
typing(int, optional):
1857+
thresh (int, optional):
18581858
Require that many non-NA values. Cannot be combined with how.
18591859
subset (column label or sequence of labels, optional):
18601860
Labels along other axis to consider, e.g. if you are dropping

0 commit comments

Comments
 (0)