From 952e4461f388730486b707d682b373f558f68489 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 7 Aug 2025 18:26:21 +0000 Subject: [PATCH 1/2] feat: Add value_counts to GroupBy classes --- bigframes/core/block_transforms.py | 19 ++- bigframes/core/groupby/dataframe_group_by.py | 35 ++++- bigframes/core/groupby/series_group_by.py | 24 ++++ bigframes/core/indexes/base.py | 2 +- bigframes/dataframe.py | 2 +- bigframes/series.py | 2 +- tests/system/small/test_groupby.py | 73 ++++++++++ .../pandas/core/groupby/__init__.py | 125 ++++++++++++++++++ 8 files changed, 271 insertions(+), 11 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index cb7c1923cf..465728b0ef 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -355,24 +355,28 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, - dropna: bool = True, + drop_na: bool = True, + grouping_keys: typing.Sequence[str] = (), ): - block, dummy = block.create_constant(1) + if grouping_keys and drop_na: + # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us + block = dropna(block, columns, how="any") block, agg_ids = block.aggregate( - by_column_ids=columns, - aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))], - dropna=dropna, + by_column_ids=(*grouping_keys, *columns), + aggregations=[ex.NullaryAggregation(agg_ops.size_op)], + dropna=drop_na and not grouping_keys, ) count_id = agg_ids[0] if normalize: - unbound_window = windows.unbound() + unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys)) block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op) if sort: - block = block.order_by( + order_parts = [ordering.ascending_over(id) for id in grouping_keys] + order_parts.extend( [ ordering.OrderingExpression( ex.deref(count_id), @@ -382,6 +386,7 @@ def value_counts( ) ] ) + block = block.order_by(order_parts) return block.select_column(count_id).with_column_labels( ["proportion" if normalize else "count"] ) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index a2c4cf2867..ef76010918 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Sequence, Tuple, Union +from typing import Literal, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -330,6 +330,39 @@ def diff(self, periods=1) -> series.Series: ) return self._apply_window_op(agg_ops.DiffOp(periods), window=window) + def value_counts( + self, + subset: Optional[Sequence[blocks.Label]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Union[df.DataFrame, series.Series]: + if subset is None: + columns = self._selected_cols + else: + columns = [ + column + for column in self._block.value_columns + if self._block.col_id_to_label[column] in subset + ] + block = self._block + if self._dropna: # this drops null grouping columns + block = block_ops.dropna(block, self._by_col_ids) + block = block_ops.value_counts( + block, + columns, + normalize=normalize, + sort=sort, + ascending=ascending, + drop_na=dropna, # this drops null value columns + grouping_keys=self._by_col_ids, + ) + if self._as_index: + return series.Series(block) + else: + return series.Series(block).to_frame().reset_index(drop=False) + @validations.requires_ordering() def rolling( self, diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index a29bb45a32..1d34469104 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -195,6 +195,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]: aggregate = agg + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Union[df.DataFrame, series.Series]: + columns = [self._value_column] + block = self._block + if self._dropna: # this drops null grouping columns + block = block_ops.dropna(block, self._by_col_ids) + block = block_ops.value_counts( + block, + columns, + normalize=normalize, + sort=sort, + ascending=ascending, + drop_na=dropna, # this drops null value columns + grouping_keys=self._by_col_ids, + ) + # TODO: once as_index=Fales supported, return DataFrame instead by resetting index + # with .to_frame().reset_index(drop=False) + return series.Series(block) + @validations.requires_ordering() def cumsum(self, *args, **kwargs) -> series.Series: return self._apply_window_op( diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 9ad201c73d..e022b3f151 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -489,7 +489,7 @@ def value_counts( self._block.index_columns, normalize=normalize, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) import bigframes.series as series diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7de4bdbc91..abaa68af8f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2475,7 +2475,7 @@ def value_counts( normalize=normalize, sort=sort, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) return bigframes.series.Series(block) diff --git a/bigframes/series.py b/bigframes/series.py index 3a1af0bb1d..bfc26adc38 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1631,7 +1631,7 @@ def value_counts( [self._value_column], normalize=normalize, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) return Series(block) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 0af173adc8..d67eb96ac7 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -582,6 +582,46 @@ def test_dataframe_groupby_nonnumeric_with_mean(): ) +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna", "as_index"), + [ + (None, True, True, True, True), + (["int64_too", "int64_col"], False, False, False, False), + ], +) +def test_dataframe_groupby_value_counts( + scalars_df_index, + scalars_pandas_df_index, + subset, + normalize, + ascending, + dropna, + as_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "int64_too"] + bf_result = ( + scalars_df_index[col_names] + .groupby("bool_col", as_index=as_index) + .value_counts( + subset=subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("bool_col", as_index=as_index) + .value_counts( + subset=subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + ) + + if as_index: + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + else: + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + # ============== # Series.groupby # ============== @@ -768,3 +808,36 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): pd.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) + + +@pytest.mark.parametrize( + ("normalize", "ascending", "dropna"), + [ + ( + True, + True, + True, + ), + ( + False, + False, + False, + ), + ], +) +def test_series_groupby_value_counts( + scalars_df_index, + scalars_pandas_df_index, + normalize, + ascending, + dropna, +): + bf_result = ( + scalars_df_index.groupby("bool_col")["string_col"] + .value_counts(normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts( + normalize=normalize, ascending=ascending, dropna=dropna + ) + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ebfbfa8830..18d5c4499b 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1256,6 +1256,32 @@ def nunique(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ): + """ + Return a Series or DataFrame containing counts of unique rows. + + Args: + normalize (bool, default False): + Return proportions rather than frequencies. + sort (bool, default True): + Sort by frequencies. + ascending (bool, default False): + Sort in ascending order. + dropna (bool, default True): + Don't include counts of rows that contain NA values. + + Returns: + Series or DataFrame: + Series if the groupby as_index is True, otherwise DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class DataFrameGroupBy(GroupBy): def agg(self, func, **kwargs): @@ -1406,3 +1432,102 @@ def nunique(self): Number of unique values within a BigQuery DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def value_counts( + self, + subset=None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ): + """ + Return a Series or DataFrame containing counts of unique rows. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + [6 rows x 3 columns] + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: Int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: Int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.5 + US 0.5 + male low FR 0.5 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: Float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + [5 rows x 4 columns] + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.5 + 1 female high US 0.5 + 2 male low FR 0.5 + 3 male low US 0.25 + 4 male medium FR 0.25 + + [5 rows x 4 columns] + + Args: + subset (list-like, optional): + Columns to use when counting unique combinations. + normalize (bool, default False): + Return proportions rather than frequencies. + sort (bool, default True): + Sort by frequencies. + ascending (bool, default False): + Sort in ascending order. + dropna (bool, default True): + Don't include counts of rows that contain NA values. + + Returns: + Series or DataFrame: + Series if the groupby as_index is True, otherwise DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From fbbfa567a891e208f1932ef5a5d1881384c32348 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 7 Aug 2025 20:20:50 +0000 Subject: [PATCH 2/2] skip tests on pandas 1.x --- tests/system/small/test_groupby.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 1626d4a453..5c89363e9b 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -598,6 +598,8 @@ def test_dataframe_groupby_value_counts( dropna, as_index, ): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") col_names = ["float64_col", "int64_col", "bool_col", "int64_too"] bf_result = ( scalars_df_index[col_names] @@ -885,6 +887,8 @@ def test_series_groupby_value_counts( ascending, dropna, ): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") bf_result = ( scalars_df_index.groupby("bool_col")["string_col"] .value_counts(normalize=normalize, ascending=ascending, dropna=dropna)