From 952e4461f388730486b707d682b373f558f68489 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 7 Aug 2025 18:26:21 +0000
Subject: [PATCH 1/2] feat: Add value_counts to GroupBy classes

---
 bigframes/core/block_transforms.py            |  19 ++-
 bigframes/core/groupby/dataframe_group_by.py  |  35 ++++-
 bigframes/core/groupby/series_group_by.py     |  24 ++++
 bigframes/core/indexes/base.py                |   2 +-
 bigframes/dataframe.py                        |   2 +-
 bigframes/series.py                           |   2 +-
 tests/system/small/test_groupby.py            |  73 ++++++++++
 .../pandas/core/groupby/__init__.py           | 125 ++++++++++++++++++
 8 files changed, 271 insertions(+), 11 deletions(-)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index cb7c1923cf..465728b0ef 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -355,24 +355,28 @@ def value_counts(
     normalize: bool = False,
     sort: bool = True,
     ascending: bool = False,
-    dropna: bool = True,
+    drop_na: bool = True,
+    grouping_keys: typing.Sequence[str] = (),
 ):
-    block, dummy = block.create_constant(1)
+    if grouping_keys and drop_na:
+        # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
+        block = dropna(block, columns, how="any")
     block, agg_ids = block.aggregate(
-        by_column_ids=columns,
-        aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
-        dropna=dropna,
+        by_column_ids=(*grouping_keys, *columns),
+        aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
+        dropna=drop_na and not grouping_keys,
     )
     count_id = agg_ids[0]
     if normalize:
-        unbound_window = windows.unbound()
+        unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
         block, total_count_id = block.apply_window_op(
             count_id, agg_ops.sum_op, unbound_window
         )
         block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
 
     if sort:
-        block = block.order_by(
+        order_parts = [ordering.ascending_over(id) for id in grouping_keys]
+        order_parts.extend(
             [
                 ordering.OrderingExpression(
                     ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
                 )
             ]
         )
+        block = block.order_by(order_parts)
     return block.select_column(count_id).with_column_labels(
         ["proportion" if normalize else "count"]
     )
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
index a2c4cf2867..ef76010918 100644
--- a/bigframes/core/groupby/dataframe_group_by.py
+++ b/bigframes/core/groupby/dataframe_group_by.py
@@ -16,7 +16,7 @@
 
 import datetime
 import typing
-from typing import Literal, Sequence, Tuple, Union
+from typing import Literal, Optional, Sequence, Tuple, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -330,6 +330,39 @@ def diff(self, periods=1) -> series.Series:
         )
         return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
 
+    def value_counts(
+        self,
+        subset: Optional[Sequence[blocks.Label]] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        if subset is None:
+            columns = self._selected_cols
+        else:
+            columns = [
+                column
+                for column in self._block.value_columns
+                if self._block.col_id_to_label[column] in subset
+            ]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        if self._as_index:
+            return series.Series(block)
+        else:
+            return series.Series(block).to_frame().reset_index(drop=False)
+
     @validations.requires_ordering()
     def rolling(
         self,
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
index a29bb45a32..1d34469104 100644
--- a/bigframes/core/groupby/series_group_by.py
+++ b/bigframes/core/groupby/series_group_by.py
@@ -195,6 +195,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
 
     aggregate = agg
 
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        columns = [self._value_column]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        # TODO: once as_index=Fales supported, return DataFrame instead by resetting index
+        # with .to_frame().reset_index(drop=False)
+        return series.Series(block)
+
     @validations.requires_ordering()
     def cumsum(self, *args, **kwargs) -> series.Series:
         return self._apply_window_op(
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
index 9ad201c73d..e022b3f151 100644
--- a/bigframes/core/indexes/base.py
+++ b/bigframes/core/indexes/base.py
@@ -489,7 +489,7 @@ def value_counts(
             self._block.index_columns,
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         import bigframes.series as series
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 7de4bdbc91..abaa68af8f 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2475,7 +2475,7 @@ def value_counts(
             normalize=normalize,
             sort=sort,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return bigframes.series.Series(block)
 
diff --git a/bigframes/series.py b/bigframes/series.py
index 3a1af0bb1d..bfc26adc38 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -1631,7 +1631,7 @@ def value_counts(
             [self._value_column],
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return Series(block)
 
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index 0af173adc8..d67eb96ac7 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -582,6 +582,46 @@ def test_dataframe_groupby_nonnumeric_with_mean():
     )
 
 
+@pytest.mark.parametrize(
+    ("subset", "normalize", "ascending", "dropna", "as_index"),
+    [
+        (None, True, True, True, True),
+        (["int64_too", "int64_col"], False, False, False, False),
+    ],
+)
+def test_dataframe_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    subset,
+    normalize,
+    ascending,
+    dropna,
+    as_index,
+):
+    col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
+    bf_result = (
+        scalars_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+    )
+
+    if as_index:
+        pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    else:
+        pd_result.index = pd_result.index.astype("Int64")
+        pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
 # ==============
 # Series.groupby
 # ==============
@@ -768,3 +808,36 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
     pd.testing.assert_series_equal(
         pd_result, bf_result, check_dtype=False, check_index_type=False
     )
+
+
+@pytest.mark.parametrize(
+    ("normalize", "ascending", "dropna"),
+    [
+        (
+            True,
+            True,
+            True,
+        ),
+        (
+            False,
+            False,
+            False,
+        ),
+    ],
+)
+def test_series_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    normalize,
+    ascending,
+    dropna,
+):
+    bf_result = (
+        scalars_df_index.groupby("bool_col")["string_col"]
+        .value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
+        .to_pandas()
+    )
+    pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
+        normalize=normalize, ascending=ascending, dropna=dropna
+    )
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index ebfbfa8830..18d5c4499b 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -1256,6 +1256,32 @@ def nunique(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ):
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        Args:
+            normalize (bool, default False):
+                Return proportions rather than frequencies.
+            sort (bool, default True):
+                Sort by frequencies.
+            ascending (bool, default False):
+                Sort in ascending order.
+            dropna (bool, default True):
+                Don't include counts of rows that contain NA values.
+
+        Returns:
+            Series or DataFrame:
+                Series if the groupby as_index is True, otherwise DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
 
 class DataFrameGroupBy(GroupBy):
     def agg(self, func, **kwargs):
@@ -1406,3 +1432,102 @@ def nunique(self):
                 Number of unique values within a BigQuery DataFrame.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def value_counts(
+        self,
+        subset=None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ):
+        """
+        Return a Series or DataFrame containing counts of unique rows.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import numpy as np
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
+            ...     'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
+            ...     'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
+            ... })
+
+            >>> df
+               gender education country
+            0    male       low      US
+            1    male    medium      FR
+            2  female      high      US
+            3    male       low      FR
+            4  female      high      FR
+            5    male       low      FR
+            <BLANKLINE>
+            [6 rows x 3 columns]
+
+            >>> df.groupby('gender').value_counts()
+                 gender  education  country
+            female  high       FR         1
+                               US         1
+            male    low        FR         2
+                               US         1
+                    medium     FR         1
+            Name: count, dtype: Int64
+
+            >>> df.groupby('gender').value_counts(ascending=True)
+            gender  education  country
+            female  high       FR         1
+                               US         1
+            male    low        US         1
+                    medium     FR         1
+                    low        FR         2
+            Name: count, dtype: Int64
+
+            >>> df.groupby('gender').value_counts(normalize=True)
+            gender  education  country
+            female  high       FR          0.5
+                               US          0.5
+            male    low        FR          0.5
+                               US         0.25
+                    medium     FR         0.25
+            Name: proportion, dtype: Float64
+
+            >>> df.groupby('gender', as_index=False).value_counts()
+               gender education country  count
+            0  female      high      FR      1
+            1  female      high      US      1
+            2    male       low      FR      2
+            3    male       low      US      1
+            4    male    medium      FR      1
+            <BLANKLINE>
+            [5 rows x 4 columns]
+
+            >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
+               gender education country  proportion
+            0  female      high      FR         0.5
+            1  female      high      US         0.5
+            2    male       low      FR         0.5
+            3    male       low      US        0.25
+            4    male    medium      FR        0.25
+            <BLANKLINE>
+            [5 rows x 4 columns]
+
+        Args:
+            subset (list-like, optional):
+                Columns to use when counting unique combinations.
+            normalize (bool, default False):
+                Return proportions rather than frequencies.
+            sort (bool, default True):
+                Sort by frequencies.
+            ascending (bool, default False):
+                Sort in ascending order.
+            dropna (bool, default True):
+                Don't include counts of rows that contain NA values.
+
+        Returns:
+            Series or DataFrame:
+                Series if the groupby as_index is True, otherwise DataFrame.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

From fbbfa567a891e208f1932ef5a5d1881384c32348 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 7 Aug 2025 20:20:50 +0000
Subject: [PATCH 2/2] skip tests on pandas 1.x

---
 tests/system/small/test_groupby.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index 1626d4a453..5c89363e9b 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -598,6 +598,8 @@ def test_dataframe_groupby_value_counts(
     dropna,
     as_index,
 ):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
     col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
     bf_result = (
         scalars_df_index[col_names]
@@ -885,6 +887,8 @@ def test_series_groupby_value_counts(
     ascending,
     dropna,
 ):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
     bf_result = (
         scalars_df_index.groupby("bool_col")["string_col"]
         .value_counts(normalize=normalize, ascending=ascending, dropna=dropna)