diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index cb7c1923cf..465728b0ef 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -355,24 +355,28 @@ def value_counts( normalize: bool = False, sort: bool = True, ascending: bool = False, - dropna: bool = True, + drop_na: bool = True, + grouping_keys: typing.Sequence[str] = (), ): - block, dummy = block.create_constant(1) + if grouping_keys and drop_na: + # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us + block = dropna(block, columns, how="any") block, agg_ids = block.aggregate( - by_column_ids=columns, - aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))], - dropna=dropna, + by_column_ids=(*grouping_keys, *columns), + aggregations=[ex.NullaryAggregation(agg_ops.size_op)], + dropna=drop_na and not grouping_keys, ) count_id = agg_ids[0] if normalize: - unbound_window = windows.unbound() + unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys)) block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op) if sort: - block = block.order_by( + order_parts = [ordering.ascending_over(id) for id in grouping_keys] + order_parts.extend( [ ordering.OrderingExpression( ex.deref(count_id), @@ -382,6 +386,7 @@ def value_counts( ) ] ) + block = block.order_by(order_parts) return block.select_column(count_id).with_column_labels( ["proportion" if normalize else "count"] ) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index 40bedd93d6..e4e4b313f9 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Sequence, Tuple, Union +from typing import Literal, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -372,6 +372,39 @@ def diff(self, periods=1) -> series.Series: ) return self._apply_window_op(agg_ops.DiffOp(periods), window=window) + def value_counts( + self, + subset: Optional[Sequence[blocks.Label]] = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Union[df.DataFrame, series.Series]: + if subset is None: + columns = self._selected_cols + else: + columns = [ + column + for column in self._block.value_columns + if self._block.col_id_to_label[column] in subset + ] + block = self._block + if self._dropna: # this drops null grouping columns + block = block_ops.dropna(block, self._by_col_ids) + block = block_ops.value_counts( + block, + columns, + normalize=normalize, + sort=sort, + ascending=ascending, + drop_na=dropna, # this drops null value columns + grouping_keys=self._by_col_ids, + ) + if self._as_index: + return series.Series(block) + else: + return series.Series(block).to_frame().reset_index(drop=False) + @validations.requires_ordering() def rolling( self, diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 24b5cba130..7a8bdcb6cf 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -244,6 +244,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]: aggregate = agg + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Union[df.DataFrame, series.Series]: + columns = [self._value_column] + block = self._block + if self._dropna: # this drops null grouping columns + block = block_ops.dropna(block, self._by_col_ids) + block = block_ops.value_counts( + block, + columns, + normalize=normalize, + sort=sort, + ascending=ascending, + drop_na=dropna, # this drops null value columns + grouping_keys=self._by_col_ids, + ) + # TODO: once as_index=Fales supported, return DataFrame instead by resetting index + # with .to_frame().reset_index(drop=False) + return series.Series(block) + @validations.requires_ordering() def cumsum(self, *args, **kwargs) -> series.Series: return self._apply_window_op( diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 9ad201c73d..e022b3f151 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -489,7 +489,7 @@ def value_counts( self._block.index_columns, normalize=normalize, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) import bigframes.series as series diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 858be3de45..6a7f67b985 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2475,7 +2475,7 @@ def value_counts( normalize=normalize, sort=sort, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) return bigframes.series.Series(block) diff --git a/bigframes/series.py b/bigframes/series.py index 3a1af0bb1d..bfc26adc38 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1631,7 +1631,7 @@ def value_counts( [self._value_column], normalize=normalize, ascending=ascending, - dropna=dropna, + drop_na=dropna, ) return Series(block) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 5d3f015de8..5c89363e9b 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -582,6 +582,101 @@ def test_dataframe_groupby_nonnumeric_with_mean(): ) +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna", "as_index"), + [ + (None, True, True, True, True), + (["int64_too", "int64_col"], False, False, False, False), + ], +) +def test_dataframe_groupby_value_counts( + scalars_df_index, + scalars_pandas_df_index, + subset, + normalize, + ascending, + dropna, + as_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + col_names = ["float64_col", "int64_col", "bool_col", "int64_too"] + bf_result = ( + scalars_df_index[col_names] + .groupby("bool_col", as_index=as_index) + .value_counts( + subset=subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("bool_col", as_index=as_index) + .value_counts( + subset=subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + ) + + if as_index: + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + else: + pd_result.index = pd_result.index.astype("Int64") + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("numeric_only", "min_count"), + [ + (False, 4), + (True, 0), + ], +) +def test_dataframe_groupby_first( + scalars_df_index, scalars_pandas_df_index, numeric_only, min_count +): + # min_count seems to not work properly on older pandas + pytest.importorskip("pandas", minversion="2.0.0") + # bytes, dates not handling min_count properly in pandas + bf_result = ( + scalars_df_index.drop(columns=["bytes_col", "date_col"]) + .groupby(scalars_df_index.int64_col % 2) + .first(numeric_only=numeric_only, min_count=min_count) + ).to_pandas() + pd_result = ( + scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"]) + .groupby(scalars_pandas_df_index.int64_col % 2) + .first(numeric_only=numeric_only, min_count=min_count) + ) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("numeric_only", "min_count"), + [ + (True, 2), + (False, -1), + ], +) +def test_dataframe_groupby_last( + scalars_df_index, scalars_pandas_df_index, numeric_only, min_count +): + bf_result = ( + scalars_df_index.groupby(scalars_df_index.int64_col % 2).last( + numeric_only=numeric_only, min_count=min_count + ) + ).to_pandas() + pd_result = scalars_pandas_df_index.groupby( + scalars_pandas_df_index.int64_col % 2 + ).last(numeric_only=numeric_only, min_count=min_count) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + # ============== # Series.groupby # ============== @@ -770,6 +865,41 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): ) +@pytest.mark.parametrize( + ("normalize", "ascending", "dropna"), + [ + ( + True, + True, + True, + ), + ( + False, + False, + False, + ), + ], +) +def test_series_groupby_value_counts( + scalars_df_index, + scalars_pandas_df_index, + normalize, + ascending, + dropna, +): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + bf_result = ( + scalars_df_index.groupby("bool_col")["string_col"] + .value_counts(normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts( + normalize=normalize, ascending=ascending, dropna=dropna + ) + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + @pytest.mark.parametrize( ("numeric_only", "min_count"), [ @@ -813,56 +943,3 @@ def test_series_groupby_last( numeric_only=numeric_only, min_count=min_count ) pd.testing.assert_series_equal(pd_result, bf_result) - - -@pytest.mark.parametrize( - ("numeric_only", "min_count"), - [ - (False, 4), - (True, 0), - ], -) -def test_dataframe_groupby_first( - scalars_df_index, scalars_pandas_df_index, numeric_only, min_count -): - # min_count seems to not work properly on older pandas - pytest.importorskip("pandas", minversion="2.0.0") - # bytes, dates not handling min_count properly in pandas - bf_result = ( - scalars_df_index.drop(columns=["bytes_col", "date_col"]) - .groupby(scalars_df_index.int64_col % 2) - .first(numeric_only=numeric_only, min_count=min_count) - ).to_pandas() - pd_result = ( - scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"]) - .groupby(scalars_pandas_df_index.int64_col % 2) - .first(numeric_only=numeric_only, min_count=min_count) - ) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("numeric_only", "min_count"), - [ - (True, 2), - (False, -1), - ], -) -def test_dataframe_groupby_last( - scalars_df_index, scalars_pandas_df_index, numeric_only, min_count -): - bf_result = ( - scalars_df_index.groupby(scalars_df_index.int64_col % 2).last( - numeric_only=numeric_only, min_count=min_count - ) - ).to_pandas() - pd_result = scalars_pandas_df_index.groupby( - scalars_pandas_df_index.int64_col % 2 - ).last(numeric_only=numeric_only, min_count=min_count) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 777846ff80..f0bc6348f8 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1330,6 +1330,32 @@ def nunique(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ): + """ + Return a Series or DataFrame containing counts of unique rows. + + Args: + normalize (bool, default False): + Return proportions rather than frequencies. + sort (bool, default True): + Sort by frequencies. + ascending (bool, default False): + Sort in ascending order. + dropna (bool, default True): + Don't include counts of rows that contain NA values. + + Returns: + Series or DataFrame: + Series if the groupby as_index is True, otherwise DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class DataFrameGroupBy(GroupBy): def agg(self, func, **kwargs): @@ -1480,3 +1506,102 @@ def nunique(self): Number of unique values within a BigQuery DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def value_counts( + self, + subset=None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ): + """ + Return a Series or DataFrame containing counts of unique rows. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + [6 rows x 3 columns] + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + Name: count, dtype: Int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + Name: count, dtype: Int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.5 + US 0.5 + male low FR 0.5 + US 0.25 + medium FR 0.25 + Name: proportion, dtype: Float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + [5 rows x 4 columns] + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.5 + 1 female high US 0.5 + 2 male low FR 0.5 + 3 male low US 0.25 + 4 male medium FR 0.25 + + [5 rows x 4 columns] + + Args: + subset (list-like, optional): + Columns to use when counting unique combinations. + normalize (bool, default False): + Return proportions rather than frequencies. + sort (bool, default True): + Sort by frequencies. + ascending (bool, default False): + Sort in ascending order. + dropna (bool, default True): + Don't include counts of rows that contain NA values. + + Returns: + Series or DataFrame: + Series if the groupby as_index is True, otherwise DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)