Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,24 +355,28 @@ def value_counts(
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
drop_na: bool = True,
grouping_keys: typing.Sequence[str] = (),
):
block, dummy = block.create_constant(1)
if grouping_keys and drop_na:
# only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
block = dropna(block, columns, how="any")
block, agg_ids = block.aggregate(
by_column_ids=columns,
aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
dropna=dropna,
by_column_ids=(*grouping_keys, *columns),
aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
dropna=drop_na and not grouping_keys,
)
count_id = agg_ids[0]
if normalize:
unbound_window = windows.unbound()
unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
block, total_count_id = block.apply_window_op(
count_id, agg_ops.sum_op, unbound_window
)
block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)

if sort:
block = block.order_by(
order_parts = [ordering.ascending_over(id) for id in grouping_keys]
order_parts.extend(
[
ordering.OrderingExpression(
ex.deref(count_id),
Expand All @@ -382,6 +386,7 @@ def value_counts(
)
]
)
block = block.order_by(order_parts)
return block.select_column(count_id).with_column_labels(
["proportion" if normalize else "count"]
)
Expand Down
35 changes: 34 additions & 1 deletion bigframes/core/groupby/dataframe_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import datetime
import typing
from typing import Literal, Sequence, Tuple, Union
from typing import Literal, Optional, Sequence, Tuple, Union

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
Expand Down Expand Up @@ -372,6 +372,39 @@ def diff(self, periods=1) -> series.Series:
)
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)

def value_counts(
self,
subset: Optional[Sequence[blocks.Label]] = None,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
) -> Union[df.DataFrame, series.Series]:
if subset is None:
columns = self._selected_cols
else:
columns = [
column
for column in self._block.value_columns
if self._block.col_id_to_label[column] in subset
]
block = self._block
if self._dropna: # this drops null grouping columns
block = block_ops.dropna(block, self._by_col_ids)
block = block_ops.value_counts(
block,
columns,
normalize=normalize,
sort=sort,
ascending=ascending,
drop_na=dropna, # this drops null value columns
grouping_keys=self._by_col_ids,
)
if self._as_index:
return series.Series(block)
else:
return series.Series(block).to_frame().reset_index(drop=False)

@validations.requires_ordering()
def rolling(
self,
Expand Down
24 changes: 24 additions & 0 deletions bigframes/core/groupby/series_group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:

aggregate = agg

def value_counts(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
dropna: bool = True,
) -> Union[df.DataFrame, series.Series]:
columns = [self._value_column]
block = self._block
if self._dropna: # this drops null grouping columns
block = block_ops.dropna(block, self._by_col_ids)
block = block_ops.value_counts(
block,
columns,
normalize=normalize,
sort=sort,
ascending=ascending,
drop_na=dropna, # this drops null value columns
grouping_keys=self._by_col_ids,
)
# TODO: once as_index=Fales supported, return DataFrame instead by resetting index
# with .to_frame().reset_index(drop=False)
return series.Series(block)

@validations.requires_ordering()
def cumsum(self, *args, **kwargs) -> series.Series:
return self._apply_window_op(
Expand Down
2 changes: 1 addition & 1 deletion bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def value_counts(
self._block.index_columns,
normalize=normalize,
ascending=ascending,
dropna=dropna,
drop_na=dropna,
)
import bigframes.series as series

Expand Down
2 changes: 1 addition & 1 deletion bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2475,7 +2475,7 @@ def value_counts(
normalize=normalize,
sort=sort,
ascending=ascending,
dropna=dropna,
drop_na=dropna,
)
return bigframes.series.Series(block)

Expand Down
2 changes: 1 addition & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1631,7 +1631,7 @@ def value_counts(
[self._value_column],
normalize=normalize,
ascending=ascending,
dropna=dropna,
drop_na=dropna,
)
return Series(block)

Expand Down
183 changes: 130 additions & 53 deletions tests/system/small/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,101 @@ def test_dataframe_groupby_nonnumeric_with_mean():
)


@pytest.mark.parametrize(
("subset", "normalize", "ascending", "dropna", "as_index"),
[
(None, True, True, True, True),
(["int64_too", "int64_col"], False, False, False, False),
],
)
def test_dataframe_groupby_value_counts(
scalars_df_index,
scalars_pandas_df_index,
subset,
normalize,
ascending,
dropna,
as_index,
):
if pd.__version__.startswith("1."):
pytest.skip("pandas 1.x produces different column labels.")
col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
bf_result = (
scalars_df_index[col_names]
.groupby("bool_col", as_index=as_index)
.value_counts(
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
)
.to_pandas()
)
pd_result = (
scalars_pandas_df_index[col_names]
.groupby("bool_col", as_index=as_index)
.value_counts(
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
)
)

if as_index:
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
else:
pd_result.index = pd_result.index.astype("Int64")
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)


@pytest.mark.parametrize(
("numeric_only", "min_count"),
[
(False, 4),
(True, 0),
],
)
def test_dataframe_groupby_first(
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
):
# min_count seems to not work properly on older pandas
pytest.importorskip("pandas", minversion="2.0.0")
# bytes, dates not handling min_count properly in pandas
bf_result = (
scalars_df_index.drop(columns=["bytes_col", "date_col"])
.groupby(scalars_df_index.int64_col % 2)
.first(numeric_only=numeric_only, min_count=min_count)
).to_pandas()
pd_result = (
scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
.groupby(scalars_pandas_df_index.int64_col % 2)
.first(numeric_only=numeric_only, min_count=min_count)
)
pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


@pytest.mark.parametrize(
("numeric_only", "min_count"),
[
(True, 2),
(False, -1),
],
)
def test_dataframe_groupby_last(
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
):
bf_result = (
scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
numeric_only=numeric_only, min_count=min_count
)
).to_pandas()
pd_result = scalars_pandas_df_index.groupby(
scalars_pandas_df_index.int64_col % 2
).last(numeric_only=numeric_only, min_count=min_count)
pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


# ==============
# Series.groupby
# ==============
Expand Down Expand Up @@ -770,6 +865,41 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
)


@pytest.mark.parametrize(
("normalize", "ascending", "dropna"),
[
(
True,
True,
True,
),
(
False,
False,
False,
),
],
)
def test_series_groupby_value_counts(
scalars_df_index,
scalars_pandas_df_index,
normalize,
ascending,
dropna,
):
if pd.__version__.startswith("1."):
pytest.skip("pandas 1.x produces different column labels.")
bf_result = (
scalars_df_index.groupby("bool_col")["string_col"]
.value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
.to_pandas()
)
pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
normalize=normalize, ascending=ascending, dropna=dropna
)
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)


@pytest.mark.parametrize(
("numeric_only", "min_count"),
[
Expand Down Expand Up @@ -813,56 +943,3 @@ def test_series_groupby_last(
numeric_only=numeric_only, min_count=min_count
)
pd.testing.assert_series_equal(pd_result, bf_result)


@pytest.mark.parametrize(
("numeric_only", "min_count"),
[
(False, 4),
(True, 0),
],
)
def test_dataframe_groupby_first(
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
):
# min_count seems to not work properly on older pandas
pytest.importorskip("pandas", minversion="2.0.0")
# bytes, dates not handling min_count properly in pandas
bf_result = (
scalars_df_index.drop(columns=["bytes_col", "date_col"])
.groupby(scalars_df_index.int64_col % 2)
.first(numeric_only=numeric_only, min_count=min_count)
).to_pandas()
pd_result = (
scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
.groupby(scalars_pandas_df_index.int64_col % 2)
.first(numeric_only=numeric_only, min_count=min_count)
)
pd.testing.assert_frame_equal(
pd_result,
bf_result,
)


@pytest.mark.parametrize(
("numeric_only", "min_count"),
[
(True, 2),
(False, -1),
],
)
def test_dataframe_groupby_last(
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
):
bf_result = (
scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
numeric_only=numeric_only, min_count=min_count
)
).to_pandas()
pd_result = scalars_pandas_df_index.groupby(
scalars_pandas_df_index.int64_col % 2
).last(numeric_only=numeric_only, min_count=min_count)
pd.testing.assert_frame_equal(
pd_result,
bf_result,
)
Loading