Skip to content

Commit d68b56d

Browse files
committed
implement it
1 parent cd38940 commit d68b56d

File tree

6 files changed

+328
-9
lines changed

6 files changed

+328
-9
lines changed

bigframes/core/blocks.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,10 +1371,16 @@ def aggregate(
13711371
) -> typing.Tuple[Block, typing.Sequence[str]]:
13721372
"""
13731373
Apply aggregations to the block.
1374+
13741375
Arguments:
13751376
by_column_id: column id of the aggregation key, this is preserved through the transform and used as index.
13761377
aggregations: input_column_id, operation tuples
13771378
dropna: whether null keys should be dropped
1379+
1380+
Returns:
1381+
Tuple[Block, Sequence[str]]:
1382+
The first element is the grouped block. The second is the
1383+
column IDs corresponding to each applied aggregation.
13781384
"""
13791385
if column_labels is None:
13801386
column_labels = pd.Index(range(len(aggregations)))

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from __future__ import annotations
1616

1717
import datetime
18+
import functools
1819
import typing
19-
from typing import Literal, Optional, Sequence, Tuple, Union
20+
from typing import Iterable, Literal, Optional, Sequence, Tuple, Union
2021

2122
import bigframes_vendored.constants as constants
2223
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -38,6 +39,8 @@
3839
import bigframes.core.window_spec as window_specs
3940
import bigframes.dataframe as df
4041
import bigframes.dtypes as dtypes
42+
import bigframes.enums
43+
import bigframes.operations as ops
4144
import bigframes.operations.aggregations as agg_ops
4245
import bigframes.series as series
4346

@@ -54,6 +57,7 @@ def __init__(
5457
selected_cols: typing.Optional[typing.Sequence[str]] = None,
5558
dropna: bool = True,
5659
as_index: bool = True,
60+
by_key_is_singular: bool = False,
5761
):
5862
# TODO(tbergeron): Support more group-by expression types
5963
self._block = block
@@ -64,6 +68,9 @@ def __init__(
6468
)
6569
}
6670
self._by_col_ids = by_col_ids
71+
self._by_key_is_singular = by_key_is_singular
72+
if by_key_is_singular:
73+
assert len(by_col_ids) == 1, "singular key should be exactly one group key"
6774

6875
self._dropna = dropna
6976
self._as_index = as_index
@@ -149,6 +156,59 @@ def head(self, n: int = 5) -> df.DataFrame:
149156
)
150157
)
151158

159+
def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]:
160+
original_index_columns = self._block._index_columns
161+
original_index_labels = self._block._index_labels
162+
by_col_ids = self._by_col_ids
163+
block = self._block.reset_index(
164+
level=None,
165+
# Keep the original index columns so they can be recovered.
166+
drop=False,
167+
allow_duplicates=True,
168+
replacement=bigframes.enums.DefaultIndexKind.NULL,
169+
).set_index(
170+
by_col_ids,
171+
# Keep by_col_ids in-place so the ordering doesn't change.
172+
drop=False,
173+
append=False,
174+
)
175+
block.cached(
176+
force=True,
177+
# All DataFrames will be filtered by by_col_ids, so
178+
# force block.cached() to cluster by the new index by explicitly
179+
# setting `session_aware=False`. This will ensure that the filters
180+
# are more efficient.
181+
session_aware=False,
182+
)
183+
keys_block, _ = block.aggregate(by_col_ids, dropna=self._dropna)
184+
for chunk in keys_block.to_pandas_batches():
185+
for by_keys in pd.MultiIndex.from_frame(chunk.index.to_frame()):
186+
filtered_df = df.DataFrame(
187+
# To ensure the cache is used, filter first, then reset the
188+
# index before yielding the DataFrame.
189+
block.filter(
190+
functools.reduce(
191+
ops.and_op.as_expr,
192+
(
193+
ops.eq_op.as_expr(by_col, ex.const(by_key))
194+
for by_col, by_key in zip(by_col_ids, by_keys)
195+
),
196+
),
197+
).set_index(
198+
original_index_columns,
199+
# We retained by_col_ids in the set_index call above,
200+
# so it's safe to drop the duplicates now.
201+
drop=True,
202+
append=False,
203+
index_labels=original_index_labels,
204+
)
205+
)
206+
207+
if self._by_key_is_singular:
208+
yield by_keys[0], filtered_df
209+
else:
210+
yield by_keys, filtered_df
211+
152212
def size(self) -> typing.Union[df.DataFrame, series.Series]:
153213
agg_block, _ = self._block.aggregate_size(
154214
by_column_ids=self._by_col_ids,

bigframes/core/groupby/series_group_by.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from __future__ import annotations
1616

1717
import datetime
18+
import functools
1819
import typing
19-
from typing import Literal, Sequence, Union
20+
from typing import Iterable, Literal, Sequence, Tuple, Union
2021

2122
import bigframes_vendored.constants as constants
2223
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -37,6 +38,8 @@
3738
import bigframes.core.window_spec as window_specs
3839
import bigframes.dataframe as df
3940
import bigframes.dtypes
41+
import bigframes.enums
42+
import bigframes.operations as ops
4043
import bigframes.operations.aggregations as agg_ops
4144
import bigframes.series as series
4245

@@ -75,6 +78,58 @@ def head(self, n: int = 5) -> series.Series:
7578
)
7679
)
7780

81+
def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]:
82+
original_index_columns = self._block._index_columns
83+
original_index_labels = self._block._index_labels
84+
by_col_ids = self._by_col_ids
85+
block = self._block.reset_index(
86+
level=None,
87+
# Keep the original index columns so they can be recovered.
88+
drop=False,
89+
allow_duplicates=True,
90+
replacement=bigframes.enums.DefaultIndexKind.NULL,
91+
).set_index(
92+
by_col_ids,
93+
# Keep by_col_ids in-place so the ordering doesn't change.
94+
drop=False,
95+
append=False,
96+
)
97+
block.cached(
98+
force=True,
99+
# All DataFrames will be filtered by by_col_ids, so
100+
# force block.cached() to cluster by the new index by explicitly
101+
# setting `session_aware=False`. This will ensure that the filters
102+
# are more efficient.
103+
session_aware=False,
104+
)
105+
keys_block, _ = block.aggregate(by_col_ids, dropna=self._dropna)
106+
for chunk in keys_block.to_pandas_batches():
107+
for by_keys in chunk.index:
108+
filtered_series = series.Series(
109+
# To ensure the cache is used, filter first, then reset the
110+
# index before yielding the DataFrame.
111+
block.filter(
112+
functools.reduce(
113+
ops.and_op.as_expr,
114+
(
115+
ops.eq_op.as_expr(by_col, ex.const(by_key))
116+
for by_col, by_key in zip(by_col_ids, by_keys)
117+
),
118+
),
119+
)
120+
.set_index(
121+
original_index_columns,
122+
# We retained by_col_ids in the set_index call above,
123+
# so it's safe to drop the duplicates now.
124+
drop=True,
125+
append=False,
126+
index_labels=original_index_labels,
127+
)
128+
.select_column(self._value_column),
129+
)
130+
filtered_series.name = self._value_name
131+
yield by_keys, filtered_series
132+
78133
def all(self) -> series.Series:
79134
return self._aggregate(agg_ops.all_op)
80135

bigframes/dataframe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3909,11 +3909,17 @@ def _groupby_level(
39093909
as_index: bool = True,
39103910
dropna: bool = True,
39113911
):
3912+
if utils.is_list_like(level):
3913+
by_key_is_singular = False
3914+
else:
3915+
by_key_is_singular = True
3916+
39123917
return groupby.DataFrameGroupBy(
39133918
self._block,
39143919
by_col_ids=self._resolve_levels(level),
39153920
as_index=as_index,
39163921
dropna=dropna,
3922+
by_key_is_singular=by_key_is_singular,
39173923
)
39183924

39193925
def _groupby_series(
@@ -3926,10 +3932,14 @@ def _groupby_series(
39263932
as_index: bool = True,
39273933
dropna: bool = True,
39283934
):
3935+
# Pandas makes a distinction between groupby with a list of keys
3936+
# versus groupby with a single item in some methods, like __iter__.
39293937
if not isinstance(by, bigframes.series.Series) and utils.is_list_like(by):
39303938
by = list(by)
3939+
by_key_is_singular = False
39313940
else:
39323941
by = [typing.cast(typing.Union[blocks.Label, bigframes.series.Series], by)]
3942+
by_key_is_singular = True
39333943

39343944
block = self._block
39353945
col_ids: typing.Sequence[str] = []
@@ -3959,6 +3969,7 @@ def _groupby_series(
39593969
by_col_ids=col_ids,
39603970
as_index=as_index,
39613971
dropna=dropna,
3972+
by_key_is_singular=by_key_is_singular,
39623973
)
39633974

39643975
def abs(self) -> DataFrame:

0 commit comments

Comments
 (0)