refactor

tswast · tswast · commit d6ba77fa877b · 2025-09-18T18:42:54.000Z
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import datetime
-import functools
 import typing
 from typing import Iterable, Literal, Optional, Sequence, Tuple, Union
 
@@ -30,7 +29,7 @@
 from bigframes.core import log_adapter
 import bigframes.core.block_transforms as block_ops
 import bigframes.core.blocks as blocks
-from bigframes.core.groupby import aggs, series_group_by
+from bigframes.core.groupby import aggs, group_by, series_group_by
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.validations as validations
@@ -39,8 +38,6 @@
 import bigframes.core.window_spec as window_specs
 import bigframes.dataframe as df
 import bigframes.dtypes as dtypes
-import bigframes.enums
-import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 import bigframes.series as series
 
@@ -157,57 +154,14 @@ def head(self, n: int = 5) -> df.DataFrame:
         )
 
     def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]:
-        original_index_columns = self._block._index_columns
-        original_index_labels = self._block._index_labels
-        by_col_ids = self._by_col_ids
-        block = self._block.reset_index(
-            level=None,
-            # Keep the original index columns so they can be recovered.
-            drop=False,
-            allow_duplicates=True,
-            replacement=bigframes.enums.DefaultIndexKind.NULL,
-        ).set_index(
-            by_col_ids,
-            # Keep by_col_ids in-place so the ordering doesn't change.
-            drop=False,
-            append=False,
-        )
-        block.cached(
-            force=True,
-            # All DataFrames will be filtered by by_col_ids, so
-            # force block.cached() to cluster by the new index by explicitly
-            # setting `session_aware=False`. This will ensure that the filters
-            # are more efficient.
-            session_aware=False,
-        )
-        keys_block, _ = block.aggregate(by_col_ids, dropna=self._dropna)
-        for chunk in keys_block.to_pandas_batches():
-            for by_keys in pd.MultiIndex.from_frame(chunk.index.to_frame()):
-                filtered_df = df.DataFrame(
-                    # To ensure the cache is used, filter first, then reset the
-                    # index before yielding the DataFrame.
-                    block.filter(
-                        functools.reduce(
-                            ops.and_op.as_expr,
-                            (
-                                ops.eq_op.as_expr(by_col, ex.const(by_key))
-                                for by_col, by_key in zip(by_col_ids, by_keys)
-                            ),
-                        ),
-                    ).set_index(
-                        original_index_columns,
-                        # We retained by_col_ids in the set_index call above,
-                        # so it's safe to drop the duplicates now.
-                        drop=True,
-                        append=False,
-                        index_labels=original_index_labels,
-                    )
-                )
-
-                if self._by_key_is_singular:
-                    yield by_keys[0], filtered_df
-                else:
-                    yield by_keys, filtered_df
+        for group_keys, filtered_block in group_by.block_groupby_iter(
+            self._block,
+            by_col_ids=self._by_col_ids,
+            by_key_is_singular=self._by_key_is_singular,
+            dropna=self._dropna,
+        ):
+            filtered_df = df.DataFrame(filtered_block)
+            yield group_keys, filtered_df
 
     def size(self) -> typing.Union[df.DataFrame, series.Series]:
         agg_block, _ = self._block.aggregate_size(
diff --git a/bigframes/core/groupby/group_by.py b/bigframes/core/groupby/group_by.py
@@ -0,0 +1,91 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import functools
+from typing import Sequence
+
+import pandas as pd
+
+from bigframes.core import blocks
+from bigframes.core import expression as ex
+import bigframes.enums
+import bigframes.operations as ops
+
+
+def block_groupby_iter(
+    block: blocks.Block,
+    *,
+    by_col_ids: Sequence[str],
+    by_key_is_singular: bool,
+    dropna: bool,
+):
+    original_index_columns = block._index_columns
+    original_index_labels = block._index_labels
+    by_col_ids = by_col_ids
+    block = block.reset_index(
+        level=None,
+        # Keep the original index columns so they can be recovered.
+        drop=False,
+        allow_duplicates=True,
+        replacement=bigframes.enums.DefaultIndexKind.NULL,
+    ).set_index(
+        by_col_ids,
+        # Keep by_col_ids in-place so the ordering doesn't change.
+        drop=False,
+        append=False,
+    )
+    block.cached(
+        force=True,
+        # All DataFrames will be filtered by by_col_ids, so
+        # force block.cached() to cluster by the new index by explicitly
+        # setting `session_aware=False`. This will ensure that the filters
+        # are more efficient.
+        session_aware=False,
+    )
+    keys_block, _ = block.aggregate(by_col_ids, dropna=dropna)
+    for chunk in keys_block.to_pandas_batches():
+        # Convert to MultiIndex to make sure we get tuples,
+        # even for singular keys.
+        by_keys_index = chunk.index
+        if not isinstance(by_keys_index, pd.MultiIndex):
+            by_keys_index = pd.MultiIndex.from_frame(by_keys_index.to_frame())
+
+        for by_keys in by_keys_index:
+            filtered_block = (
+                # To ensure the cache is used, filter first, then reset the
+                # index before yielding the DataFrame.
+                block.filter(
+                    functools.reduce(
+                        ops.and_op.as_expr,
+                        (
+                            ops.eq_op.as_expr(by_col, ex.const(by_key))
+                            for by_col, by_key in zip(by_col_ids, by_keys)
+                        ),
+                    ),
+                ).set_index(
+                    original_index_columns,
+                    # We retained by_col_ids in the set_index call above,
+                    # so it's safe to drop the duplicates now.
+                    drop=True,
+                    append=False,
+                    index_labels=original_index_labels,
+                )
+            )
+
+            if by_key_is_singular:
+                yield by_keys[0], filtered_block
+            else:
+                yield by_keys, filtered_block
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import datetime
-import functools
 import typing
 from typing import Iterable, Literal, Sequence, Tuple, Union
 
@@ -29,7 +28,7 @@
 from bigframes.core import log_adapter
 import bigframes.core.block_transforms as block_ops
 import bigframes.core.blocks as blocks
-from bigframes.core.groupby import aggs
+from bigframes.core.groupby import aggs, group_by
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.validations as validations
@@ -38,8 +37,6 @@
 import bigframes.core.window_spec as window_specs
 import bigframes.dataframe as df
 import bigframes.dtypes
-import bigframes.enums
-import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 import bigframes.series as series
 
@@ -55,6 +52,8 @@ def __init__(
         by_col_ids: typing.Sequence[str],
         value_name: blocks.Label = None,
         dropna=True,
+        *,
+        by_key_is_singular: bool = False,
     ):
         # TODO(tbergeron): Support more group-by expression types
         self._block = block
@@ -63,6 +62,10 @@ def __init__(
         self._value_name = value_name
         self._dropna = dropna  # Applies to aggregations but not windowing
 
+        self._by_key_is_singular = by_key_is_singular
+        if by_key_is_singular:
+            assert len(by_col_ids) == 1, "singular key should be exactly one group key"
+
     @property
     def _session(self) -> session.Session:
         return self._block.session
@@ -79,56 +82,17 @@ def head(self, n: int = 5) -> series.Series:
         )
 
     def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]:
-        original_index_columns = self._block._index_columns
-        original_index_labels = self._block._index_labels
-        by_col_ids = self._by_col_ids
-        block = self._block.reset_index(
-            level=None,
-            # Keep the original index columns so they can be recovered.
-            drop=False,
-            allow_duplicates=True,
-            replacement=bigframes.enums.DefaultIndexKind.NULL,
-        ).set_index(
-            by_col_ids,
-            # Keep by_col_ids in-place so the ordering doesn't change.
-            drop=False,
-            append=False,
-        )
-        block.cached(
-            force=True,
-            # All DataFrames will be filtered by by_col_ids, so
-            # force block.cached() to cluster by the new index by explicitly
-            # setting `session_aware=False`. This will ensure that the filters
-            # are more efficient.
-            session_aware=False,
-        )
-        keys_block, _ = block.aggregate(by_col_ids, dropna=self._dropna)
-        for chunk in keys_block.to_pandas_batches():
-            for by_keys in chunk.index:
-                filtered_series = series.Series(
-                    # To ensure the cache is used, filter first, then reset the
-                    # index before yielding the DataFrame.
-                    block.filter(
-                        functools.reduce(
-                            ops.and_op.as_expr,
-                            (
-                                ops.eq_op.as_expr(by_col, ex.const(by_key))
-                                for by_col, by_key in zip(by_col_ids, by_keys)
-                            ),
-                        ),
-                    )
-                    .set_index(
-                        original_index_columns,
-                        # We retained by_col_ids in the set_index call above,
-                        # so it's safe to drop the duplicates now.
-                        drop=True,
-                        append=False,
-                        index_labels=original_index_labels,
-                    )
-                    .select_column(self._value_column),
-                )
-                filtered_series.name = self._value_name
-                yield by_keys, filtered_series
+        for group_keys, filtered_block in group_by.block_groupby_iter(
+            self._block,
+            by_col_ids=self._by_col_ids,
+            by_key_is_singular=self._by_key_is_singular,
+            dropna=self._dropna,
+        ):
+            filtered_series = series.Series(
+                filtered_block.select_column(self._value_column)
+            )
+            filtered_series.name = self._value_name
+            yield group_keys, filtered_series
 
     def all(self) -> series.Series:
         return self._aggregate(agg_ops.all_op)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1854,12 +1854,18 @@ def _groupby_level(
         level: int | str | typing.Sequence[int] | typing.Sequence[str],
         dropna: bool = True,
     ) -> bigframes.core.groupby.SeriesGroupBy:
+        if utils.is_list_like(level):
+            by_key_is_singular = False
+        else:
+            by_key_is_singular = True
+
         return groupby.SeriesGroupBy(
             self._block,
             self._value_column,
             by_col_ids=self._resolve_levels(level),
             value_name=self.name,
             dropna=dropna,
+            by_key_is_singular=by_key_is_singular,
         )
 
     def _groupby_values(
@@ -1871,8 +1877,10 @@ def _groupby_values(
     ) -> bigframes.core.groupby.SeriesGroupBy:
         if not isinstance(by, Series) and _is_list_like(by):
             by = list(by)
+            by_key_is_singular = False
         else:
             by = [typing.cast(typing.Union[blocks.Label, Series], by)]
+            by_key_is_singular = True
 
         block = self._block
         grouping_cols: typing.Sequence[str] = []
@@ -1904,6 +1912,7 @@ def _groupby_values(
             by_col_ids=grouping_cols,
             value_name=self.name,
             dropna=dropna,
+            by_key_is_singular=by_key_is_singular,
         )
 
     def apply(
diff --git a/tests/unit/core/test_groupby.py b/tests/unit/core/test_groupby.py