refactor: add compile_aggregate

chelsea-lin · chelsea-lin · commit 1fa0b77bb42d · 2025-07-11T20:58:35.000Z
diff --git a/bigframes/core/compile/sqlglot/aggregate_compiler.py b/bigframes/core/compile/sqlglot/aggregate_compiler.py
@@ -0,0 +1,104 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import functools
+import typing
+
+import sqlglot.expressions as sge
+
+from bigframes import dtypes
+from bigframes.core import expression, window_spec
+import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
+import bigframes.core.compile.sqlglot.sqlglot_ir as ir
+import bigframes.operations as ops
+
+
+def compile_aggregate(
+    aggregate: expression.Aggregation,
+    order_by: tuple[sge.Expression, ...],
+) -> sge.Expression:
+    """Compiles BigFrames aggregation expression into SQLGlot expression."""
+    # TODO: try to remove type: ignore
+    if isinstance(aggregate, expression.NullaryAggregation):
+        return compile_nullary_agg(aggregate.op)
+    if isinstance(aggregate, expression.UnaryAggregation):
+        column = scalar_compiler.compile_scalar_expression(aggregate.arg)
+        if not aggregate.op.order_independent:
+            return compile_ordered_unary_agg(aggregate.op, column, order_by=order_by)  # type: ignore
+        else:
+            return compile_unary_agg(aggregate.op, column)  # type: ignore
+    elif isinstance(aggregate, expression.BinaryAggregation):
+        left = scalar_compiler.compile_scalar_expression(aggregate.left)
+        right = scalar_compiler.compile_scalar_expression(aggregate.right)
+        return compile_binary_agg(aggregate.op, left, right)  # type: ignore
+    else:
+        raise ValueError(f"Unexpected aggregation: {aggregate}")
+
+
+@functools.singledispatch
+def compile_nullary_agg(
+    op: ops.aggregations.WindowOp,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
+@functools.singledispatch
+def compile_binary_agg(
+    op: ops.aggregations.WindowOp,
+    left: sge.Expression,
+    right: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
+@functools.singledispatch
+def compile_unary_agg(
+    op: ops.aggregations.WindowOp,
+    column: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
+@functools.singledispatch
+def compile_ordered_unary_agg(
+    op: ops.aggregations.WindowOp,
+    column: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
+# TODO: check @numeric_op
+@compile_unary_agg.register
+def _(
+    op: ops.aggregations.SumOp,
+    column: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    # Will be null if all inputs are null. Pandas defaults to zero sum though.
+    expr = _apply_window_if_present(sge.func("SUM", column), window)
+    return sge.func("IFNULL", expr, ir._literal(0, dtypes.INT_DTYPE))
+
+
+def _apply_window_if_present(
+    value: sge.Expression,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    if window is not None:
+        raise NotImplementedError("Can't apply window to the expression.")
+    return window
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -22,6 +22,7 @@
 
 from bigframes.core import expression, guid, identifiers, nodes, pyarrow_utils, rewrite
 from bigframes.core.compile import configs
+import bigframes.core.compile.sqlglot.aggregate_compiler as aggregate_compiler
 from bigframes.core.compile.sqlglot.expressions import typed_expr
 import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
 import bigframes.core.compile.sqlglot.sqlglot_ir as ir
@@ -267,6 +268,39 @@ def compile_random_sample(
     ) -> ir.SQLGlotIR:
         return child.sample(node.fraction)
 
+    @_compile_node.register
+    def compile_aggregate(
+        self, node: nodes.AggregateNode, child: ir.SQLGlotIR
+    ) -> ir.SQLGlotIR:
+        ordering_cols = tuple(
+            sge.Ordered(
+                this=scalar_compiler.compile_scalar_expression(
+                    ordering.scalar_expression
+                ),
+                desc=ordering.direction.is_ascending is False,
+                # TODO: _convert_row_ordering_to_table_values for overwrite.
+                nulls_first=ordering.na_last is False,
+            )
+            for ordering in node.order_by
+        )
+        aggregations: tuple[tuple[str, sge.Expression], ...] = tuple(
+            (id.sql, aggregate_compiler.compile_aggregate(agg, order_by=ordering_cols))
+            for agg, id in node.aggregations
+        )
+        by_cols: tuple[sge.Expression, ...] = tuple(
+            scalar_compiler.compile_scalar_expression(by_col)
+            for by_col in node.by_column_ids
+        )
+
+        result = child.aggregate(aggregations, by_cols)
+        # TODO(chelsealin): Support dropna
+        # TODO: Remove dropna field and use filter node instead
+        # if node.dropna:
+        #     for key in node.by_column_ids:
+        #         if node.child.field_by_id[key.id].nullable:
+        #             result = result.filter(operations.notnull_op.as_expr(key))
+        return result
+
 
 def _replace_unsupported_ops(node: nodes.BigFrameNode):
     node = nodes.bottom_up(node, rewrite.rewrite_slice)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -25,11 +25,9 @@
 import sqlglot.expressions as sge
 
 from bigframes import dtypes
-from bigframes.core import guid, utils
+from bigframes.core import guid, local_data, schema, utils
 from bigframes.core.compile.sqlglot.expressions import typed_expr
 import bigframes.core.compile.sqlglot.sqlglot_types as sgt
-import bigframes.core.local_data as local_data
-import bigframes.core.schema as bf_schema
 
 # shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
 try:
@@ -68,7 +66,7 @@ def sql(self) -> str:
     def from_pyarrow(
         cls,
         pa_table: pa.Table,
-        schema: bf_schema.ArraySchema,
+        schema: schema.ArraySchema,
         uid_gen: guid.SequentialUIDGenerator,
     ) -> SQLGlotIR:
         """Builds SQLGlot expression from a pyarrow table.
@@ -364,6 +362,38 @@ def sample(self, fraction: float) -> SQLGlotIR:
         ).where(condition, append=False)
         return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
+    def aggregate(
+        self,
+        aggregations: tuple[tuple[str, sge.Expression], ...],
+        by_column_ids: tuple[sge.Expression, ...],
+    ) -> SQLGlotIR:
+        """Applies the aggregation expressions.
+
+        Args:
+            aggregations: output_column_id, aggregation_expr tuples
+            by_column_ids: column ids of the aggregation key, this is preserved through
+              the transform
+            dropna: whether null keys should be dropped
+        """
+        aggregations_expr = [
+            sge.Alias(
+                this=expr,
+                alias=sge.to_identifier(id, quoted=self.quoted),
+            )
+            for id, expr in aggregations
+        ]
+
+        new_expr = _select_to_cte(
+            self.expr,
+            sge.to_identifier(
+                next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted
+            ),
+        )
+        new_expr = new_expr.group_by(*by_column_ids).select(
+            *[*by_column_ids, *aggregations_expr], append=False
+        )
+        return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
+
     def insert(
         self,
         destination: bigquery.TableReference,
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_aggregate/test_compile_aggregate/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_aggregate/test_compile_aggregate/out.sql
@@ -0,0 +1,25 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_col` AS `bfcol_0`,
+    `int64_too` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    `bfcol_1` AS `bfcol_2`,
+    `bfcol_0` AS `bfcol_3`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    `bfcol_3`,
+    COALESCE(SUM(`bfcol_2`), 0) AS `bfcol_6`
+  FROM `bfcte_1`
+  GROUP BY
+    `bfcol_3`
+)
+SELECT
+  `bfcol_3` AS `bool_col`,
+  `bfcol_6` AS `int64_too`
+FROM `bfcte_2`
+ORDER BY
+  `bfcol_3` ASC NULLS LAST
diff --git a/tests/unit/core/compile/sqlglot/test_compile_aggregate.py b/tests/unit/core/compile/sqlglot/test_compile_aggregate.py
@@ -0,0 +1,24 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes.pandas as bpd
+
+pytest.importorskip("pytest_snapshot")
+
+
+def test_compile_aggregate(scalar_types_df: bpd.DataFrame, snapshot):
+    result = scalar_types_df["int64_too"].groupby(scalar_types_df["bool_col"]).sum()
+    snapshot.assert_match(result.to_frame().sql, "out.sql")