refactor: add ArrayReduceOp to the sqlglot compiler

chelsea-lin · chelsea-lin · commit 1e5f69839ade · 2025-11-13T21:33:06.000Z
diff --git a/bigframes/core/compile/sqlglot/aggregations/ordered_unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/ordered_unary_compiler.py
@@ -27,7 +27,7 @@ def compile(
     op: agg_ops.WindowOp,
     column: typed_expr.TypedExpr,
     *,
-    order_by: tuple[sge.Expression, ...],
+    order_by: tuple[sge.Expression, ...] = (),
 ) -> sge.Expression:
     return ORDERED_UNARY_OP_REGISTRATION[op](op, column, order_by=order_by)
 
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -49,6 +49,19 @@ def _(
     return sge.func("IFNULL", result, sge.true())
 
 
+@UNARY_OP_REGISTRATION.register(agg_ops.AnyOp)
+def _(
+    op: agg_ops.AnyOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    expr = column.expr
+    expr = apply_window_if_present(sge.func("LOGICAL_OR", expr), window)
+
+    # BQ will return null for empty column, result would be false in pandas.
+    return sge.func("COALESCE", expr, sge.convert(False))
+
+
 @UNARY_OP_REGISTRATION.register(agg_ops.ApproxQuartilesOp)
 def _(
     op: agg_ops.ApproxQuartilesOp,
diff --git a/bigframes/core/compile/sqlglot/expressions/array_ops.py b/bigframes/core/compile/sqlglot/expressions/array_ops.py
@@ -16,7 +16,7 @@
 
 import typing
 
-import sqlglot
+import sqlglot as sg
 import sqlglot.expressions as sge
 
 from bigframes import operations as ops
@@ -38,17 +38,45 @@ def _(expr: TypedExpr, op: ops.ArrayIndexOp) -> sge.Expression:
     )
 
 
+@register_unary_op(ops.ArrayReduceOp, pass_op=True)
+def _(expr: TypedExpr, op: ops.ArrayReduceOp) -> sge.Expression:
+    sub_expr = sg.to_identifier("bf_arr_reduce_uid")
+    sub_type = dtypes.get_array_inner_type(expr.dtype)
+
+    if op.aggregation.order_independent:
+        from bigframes.core.compile.sqlglot.aggregations import unary_compiler
+
+        agg_expr = unary_compiler.compile(op.aggregation, TypedExpr(sub_expr, sub_type))
+    else:
+        from bigframes.core.compile.sqlglot.aggregations import ordered_unary_compiler
+
+        agg_expr = ordered_unary_compiler.compile(
+            op.aggregation, TypedExpr(sub_expr, sub_type)
+        )
+
+    return (
+        sge.select(agg_expr)
+        .from_(
+            sge.Unnest(
+                expressions=[expr.expr],
+                alias=sge.TableAlias(columns=[sub_expr]),
+            )
+        )
+        .subquery()
+    )
+
+
 @register_unary_op(ops.ArraySliceOp, pass_op=True)
 def _(expr: TypedExpr, op: ops.ArraySliceOp) -> sge.Expression:
-    slice_idx = sqlglot.to_identifier("slice_idx")
+    slice_idx = sg.to_identifier("slice_idx")
 
     conditions: typing.List[sge.Predicate] = [slice_idx >= op.start]
 
     if op.stop is not None:
         conditions.append(slice_idx < op.stop)
 
     # local name for each element in the array
-    el = sqlglot.to_identifier("el")
+    el = sg.to_identifier("el")
 
     selected_elements = (
         sge.select(el)
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_any/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_any/out.sql
@@ -0,0 +1,12 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    COALESCE(LOGICAL_OR(`bool_col`), FALSE) AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `bool_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_any/window_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_any/window_out.sql
@@ -0,0 +1,17 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    CASE
+      WHEN `bool_col` IS NULL
+      THEN NULL
+      ELSE COALESCE(LOGICAL_OR(`bool_col`) OVER (), FALSE)
+    END AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `agg_bool`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py
@@ -88,6 +88,20 @@ def test_all(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql_window_partition, "window_partition_out.sql")
 
 
+def test_any(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "bool_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.AnyOp().as_expr(col_name)
+    sql = _apply_unary_agg_ops(bf_df, [agg_expr], [col_name])
+
+    snapshot.assert_match(sql, "out.sql")
+
+    # Window tests
+    window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),))
+    sql_window = _apply_unary_window_op(bf_df, agg_expr, window, "agg_bool")
+    snapshot.assert_match(sql_window, "window_out.sql")
+
+
 def test_approx_quartiles(scalar_types_df: bpd.DataFrame, snapshot):
     col_name = "int64_col"
     bf_df = scalar_types_df[[col_name]]
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_array_ops/test_array_reduce_op/out.sql
@@ -0,0 +1,37 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_list_col`,
+    `float_list_col`,
+    `string_list_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`repeated_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    (
+      SELECT
+        COALESCE(SUM(bf_arr_reduce_uid), 0)
+      FROM UNNEST(`float_list_col`) AS bf_arr_reduce_uid
+    ) AS `bfcol_3`,
+    (
+      SELECT
+        STDDEV(bf_arr_reduce_uid)
+      FROM UNNEST(`float_list_col`) AS bf_arr_reduce_uid
+    ) AS `bfcol_4`,
+    (
+      SELECT
+        COUNT(bf_arr_reduce_uid)
+      FROM UNNEST(`string_list_col`) AS bf_arr_reduce_uid
+    ) AS `bfcol_5`,
+    (
+      SELECT
+        COALESCE(LOGICAL_OR(bf_arr_reduce_uid), FALSE)
+      FROM UNNEST(`bool_list_col`) AS bf_arr_reduce_uid
+    ) AS `bfcol_6`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_3` AS `sum_float`,
+  `bfcol_4` AS `std_float`,
+  `bfcol_5` AS `count_str`,
+  `bfcol_6` AS `any_bool`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_array_ops.py
@@ -17,6 +17,7 @@
 from bigframes import operations as ops
 from bigframes.core import expression
 from bigframes.operations._op_converters import convert_index, convert_slice
+import bigframes.operations.aggregations as agg_ops
 import bigframes.pandas as bpd
 from bigframes.testing import utils
 
@@ -43,6 +44,20 @@ def test_array_index(repeated_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql, "out.sql")
 
 
+def test_array_reduce_op(repeated_types_df: bpd.DataFrame, snapshot):
+    ops_map = {
+        "sum_float": ops.ArrayReduceOp(agg_ops.SumOp()).as_expr("float_list_col"),
+        "std_float": ops.ArrayReduceOp(agg_ops.StdOp()).as_expr("float_list_col"),
+        "count_str": ops.ArrayReduceOp(agg_ops.CountOp()).as_expr("string_list_col"),
+        "any_bool": ops.ArrayReduceOp(agg_ops.AnyOp()).as_expr("bool_list_col"),
+    }
+
+    sql = utils._apply_ops_to_sql(
+        repeated_types_df, list(ops_map.values()), list(ops_map.keys())
+    )
+    snapshot.assert_match(sql, "out.sql")
+
+
 def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot):
     col_name = "string_list_col"
     bf_df = repeated_types_df[[col_name]]