refactor: add agg_ops.std_op for the sqlglot compiler

chelsea-lin · chelsea-lin · commit 012a04bf6e27 · 2025-11-04T00:54:39.000Z
diff --git a/bigframes/core/compile/sqlglot/aggregations/op_registration.py b/bigframes/core/compile/sqlglot/aggregations/op_registration.py
@@ -52,5 +52,5 @@ def arg_checker(*args, **kwargs):
     def __getitem__(self, op: str | agg_ops.WindowOp) -> CompilationFunc:
         key = op if isinstance(op, type) else type(op)
         if str(key) not in self._registered_ops:
-            raise ValueError(f"{key} is already not registered")
+            raise ValueError(f"{key} is not registered")
         return self._registered_ops[str(key)]
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -278,6 +278,22 @@ def _(
     return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window)
 
 
+@UNARY_OP_REGISTRATION.register(agg_ops.StdOp)
+def _(
+    op: agg_ops.StdOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    expr = column.expr
+    if column.dtype == dtypes.BOOL_DTYPE:
+        expr = sge.Cast(this=expr, to="INT64")
+
+    expr = sge.func("STDDEV", expr)
+    if op.should_floor_result or column.dtype == dtypes.TIMEDELTA_DTYPE:
+        expr = sge.Cast(this=sge.func("FLOOR", expr), to="INT64")
+    return apply_window_if_present(expr, window)
+
+
 @UNARY_OP_REGISTRATION.register(agg_ops.ShiftOp)
 def _(
     op: agg_ops.ShiftOp,
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/out.sql
@@ -0,0 +1,27 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `bool_col` AS `bfcol_0`,
+    `int64_col` AS `bfcol_1`,
+    `duration_col` AS `bfcol_2`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    `bfcol_1` AS `bfcol_6`,
+    `bfcol_0` AS `bfcol_7`,
+    `bfcol_2` AS `bfcol_8`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    STDDEV(`bfcol_6`) AS `bfcol_12`,
+    STDDEV(CAST(`bfcol_7` AS INT64)) AS `bfcol_13`,
+    CAST(FLOOR(STDDEV(`bfcol_8`)) AS INT64) AS `bfcol_14`,
+    CAST(FLOOR(STDDEV(`bfcol_6`)) AS INT64) AS `bfcol_15`
+  FROM `bfcte_1`
+)
+SELECT
+  `bfcol_12` AS `int64_col`,
+  `bfcol_13` AS `bool_col`,
+  `bfcol_14` AS `duration_col`,
+  `bfcol_15` AS `int64_col_w_floor`
+FROM `bfcte_2`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/window_out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_std/window_out.sql
@@ -0,0 +1,13 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col` AS `bfcol_0`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    CASE WHEN `bfcol_0` IS NULL THEN NULL ELSE STDDEV(`bfcol_0`) OVER () END AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `agg_int64`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py
@@ -428,6 +428,40 @@ def test_shift(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(noop_sql, "noop.sql")
 
 
+def test_std(scalar_types_df: bpd.DataFrame, snapshot):
+    col_names = ["int64_col", "bool_col", "duration_col"]
+    bf_df = scalar_types_df[col_names]
+    bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us")
+
+    # The `to_timedelta` creates a new mapping for the column id.
+    col_names.insert(0, "rowindex")
+    name2id = {
+        col_name: col_id
+        for col_name, col_id in zip(col_names, bf_df._block.expr.column_ids)
+    }
+
+    agg_ops_map = {
+        "int64_col": agg_ops.StdOp().as_expr(name2id["int64_col"]),
+        "bool_col": agg_ops.StdOp().as_expr(name2id["bool_col"]),
+        "duration_col": agg_ops.StdOp().as_expr(name2id["duration_col"]),
+        "int64_col_w_floor": agg_ops.StdOp(should_floor_result=True).as_expr(
+            name2id["int64_col"]
+        ),
+    }
+    sql = _apply_unary_agg_ops(
+        bf_df, list(agg_ops_map.values()), list(agg_ops_map.keys())
+    )
+    snapshot.assert_match(sql, "out.sql")
+
+    # Window tests
+    col_name = "int64_col"
+    bf_df_int = scalar_types_df[[col_name]]
+    agg_expr = agg_ops.StdOp().as_expr(col_name)
+    window = window_spec.WindowSpec(ordering=(ordering.descending_over(col_name),))
+    sql_window = _apply_unary_window_op(bf_df_int, agg_expr, window, "agg_int64")
+    snapshot.assert_match(sql_window, "window_out.sql")
+
+
 def test_sum(scalar_types_df: bpd.DataFrame, snapshot):
     bf_df = scalar_types_df[["int64_col", "bool_col"]]
     agg_ops_map = {