refactor: add agg_ops.QcutOp to the sqlglot compiler

chelsea-lin · chelsea-lin · commit 1be0f8c5ff4d · 2025-11-13T23:58:19.000Z
diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py
@@ -253,6 +253,39 @@ def _(
     return apply_window_if_present(expr, window)
 
 
+@UNARY_OP_REGISTRATION.register(agg_ops.QcutOp)
+def _(
+    op: agg_ops.QcutOp,
+    column: typed_expr.TypedExpr,
+    window: typing.Optional[window_spec.WindowSpec] = None,
+) -> sge.Expression:
+    percent_ranks = apply_window_if_present(
+        sge.func("PERCENT_RANK"), window, include_framing_clauses=False
+    )
+    if isinstance(op.quantiles, int):
+        quantiles_sql = ir._literal(op.quantiles, dtypes.INT_DTYPE)
+        float_bucket = percent_ranks * quantiles_sql
+        # We need to clip the result to be between 1 and quantiles, so we use LEAST.
+        ceil_val = sge.func("CEIL", float_bucket)
+        clipped = sge.func("LEAST", ceil_val, quantiles_sql)
+        return sge.Sub(this=clipped, expression=sge.convert(1))
+    else:
+        case = sge.Case()
+        first_quantile = ir._literal(
+            op.quantiles[0], dtypes.infer_literal_type(op.quantiles[0])
+        )
+        case = case.when(
+            sge.LT(this=percent_ranks, expression=first_quantile), sge.Null()
+        )
+        for i in range(len(op.quantiles) - 1):
+            quantile = ir._literal(
+                op.quantiles[i + 1], dtypes.infer_literal_type(op.quantiles[i + 1])
+            )
+            bucket = ir._literal(i, dtypes.INT_DTYPE)
+            case = case.when(sge.LTE(this=percent_ranks, expression=quantile), bucket)
+        return case.else_(sge.Null())
+
+
 @UNARY_OP_REGISTRATION.register(agg_ops.QuantileOp)
 def _(
     op: agg_ops.QuantileOp,
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_qcut/int_quantiles.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_qcut/int_quantiles.sql
@@ -0,0 +1,13 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    LEAST(CEIL(PERCENT_RANK() OVER () * 4), 4) - 1 AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `int_quantiles`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_qcut/list_quantiles.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_qcut/list_quantiles.sql
@@ -0,0 +1,25 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `int64_col`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    CASE
+      WHEN PERCENT_RANK() OVER () < 0
+      THEN NULL
+      WHEN PERCENT_RANK() OVER () <= 0.25
+      THEN 0
+      WHEN PERCENT_RANK() OVER () <= 0.5
+      THEN 1
+      WHEN PERCENT_RANK() OVER () <= 0.75
+      THEN 2
+      WHEN PERCENT_RANK() OVER () <= 1
+      THEN 3
+      ELSE NULL
+    END AS `bfcol_1`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_1` AS `list_quantiles`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py
@@ -392,6 +392,29 @@ def test_pop_var(scalar_types_df: bpd.DataFrame, snapshot):
     snapshot.assert_match(sql_window, "window_out.sql")
 
 
+def test_qcut(scalar_types_df: bpd.DataFrame, snapshot):
+    col_name = "int64_col"
+    bf_df = scalar_types_df[[col_name]]
+    agg_ops_map = {
+        "int_quantiles": agg_exprs.UnaryAggregation(
+            agg_ops.QcutOp(quantiles=4), expression.deref(col_name)
+        ),
+        "list_quantiles": agg_exprs.UnaryAggregation(
+            agg_ops.QcutOp(quantiles=tuple([0, 0.25, 0.5, 0.75, 1])),
+            expression.deref(col_name),
+        ),
+    }
+    window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),))
+    sql = _apply_unary_window_op(
+        bf_df, agg_ops_map["int_quantiles"], window, "int_quantiles"
+    )
+    snapshot.assert_match(sql, "int_quantiles.sql")
+    sql = _apply_unary_window_op(
+        bf_df, agg_ops_map["list_quantiles"], window, "list_quantiles"
+    )
+    snapshot.assert_match(sql, "list_quantiles.sql")
+
+
 def test_quantile(scalar_types_df: bpd.DataFrame, snapshot):
     col_name = "int64_col"
     bf_df = scalar_types_df[[col_name]]