From 876d2a490faaa7365019ab209f9f41c1f74db367 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 24 Sep 2025 18:48:25 +0000 Subject: [PATCH 1/4] refactor: support agg_ops.DenseRankOp and RankOp for sqlglot compiler --- .../sqlglot/aggregations/unary_compiler.py | 9 +++++ bigframes/operations/aggregations.py | 2 + .../test_dense_rank/out.sql | 19 +++++++++ .../aggregations/test_unary_compiler.py | 39 ++++++++++++++++++- 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index 8ed5510ec2..a4b6412f08 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -47,6 +47,15 @@ def _( return apply_window_if_present(sge.func("COUNT", column.expr), window) +@UNARY_OP_REGISTRATION.register(agg_ops.DenseRankOp) +def _( + op: agg_ops.DenseRankOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("DENSE_RANK"), window) + + @UNARY_OP_REGISTRATION.register(agg_ops.MaxOp) def _( op: agg_ops.MaxOp, diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 7b6998b90e..f6e8600d42 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -519,6 +519,8 @@ def implicitly_inherits_order(self): @dataclasses.dataclass(frozen=True) class DenseRankOp(UnaryWindowOp): + name: ClassVar[str] = "dense_rank" + @property def skips_nulls(self): return False diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql new file mode 100644 index 0000000000..d658d704de --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql @@ -0,0 +1,19 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + DENSE_RANK() OVER ( + ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST + RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `bfuid_col_1`, + `bfcol_0` AS `int64_col`, + `bfcol_4` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index a5ffda0e65..19969cc233 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -17,7 +17,14 @@ import pytest from bigframes.core import agg_expressions as agg_exprs -from bigframes.core import array_value, identifiers, nodes +from bigframes.core import ( + array_value, + expression, + identifiers, + nodes, + ordering, + window_spec, +) from bigframes.operations import aggregations as agg_ops import bigframes.pandas as bpd @@ -38,6 +45,24 @@ def _apply_unary_agg_ops( return sql +def _apply_unary_window_op( + obj: bpd.DataFrame, + op: agg_exprs.UnaryAggregation, + window_spec: window_spec.WindowSpec, + new_name: str, +) -> str: + win_node = nodes.WindowOpNode( + obj._block.expr.node, + expression=op, + window_spec=window_spec, + output_name=identifiers.ColumnId(new_name), + ) + result = array_value.ArrayValue(win_node) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql + + def test_count(scalar_types_df: bpd.DataFrame, snapshot): col_name = "int64_col" bf_df = scalar_types_df[[col_name]] @@ -47,6 +72,18 @@ def test_count(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation( + agg_ops.DenseRankOp(), expression.deref("int64_col") + ) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + def test_max(scalar_types_df: bpd.DataFrame, snapshot): col_name = "int64_col" bf_df = scalar_types_df[[col_name]] From 371f0b58a879d3cddcf19b532f8093dc8e719119 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 24 Sep 2025 18:51:26 +0000 Subject: [PATCH 2/4] dense_rank and rank doesn't support window framing clauses --- .../compile/sqlglot/aggregations/windows.py | 19 +++++++++++++++++++ .../test_dense_rank/out.sql | 5 +---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/bigframes/core/compile/sqlglot/aggregations/windows.py b/bigframes/core/compile/sqlglot/aggregations/windows.py index 4d7a3f7406..9a3b52b1a1 100644 --- a/bigframes/core/compile/sqlglot/aggregations/windows.py +++ b/bigframes/core/compile/sqlglot/aggregations/windows.py @@ -64,6 +64,25 @@ def apply_window_if_present( if not window.bounds and not order: return sge.Window(this=value, partition_by=group_by) + # Ranking and navigation functions do not support window framing clauses. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/analytic-function-concepts#window-frame-clause + no_frame_functions = { + "RANK", + "DENSE_RANK", + "NTILE", + "PERCENT_RANK", + "CUME_DIST", + "ROW_NUMBER", + "LAG", + "LEAD", + "FIRST_VALUE", + "LAST_VALUE", + "NTH_VALUE", + } + func_name = value.sql_name().upper() if isinstance(value, sge.Func) else "" + if not window.bounds and func_name in no_frame_functions: + return sge.Window(this=value, partition_by=group_by, order=order) + kind = ( "ROWS" if isinstance(window.bounds, window_spec.RowsWindowBounds) else "RANGE" ) diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql index d658d704de..0a55f2fb38 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql @@ -6,10 +6,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - DENSE_RANK() OVER ( - ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING - ) AS `bfcol_4` + DENSE_RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_4` FROM `bfcte_0` ) SELECT From 801da075c4a556d1d301ee52c4bd1c5ac66ec1e3 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 24 Sep 2025 20:23:04 +0000 Subject: [PATCH 3/4] support rank --- .../sqlglot/aggregations/unary_compiler.py | 9 +++++++++ .../test_unary_compiler/test_rank/out.sql | 16 ++++++++++++++++ .../sqlglot/aggregations/test_unary_compiler.py | 13 ++++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index a4b6412f08..bf4cdabc25 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -115,6 +115,15 @@ def _( return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) +@UNARY_OP_REGISTRATION.register(agg_ops.RankOp) +def _( + op: agg_ops.RankOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("RANK"), window) + + @UNARY_OP_REGISTRATION.register(agg_ops.SumOp) def _( op: agg_ops.SumOp, diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql new file mode 100644 index 0000000000..067d709e41 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `bfuid_col_1`, + `bfcol_0` AS `int64_col`, + `bfcol_4` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index 19969cc233..9c7175035d 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -76,7 +76,7 @@ def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): col_name = "int64_col" bf_df = scalar_types_df[[col_name]] agg_expr = agg_exprs.UnaryAggregation( - agg_ops.DenseRankOp(), expression.deref("int64_col") + agg_ops.DenseRankOp(), expression.deref(col_name) ) window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") @@ -141,6 +141,17 @@ def test_min(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_rank(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation(agg_ops.RankOp(), expression.deref(col_name)) + + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + def test_sum(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] agg_ops_map = { From 2544ee53c6cbcf392b6673b22b623e42ece4b4ea Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 24 Sep 2025 20:40:12 +0000 Subject: [PATCH 4/4] select some columns --- .../sqlglot/aggregations/unary_compiler.py | 10 ++++++++-- .../compile/sqlglot/aggregations/windows.py | 19 ++----------------- .../test_dense_rank/out.sql | 9 +++------ .../test_unary_compiler/test_rank/out.sql | 9 +++------ .../aggregations/test_unary_compiler.py | 2 +- 5 files changed, 17 insertions(+), 32 deletions(-) diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index bf4cdabc25..598a89e4eb 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -53,7 +53,10 @@ def _( column: typed_expr.TypedExpr, window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: - return apply_window_if_present(sge.func("DENSE_RANK"), window) + # Ranking functions do not support window framing clauses. + return apply_window_if_present( + sge.func("DENSE_RANK"), window, include_framing_clauses=False + ) @UNARY_OP_REGISTRATION.register(agg_ops.MaxOp) @@ -121,7 +124,10 @@ def _( column: typed_expr.TypedExpr, window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: - return apply_window_if_present(sge.func("RANK"), window) + # Ranking functions do not support window framing clauses. + return apply_window_if_present( + sge.func("RANK"), window, include_framing_clauses=False + ) @UNARY_OP_REGISTRATION.register(agg_ops.SumOp) diff --git a/bigframes/core/compile/sqlglot/aggregations/windows.py b/bigframes/core/compile/sqlglot/aggregations/windows.py index 9a3b52b1a1..1bfa72b878 100644 --- a/bigframes/core/compile/sqlglot/aggregations/windows.py +++ b/bigframes/core/compile/sqlglot/aggregations/windows.py @@ -25,6 +25,7 @@ def apply_window_if_present( value: sge.Expression, window: typing.Optional[window_spec.WindowSpec] = None, + include_framing_clauses: bool = True, ) -> sge.Expression: if window is None: return value @@ -64,23 +65,7 @@ def apply_window_if_present( if not window.bounds and not order: return sge.Window(this=value, partition_by=group_by) - # Ranking and navigation functions do not support window framing clauses. - # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/analytic-function-concepts#window-frame-clause - no_frame_functions = { - "RANK", - "DENSE_RANK", - "NTILE", - "PERCENT_RANK", - "CUME_DIST", - "ROW_NUMBER", - "LAG", - "LEAD", - "FIRST_VALUE", - "LAST_VALUE", - "NTH_VALUE", - } - func_name = value.sql_name().upper() if isinstance(value, sge.Func) else "" - if not window.bounds and func_name in no_frame_functions: + if not window.bounds and not include_framing_clauses: return sge.Window(this=value, partition_by=group_by, order=order) kind = ( diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql index 0a55f2fb38..38b6ed9f5c 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_dense_rank/out.sql @@ -1,16 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0`, - `rowindex` AS `bfcol_1` + `int64_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - DENSE_RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_4` + DENSE_RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_1` AS `bfuid_col_1`, - `bfcol_0` AS `int64_col`, - `bfcol_4` AS `agg_int64` + `bfcol_1` AS `agg_int64` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql index 067d709e41..5de2330ef6 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_rank/out.sql @@ -1,16 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0`, - `rowindex` AS `bfcol_1` + `int64_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_4` + RANK() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_1` AS `bfuid_col_1`, - `bfcol_0` AS `int64_col`, - `bfcol_4` AS `agg_int64` + `bfcol_1` AS `agg_int64` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index 9c7175035d..bf2523930f 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -57,7 +57,7 @@ def _apply_unary_window_op( window_spec=window_spec, output_name=identifiers.ColumnId(new_name), ) - result = array_value.ArrayValue(win_node) + result = array_value.ArrayValue(win_node).select_columns([new_name]) sql = result.session._executor.to_sql(result, enable_cache=False) return sql