Skip to content

Commit 118c265

Browse files
authored
refactor: enhance aggregation tests and optimize null-last SQL for sqlglot (#2165)
1 parent 62f7e9f commit 118c265

File tree

35 files changed

+375
-66
lines changed

35 files changed

+375
-66
lines changed

bigframes/core/compile/sqlglot/aggregations/windows.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,9 @@ def apply_window_if_present(
4040
# Unbound grouping window.
4141
order_by = None
4242
elif window.is_range_bounded:
43-
# Note that, when the window is range-bounded, we only need one ordering key.
44-
# There are two reasons:
45-
# 1. Manipulating null positions requires more than one ordering key, which
46-
# is forbidden by SQL window syntax for range rolling.
47-
# 2. Pandas does not allow range rolling on timeseries with nulls.
48-
order_by = get_window_order_by((window.ordering[0],), override_null_order=False)
43+
order_by = get_window_order_by((window.ordering[0],))
4944
else:
50-
order_by = get_window_order_by(window.ordering, override_null_order=True)
45+
order_by = get_window_order_by(window.ordering)
5146

5247
order = sge.Order(expressions=order_by) if order_by else None
5348

@@ -102,7 +97,15 @@ def get_window_order_by(
10297
ordering: typing.Tuple[ordering_spec.OrderingExpression, ...],
10398
override_null_order: bool = False,
10499
) -> typing.Optional[tuple[sge.Ordered, ...]]:
105-
"""Returns the SQL order by clause for a window specification."""
100+
"""Returns the SQL order by clause for a window specification.
101+
Args:
102+
ordering (Tuple[ordering_spec.OrderingExpression, ...]):
103+
A tuple of ordering specification objects.
104+
override_null_order (bool):
105+
If True, overrides BigQuery's default null ordering behavior, which
106+
is sometimes incompatible with ordered aggregations. The generated SQL
107+
will include extra expressions to correctly enforce NULL FIRST/LAST.
108+
"""
106109
if not ordering:
107110
return None
108111

@@ -115,8 +118,6 @@ def get_window_order_by(
115118
nulls_first = not ordering_spec_item.na_last
116119

117120
if override_null_order:
118-
# Bigquery SQL considers NULLS to be "smallest" values, but we need
119-
# to override in these cases.
120121
is_null_expr = sge.Is(this=expr, expression=sge.Null())
121122
if nulls_first and desc:
122123
order_by.append(

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ def compile_random_sample(
299299
def compile_aggregate(
300300
self, node: nodes.AggregateNode, child: ir.SQLGlotIR
301301
) -> ir.SQLGlotIR:
302+
# The BigQuery ordered aggregation cannot support for NULL FIRST/LAST,
303+
# so we need to add extra expressions to enforce the null ordering.
302304
ordering_cols = windows.get_window_order_by(
303305
node.order_by, override_null_order=True
304306
)

tests/unit/core/compile/sqlglot/aggregations/snapshots/test_nullary_compiler/test_row_number_with_window/out.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ WITH `bfcte_0` AS (
55
), `bfcte_1` AS (
66
SELECT
77
*,
8-
ROW_NUMBER() OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1`
8+
ROW_NUMBER() OVER (ORDER BY `bfcol_0` ASC NULLS LAST) AS `bfcol_1`
99
FROM `bfcte_0`
1010
)
1111
SELECT
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`bool_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
CASE
9+
WHEN `bfcol_0` IS NULL
10+
THEN NULL
11+
ELSE COALESCE(LOGICAL_AND(`bfcol_0`) OVER (), TRUE)
12+
END AS `bfcol_1`
13+
FROM `bfcte_0`
14+
)
15+
SELECT
16+
`bfcol_1` AS `agg_bool`
17+
FROM `bfcte_1`
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`bool_col` AS `bfcol_0`,
4+
`string_col` AS `bfcol_1`
5+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
6+
), `bfcte_1` AS (
7+
SELECT
8+
*,
9+
CASE
10+
WHEN `bfcol_0` IS NULL
11+
THEN NULL
12+
ELSE COALESCE(LOGICAL_AND(`bfcol_0`) OVER (PARTITION BY `bfcol_1`), TRUE)
13+
END AS `bfcol_2`
14+
FROM `bfcte_0`
15+
)
16+
SELECT
17+
`bfcol_2` AS `agg_bool`
18+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`int64_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
CASE WHEN `bfcol_0` IS NULL THEN NULL ELSE ANY_VALUE(`bfcol_0`) OVER () END AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `agg_int64`
13+
FROM `bfcte_1`
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`int64_col` AS `bfcol_0`,
4+
`string_col` AS `bfcol_1`
5+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
6+
), `bfcte_1` AS (
7+
SELECT
8+
*,
9+
CASE
10+
WHEN `bfcol_0` IS NULL
11+
THEN NULL
12+
ELSE ANY_VALUE(`bfcol_0`) OVER (PARTITION BY `bfcol_1`)
13+
END AS `bfcol_2`
14+
FROM `bfcte_0`
15+
)
16+
SELECT
17+
`bfcol_2` AS `agg_int64`
18+
FROM `bfcte_1`
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`int64_col` AS `bfcol_0`
4+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
COUNT(`bfcol_0`) OVER () AS `bfcol_1`
9+
FROM `bfcte_0`
10+
)
11+
SELECT
12+
`bfcol_1` AS `agg_int64`
13+
FROM `bfcte_1`
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`int64_col` AS `bfcol_0`,
4+
`string_col` AS `bfcol_1`
5+
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
6+
), `bfcte_1` AS (
7+
SELECT
8+
*,
9+
COUNT(`bfcol_0`) OVER (PARTITION BY `bfcol_1`) AS `bfcol_2`
10+
FROM `bfcte_0`
11+
)
12+
SELECT
13+
`bfcol_2` AS `agg_int64`
14+
FROM `bfcte_1`

tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_date_series_diff/out.sql

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,7 @@ WITH `bfcte_0` AS (
55
), `bfcte_1` AS (
66
SELECT
77
*,
8-
CAST(DATE_DIFF(
9-
`bfcol_0`,
10-
LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST),
11-
DAY
12-
) * 86400000000 AS INT64) AS `bfcol_1`
8+
CAST(DATE_DIFF(`bfcol_0`, LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` ASC NULLS LAST), DAY) * 86400000000 AS INT64) AS `bfcol_1`
139
FROM `bfcte_0`
1410
)
1511
SELECT

0 commit comments

Comments
 (0)