fix various problems, migrate rank to new api

TrevorBergeron · TrevorBergeron · commit 616eccf0b5d5 · 2025-11-07T21:00:49.000Z
diff --git a/bigframes/core/agg_expressions.py b/bigframes/core/agg_expressions.py
@@ -210,7 +210,7 @@ def transform_children(
         t: Callable[[expression.Expression], expression.Expression],
     ) -> WindowExpression:
         return WindowExpression(
-            self.analytic_expr.transform_children(t),
+            t(self.analytic_expr),  # type: ignore
             self.window.transform_exprs(t),
         )
 
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -431,16 +431,11 @@ def rank(
 
     columns = columns or tuple(col for col in block.value_columns)
     labels = [block.col_id_to_label[id] for id in columns]
-    # Step 1: Calculate row numbers for each row
-    # Identify null values to be treated according to na_option param
-    rownum_col_ids = []
-    nullity_col_ids = []
+
+    result_exprs = []
     for col in columns:
-        block, nullity_col_id = block.apply_unary_op(
-            col,
-            ops.isnull_op,
-        )
-        nullity_col_ids.append(nullity_col_id)
+        # Step 1: Calculate row numbers for each row
+        # Identify null values to be treated according to na_option param
         window_ordering = (
             ordering.OrderingExpression(
                 ex.deref(col),
@@ -451,87 +446,66 @@ def rank(
             ),
         )
         # Count_op ignores nulls, so if na_option is "top" or "bottom", we instead count the nullity columns, where nulls have been mapped to bools
-        block, rownum_id = block.apply_window_op(
-            col if na_option == "keep" else nullity_col_id,
-            agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op,
-            window_spec=windows.unbound(
-                grouping_keys=grouping_cols, ordering=window_ordering
-            )
+        target_expr = (
+            ex.deref(col) if na_option == "keep" else ops.isnull_op.as_expr(col)
+        )
+        window_op = agg_ops.dense_rank_op if method == "dense" else agg_ops.count_op
+        window_spec = (
+            windows.unbound(grouping_keys=grouping_cols, ordering=window_ordering)
             if method == "dense"
             else windows.rows(
                 end=0, ordering=window_ordering, grouping_keys=grouping_cols
-            ),
-            skip_reproject_unsafe=(col != columns[-1]),
+            )
+        )
+        result_expr: ex.Expression = agg_expressions.WindowExpression(
+            agg_expressions.UnaryAggregation(window_op, target_expr), window_spec
         )
         if pct:
-            block, max_id = block.apply_window_op(
-                rownum_id, agg_ops.max_op, windows.unbound(grouping_keys=grouping_cols)
+            result_expr = ops.div_op.as_expr(
+                result_expr,
+                agg_expressions.WindowExpression(
+                    agg_expressions.UnaryAggregation(agg_ops.max_op, result_expr),
+                    windows.unbound(grouping_keys=grouping_cols),
+                ),
             )
-            block, rownum_id = block.project_expr(ops.div_op.as_expr(rownum_id, max_id))
-
-        rownum_col_ids.append(rownum_id)
-
-    # Step 2: Apply aggregate to groups of like input values.
-    # This step is skipped for method=='first' or 'dense'
-    if method in ["average", "min", "max"]:
-        agg_op = {
-            "average": agg_ops.mean_op,
-            "min": agg_ops.min_op,
-            "max": agg_ops.max_op,
-        }[method]
-        post_agg_rownum_col_ids = []
-        for i in range(len(columns)):
-            block, result_id = block.apply_window_op(
-                rownum_col_ids[i],
-                agg_op,
-                window_spec=windows.unbound(grouping_keys=(columns[i], *grouping_cols)),
-                skip_reproject_unsafe=(i < (len(columns) - 1)),
+        # Step 2: Apply aggregate to groups of like input values.
+        # This step is skipped for method=='first' or 'dense'
+        if method in ["average", "min", "max"]:
+            agg_op = {
+                "average": agg_ops.mean_op,
+                "min": agg_ops.min_op,
+                "max": agg_ops.max_op,
+            }[method]
+            result_expr = agg_expressions.WindowExpression(
+                agg_expressions.UnaryAggregation(agg_op, result_expr),
+                windows.unbound(grouping_keys=(col, *grouping_cols)),
             )
-            post_agg_rownum_col_ids.append(result_id)
-        rownum_col_ids = post_agg_rownum_col_ids
-
-    # Pandas masks all values where any grouping column is null
-    # Note: we use pd.NA instead of float('nan')
-    if grouping_cols:
-        predicate = functools.reduce(
-            ops.and_op.as_expr,
-            [ops.notnull_op.as_expr(column_id) for column_id in grouping_cols],
-        )
-        block = block.project_exprs(
-            [
-                ops.where_op.as_expr(
-                    ex.deref(col),
-                    predicate,
-                    ex.const(None),
-                )
-                for col in rownum_col_ids
-            ],
-            labels=labels,
-        )
-        rownum_col_ids = list(block.value_columns[-len(rownum_col_ids) :])
-
-    # Step 3: post processing: mask null values and cast to float
-    if method in ["min", "max", "first", "dense"]:
-        # Pandas rank always produces Float64, so must cast for aggregation types that produce ints
-        return (
-            block.select_columns(rownum_col_ids)
-            .multi_apply_unary_op(ops.AsTypeOp(pd.Float64Dtype()))
-            .with_column_labels(labels)
-        )
-    if na_option == "keep":
-        # For na_option "keep", null inputs must produce null outputs
-        exprs = []
-        for i in range(len(columns)):
-            exprs.append(
-                ops.where_op.as_expr(
-                    ex.const(pd.NA, dtype=pd.Float64Dtype()),
-                    nullity_col_ids[i],
-                    rownum_col_ids[i],
-                )
+        # Pandas masks all values where any grouping column is null
+        # Note: we use pd.NA instead of float('nan')
+        if grouping_cols:
+            predicate = functools.reduce(
+                ops.and_op.as_expr,
+                [ops.notnull_op.as_expr(column_id) for column_id in grouping_cols],
+            )
+            result_expr = ops.where_op.as_expr(
+                result_expr,
+                predicate,
+                ex.const(None),
             )
-        return block.project_exprs(exprs, labels=labels, drop=True)
 
-    return block.select_columns(rownum_col_ids).with_column_labels(labels)
+        # Step 3: post processing: mask null values and cast to float
+        if method in ["min", "max", "first", "dense"]:
+            # Pandas rank always produces Float64, so must cast for aggregation types that produce ints
+            result_expr = ops.AsTypeOp(pd.Float64Dtype()).as_expr(result_expr)
+        elif na_option == "keep":
+            # For na_option "keep", null inputs must produce null outputs
+            result_expr = ops.where_op.as_expr(
+                ex.const(pd.NA, dtype=pd.Float64Dtype()),
+                ops.isnull_op.as_expr(col),
+                result_expr,
+            )
+        result_exprs.append(result_expr)
+    return block.project_block_exprs(result_exprs, labels=labels, drop=True)
 
 
 def dropna(
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1165,6 +1165,7 @@ def project_block_exprs(
         if drop:
             new_array = new_array.drop_columns(self.value_columns)
 
+        new_array.node.validate_tree()
         return Block(
             new_array,
             index_columns=self.index_columns,
diff --git a/bigframes/core/expression_factoring.py b/bigframes/core/expression_factoring.py
@@ -135,11 +135,10 @@ def push_into_tree(
         for child_id in expr.expr.column_references
         if child_id in by_id.keys()
     )
-    # be careful about merging multi-parent ids
     # TODO: Also prevent inlining expensive or non-deterministic
+    # We avoid inlining multi-parent ids, as they would be inlined multiple places, potentially increasing work and/or compiled text size
     multi_parent_ids = set(id for id in graph.nodes if len(graph.parents(id)) > 2)
     scalar_ids = set(expr.name for expr in exprs if expr.expr.is_scalar_expr)
-    post_ids = (*root.ids, *target_ids)
 
     def graph_extract_scalar_exprs() -> Sequence[NamedExpression]:
         results: dict[identifiers.ColumnId, expression.Expression] = dict()
@@ -168,11 +167,8 @@ def graph_extract_scalar_exprs() -> Sequence[NamedExpression]:
                     id: by_id[id].expr.bind_refs(results, allow_partial_bindings=True)
                 }
                 results.update(new_exprs)
-        return tuple(
-            NamedExpression(expr, id)
-            for id, expr in results.items()
-            if id in set([*graph.sinks, *target_ids])
-        )
+        # TODO: We can prune expressions that won't be reused here,
+        return tuple(NamedExpression(expr, id) for id, expr in results.items())
 
     def graph_extract_window_expr() -> Optional[
         Tuple[identifiers.ColumnId, agg_expressions.WindowExpression]
@@ -193,19 +189,17 @@ def graph_extract_window_expr() -> Optional[
             curr_root = nodes.ProjectionNode(
                 curr_root, tuple((x.expr, x.name) for x in scalar_exprs)
             )
-            curr_root._validate()
         while result := graph_extract_window_expr():
             id, window_expr = result
             curr_root = nodes.WindowOpNode(
                 curr_root, window_expr.analytic_expr, window_expr.window, output_name=id
             )
-            curr_root._validate()
     # TODO: Try to get the ordering right earlier, so can avoid this extra node.
+    post_ids = (*root.ids, *target_ids)
     if tuple(curr_root.ids) != post_ids:
         curr_root = nodes.SelectionNode(
             curr_root, tuple(nodes.AliasedRef.identity(id) for id in post_ids)
         )
-        curr_root._validate()
     return curr_root
 
 
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -1199,6 +1199,7 @@ def _validate(self):
         for expression, _ in self.assignments:
             # throws TypeError if invalid
             _ = ex.bind_schema_fields(expression, self.child.field_by_id).output_type
+            assert expression.is_scalar_expr
         # Cannot assign to existing variables - append only!
         assert all(name not in self.child.schema.names for _, name in self.assignments)
 
@@ -1404,6 +1405,11 @@ def _validate(self):
             not self.window_spec.is_row_bounded
         ) or self.expression.op.implicitly_inherits_order
         assert all(ref in self.child.ids for ref in self.expression.column_references)
+        assert self.added_field.dtype is not None
+        for agg_child in self.expression.children:
+            assert agg_child.is_scalar_expr
+        for window_expr in self.window_spec.expressions:
+            assert window_expr.is_scalar_expr
 
     @property
     def non_local(self) -> bool:

Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,7 @@ def transform_children(`
`210`	`210`	`t: Callable[[expression.Expression], expression.Expression],`
`211`	`211`	`) -> WindowExpression:`
`212`	`212`	`return WindowExpression(`
`213`		`- self.analytic_expr.transform_children(t),`
	`213`	`+ t(self.analytic_expr), # type: ignore`
`214`	`214`	`self.window.transform_exprs(t),`
`215`	`215`	`)`
`216`	`216`