googleapis
diff --git a/‎bigframes/core/array_value.py‎
Lines changed: 83 additions & 9 deletions b/‎bigframes/core/array_value.py‎
Lines changed: 83 additions & 9 deletions
diff --git a/‎bigframes/core/block_transforms.py‎
Lines changed: 45 additions & 80 deletions b/‎bigframes/core/block_transforms.py‎
Lines changed: 45 additions & 80 deletions
@@ -16,7 +16,6 @@
 from dataclasses import dataclass
 import datetime
 import functools
-import itertools
 import typing
 from typing import Iterable, List, Mapping, Optional, Sequence, Tuple
 
@@ -267,21 +266,96 @@ def compute_values(self, assignments: Sequence[ex.Expression]):
         )
 
     def compute_general_expression(self, assignments: Sequence[ex.Expression]):
+        """
+        Applies arbitrary column expressions to the current execution block.
+
+        This method transforms the logical plan by applying a sequence of expressions that
+        preserve the length of the input columns. It supports both scalar operations
+        and window functions. Each expression is assigned a unique internal column identifier.
+
+        Args:
+            assignments (Sequence[ex.Expression]): A sequence of expression objects
+                representing the transformations to apply to the columns.
+
+        Returns:
+            Tuple[ArrayValue, Tuple[str, ...]]: A tuple containing:
+                - An `ArrayValue` wrapping the new root node of the updated logical plan.
+                - A tuple of strings representing the unique column IDs generated for
+                  each expression in the assignments.
+        """
         named_exprs = [
             nodes.ColumnDef(expr, ids.ColumnId.unique()) for expr in assignments
         ]
         # TODO: Push this to rewrite later to go from block expression to planning form
-        # TODO: Jointly fragmentize expressions to more efficiently reuse common sub-expressions
-        fragments = tuple(
-            itertools.chain.from_iterable(
-                expression_factoring.fragmentize_expression(expr)
-                for expr in named_exprs
-            )
-        )
+        new_root = expression_factoring.apply_col_exprs_to_plan(self.node, named_exprs)
+
         target_ids = tuple(named_expr.id for named_expr in named_exprs)
-        new_root = expression_factoring.push_into_tree(self.node, fragments, target_ids)
         return (ArrayValue(new_root), target_ids)
 
+    def compute_general_reduction(
+        self,
+        assignments: Sequence[ex.Expression],
+        by_column_ids: typing.Sequence[str] = (),
+        *,
+        dropna: bool = False,
+    ):
+        """
+        Applies arbitrary aggregation expressions to the block, optionally grouped by keys.
+
+        This method handles reduction operations (e.g., sum, mean, count) that collapse
+        multiple input rows into a single scalar value per group. If grouping keys are
+        provided, the operation is performed per group; otherwise, it is a global reduction.
+
+        Note: Intermediate aggregations (those that are inputs to further aggregations)
+        must be windowizable. Notably excluded are approx quantile, top count ops.
+
+        Args:
+            assignments (Sequence[ex.Expression]): A sequence of aggregation expressions
+                to be calculated.
+            by_column_ids (typing.Sequence[str], optional): A sequence of column IDs
+                to use as grouping keys. Defaults to an empty tuple (global reduction).
+            dropna (bool, optional): If True, rows containing null values in the
+                `by_column_ids` columns will be filtered out before the reduction
+                is applied. Defaults to False.
+
+        Returns:
+            ArrayValue:
+               The new root node representing the aggregation/group-by result.
+        """
+        plan = self.node
+
+        # shortcircuit to keep things simple if all aggs are simple
+        # TODO: Fully unify paths once rewriters are strong enough to simplify complexity from full path
+        def _is_direct_agg(agg_expr):
+            return isinstance(agg_expr, agg_expressions.Aggregation) and all(
+                isinstance(child, (ex.DerefOp, ex.ScalarConstantExpression))
+                for child in agg_expr.children
+            )
+
+        if all(_is_direct_agg(agg) for agg in assignments):
+            agg_defs = tuple((agg, ids.ColumnId.unique()) for agg in assignments)
+            return ArrayValue(
+                nodes.AggregateNode(
+                    child=self.node,
+                    aggregations=agg_defs,  # type: ignore
+                    by_column_ids=tuple(map(ex.deref, by_column_ids)),
+                    dropna=dropna,
+                )
+            )
+
+        if dropna:
+            for col_id in by_column_ids:
+                plan = nodes.FilterNode(plan, ops.notnull_op.as_expr(col_id))
+
+        named_exprs = [
+            nodes.ColumnDef(expr, ids.ColumnId.unique()) for expr in assignments
+        ]
+        # TODO: Push this to rewrite later to go from block expression to planning form
+        new_root = expression_factoring.apply_agg_exprs_to_plan(
+            plan, named_exprs, grouping_keys=[ex.deref(by) for by in by_column_ids]
+        )
+        return ArrayValue(new_root)
+
     def project_to_id(self, expression: ex.Expression):
         array_val, ids = self.compute_values(
             [expression],
 
@@ -129,12 +129,12 @@ def quantile(
                 window_spec=window,
             )
             quantile_cols.append(quantile_col)
-    block, _ = block.aggregate(
-        grouping_column_ids,
+    block = block.aggregate(
         tuple(
             agg_expressions.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col))
             for col in quantile_cols
         ),
+        grouping_column_ids,
         column_labels=pd.Index(labels),
         dropna=dropna,
     )
@@ -358,12 +358,12 @@ def value_counts(
     if grouping_keys and drop_na:
         # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
         block = dropna(block, columns, how="any")
-    block, agg_ids = block.aggregate(
-        by_column_ids=(*grouping_keys, *columns),
+    block = block.aggregate(
         aggregations=[agg_expressions.NullaryAggregation(agg_ops.size_op)],
+        by_column_ids=(*grouping_keys, *columns),
         dropna=drop_na and not grouping_keys,
     )
-    count_id = agg_ids[0]
+    count_id = block.value_columns[0]
     if normalize:
         unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
         block, total_count_id = block.apply_window_op(
@@ -621,40 +621,28 @@ def skew(
     original_columns = skew_column_ids
     column_labels = block.select_columns(original_columns).column_labels
 
-    block, delta3_ids = _mean_delta_to_power(
-        block, 3, original_columns, grouping_column_ids
-    )
     # counts, moment3 for each column
     aggregations = []
-    for i, col in enumerate(original_columns):
+    for col in original_columns:
+        delta3_expr = _mean_delta_to_power(3, col)
         count_agg = agg_expressions.UnaryAggregation(
             agg_ops.count_op,
             ex.deref(col),
         )
         moment3_agg = agg_expressions.UnaryAggregation(
             agg_ops.mean_op,
-            ex.deref(delta3_ids[i]),
+            delta3_expr,
         )
         variance_agg = agg_expressions.UnaryAggregation(
             agg_ops.PopVarOp(),
             ex.deref(col),
         )
-        aggregations.extend([count_agg, moment3_agg, variance_agg])
+        skew_expr = _skew_from_moments_and_count(count_agg, moment3_agg, variance_agg)
+        aggregations.append(skew_expr)
 
-    block, agg_ids = block.aggregate(
-        by_column_ids=grouping_column_ids, aggregations=aggregations
+    block = block.aggregate(
+        aggregations, grouping_column_ids, column_labels=column_labels
     )
-
-    skew_ids = []
-    for i, col in enumerate(original_columns):
-        # Corresponds to order of aggregations in preceding loop
-        count_id, moment3_id, var_id = agg_ids[i * 3 : (i * 3) + 3]
-        block, skew_id = _skew_from_moments_and_count(
-            block, count_id, moment3_id, var_id
-        )
-        skew_ids.append(skew_id)
-
-    block = block.select_columns(skew_ids).with_column_labels(column_labels)
     if not grouping_column_ids:
         # When ungrouped, transpose result row into a series
         # perform transpose last, so as to not invalidate cache
@@ -671,36 +659,23 @@ def kurt(
 ) -> blocks.Block:
     original_columns = skew_column_ids
     column_labels = block.select_columns(original_columns).column_labels
-
-    block, delta4_ids = _mean_delta_to_power(
-        block, 4, original_columns, grouping_column_ids
-    )
     # counts, moment4 for each column
-    aggregations = []
-    for i, col in enumerate(original_columns):
+    kurt_exprs = []
+    for col in original_columns:
+        delta_4_expr = _mean_delta_to_power(4, col)
         count_agg = agg_expressions.UnaryAggregation(agg_ops.count_op, ex.deref(col))
-        moment4_agg = agg_expressions.UnaryAggregation(
-            agg_ops.mean_op, ex.deref(delta4_ids[i])
-        )
+        moment4_agg = agg_expressions.UnaryAggregation(agg_ops.mean_op, delta_4_expr)
         variance_agg = agg_expressions.UnaryAggregation(
             agg_ops.PopVarOp(), ex.deref(col)
         )
-        aggregations.extend([count_agg, moment4_agg, variance_agg])
-
-    block, agg_ids = block.aggregate(
-        by_column_ids=grouping_column_ids, aggregations=aggregations
-    )
 
-    kurt_ids = []
-    for i, col in enumerate(original_columns):
         # Corresponds to order of aggregations in preceding loop
-        count_id, moment4_id, var_id = agg_ids[i * 3 : (i * 3) + 3]
-        block, kurt_id = _kurt_from_moments_and_count(
-            block, count_id, moment4_id, var_id
-        )
-        kurt_ids.append(kurt_id)
+        kurt_expr = _kurt_from_moments_and_count(count_agg, moment4_agg, variance_agg)
+        kurt_exprs.append(kurt_expr)
 
-    block = block.select_columns(kurt_ids).with_column_labels(column_labels)
+    block = block.aggregate(
+        kurt_exprs, grouping_column_ids, column_labels=column_labels
+    )
     if not grouping_column_ids:
         # When ungrouped, transpose result row into a series
         # perform transpose last, so as to not invalidate cache
@@ -711,38 +686,30 @@ def kurt(
 
 
 def _mean_delta_to_power(
-    block: blocks.Block,
     n_power: int,
-    column_ids: typing.Sequence[str],
-    grouping_column_ids: typing.Sequence[str],
-) -> typing.Tuple[blocks.Block, typing.Sequence[str]]:
+    val_id: str,
+) -> ex.Expression:
     """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis."""
-    window = windows.unbound(grouping_keys=tuple(grouping_column_ids))
-    block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window)
-    delta_ids = []
-    for val_id, mean_val_id in zip(column_ids, mean_ids):
-        delta = ops.sub_op.as_expr(val_id, mean_val_id)
-        delta_power = ops.pow_op.as_expr(delta, ex.const(n_power))
-        block, delta_power_id = block.project_expr(delta_power)
-        delta_ids.append(delta_power_id)
-    return block, delta_ids
+    mean_expr = agg_expressions.UnaryAggregation(agg_ops.mean_op, ex.deref(val_id))
+    delta = ops.sub_op.as_expr(val_id, mean_expr)
+    return ops.pow_op.as_expr(delta, ex.const(n_power))
 
 
 def _skew_from_moments_and_count(
-    block: blocks.Block, count_id: str, moment3_id: str, moment2_id: str
-) -> typing.Tuple[blocks.Block, str]:
+    count: ex.Expression, moment3: ex.Expression, moment2: ex.Expression
+) -> ex.Expression:
     # Calculate skew using count, third moment and population variance
     # See G1 estimator:
     # https://en.wikipedia.org/wiki/Skewness#Sample_skewness
     moments_estimator = ops.div_op.as_expr(
-        moment3_id, ops.pow_op.as_expr(moment2_id, ex.const(3 / 2))
+        moment3, ops.pow_op.as_expr(moment2, ex.const(3 / 2))
     )
 
-    countminus1 = ops.sub_op.as_expr(count_id, ex.const(1))
-    countminus2 = ops.sub_op.as_expr(count_id, ex.const(2))
+    countminus1 = ops.sub_op.as_expr(count, ex.const(1))
+    countminus2 = ops.sub_op.as_expr(count, ex.const(2))
     adjustment = ops.div_op.as_expr(
         ops.unsafe_pow_op.as_expr(
-            ops.mul_op.as_expr(count_id, countminus1), ex.const(1 / 2)
+            ops.mul_op.as_expr(count, countminus1), ex.const(1 / 2)
         ),
         countminus2,
     )
@@ -751,14 +718,14 @@ def _skew_from_moments_and_count(
 
     # Need to produce NA if have less than 3 data points
     cleaned_skew = ops.where_op.as_expr(
-        skew, ops.ge_op.as_expr(count_id, ex.const(3)), ex.const(None)
+        skew, ops.ge_op.as_expr(count, ex.const(3)), ex.const(None)
     )
-    return block.project_expr(cleaned_skew)
+    return cleaned_skew
 
 
 def _kurt_from_moments_and_count(
-    block: blocks.Block, count_id: str, moment4_id: str, moment2_id: str
-) -> typing.Tuple[blocks.Block, str]:
+    count: ex.Expression, moment4: ex.Expression, moment2: ex.Expression
+) -> ex.Expression:
     # Kurtosis is often defined as the second standardize moment: moment(4)/moment(2)**2
     # Pandas however uses Fisher’s estimator, implemented below
     # numerator = (count + 1) * (count - 1) * moment4
@@ -767,28 +734,26 @@ def _kurt_from_moments_and_count(
     # kurtosis = (numerator / denominator) - adjustment
 
     numerator = ops.mul_op.as_expr(
-        moment4_id,
+        moment4,
         ops.mul_op.as_expr(
-            ops.sub_op.as_expr(count_id, ex.const(1)),
-            ops.add_op.as_expr(count_id, ex.const(1)),
+            ops.sub_op.as_expr(count, ex.const(1)),
+            ops.add_op.as_expr(count, ex.const(1)),
         ),
     )
 
     # Denominator
-    countminus2 = ops.sub_op.as_expr(count_id, ex.const(2))
-    countminus3 = ops.sub_op.as_expr(count_id, ex.const(3))
+    countminus2 = ops.sub_op.as_expr(count, ex.const(2))
+    countminus3 = ops.sub_op.as_expr(count, ex.const(3))
 
     # Denominator
     denominator = ops.mul_op.as_expr(
-        ops.unsafe_pow_op.as_expr(moment2_id, ex.const(2)),
+        ops.unsafe_pow_op.as_expr(moment2, ex.const(2)),
         ops.mul_op.as_expr(countminus2, countminus3),
     )
 
     # Adjustment
     adj_num = ops.mul_op.as_expr(
-        ops.unsafe_pow_op.as_expr(
-            ops.sub_op.as_expr(count_id, ex.const(1)), ex.const(2)
-        ),
+        ops.unsafe_pow_op.as_expr(ops.sub_op.as_expr(count, ex.const(1)), ex.const(2)),
         ex.const(3),
     )
     adj_denom = ops.mul_op.as_expr(countminus2, countminus3)
@@ -799,9 +764,9 @@ def _kurt_from_moments_and_count(
 
     # Need to produce NA if have less than 4 data points
     cleaned_kurt = ops.where_op.as_expr(
-        kurt, ops.ge_op.as_expr(count_id, ex.const(4)), ex.const(None)
+        kurt, ops.ge_op.as_expr(count, ex.const(4)), ex.const(None)
     )
-    return block.project_expr(cleaned_kurt)
+    return cleaned_kurt
 
 
 def align(