perf: Improve apply axis=1 performance

TrevorBergeron · TrevorBergeron · commit ee64721bcfb4 · 2025-09-11T23:13:31.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -27,7 +27,6 @@
 import functools
 import itertools
 import random
-import textwrap
 import typing
 from typing import (
     Iterable,
@@ -54,16 +53,13 @@
 from bigframes.core import agg_expressions, local_data
 import bigframes.core as core
 import bigframes.core.agg_expressions as ex_types
-import bigframes.core.compile.googlesql as googlesql
 import bigframes.core.expression as ex
 import bigframes.core.expression as scalars
 import bigframes.core.guid as guid
 import bigframes.core.identifiers
 import bigframes.core.join_def as join_defs
 import bigframes.core.ordering as ordering
 import bigframes.core.pyarrow_utils as pyarrow_utils
-import bigframes.core.schema as bf_schema
-import bigframes.core.sql as sql
 import bigframes.core.utils as utils
 import bigframes.core.window_spec as windows
 import bigframes.dtypes
@@ -2776,14 +2772,6 @@ def _throw_if_null_index(self, opname: str):
             )
 
     def _get_rows_as_json_values(self) -> Block:
-        # We want to preserve any ordering currently present before turning to
-        # direct SQL manipulation. We will restore the ordering when we rebuild
-        # expression.
-        # TODO(shobs): Replace direct SQL manipulation by structured expression
-        # manipulation
-        expr, ordering_column_name = self.expr.promote_offsets()
-        expr_sql = self.session._executor.to_sql(expr)
-
         # Names of the columns to serialize for the row.
         # We will use the repr-eval pattern to serialize a value here and
         # deserialize in the cloud function. Let's make sure that would work.
@@ -2799,10 +2787,6 @@ def _get_rows_as_json_values(self) -> Block:
                 )
 
             column_names.append(serialized_column_name)
-        column_names_csv = sql.csv(map(sql.simple_literal, column_names))
-
-        # index columns count
-        index_columns_count = len(self.index_columns)
 
         # column references to form the array of values for the row
         column_types = list(self.index.dtypes) + list(self.dtypes)
@@ -2811,81 +2795,30 @@ def _get_rows_as_json_values(self) -> Block:
             if isinstance(type_, pd.ArrowDtype) and pa.types.is_binary(
                 type_.pyarrow_dtype
             ):
-                column_references.append(sql.to_json_string(col))
+                column_references.append(ops.ToJSONString().as_expr(col))
             else:
-                column_references.append(sql.cast_as_string(col))
-
-        column_references_csv = sql.csv(column_references)
-
-        # types of the columns to serialize for the row
-        column_types_csv = sql.csv(
-            [sql.simple_literal(str(typ)) for typ in column_types]
-        )
+                column_references.append(
+                    ops.AsTypeOp(bigframes.dtypes.STRING_DTYPE).as_expr(col)
+                )
 
         # row dtype to use for deserializing the row as pandas series
         pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
         if pandas_row_dtype is None:
             pandas_row_dtype = "object"
-        pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype))
-
-        # create a json column representing row through SQL manipulation
-        row_json_column_name = guid.generate_guid()
-        select_columns = (
-            [ordering_column_name] + list(self.index_columns) + [row_json_column_name]
-        )
-        select_columns_csv = sql.csv(
-            [googlesql.identifier(col) for col in select_columns]
-        )
-        json_sql = f"""\
-With T0 AS (
-{textwrap.indent(expr_sql, "    ")}
-),
-T1 AS (
-    SELECT *,
-           TO_JSON_STRING(JSON_OBJECT(
-               "names", [{column_names_csv}],
-               "types", [{column_types_csv}],
-               "values", [{column_references_csv}],
-               "indexlength", {index_columns_count},
-               "dtype", {pandas_row_dtype}
-           )) AS {googlesql.identifier(row_json_column_name)} FROM T0
-)
-SELECT {select_columns_csv} FROM T1
-"""
-        # The only ways this code is used is through df.apply(axis=1) cope path
-        destination, query_job = self.session._loader._query_to_destination(
-            json_sql, cluster_candidates=[ordering_column_name]
-        )
-        if not destination:
-            raise ValueError(f"Query job {query_job} did not produce result table")
-
-        new_schema = (
-            self.expr.schema.select([*self.index_columns])
-            .append(
-                bf_schema.SchemaItem(
-                    row_json_column_name, bigframes.dtypes.STRING_DTYPE
-                )
-            )
-            .append(
-                bf_schema.SchemaItem(ordering_column_name, bigframes.dtypes.INT_DTYPE)
-            )
-        )
+        pandas_row_dtype = str(pandas_row_dtype)
 
-        dest_table = self.session.bqclient.get_table(destination)
-        expr = core.ArrayValue.from_table(
-            dest_table,
-            schema=new_schema,
-            session=self.session,
-            offsets_col=ordering_column_name,
-            n_rows=dest_table.num_rows,
-        ).drop_columns([ordering_column_name])
-        block = Block(
-            expr,
-            index_columns=self.index_columns,
-            column_labels=[row_json_column_name],
-            index_labels=self._index_labels,
+        struct_op = ops.StructOp(
+            column_names=("names", "types", "values", "indexlength", "dtype")
         )
-        return block
+        names_val = ex.const(tuple(column_names))
+        types_val = ex.const(tuple(map(str, column_types)))
+        values_val = ops.ToArrayOp().as_expr(*self.expr.column_ids)
+        indexlength_val = ex.const(len(self.index_columns))
+        dtype_val = ex.const(str(pandas_row_dtype))
+        struct_expr = struct_op.as_expr(
+            names_val, types_val, values_val, indexlength_val, dtype_val
+        )
+        return self.project_expr(struct_expr)[0]
 
 
 class BlockIndexProperties:
diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py
@@ -107,12 +107,6 @@ class ToJSONString(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "to_json_string"
 
     def output_type(self, *input_types):
-        input_type = input_types[0]
-        if not dtypes.is_json_like(input_type):
-            raise TypeError(
-                "Input type must be a valid JSON object or JSON-formatted string type."
-                + f" Received type: {input_type}"
-            )
         return dtypes.STRING_DTYPE
 
 
diff --git a/bigframes/operations/struct_ops.py b/bigframes/operations/struct_ops.py
@@ -43,7 +43,7 @@ def output_type(self, *input_types):
 @dataclasses.dataclass(frozen=True)
 class StructOp(base_ops.NaryOp):
     name: typing.ClassVar[str] = "struct"
-    column_names: tuple[str]
+    column_names: tuple[str, ...]
 
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         num_input_types = len(input_types)