Merge branch 'main' into polars_semi

TrevorBergeron · web-flow · commit 9d172864a9f6 · 2025-06-20T10:05:40.000-07:00
diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py
@@ -29,7 +29,7 @@ class DisplayOptions:
     max_columns: int = 20
     max_rows: int = 25
     progress_bar: Optional[str] = "auto"
-    repr_mode: Literal["head", "deferred"] = "head"
+    repr_mode: Literal["head", "deferred", "anywidget"] = "head"
 
     max_info_columns: int = 100
     max_info_rows: Optional[int] = 200000
diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py
@@ -83,7 +83,13 @@ def _select_field(self, field) -> SelectExpression:
             return SelectExpression(expression=expr.ColumnExpression(name=field))
 
         else:
-            alias = field[1] if (field[0] != field[1]) else None
+            alias = (
+                expr.AliasExpression(field[1])
+                if isinstance(field[1], str)
+                else field[1]
+                if (field[0] != field[1])
+                else None
+            )
             return SelectExpression(
                 expression=expr.ColumnExpression(name=field[0]), alias=alias
             )
diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py
@@ -125,10 +125,7 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str:
             (name, scalar_compiler.compile_scalar_expression(ref))
             for ref, name in root.output_cols
         )
-        # Skip squashing selections to ensure the right ordering and limit keys
-        sqlglot_ir = self.compile_node(root.child).select(
-            selected_cols, squash_selections=False
-        )
+        sqlglot_ir = self.compile_node(root.child).select(selected_cols)
 
         if root.order_by is not None:
             ordering_cols = tuple(
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -203,7 +203,6 @@ def from_union(
     def select(
         self,
         selected_cols: tuple[tuple[str, sge.Expression], ...],
-        squash_selections: bool = True,
     ) -> SQLGlotIR:
         selections = [
             sge.Alias(
@@ -213,15 +212,6 @@ def select(
             for id, expr in selected_cols
         ]
 
-        # If squashing is enabled, we try to simplify the selections
-        # by checking if the new selections are simply aliases of the
-        # original columns.
-        if squash_selections:
-            new_selections = _squash_selections(self.expr.expressions, selections)
-            if new_selections != []:
-                new_expr = self.expr.select(*new_selections, append=False)
-                return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
-
         new_expr = self._encapsulate_as_cte().select(*selections, append=False)
         return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
 
@@ -361,63 +351,3 @@ def _table(table: bigquery.TableReference) -> sge.Table:
         db=sg.to_identifier(table.dataset_id, quoted=True),
         catalog=sg.to_identifier(table.project, quoted=True),
     )
-
-
-def _squash_selections(
-    old_expr: list[sge.Expression], new_expr: list[sge.Alias]
-) -> list[sge.Alias]:
-    """
-    TODO: Reanble this function to optimize the SQL.
-    Simplifies the select column expressions if existing (old_expr) and
-    new (new_expr) selected columns are both simple aliases of column definitions.
-
-    Example:
-    old_expr: [A AS X, B AS Y]
-    new_expr: [X AS P, Y AS Q]
-    Result:   [A AS P, B AS Q]
-    """
-    old_alias_map: typing.Dict[str, str] = {}
-    for selected in old_expr:
-        column_alias_pair = _get_column_alias_pair(selected)
-        if column_alias_pair is None:
-            return []
-        else:
-            old_alias_map[column_alias_pair[1]] = column_alias_pair[0]
-
-    new_selected_cols: typing.List[sge.Alias] = []
-    for selected in new_expr:
-        column_alias_pair = _get_column_alias_pair(selected)
-        if column_alias_pair is None or column_alias_pair[0] not in old_alias_map:
-            return []
-        else:
-            new_alias_expr = sge.Alias(
-                this=sge.ColumnDef(
-                    this=sge.to_identifier(
-                        old_alias_map[column_alias_pair[0]], quoted=True
-                    )
-                ),
-                alias=sg.to_identifier(column_alias_pair[1], quoted=True),
-            )
-            new_selected_cols.append(new_alias_expr)
-    return new_selected_cols
-
-
-def _get_column_alias_pair(
-    expr: sge.Expression,
-) -> typing.Optional[typing.Tuple[str, str]]:
-    """Checks if an expression is a simple alias of a column definition
-    (e.g., "column_name AS alias_name").
-    If it is, returns a tuple containing the alias name and original column name.
-    Returns `None` otherwise.
-    """
-    if not isinstance(expr, sge.Alias):
-        return None
-    if not isinstance(expr.this, sge.ColumnDef):
-        return None
-
-    column_def_expr: sge.ColumnDef = expr.this
-    if not isinstance(column_def_expr.this, sge.Identifier):
-        return None
-
-    original_identifier: sge.Identifier = column_def_expr.this
-    return (original_identifier.this, expr.alias)
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -251,7 +251,9 @@ def __repr__(self) -> str:
         # metadata, like we do with DataFrame.
         opts = bigframes.options.display
         max_results = opts.max_rows
-        if opts.repr_mode == "deferred":
+        # anywdiget mode uses the same display logic as the "deferred" mode
+        # for faster execution
+        if opts.repr_mode in ("deferred", "anywidget"):
             _, dry_run_query_job = self._block._compute_dry_run()
             return formatter.repr_query_job(dry_run_query_job)
 
diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import dataclasses
 import functools
-from typing import AbstractSet
+import typing
 
 from bigframes.core import identifiers, nodes
 
@@ -143,7 +143,7 @@ def prune_selection_child(
 
 def prune_node(
     node: nodes.BigFrameNode,
-    ids: AbstractSet[identifiers.ColumnId],
+    ids: typing.AbstractSet[identifiers.ColumnId],
 ):
     # This clause is important, ensures idempotency, so can reach fixed point
     if not (set(node.ids) - ids):
@@ -157,7 +157,7 @@ def prune_node(
 
 def prune_aggregate(
     node: nodes.AggregateNode,
-    used_cols: AbstractSet[identifiers.ColumnId],
+    used_cols: typing.AbstractSet[identifiers.ColumnId],
 ) -> nodes.AggregateNode:
     pruned_aggs = (
         tuple(agg for agg in node.aggregations if agg[1] in used_cols)
@@ -169,15 +169,15 @@ def prune_aggregate(
 @functools.singledispatch
 def prune_leaf(
     node: nodes.BigFrameNode,
-    used_cols: AbstractSet[identifiers.ColumnId],
+    used_cols: typing.AbstractSet[identifiers.ColumnId],
 ):
     ...
 
 
 @prune_leaf.register
 def prune_readlocal(
     node: nodes.ReadLocalNode,
-    selection: AbstractSet[identifiers.ColumnId],
+    selection: typing.AbstractSet[identifiers.ColumnId],
 ) -> nodes.ReadLocalNode:
     new_scan_list = node.scan_list.filter_cols(selection)
     return dataclasses.replace(
@@ -190,7 +190,7 @@ def prune_readlocal(
 @prune_leaf.register
 def prune_readtable(
     node: nodes.ReadTableNode,
-    selection: AbstractSet[identifiers.ColumnId],
+    selection: typing.AbstractSet[identifiers.ColumnId],
 ) -> nodes.ReadTableNode:
     new_scan_list = node.scan_list.filter_cols(selection)
     return dataclasses.replace(node, scan_list=new_scan_list)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -725,7 +725,9 @@ def __repr__(self) -> str:
 
         opts = bigframes.options.display
         max_results = opts.max_rows
-        if opts.repr_mode == "deferred":
+        # anywdiget mode uses the same display logic as the "deferred" mode
+        # for faster execution
+        if opts.repr_mode in ("deferred", "anywidget"):
             return formatter.repr_query_job(self._compute_dry_run())
 
         # TODO(swast): pass max_columns and get the true column count back. Maybe
@@ -774,6 +776,23 @@ def _repr_html_(self) -> str:
         if opts.repr_mode == "deferred":
             return formatter.repr_query_job(self._compute_dry_run())
 
+        if opts.repr_mode == "anywidget":
+            import anywidget  # type: ignore
+
+            # create an iterator for the data batches
+            batches = self.to_pandas_batches()
+
+            # get the first page result
+            try:
+                first_page = next(iter(batches))
+            except StopIteration:
+                first_page = pandas.DataFrame(columns=self.columns)
+
+            # Instantiate and return the widget. The widget's frontend will
+            # handle the display of the table and pagination
+            return anywidget.AnyWidget(dataframe=first_page)
+
+        self._cached()
         df = self.copy()
         if bigframes.options.display.blob_display:
             blob_cols = [
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -430,7 +430,9 @@ def __repr__(self) -> str:
         # metadata, like we do with DataFrame.
         opts = bigframes.options.display
         max_results = opts.max_rows
-        if opts.repr_mode == "deferred":
+        # anywdiget mode uses the same display logic as the "deferred" mode
+        # for faster execution
+        if opts.repr_mode in ("deferred", "anywidget"):
             return formatter.repr_query_job(self._compute_dry_run())
 
         self._cached()
diff --git a/mypy.ini b/mypy.ini
@@ -41,3 +41,6 @@ ignore_missing_imports = True
 
 [mypy-google.cloud.bigtable]
 ignore_missing_imports = True
+
+[mypy-anywidget]
+ignore_missing_imports = True
diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d10bfca4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2025 Google LLC\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acca43ae",
+   "metadata": {},
+   "source": [
+    "# Demo to Show Anywidget mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ca22f059",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bigframes.pandas as bpd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04406a4d",
+   "metadata": {},
+   "source": [
+    "Set the display option to use anywidget"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1bc5aaf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bpd.options.display.repr_mode = \"anywidget\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a354c69",
+   "metadata": {},
+   "source": [
+    "Display the dataframe in anywidget mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f289d250",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Query job 91997f19-1768-4360-afa7-4a431b3e2d22 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:91997f19-1768-4360-afa7-4a431b3e2d22&page=queryresults\">Open Job</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computation deferred. Computation will process 171.4 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a73e472",
+   "metadata": {},
+   "source": [
+    "Display Series in anywidget mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "42bb02ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computation deferred. Computation will process 171.4 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_series = df[\"year\"]\n",
+    "print(test_series)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/noxfile.py b/noxfile.py
diff --git a/setup.py b/setup.py
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py