report execution started/stopped in read_gbq_query

tswast · tswast · commit 70d83242edd2 · 2025-09-26T16:08:58.000Z
diff --git a/bigframes/core/events.py b/bigframes/core/events.py
@@ -25,6 +25,7 @@
 import google.cloud.bigquery.table
 
 import bigframes.formatting_helpers
+import bigframes.session.executor
 
 
 @dataclasses.dataclass(frozen=True)
@@ -83,8 +84,9 @@ class ExecutionRunning(Event):
     pass
 
 
-class ExecutionStopped(Event):
-    pass
+@dataclasses.dataclass(frozen=True)
+class ExecutionFinished(Event):
+    result: Optional[bigframes.session.executor.ExecuteResult] = None
 
 
 @dataclasses.dataclass(frozen=True)
diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py
@@ -125,6 +125,7 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]):
 
 current_display: Optional[display.HTML] = None
 current_display_id: Optional[str] = None
+previous_message: str = ""
 
 
 def progress_callback(
@@ -149,24 +150,27 @@ def progress_callback(
         ):
             current_display_id = str(random.random())
             current_display = display.HTML("Starting execution.")
-            display.display(current_display)
+            display.display(
+                current_display,
+                display_id=current_display_id,
+            )
 
         if isinstance(event, bigframes.core.events.ExecutionRunning):
             display.update_display(
                 display.HTML("Execution happening."),
                 display_id=current_display_id,
             )
-        elif isinstance(event, bigframes.core.events.ExecutionStopped):
+        elif isinstance(event, bigframes.core.events.ExecutionFinished):
             display.update_display(
-                display.HTML("Execution done."),
+                display.HTML(f"{previous_message} Execution done."),
                 display_id=current_display_id,
             )
     elif progress_bar == "terminal":
         if isinstance(event, bigframes.core.events.ExecutionStarted):
             print("Starting execution.")
         elif isinstance(event, bigframes.core.events.ExecutionRunning):
             print("Execution happening.")
-        elif isinstance(event, bigframes.core.events.ExecutionStopped):
+        elif isinstance(event, bigframes.core.events.ExecutionFinished):
             print("Execution done.")
 
 
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -75,10 +75,7 @@
 import bigframes.functions.function as bff
 from bigframes.session import bigquery_session, bq_caching_executor, executor
 import bigframes.session._io.bigquery as bf_io_bigquery
-import bigframes.session.anonymous_dataset
 import bigframes.session.clients
-import bigframes.session.loader
-import bigframes.session.metrics
 import bigframes.session.validation
 
 # Avoid circular imports.
diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py
@@ -26,8 +26,8 @@
 import bigframes_vendored.constants as constants
 import google.api_core.exceptions
 import google.cloud.bigquery as bigquery
+import google.cloud.bigquery.table
 
-import bigframes.core.sql
 import bigframes.exceptions as bfe
 import bigframes.session._io.bigquery
 
@@ -101,7 +101,7 @@ def get_table_metadata(
 
 def is_time_travel_eligible(
     bqclient: bigquery.Client,
-    table: bigquery.table.Table,
+    table: google.cloud.bigquery.table.Table,
     columns: Optional[Sequence[str]],
     snapshot_time: datetime.datetime,
     filter_str: Optional[str] = None,
@@ -210,10 +210,8 @@ def is_time_travel_eligible(
 
 
 def infer_unique_columns(
-    bqclient: bigquery.Client,
-    table: bigquery.table.Table,
+    table: google.cloud.bigquery.table.Table,
     index_cols: List[str],
-    metadata_only: bool = False,
 ) -> Tuple[str, ...]:
     """Return a set of columns that can provide a unique row key or empty if none can be inferred.
 
@@ -227,14 +225,34 @@ def infer_unique_columns(
         # Essentially, just reordering the primary key to match the index col order
         return tuple(index_col for index_col in index_cols if index_col in primary_keys)
 
-    if primary_keys or metadata_only or (not index_cols):
-        # Sometimes not worth scanning data to check uniqueness
+    if primary_keys:
         return primary_keys
+
+    return ()
+
+
+def check_if_index_columns_are_unique(
+    bqclient: bigquery.Client,
+    table: google.cloud.bigquery.table.Table,
+    index_cols: List[str],
+) -> Tuple[str, ...]:
+    import bigframes.core.sql
+    import bigframes.session._io.bigquery
+
     # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring
     # table_expression only selects just index_cols.
     is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference)
     job_config = bigquery.QueryJobConfig()
-    results = bqclient.query_and_wait(is_unique_sql, job_config=job_config)
+    results, _ = bigframes.session._io.bigquery.start_query_with_client(
+        bq_client=bqclient,
+        sql=is_unique_sql,
+        job_config=job_config,
+        timeout=None,
+        location=None,
+        project=None,
+        metrics=None,
+        query_with_job=False,
+    )
     row = next(iter(results))
 
     if row["total_count"] == row["distinct_count"]:
@@ -243,7 +261,7 @@ def infer_unique_columns(
 
 
 def _get_primary_keys(
-    table: bigquery.table.Table,
+    table: google.cloud.bigquery.table.Table,
 ) -> List[str]:
     """Get primary keys from table if they are set."""
 
@@ -261,7 +279,7 @@ def _get_primary_keys(
 
 
 def _is_table_clustered_or_partitioned(
-    table: bigquery.table.Table,
+    table: google.cloud.bigquery.table.Table,
 ) -> bool:
     """Returns True if the table is clustered or partitioned."""
 
@@ -284,7 +302,7 @@ def _is_table_clustered_or_partitioned(
 
 
 def get_index_cols(
-    table: bigquery.table.Table,
+    table: google.cloud.bigquery.table.Table,
     index_col: Iterable[str]
     | str
     | Iterable[int]
diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py
@@ -32,6 +32,7 @@
 import bigframes.core
 from bigframes.core import compile, local_data, rewrite
 import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir
+import bigframes.core.events
 import bigframes.core.guid
 import bigframes.core.identifiers
 import bigframes.core.nodes as nodes
@@ -187,6 +188,8 @@ def execute(
         array_value: bigframes.core.ArrayValue,
         execution_spec: ex_spec.ExecutionSpec,
     ) -> executor.ExecuteResult:
+        bigframes.core.events.publisher.send(bigframes.core.events.ExecutionStarted())
+
         # TODO: Support export jobs in combination with semi executors
         if execution_spec.destination_spec is None:
             plan = self.prepare_plan(array_value.node, target="simplify")
@@ -195,6 +198,11 @@ def execute(
                     plan, ordered=execution_spec.ordered, peek=execution_spec.peek
                 )
                 if maybe_result:
+                    bigframes.core.events.publisher.send(
+                        bigframes.core.events.ExecutionFinished(
+                            result=maybe_result,
+                        )
+                    )
                     return maybe_result
 
         if isinstance(execution_spec.destination_spec, ex_spec.TableOutputSpec):
@@ -203,7 +211,13 @@ def execute(
                     "Ordering and peeking not supported for gbq export"
                 )
             # separate path for export_gbq, as it has all sorts of annoying logic, such as possibly running as dml
-            return self._export_gbq(array_value, execution_spec.destination_spec)
+            result = self._export_gbq(array_value, execution_spec.destination_spec)
+            bigframes.core.events.publisher.send(
+                bigframes.core.events.ExecutionFinished(
+                    result=result,
+                )
+            )
+            return result
 
         result = self._execute_plan_gbq(
             array_value.node,
@@ -218,6 +232,11 @@ def execute(
         if isinstance(execution_spec.destination_spec, ex_spec.GcsOutputSpec):
             self._export_result_gcs(result, execution_spec.destination_spec)
 
+        bigframes.core.events.publisher.send(
+            bigframes.core.events.ExecutionFinished(
+                result=result,
+            )
+        )
         return result
 
     def _export_result_gcs(
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -50,6 +50,7 @@
 from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils
 import bigframes.core as core
 import bigframes.core.blocks as blocks
+import bigframes.core.events
 import bigframes.core.schema as schemata
 import bigframes.dtypes
 import bigframes.formatting_helpers as formatting_helpers
@@ -499,6 +500,7 @@ def read_gbq_table(  # type: ignore[overload-overlap]
         force_total_order: Optional[bool] = ...,
         n_rows: Optional[int] = None,
         index_col_in_columns: bool = False,
+        publish_execution: bool = True,
     ) -> dataframe.DataFrame:
         ...
 
@@ -522,6 +524,7 @@ def read_gbq_table(
         force_total_order: Optional[bool] = ...,
         n_rows: Optional[int] = None,
         index_col_in_columns: bool = False,
+        publish_execution: bool = True,
     ) -> pandas.Series:
         ...
 
@@ -544,6 +547,7 @@ def read_gbq_table(
         force_total_order: Optional[bool] = None,
         n_rows: Optional[int] = None,
         index_col_in_columns: bool = False,
+        publish_execution: bool = True,
     ) -> dataframe.DataFrame | pandas.Series:
         """Read a BigQuery table into a BigQuery DataFrames DataFrame.
 
@@ -603,8 +607,12 @@ def read_gbq_table(
                     when the index is selected from the data columns (e.g., in a
                     ``read_csv`` scenario). The column will be used as the
                     DataFrame's index and removed from the list of value columns.
+            publish_execution (bool, optional):
+                If True, sends an execution started and stopped event if this
+                causes a query. Set to False if using read_gbq_table from
+                another function that is reporting execution.
         """
-        import bigframes._tools.strings
+        import bigframes.core.events
         import bigframes.dataframe as dataframe
 
         # ---------------------------------
@@ -768,12 +776,26 @@ def read_gbq_table(
         # TODO(b/338065601): Provide a way to assume uniqueness and avoid this
         # check.
         primary_key = bf_read_gbq_table.infer_unique_columns(
-            bqclient=self._bqclient,
             table=table,
             index_cols=index_cols,
-            # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique
-            metadata_only=not self._scan_index_uniqueness,
         )
+
+        # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique
+        if not primary_key and self._scan_index_uniqueness and index_cols:
+            if publish_execution:
+                bigframes.core.events.publisher.send(
+                    bigframes.core.events.ExecutionStarted(),
+                )
+            primary_key = bf_read_gbq_table.check_if_index_columns_are_unique(
+                self._bqclient,
+                table=table,
+                index_cols=index_cols,
+            )
+            if publish_execution:
+                bigframes.core.events.publisher.send(
+                    bigframes.core.events.ExecutionFinished(),
+                )
+
         schema = schemata.ArraySchema.from_bq_table(table)
         if not include_all_columns:
             schema = schema.select(index_cols + columns)
@@ -991,6 +1013,12 @@ def read_gbq_query(
                 query_job, list(columns), index_cols
             )
 
+        # We want to make sure we show progress when we actually do execute a
+        # query. Since we have got this far, we know it's not a dry run.
+        bigframes.core.events.publisher.send(
+            bigframes.core.events.ExecutionStarted(),
+        )
+
         query_job_for_metrics: Optional[bigquery.QueryJob] = None
         destination: Optional[bigquery.TableReference] = None
 
@@ -1046,20 +1074,28 @@ def read_gbq_query(
         # makes sense to download the results beyond the first page, even if
         # there is a job and destination table available.
         if query_job_for_metrics is None and rows is not None:
-            return bf_read_gbq_query.create_dataframe_from_row_iterator(
+            df = bf_read_gbq_query.create_dataframe_from_row_iterator(
                 rows,
                 session=self._session,
                 index_col=index_col,
                 columns=columns,
             )
+            bigframes.core.events.publisher.send(
+                bigframes.core.events.ExecutionFinished(),
+            )
+            return df
 
         # We already checked rows, so if there's no destination table, then
         # there are no results to return.
         if destination is None:
-            return bf_read_gbq_query.create_dataframe_from_query_job_stats(
+            df = bf_read_gbq_query.create_dataframe_from_query_job_stats(
                 query_job_for_metrics,
                 session=self._session,
             )
+            bigframes.core.events.publisher.send(
+                bigframes.core.events.ExecutionFinished(),
+            )
+            return df
 
         # If the query was DDL or DML, return some job metadata. See
         # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type
@@ -1070,10 +1106,14 @@ def read_gbq_query(
             query_job_for_metrics is not None
             and not bf_read_gbq_query.should_return_query_results(query_job_for_metrics)
         ):
-            return bf_read_gbq_query.create_dataframe_from_query_job_stats(
+            df = bf_read_gbq_query.create_dataframe_from_query_job_stats(
                 query_job_for_metrics,
                 session=self._session,
             )
+            bigframes.core.events.publisher.send(
+                bigframes.core.events.ExecutionFinished(),
+            )
+            return df
 
         # Speed up counts by getting counts from result metadata.
         if rows is not None:
@@ -1083,16 +1123,21 @@ def read_gbq_query(
         else:
             n_rows = None
 
-        return self.read_gbq_table(
+        df = self.read_gbq_table(
             f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
             index_col=index_col,
             columns=columns,
             use_cache=configuration["query"]["useQueryCache"],
             force_total_order=force_total_order,
             n_rows=n_rows,
+            publish_execution=False,
             # max_results and filters are omitted because they are already
             # handled by to_query(), above.
         )
+        bigframes.core.events.publisher.send(
+            bigframes.core.events.ExecutionFinished(),
+        )
+        return df
 
     def _query_to_destination(
         self,
diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb