From 553368ee41194811a0f5c43fcfbf1d028dacf1b4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 8 Sep 2025 17:21:50 +0000 Subject: [PATCH 1/2] chore: investigate b/440407542 --- bigframes/bigquery/_operations/search.py | 4 +- bigframes/core/blocks.py | 11 ++ bigframes/core/global_session.py | 2 + bigframes/dataframe.py | 10 + bigframes/formatting_helpers.py | 2 - bigframes/perf_inspect.py | 41 ++++ bigframes/session/__init__.py | 4 +- bigframes/session/bq_caching_executor.py | 3 + bigframes/session/clients.py | 4 + bigframes/session/executor.py | 2 + notebooks/perf_inspect.ipynb | 230 +++++++++++++++++++++++ 11 files changed, 307 insertions(+), 6 deletions(-) create mode 100644 bigframes/perf_inspect.py create mode 100644 notebooks/perf_inspect.ipynb diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py index c16c2af1a9..b819cf5b93 100644 --- a/bigframes/bigquery/_operations/search.py +++ b/bigframes/bigquery/_operations/search.py @@ -87,7 +87,8 @@ def create_vector_index( read_gbq_query(sql) - +import bigframes.perf_inspect as perf_inspect +@perf_inspect.runtime_logger def vector_search( base_table: str, column_to_search: str, @@ -246,5 +247,4 @@ def vector_search( df.index.names = index_labels else: df = query._session.read_gbq_query(sql, allow_large_results=allow_large_results) - return df diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d62173b7d6..0f2782a4f6 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -41,6 +41,7 @@ Union, ) import warnings +import bigframes.perf_inspect as perf_inspect import bigframes_vendored.constants as constants import google.cloud.bigquery as bigquery @@ -940,6 +941,8 @@ def split( ] return [sliced_block.drop_columns(drop_cols) for sliced_block in sliced_blocks] + + @perf_inspect.runtime_logger def _compute_dry_run( self, value_keys: Optional[Iterable[str]] = None, @@ -1629,6 +1632,7 @@ def slice( # Using cache to optimize for Jupyter Notebook's behavior where both '__repr__' # and '__repr_html__' are called in a single display action, reducing redundant # queries. + @perf_inspect.runtime_logger @functools.cache def retrieve_repr_request_results( self, max_results: int @@ -1646,6 +1650,8 @@ def retrieve_repr_request_results( array_value=self.expr, config=executors.CacheConfig(optimize_for="head", if_cached="reuse-strict"), ) + import time + start_time = time.monotonic() head_result = self.session._executor.execute( self.expr.slice(start=None, stop=max_results, step=None), execution_spec.ExecutionSpec( @@ -1653,6 +1659,8 @@ def retrieve_repr_request_results( ordered=True, ), ) + print("Time taken to execute head: {:.2f} seconds".format(time.monotonic() - start_time)) + start_time = time.monotonic() row_count = self.session._executor.execute( self.expr.row_count(), execution_spec.ExecutionSpec( @@ -1660,8 +1668,11 @@ def retrieve_repr_request_results( ordered=False, ), ).to_py_scalar() + print("Time taken to execute row_count: {:.2f} seconds".format(time.monotonic() - start_time)) + start_time = time.monotonic() head_df = head_result.to_pandas() + print("Time taken to execute to_pandas: {:.2f} seconds".format(time.monotonic() - start_time)) return self._copy_index_to_pandas(head_df), row_count, head_result.query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 4698e4c4c5..b5d7aea4cd 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -83,6 +83,8 @@ def close_session() -> None: bigframes._config.options.bigquery._session_started = False +import bigframes.perf_inspect as perf_inspect +@perf_inspect.runtime_logger def get_global_session(): """Gets the global session. diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c65bbdd2c8..1729da18d2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -54,6 +54,8 @@ import pyarrow import tabulate +import bigframes.perf_inspect as perf_inspect + import bigframes._config.display_options as display_options import bigframes.constants import bigframes.core @@ -116,6 +118,8 @@ class DataFrame(vendored_pandas_frame.DataFrame): # Must be above 5000 for pandas to delegate to bigframes for binops __pandas_priority__ = 15000 + import bigframes.perf_inspect as perf_inspect + @perf_inspect.runtime_logger def __init__( self, data=None, @@ -725,6 +729,7 @@ def __setattr__(self, key: str, value): else: object.__setattr__(self, key, value) + @perf_inspect.runtime_logger def __repr__(self) -> str: """Converts a DataFrame to a string. Calls to_pandas. @@ -777,6 +782,7 @@ def __repr__(self) -> str: lines.append(f"[{row_count} rows x {column_count} columns]") return "\n".join(lines) + @perf_inspect.runtime_logger def _repr_html_(self) -> str: """ Returns an html string primarily for use by notebooks for displaying @@ -1703,6 +1709,9 @@ def to_pandas( ) -> pandas.Series: ... + + import bigframes.perf_inspect as perf_inspect + @perf_inspect.runtime_logger def to_pandas( self, max_download_size: Optional[int] = None, @@ -1887,6 +1896,7 @@ def to_pandas_batches( allow_large_results=allow_large_results, ) + @perf_inspect.runtime_logger def _compute_dry_run(self) -> bigquery.QueryJob: _, query_job = self._block._compute_dry_run() return query_job diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 48afb4fdbd..72503a4f16 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -90,7 +90,6 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): table_html += "" return widgets.HTML(table_html) - def repr_query_job(query_job: Optional[bigquery.QueryJob]): """Return query job as a formatted string. Args: @@ -118,7 +117,6 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): res += f"""{key}: {job_val}""" return res - def wait_for_query_job( query_job: bigquery.QueryJob, max_results: Optional[int] = None, diff --git a/bigframes/perf_inspect.py b/bigframes/perf_inspect.py new file mode 100644 index 0000000000..5d32a1047d --- /dev/null +++ b/bigframes/perf_inspect.py @@ -0,0 +1,41 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import time + +global_counter = 0 + +def runtime_logger(func): + """Decorator to log the runtime of a function.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + global global_counter + global_counter += 1 + prefix = "--" * global_counter + + start_time = time.monotonic() + + print(f"|{prefix}{func.__qualname__} started at {start_time:.2f} seconds") + result = func(*args, **kwargs) + end_time = time.monotonic() + print( + f"|{prefix}{func.__qualname__} ended at {end_time:.2f} seconds. " + f"Runtime: {end_time - start_time:.2f} seconds" + ) + global_counter -= 1 + return result + + return wrapper \ No newline at end of file diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index df67e64e9e..a9d70bf313 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -130,6 +130,8 @@ class Session( An object providing client library objects. """ + import bigframes.perf_inspect as perf_inspect + @perf_inspect.runtime_logger def __init__( self, context: Optional[bigquery_options.BigQueryOptions] = None, @@ -143,7 +145,6 @@ def __init__( if context is None: context = bigquery_options.BigQueryOptions() - if context.location is None: self._location = "US" msg = bfe.format_message( @@ -184,7 +185,6 @@ def __init__( client_endpoints_override=context.client_endpoints_override, requests_transport_adapters=context.requests_transport_adapters, ) - # TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494 # has been fixed. The ibis client changes the default query job config # so we are going to remember the current config and restore it after diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index b7412346bd..873d06ae51 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -51,6 +51,7 @@ import bigframes.session.metrics import bigframes.session.planner import bigframes.session.temporary_storage +import bigframes.perf_inspect as perf_inspect # Max complexity that should be executed as a single query QUERY_COMPLEXITY_LIMIT = 1e7 @@ -182,6 +183,8 @@ def to_sql( compiled = compile.compile_sql(compile.CompileRequest(node, sort_rows=ordered)) return compiled.sql + + @perf_inspect.runtime_logger def execute( self, array_value: bigframes.core.ArrayValue, diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index d680b94b8a..3a58d202ae 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -50,6 +50,8 @@ _BIGQUERYSTORAGE_REGIONAL_ENDPOINT = "bigquerystorage.{location}.rep.googleapis.com" +import bigframes.perf_inspect as perf_inspect +@perf_inspect.runtime_logger def _get_default_credentials_with_project(): return pydata_google_auth.default(scopes=_SCOPES, use_local_webserver=False) @@ -72,6 +74,8 @@ def _get_application_names(): class ClientsProvider: """Provides client instances necessary to perform cloud operations.""" + import bigframes.perf_inspect as perf_inspect + @perf_inspect.runtime_logger def __init__( self, project: Optional[str] = None, diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index d0cfe5f4f7..bd5bbf8bd5 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -30,6 +30,7 @@ import bigframes.core.schema import bigframes.session._io.pandas as io_pandas import bigframes.session.execution_spec as ex_spec +import bigframes.perf_inspect as perf_inspect _ROW_LIMIT_EXCEEDED_TEMPLATE = ( "Execution has downloaded {result_rows} rows so far, which exceeds the " @@ -150,6 +151,7 @@ def to_sql( raise NotImplementedError("to_sql not implemented for this executor") @abc.abstractmethod + @perf_inspect.runtime_logger def execute( self, array_value: bigframes.core.ArrayValue, diff --git a/notebooks/perf_inspect.ipynb b/notebooks/perf_inspect.ipynb new file mode 100644 index 0000000000..b14f47e56a --- /dev/null +++ b/notebooks/perf_inspect.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b782db87", + "metadata": {}, + "source": [ + "# BigFrames Performance Analytical Report" + ] + }, + { + "cell_type": "markdown", + "id": "9a853901", + "metadata": {}, + "source": [ + "## Why BigFrames is slow (b/440407542)" + ] + }, + { + "cell_type": "markdown", + "id": "e685b19c", + "metadata": {}, + "source": [ + "- Add `@perf_inspect.runtime_logger` to the call methods, such as" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c8d1d068", + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d0d32114", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--DataFrame.__init__ started at 2118257.94 seconds\n", + "|----get_global_session started at 2118257.94 seconds\n", + "|------Session.__init__ started at 2118257.94 seconds\n", + "|--------ClientsProvider.__init__ started at 2118257.94 seconds\n", + "|----------_get_default_credentials_with_project started at 2118257.94 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/chelsealin/src/bigframes1/bigframes/session/__init__.py:2287: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|----------_get_default_credentials_with_project ended at 2118259.89 seconds. Runtime: 1.95 seconds\n", + "|--------ClientsProvider.__init__ ended at 2118259.94 seconds. Runtime: 2.00 seconds\n", + "|------Session.__init__ ended at 2118260.40 seconds. Runtime: 2.47 seconds\n", + "|----get_global_session ended at 2118260.40 seconds. Runtime: 2.47 seconds\n", + "|----DataFrame.__init__ started at 2118260.42 seconds\n", + "|----DataFrame.__init__ ended at 2118260.42 seconds. Runtime: 0.00 seconds\n", + "|--DataFrame.__init__ ended at 2118260.42 seconds. Runtime: 2.49 seconds\n", + "Runtime: 2.49 seconds\n" + ] + } + ], + "source": [ + "start_time = time.monotonic()\n", + "search_query = bpd.DataFrame({\"query_id\": [\"dog\", \"cat\"],\n", + " \"embedding\": [[1.0, 2.0], [3.0, 5.2]]})\n", + "print(\"Runtime: {:.2f} seconds\".format(time.monotonic() - start_time))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1f6b8a02", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--vector_search started at 2118260.43 seconds\n", + "|----DataFrame.__init__ started at 2118260.44 seconds\n", + "|----DataFrame.__init__ ended at 2118260.44 seconds. Runtime: 0.00 seconds\n", + "|----DataFrame.__init__ started at 2118261.18 seconds\n", + "|----DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", + "|----DataFrame.__init__ started at 2118261.18 seconds\n", + "|----DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", + "|--vector_search ended at 2118261.18 seconds. Runtime: 0.74 seconds\n", + "|--DataFrame.__init__ started at 2118261.18 seconds\n", + "|--DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", + "Runtime: 0.74 seconds\n" + ] + } + ], + "source": [ + "start_time = time.monotonic()\n", + "df = bbq.vector_search(\n", + " base_table=\"bigframes-dev.bigframes_tests_sys.base_table\",\n", + " column_to_search=\"my_embedding\",\n", + " query=search_query,\n", + " top_k=2\n", + ").sort_values(\"id\")\n", + "print(\"Runtime: {:.2f} seconds\".format(time.monotonic() - start_time))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3c105ddd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--DataFrame.to_pandas started at 2118261.19 seconds\n", + "|----BigQueryCachingExecutor.execute started at 2118261.19 seconds\n", + "|----BigQueryCachingExecutor.execute ended at 2118261.45 seconds. Runtime: 0.26 seconds\n", + "|--DataFrame.to_pandas ended at 2118261.46 seconds. Runtime: 0.27 seconds\n", + " query_id embedding id my_embedding distance\n", + "0 dog [1. 2.] 1 [1. 2.] 0.0\n", + "1 cat [3. 5.2] 2 [2. 4.] 1.56205\n", + "0 dog [1. 2.] 4 [1. 3.2] 1.2\n", + "1 cat [3. 5.2] 5 [5. 5.4] 2.009975\n", + "Runtime: 0.29 seconds\n" + ] + } + ], + "source": [ + "start_time = time.monotonic()\n", + "print(df.to_pandas())\n", + "print(\"Runtime: {:.2f} seconds\".format(time.monotonic() - start_time))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f13bd234", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--DataFrame.__repr__ started at 2118261.49 seconds\n", + "|----Block.retrieve_repr_request_results started at 2118261.49 seconds\n", + "|------BigQueryCachingExecutor.execute started at 2118261.49 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 9325d063-b110-423a-aa40-e8ad1ee57466 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|------BigQueryCachingExecutor.execute ended at 2118267.25 seconds. Runtime: 5.75 seconds\n", + "|------BigQueryCachingExecutor.execute started at 2118267.25 seconds\n", + "|------BigQueryCachingExecutor.execute ended at 2118267.64 seconds. Runtime: 0.39 seconds\n", + "Time taken to execute head: 0.39 seconds\n", + "|------BigQueryCachingExecutor.execute started at 2118267.64 seconds\n", + "|------BigQueryCachingExecutor.execute ended at 2118267.64 seconds. Runtime: 0.00 seconds\n", + "Time taken to execute row_count: 0.00 seconds\n", + "Time taken to execute to_pandas: 0.00 seconds\n", + "|----Block.retrieve_repr_request_results ended at 2118267.65 seconds. Runtime: 6.15 seconds\n", + "|--DataFrame.__repr__ ended at 2118267.65 seconds. Runtime: 6.16 seconds\n", + " query_id embedding id my_embedding distance\n", + "0 dog [1. 2.] 1 [1. 2.] 0.0\n", + "1 cat [3. 5.2] 2 [2. 4.] 1.56205\n", + "0 dog [1. 2.] 4 [1. 3.2] 1.2\n", + "1 cat [3. 5.2] 5 [5. 5.4] 2.009975\n", + "\n", + "[4 rows x 5 columns]\n", + "Runtime: 6.16 seconds\n" + ] + } + ], + "source": [ + "start_time = time.monotonic()\n", + "print(df)\n", + "print(\"Runtime: {:.2f} seconds\".format(time.monotonic() - start_time))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 76e2f9bebe310ffd0f72dcdc7540c6bfb99f95eb Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 9 Sep 2025 22:05:22 +0000 Subject: [PATCH 2/2] add more runtime logger for materalizing dataframe (df.__repr__) --- bigframes/core/blocks.py | 5 +- .../compile/ibis_compiler/ibis_compiler.py | 2 + bigframes/core/tree_properties.py | 1 + bigframes/session/_io/bigquery/__init__.py | 2 + bigframes/session/anonymous_dataset.py | 2 +- bigframes/session/bigquery_session.py | 2 + bigframes/session/bq_caching_executor.py | 21 ++ notebooks/perf_inspect.ipynb | 212 ++++++++++++------ 8 files changed, 182 insertions(+), 65 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0f2782a4f6..4490acb151 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -41,6 +41,7 @@ Union, ) import warnings +import time import bigframes.perf_inspect as perf_inspect import bigframes_vendored.constants as constants @@ -1646,11 +1647,12 @@ def retrieve_repr_request_results( # head caches full underlying expression, so row_count will be free after executor = self.session._executor + start_time = time.monotonic() executor.cached( array_value=self.expr, config=executors.CacheConfig(optimize_for="head", if_cached="reuse-strict"), ) - import time + print("Time taken to cache table: {:.2f} seconds".format(time.monotonic() - start_time)) start_time = time.monotonic() head_result = self.session._executor.execute( self.expr.slice(start=None, stop=max_results, step=None), @@ -2707,6 +2709,7 @@ def to_placeholder_table( self._view_ref = self.session._create_temp_view(sql) return self._view_ref + @perf_inspect.runtime_logger def cached(self, *, force: bool = False, session_aware: bool = False) -> None: """Write the block to a session table.""" # use a heuristic for whether something needs to be cached diff --git a/bigframes/core/compile/ibis_compiler/ibis_compiler.py b/bigframes/core/compile/ibis_compiler/ibis_compiler.py index ff0441ea22..1590b884d8 100644 --- a/bigframes/core/compile/ibis_compiler/ibis_compiler.py +++ b/bigframes/core/compile/ibis_compiler/ibis_compiler.py @@ -32,11 +32,13 @@ import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering import bigframes.core.rewrite as rewrites +import bigframes.perf_inspect as perf_inspect if typing.TYPE_CHECKING: import bigframes.core +@perf_inspect.runtime_logger def compile_sql(request: configs.CompileRequest) -> configs.CompileResult: output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids) result_node = nodes.ResultNode( diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index baf4b12566..4c7fee9120 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -16,6 +16,7 @@ import functools import itertools from typing import Callable, Dict, Optional, Sequence +import bigframes.perf_inspect as perf_inspect import bigframes.core.nodes as nodes diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 83f63e8b9a..90a1a3f36c 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -35,6 +35,7 @@ import bigframes.core.sql import bigframes.formatting_helpers as formatting_helpers import bigframes.session.metrics +import bigframes.perf_inspect as perf_inspect CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." @@ -116,6 +117,7 @@ def table_ref_to_sql(table: bigquery.TableReference) -> str: return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" +@perf_inspect.runtime_logger def create_temp_table( bqclient: bigquery.Client, table_ref: bigquery.TableReference, diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index ec624d4eb4..ec61f6d0b2 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -84,7 +84,7 @@ def create_temp_table( self.allocate_temp_table(), expiration, schema=schema, - cluster_columns=list(cluster_cols), + # cluster_columns=list(cluster_cols), kms_key=self._kms_key, ) return bigquery.TableReference.from_string(table) diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index 883087df07..8e42eb6335 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -24,6 +24,7 @@ from bigframes.core.compile import googlesql from bigframes.session import temporary_storage +import bigframes.perf_inspect as perf_inspect KEEPALIVE_QUERY_TIMEOUT_SECONDS = 5.0 @@ -49,6 +50,7 @@ def __init__(self, bqclient: bigquery.Client, location: str): def location(self): return self._location + @perf_inspect.runtime_logger def create_temp_table( self, schema: Sequence[bigquery.SchemaField], cluster_cols: Sequence[str] = [] ) -> bigquery.TableReference: diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 873d06ae51..7409e6fa87 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -25,6 +25,7 @@ import google.cloud.bigquery.job as bq_job import google.cloud.bigquery.table as bq_table import google.cloud.bigquery_storage_v1 +import bigframes.perf_inspect as perf_inspect import bigframes from bigframes import exceptions as bfe @@ -80,6 +81,7 @@ def __init__(self): def mapping(self) -> Mapping[nodes.BigFrameNode, nodes.BigFrameNode]: return self._cached_executions + @perf_inspect.runtime_logger def cache_results_table( self, original_root: nodes.BigFrameNode, @@ -340,6 +342,7 @@ def dry_run( query_job = self.bqclient.query(sql, job_config=job_config) return query_job + @perf_inspect.runtime_logger def cached( self, array_value: bigframes.core.ArrayValue, *, config: executor.CacheConfig ) -> None: @@ -373,6 +376,7 @@ def cached( ) # Helpers + @perf_inspect.runtime_logger def _run_execute_query( self, sql: str, @@ -392,6 +396,7 @@ def _run_execute_query( job_config.labels["bigframes-mode"] = "unordered" try: + print(sql) # Trick the type checker into thinking we got a literal. if query_with_job: return bq_io.start_query_with_client( @@ -477,6 +482,7 @@ def _cache_with_cluster_cols( execution_spec=execution_spec, ) + @perf_inspect.runtime_logger def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" execution_spec = ex_spec.ExecutionSpec( @@ -586,6 +592,7 @@ def _upload_local_data(self, local_table: local_data.ManagedArrowTable): ) self.cache.cache_remote_replacement(local_table, uploaded) + @perf_inspect.runtime_logger def _execute_plan_gbq( self, plan: nodes.BigFrameNode, @@ -619,6 +626,9 @@ def _execute_plan_gbq( else: cluster_cols = cache_spec.cluster_cols + + import time + start_time = time.monotonic() compiled = compile.compile_sql( compile.CompileRequest( plan, @@ -627,38 +637,49 @@ def _execute_plan_gbq( materialize_all_order_keys=(cache_spec is not None), ) ) + print("Runtime to compile SQL: {:.2f} seconds".format(time.monotonic() - start_time)) + # might have more columns than og schema, for hidden ordering columns compiled_schema = compiled.sql_schema destination_table: Optional[bigquery.TableReference] = None + start_time = time.monotonic() job_config = bigquery.QueryJobConfig() if create_table: destination_table = self.storage_manager.create_temp_table( compiled_schema, cluster_cols ) job_config.destination = destination_table + # print("Runtime to create temp table: {:.2f} seconds".format(time.monotonic() - start_time)) + + start_time = time.monotonic() iterator, query_job = self._run_execute_query( sql=compiled.sql, job_config=job_config, query_with_job=(destination_table is not None), ) + # print("Runtime to run execute query: {:.2f} seconds".format(time.monotonic() - start_time)) + start_time = time.monotonic() table_info: Optional[bigquery.Table] = None if query_job and query_job.destination: table_info = self.bqclient.get_table(query_job.destination) size_bytes = table_info.num_bytes else: size_bytes = None + print("Runtime to get table size: {:.2f} seconds".format(time.monotonic() - start_time)) # we could actually cache even when caching is not explicitly requested, but being conservative for now if cache_spec is not None: assert table_info is not None assert compiled.row_order is not None + start_time = time.monotonic() self.cache.cache_results_table( og_plan, table_info, compiled.row_order, num_rows=table_info.num_rows ) + # print("Runtime to cache results table: {:.2f} seconds".format(time.monotonic() - start_time)) if size_bytes is not None and size_bytes >= MAX_SMALL_RESULT_BYTES: msg = bfe.format_message( diff --git a/notebooks/perf_inspect.ipynb b/notebooks/perf_inspect.ipynb index b14f47e56a..b766f7e912 100644 --- a/notebooks/perf_inspect.ipynb +++ b/notebooks/perf_inspect.ipynb @@ -24,6 +24,119 @@ "- Add `@perf_inspect.runtime_logger` to the call methods, such as" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b4c10000", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--DataFrame.__init__ started at 2216348.59 seconds\n", + "|----get_global_session started at 2216348.59 seconds\n", + "|------Session.__init__ started at 2216348.59 seconds\n", + "|--------ClientsProvider.__init__ started at 2216348.59 seconds\n", + "|----------_get_default_credentials_with_project started at 2216348.59 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/chelsealin/src/bigframes1/bigframes/session/__init__.py:2278: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|----------_get_default_credentials_with_project ended at 2216349.97 seconds. Runtime: 1.38 seconds\n", + "|--------ClientsProvider.__init__ ended at 2216350.03 seconds. Runtime: 1.44 seconds\n", + "|------Session.__init__ ended at 2216350.26 seconds. Runtime: 1.67 seconds\n", + "|----get_global_session ended at 2216350.26 seconds. Runtime: 1.67 seconds\n", + "|----DataFrame.__init__ started at 2216350.27 seconds\n", + "|----DataFrame.__init__ ended at 2216350.27 seconds. Runtime: 0.00 seconds\n", + "|--DataFrame.__init__ ended at 2216350.27 seconds. Runtime: 1.69 seconds\n", + "|--vector_search started at 2216350.27 seconds\n", + "|----DataFrame.__init__ started at 2216350.27 seconds\n", + "|----DataFrame.__init__ ended at 2216350.27 seconds. Runtime: 0.00 seconds\n", + "|----compile_sql started at 2216350.28 seconds\n", + "|----compile_sql ended at 2216350.29 seconds. Runtime: 0.01 seconds\n", + "|----DataFrame.__init__ started at 2216350.55 seconds\n", + "|----DataFrame.__init__ ended at 2216350.55 seconds. Runtime: 0.00 seconds\n", + "|----DataFrame.__init__ started at 2216350.55 seconds\n", + "|----DataFrame.__init__ ended at 2216350.56 seconds. Runtime: 0.00 seconds\n", + "|--vector_search ended at 2216350.56 seconds. Runtime: 0.28 seconds\n", + "|--DataFrame.__init__ started at 2216350.56 seconds\n", + "|--DataFrame.__init__ ended at 2216350.56 seconds. Runtime: 0.00 seconds\n", + "\n", + "--- Materializing dataframe ---\n", + "\n", + "|--DataFrame.to_pandas started at 2216350.56 seconds\n", + "|----BigQueryCachingExecutor.execute started at 2216350.56 seconds\n", + "|------BigQueryCachingExecutor._execute_plan_gbq started at 2216350.56 seconds\n", + "|--------compile_sql started at 2216350.56 seconds\n", + "|--------compile_sql ended at 2216350.57 seconds. Runtime: 0.01 seconds\n", + "Runtime to compile SQL: 0.01 seconds\n", + "|--------BigQueryCachingExecutor._run_execute_query started at 2216350.57 seconds\n", + "SELECT\n", + "`bigframes_unnamed_index` AS `bigframes_unnamed_index`,\n", + "`query_id` AS `query_id`,\n", + "`embedding` AS `embedding`,\n", + "`id` AS `id`,\n", + "`my_embedding` AS `my_embedding`,\n", + "`distance` AS `distance`\n", + "FROM\n", + "(SELECT\n", + " *\n", + "FROM (\n", + " SELECT\n", + " *\n", + " FROM UNNEST(ARRAY, `id` INT64, `my_embedding` ARRAY, `distance` FLOAT64, `bfuid_col_3` INT64>>[STRUCT(0, 'dog', [1.0, 2.0], 1, [1.0, 2.0], 0.0, 0), STRUCT(0, 'dog', [1.0, 2.0], 4, [1.0, 3.2], 1.2000000000000002, 1), STRUCT(1, 'cat', [3.0, 5.2], 2, [2.0, 4.0], 1.562049935181331, 2), STRUCT(1, 'cat', [3.0, 5.2], 5, [5.0, 5.4], 2.009975124224178, 3)]) AS `bigframes_unnamed_index`\n", + ") AS `t0`)\n", + "ORDER BY `id` ASC NULLS LAST ,`bigframes_unnamed_index` ASC NULLS LAST ,`bfuid_col_3` ASC NULLS LAST\n", + "|--------BigQueryCachingExecutor._run_execute_query ended at 2216350.80 seconds. Runtime: 0.23 seconds\n", + "Runtime to get table size: 0.00 seconds\n", + "|------BigQueryCachingExecutor._execute_plan_gbq ended at 2216350.80 seconds. Runtime: 0.25 seconds\n", + "|----BigQueryCachingExecutor.execute ended at 2216350.80 seconds. Runtime: 0.25 seconds\n", + "|--DataFrame.to_pandas ended at 2216350.81 seconds. Runtime: 0.25 seconds\n", + " query_id embedding id my_embedding distance\n", + "0 dog [1. 2.] 1 [1. 2.] 0.0\n", + "1 cat [3. 5.2] 2 [2. 4.] 1.56205\n", + "0 dog [1. 2.] 4 [1. 3.2] 1.2\n", + "1 cat [3. 5.2] 5 [5. 5.4] 2.009975\n", + "Runtime: 0.26 seconds\n" + ] + } + ], + "source": [ + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "import bigframes\n", + "import time\n", + "\n", + "# bigframes.options.bigquery.ordering_mode = \"partial\"\n", + "\n", + "search_query = bpd.DataFrame({\"query_id\": [\"dog\", \"cat\"],\n", + " \"embedding\": [[1.0, 2.0], [3.0, 5.2]]})\n", + "\n", + "df = bbq.vector_search(\n", + " base_table=\"bigframes-dev.bigframes_tests_sys.base_table\",\n", + " column_to_search=\"my_embedding\",\n", + " query=search_query,\n", + " top_k=2\n", + ").sort_values(\"id\")\n", + "\n", + "print(\"\\n--- Materializing dataframe ---\\n\")\n", + "start_time = time.monotonic()\n", + "print(df.to_pandas())\n", + "# print(df)\n", + "print(\"Runtime: {:.2f} seconds\".format(time.monotonic() - start_time))" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -46,18 +159,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "|--DataFrame.__init__ started at 2118257.94 seconds\n", - "|----get_global_session started at 2118257.94 seconds\n", - "|------Session.__init__ started at 2118257.94 seconds\n", - "|--------ClientsProvider.__init__ started at 2118257.94 seconds\n", - "|----------_get_default_credentials_with_project started at 2118257.94 seconds\n" + "|--DataFrame.__init__ started at 2124418.65 seconds\n", + "|----get_global_session started at 2124418.66 seconds\n", + "|------Session.__init__ started at 2124418.66 seconds\n", + "|--------ClientsProvider.__init__ started at 2124418.66 seconds\n", + "|----------_get_default_credentials_with_project started at 2124418.66 seconds\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/chelsealin/src/bigframes1/bigframes/session/__init__.py:2287: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + "/usr/local/google/home/chelsealin/src/bigframes1/bigframes/session/__init__.py:2278: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", " return Session(context)\n" ] }, @@ -65,14 +178,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "|----------_get_default_credentials_with_project ended at 2118259.89 seconds. Runtime: 1.95 seconds\n", - "|--------ClientsProvider.__init__ ended at 2118259.94 seconds. Runtime: 2.00 seconds\n", - "|------Session.__init__ ended at 2118260.40 seconds. Runtime: 2.47 seconds\n", - "|----get_global_session ended at 2118260.40 seconds. Runtime: 2.47 seconds\n", - "|----DataFrame.__init__ started at 2118260.42 seconds\n", - "|----DataFrame.__init__ ended at 2118260.42 seconds. Runtime: 0.00 seconds\n", - "|--DataFrame.__init__ ended at 2118260.42 seconds. Runtime: 2.49 seconds\n", - "Runtime: 2.49 seconds\n" + "|----------_get_default_credentials_with_project ended at 2124420.07 seconds. Runtime: 1.41 seconds\n", + "|--------ClientsProvider.__init__ ended at 2124420.12 seconds. Runtime: 1.46 seconds\n", + "|------Session.__init__ ended at 2124420.37 seconds. Runtime: 1.72 seconds\n", + "|----get_global_session ended at 2124420.37 seconds. Runtime: 1.72 seconds\n", + "|----DataFrame.__init__ started at 2124420.38 seconds\n", + "|----DataFrame.__init__ ended at 2124420.38 seconds. Runtime: 0.00 seconds\n", + "|--DataFrame.__init__ ended at 2124420.38 seconds. Runtime: 1.73 seconds\n", + "Runtime: 1.73 seconds\n" ] } ], @@ -93,17 +206,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "|--vector_search started at 2118260.43 seconds\n", - "|----DataFrame.__init__ started at 2118260.44 seconds\n", - "|----DataFrame.__init__ ended at 2118260.44 seconds. Runtime: 0.00 seconds\n", - "|----DataFrame.__init__ started at 2118261.18 seconds\n", - "|----DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", - "|----DataFrame.__init__ started at 2118261.18 seconds\n", - "|----DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", - "|--vector_search ended at 2118261.18 seconds. Runtime: 0.74 seconds\n", - "|--DataFrame.__init__ started at 2118261.18 seconds\n", - "|--DataFrame.__init__ ended at 2118261.18 seconds. Runtime: 0.00 seconds\n", - "Runtime: 0.74 seconds\n" + "|--vector_search started at 2124420.40 seconds\n", + "|----DataFrame.__init__ started at 2124420.40 seconds\n", + "|----DataFrame.__init__ ended at 2124420.40 seconds. Runtime: 0.00 seconds\n", + "|----DataFrame.__init__ started at 2124420.79 seconds\n", + "|----DataFrame.__init__ ended at 2124420.79 seconds. Runtime: 0.00 seconds\n", + "|----DataFrame.__init__ started at 2124420.79 seconds\n", + "|----DataFrame.__init__ ended at 2124420.79 seconds. Runtime: 0.00 seconds\n", + "|--vector_search ended at 2124420.79 seconds. Runtime: 0.39 seconds\n", + "|--DataFrame.__init__ started at 2124420.79 seconds\n", + "|--DataFrame.__init__ ended at 2124420.79 seconds. Runtime: 0.00 seconds\n", + "Runtime: 0.39 seconds\n" ] } ], @@ -128,16 +241,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "|--DataFrame.to_pandas started at 2118261.19 seconds\n", - "|----BigQueryCachingExecutor.execute started at 2118261.19 seconds\n", - "|----BigQueryCachingExecutor.execute ended at 2118261.45 seconds. Runtime: 0.26 seconds\n", - "|--DataFrame.to_pandas ended at 2118261.46 seconds. Runtime: 0.27 seconds\n", + "|--DataFrame.to_pandas started at 2124420.80 seconds\n", + "|----BigQueryCachingExecutor.execute started at 2124420.80 seconds\n", + "|----BigQueryCachingExecutor.execute ended at 2124421.00 seconds. Runtime: 0.20 seconds\n", + "|--DataFrame.to_pandas ended at 2124421.01 seconds. Runtime: 0.21 seconds\n", " query_id embedding id my_embedding distance\n", "0 dog [1. 2.] 1 [1. 2.] 0.0\n", "1 cat [3. 5.2] 2 [2. 4.] 1.56205\n", "0 dog [1. 2.] 4 [1. 3.2] 1.2\n", "1 cat [3. 5.2] 5 [5. 5.4] 2.009975\n", - "Runtime: 0.29 seconds\n" + "Runtime: 0.21 seconds\n" ] } ], @@ -149,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "f13bd234", "metadata": {}, "outputs": [ @@ -157,37 +270,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "|--DataFrame.__repr__ started at 2118261.49 seconds\n", - "|----Block.retrieve_repr_request_results started at 2118261.49 seconds\n", - "|------BigQueryCachingExecutor.execute started at 2118261.49 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 9325d063-b110-423a-aa40-e8ad1ee57466 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "|------BigQueryCachingExecutor.execute ended at 2118267.25 seconds. Runtime: 5.75 seconds\n", - "|------BigQueryCachingExecutor.execute started at 2118267.25 seconds\n", - "|------BigQueryCachingExecutor.execute ended at 2118267.64 seconds. Runtime: 0.39 seconds\n", - "Time taken to execute head: 0.39 seconds\n", - "|------BigQueryCachingExecutor.execute started at 2118267.64 seconds\n", - "|------BigQueryCachingExecutor.execute ended at 2118267.64 seconds. Runtime: 0.00 seconds\n", - "Time taken to execute row_count: 0.00 seconds\n", - "Time taken to execute to_pandas: 0.00 seconds\n", - "|----Block.retrieve_repr_request_results ended at 2118267.65 seconds. Runtime: 6.15 seconds\n", - "|--DataFrame.__repr__ ended at 2118267.65 seconds. Runtime: 6.16 seconds\n", + "|--DataFrame.__repr__ started at 2124496.45 seconds\n", + "|----Block.retrieve_repr_request_results started at 2124496.45 seconds\n", + "|----Block.retrieve_repr_request_results ended at 2124496.45 seconds. Runtime: 0.00 seconds\n", + "|--DataFrame.__repr__ ended at 2124496.47 seconds. Runtime: 0.02 seconds\n", " query_id embedding id my_embedding distance\n", "0 dog [1. 2.] 1 [1. 2.] 0.0\n", "1 cat [3. 5.2] 2 [2. 4.] 1.56205\n", @@ -195,7 +281,7 @@ "1 cat [3. 5.2] 5 [5. 5.4] 2.009975\n", "\n", "[4 rows x 5 columns]\n", - "Runtime: 6.16 seconds\n" + "Runtime: 0.02 seconds\n" ] } ],