From 77c9061e66a54640251a16d65b85cc1deb5fdc2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 26 Jun 2025 13:12:55 -0500 Subject: [PATCH 1/5] chore: add benchmarks for read_gbq_colab --- tests/benchmark/.gitignore | 5 ++ .../read_gbq_colab/aggregate_output.py | 69 +++++++++++++++++++ tests/benchmark/read_gbq_colab/config.jsonl | 10 +++ tests/benchmark/read_gbq_colab/dry_run.py | 48 +++++++++++++ .../benchmark/read_gbq_colab/filter_output.py | 60 ++++++++++++++++ tests/benchmark/read_gbq_colab/first_page.py | 53 ++++++++++++++ .../first_page.py_percentile_99.error | 0 tests/benchmark/read_gbq_colab/last_page.py | 54 +++++++++++++++ tests/benchmark/read_gbq_colab/sort_output.py | 64 +++++++++++++++++ tests/benchmark/utils.py | 3 + 10 files changed, 366 insertions(+) create mode 100644 tests/benchmark/.gitignore create mode 100644 tests/benchmark/read_gbq_colab/aggregate_output.py create mode 100644 tests/benchmark/read_gbq_colab/config.jsonl create mode 100644 tests/benchmark/read_gbq_colab/dry_run.py create mode 100644 tests/benchmark/read_gbq_colab/filter_output.py create mode 100644 tests/benchmark/read_gbq_colab/first_page.py create mode 100644 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error create mode 100644 tests/benchmark/read_gbq_colab/last_page.py create mode 100644 tests/benchmark/read_gbq_colab/sort_output.py diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 0000000000..14b434a131 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,5 @@ +*.bytesprocessed +*.bq_exec_time_seconds +*.local_exec_time_seconds +*.query_char_count +*.slotmillis diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py new file mode 100644 index 0000000000..1302ea1850 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -0,0 +1,69 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def aggregate_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + group_column = "col_int64_1" + if group_column not in df.columns: + group_column = "col_bool_0" + + # Simulate the user aggregating by a column and visualizing those results + df_aggregated = ( + df.assign(rounded=df[group_column].astype("Int64").round(-9)) + .groupby("rounded") + .sum() + ) + + df_aggregated.shape + next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + aggregate_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl new file mode 100644 index 0000000000..53f2779151 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/config.jsonl @@ -0,0 +1,10 @@ +{"benchmark_suffix": "percentile_09", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} +{"benchmark_suffix": "percentile_19", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} +{"benchmark_suffix": "percentile_29", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} +{"benchmark_suffix": "percentile_39", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} +{"benchmark_suffix": "percentile_49", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} +{"benchmark_suffix": "percentile_59", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} +{"benchmark_suffix": "percentile_69", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} +{"benchmark_suffix": "percentile_79", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} +{"benchmark_suffix": "percentile_89", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} +{"benchmark_suffix": "percentile_99", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py new file mode 100644 index 0000000000..c2de1b7cc4 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/dry_run.py @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + + +def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}", + dry_run=True, + ) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + dry_run, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py new file mode 100644 index 0000000000..7945d9f0c6 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def filter_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user filtering by a column and visualizing those results + df_filtered = df[df["col_bool_0"]] + df_filtered.shape + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + filter_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py new file mode 100644 index 0000000000..2df9990d22 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and the first page. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + first_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error b/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py new file mode 100644 index 0000000000..ad785a29e8 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and then all pages. + df.shape + for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + pass + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + last_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py new file mode 100644 index 0000000000..997de5683d --- /dev/null +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def sort_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user sorting by a column and visualizing those results + sort_column = "col_int64_1" + if sort_column not in df.columns: + sort_column = "col_bool_0" + + df_sorted = df.sort_values(sort_column) + df_sorted.shape + next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + sort_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 887d54dba2..48357ddde7 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -17,6 +17,8 @@ import bigframes +READ_GBQ_COLAB_PAGE_SIZE = 100 + def get_configuration(include_table_id=False): parser = argparse.ArgumentParser() @@ -94,6 +96,7 @@ def _str_to_bool(value): def _initialize_session(ordered: bool): + # TODO(tswast): add a flag to enable the polars semi-executor. context = bigframes.BigQueryOptions( location="US", ordering_mode="strict" if ordered else "partial" ) From b14171c4f8bdde2dcf2287bce23bfa0815087a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 26 Jun 2025 13:15:54 -0500 Subject: [PATCH 2/5] correct project id --- tests/benchmark/read_gbq_colab/config.jsonl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl index 53f2779151..6f1ddf4a5f 100644 --- a/tests/benchmark/read_gbq_colab/config.jsonl +++ b/tests/benchmark/read_gbq_colab/config.jsonl @@ -1,10 +1,10 @@ -{"benchmark_suffix": "percentile_09", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} -{"benchmark_suffix": "percentile_19", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} -{"benchmark_suffix": "percentile_29", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} -{"benchmark_suffix": "percentile_39", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} -{"benchmark_suffix": "percentile_49", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} -{"benchmark_suffix": "percentile_59", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} -{"benchmark_suffix": "percentile_69", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} -{"benchmark_suffix": "percentile_79", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} -{"benchmark_suffix": "percentile_89", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} -{"benchmark_suffix": "percentile_99", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} +{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} +{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} +{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} +{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} +{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} +{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} +{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} +{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} +{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} +{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} From 615a76a07056094352569ac707fc8e580a887037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 26 Jun 2025 13:20:40 -0500 Subject: [PATCH 3/5] exclude error too --- tests/benchmark/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore index 14b434a131..f1bf042bf7 100644 --- a/tests/benchmark/.gitignore +++ b/tests/benchmark/.gitignore @@ -1,5 +1,6 @@ *.bytesprocessed *.bq_exec_time_seconds +*.error *.local_exec_time_seconds *.query_char_count *.slotmillis From 1c963c3f4150565404e5a40a1b587aa3412638a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 26 Jun 2025 13:21:24 -0500 Subject: [PATCH 4/5] Delete tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error --- tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error diff --git a/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error b/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error deleted file mode 100644 index e69de29bb2..0000000000 From 8a36a3212c6c74d8388219119c4503b4388d83f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 27 Jun 2025 15:14:21 -0500 Subject: [PATCH 5/5] explain column selection for groupby --- tests/benchmark/read_gbq_colab/aggregate_output.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index 1302ea1850..b612e2998c 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -33,6 +33,9 @@ def aggregate_output( df.shape next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + # To simulate very small rows that can only fit a boolean, + # some tables don't have an integer column. If an integer column is available, + # we prefer to group by that to get a more realistic number of groups. group_column = "col_int64_1" if group_column not in df.columns: group_column = "col_bool_0"