diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 0000000000..f1bf042bf7 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,6 @@ +*.bytesprocessed +*.bq_exec_time_seconds +*.error +*.local_exec_time_seconds +*.query_char_count +*.slotmillis diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py new file mode 100644 index 0000000000..b612e2998c --- /dev/null +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -0,0 +1,72 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def aggregate_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # To simulate very small rows that can only fit a boolean, + # some tables don't have an integer column. If an integer column is available, + # we prefer to group by that to get a more realistic number of groups. + group_column = "col_int64_1" + if group_column not in df.columns: + group_column = "col_bool_0" + + # Simulate the user aggregating by a column and visualizing those results + df_aggregated = ( + df.assign(rounded=df[group_column].astype("Int64").round(-9)) + .groupby("rounded") + .sum() + ) + + df_aggregated.shape + next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + aggregate_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl new file mode 100644 index 0000000000..6f1ddf4a5f --- /dev/null +++ b/tests/benchmark/read_gbq_colab/config.jsonl @@ -0,0 +1,10 @@ +{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false} +{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false} +{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false} +{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false} +{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false} +{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false} +{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false} +{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false} +{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false} +{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false} diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py new file mode 100644 index 0000000000..c2de1b7cc4 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/dry_run.py @@ -0,0 +1,48 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + + +def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}", + dry_run=True, + ) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + dry_run, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py new file mode 100644 index 0000000000..7945d9f0c6 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def filter_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user filtering by a column and visualizing those results + df_filtered = df[df["col_bool_0"]] + df_filtered.shape + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + filter_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py new file mode 100644 index 0000000000..2df9990d22 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and the first page. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + first_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py new file mode 100644 index 0000000000..ad785a29e8 --- /dev/null +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -0,0 +1,54 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Get number of rows (to calculate number of pages) and then all pages. + df.shape + for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + pass + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + last_page, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py new file mode 100644 index 0000000000..997de5683d --- /dev/null +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils + +import bigframes.session + +PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE + + +def sort_output( + *, project_id, dataset_id, table_id, session: bigframes.session.Session +): + # TODO(tswast): Support alternative query if table_id is a local DataFrame, + # e.g. "{local_inline}" or "{local_large}" + df = session._read_gbq_colab( + f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" + ) + + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + + # Simulate the user sorting by a column and visualizing those results + sort_column = "col_int64_1" + if sort_column not in df.columns: + sort_column = "col_bool_0" + + df_sorted = df.sort_values(sort_column) + df_sorted.shape + next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE))) + + +if __name__ == "__main__": + ( + project_id, + dataset_id, + table_id, + session, + suffix, + ) = utils.get_configuration(include_table_id=True) + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + sort_output, + current_path, + suffix, + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + session=session, + ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py index 887d54dba2..48357ddde7 100644 --- a/tests/benchmark/utils.py +++ b/tests/benchmark/utils.py @@ -17,6 +17,8 @@ import bigframes +READ_GBQ_COLAB_PAGE_SIZE = 100 + def get_configuration(include_table_id=False): parser = argparse.ArgumentParser() @@ -94,6 +96,7 @@ def _str_to_bool(value): def _initialize_session(ordered: bool): + # TODO(tswast): add a flag to enable the polars semi-executor. context = bigframes.BigQueryOptions( location="US", ordering_mode="strict" if ordered else "partial" )