Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/benchmark/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*.bytesprocessed
*.bq_exec_time_seconds
*.error
*.local_exec_time_seconds
*.query_char_count
*.slotmillis
72 changes: 72 additions & 0 deletions tests/benchmark/read_gbq_colab/aggregate_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE


def aggregate_output(
*, project_id, dataset_id, table_id, session: bigframes.session.Session
):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
df = session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))

# To simulate very small rows that can only fit a boolean,
# some tables don't have an integer column. If an integer column is available,
# we prefer to group by that to get a more realistic number of groups.
group_column = "col_int64_1"
if group_column not in df.columns:
group_column = "col_bool_0"
Comment on lines +39 to +41
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I follow what is going on here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need some column to group by and some tables with tiny rows can only fit a boolean. I can add a comment.


# Simulate the user aggregating by a column and visualizing those results
df_aggregated = (
df.assign(rounded=df[group_column].astype("Int64").round(-9))
.groupby("rounded")
.sum()
)

df_aggregated.shape
next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
aggregate_output,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
10 changes: 10 additions & 0 deletions tests/benchmark/read_gbq_colab/config.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}
48 changes: 48 additions & 0 deletions tests/benchmark/read_gbq_colab/dry_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session


def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}",
dry_run=True,
)


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
dry_run,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
60 changes: 60 additions & 0 deletions tests/benchmark/read_gbq_colab/filter_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE


def filter_output(
*, project_id, dataset_id, table_id, session: bigframes.session.Session
):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
df = session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))

# Simulate the user filtering by a column and visualizing those results
df_filtered = df[df["col_bool_0"]]
df_filtered.shape
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These .shape calls are going to be pretty brutal, going to double-execute

next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
filter_output,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
53 changes: 53 additions & 0 deletions tests/benchmark/read_gbq_colab/first_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE


def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
df = session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Get number of rows (to calculate number of pages) and the first page.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
first_page,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
54 changes: 54 additions & 0 deletions tests/benchmark/read_gbq_colab/last_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE


def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
df = session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Get number of rows (to calculate number of pages) and then all pages.
df.shape
for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
pass


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
last_page,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
64 changes: 64 additions & 0 deletions tests/benchmark/read_gbq_colab/sort_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils

import bigframes.session

PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE


def sort_output(
*, project_id, dataset_id, table_id, session: bigframes.session.Session
):
# TODO(tswast): Support alternative query if table_id is a local DataFrame,
# e.g. "{local_inline}" or "{local_large}"
df = session._read_gbq_colab(
f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
)

# Simulate getting the first page, since we'll always do that first in the UI.
df.shape
next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))

# Simulate the user sorting by a column and visualizing those results
sort_column = "col_int64_1"
if sort_column not in df.columns:
sort_column = "col_bool_0"

df_sorted = df.sort_values(sort_column)
df_sorted.shape
next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))


if __name__ == "__main__":
(
project_id,
dataset_id,
table_id,
session,
suffix,
) = utils.get_configuration(include_table_id=True)
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
sort_output,
current_path,
suffix,
project_id=project_id,
dataset_id=dataset_id,
table_id=table_id,
session=session,
)
3 changes: 3 additions & 0 deletions tests/benchmark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import bigframes

READ_GBQ_COLAB_PAGE_SIZE = 100


def get_configuration(include_table_id=False):
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -94,6 +96,7 @@ def _str_to_bool(value):


def _initialize_session(ordered: bool):
# TODO(tswast): add a flag to enable the polars semi-executor.
context = bigframes.BigQueryOptions(
location="US", ordering_mode="strict" if ordered else "partial"
)
Expand Down