From 77c9061e66a54640251a16d65b85cc1deb5fdc2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Thu, 26 Jun 2025 13:12:55 -0500
Subject: [PATCH 1/5] chore: add benchmarks for read_gbq_colab

---
 tests/benchmark/.gitignore                    |  5 ++
 .../read_gbq_colab/aggregate_output.py        | 69 +++++++++++++++++++
 tests/benchmark/read_gbq_colab/config.jsonl   | 10 +++
 tests/benchmark/read_gbq_colab/dry_run.py     | 48 +++++++++++++
 .../benchmark/read_gbq_colab/filter_output.py | 60 ++++++++++++++++
 tests/benchmark/read_gbq_colab/first_page.py  | 53 ++++++++++++++
 .../first_page.py_percentile_99.error         |  0
 tests/benchmark/read_gbq_colab/last_page.py   | 54 +++++++++++++++
 tests/benchmark/read_gbq_colab/sort_output.py | 64 +++++++++++++++++
 tests/benchmark/utils.py                      |  3 +
 10 files changed, 366 insertions(+)
 create mode 100644 tests/benchmark/.gitignore
 create mode 100644 tests/benchmark/read_gbq_colab/aggregate_output.py
 create mode 100644 tests/benchmark/read_gbq_colab/config.jsonl
 create mode 100644 tests/benchmark/read_gbq_colab/dry_run.py
 create mode 100644 tests/benchmark/read_gbq_colab/filter_output.py
 create mode 100644 tests/benchmark/read_gbq_colab/first_page.py
 create mode 100644 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error
 create mode 100644 tests/benchmark/read_gbq_colab/last_page.py
 create mode 100644 tests/benchmark/read_gbq_colab/sort_output.py

diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore
new file mode 100644
index 0000000000..14b434a131
--- /dev/null
+++ b/tests/benchmark/.gitignore
@@ -0,0 +1,5 @@
+*.bytesprocessed
+*.bq_exec_time_seconds
+*.local_exec_time_seconds
+*.query_char_count
+*.slotmillis
diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
new file mode 100644
index 0000000000..1302ea1850
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -0,0 +1,69 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def aggregate_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    group_column = "col_int64_1"
+    if group_column not in df.columns:
+        group_column = "col_bool_0"
+
+    # Simulate the user aggregating by a column and visualizing those results
+    df_aggregated = (
+        df.assign(rounded=df[group_column].astype("Int64").round(-9))
+        .groupby("rounded")
+        .sum()
+    )
+
+    df_aggregated.shape
+    next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        aggregate_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl
new file mode 100644
index 0000000000..53f2779151
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/config.jsonl
@@ -0,0 +1,10 @@
+{"benchmark_suffix": "percentile_09", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
+{"benchmark_suffix": "percentile_19", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
+{"benchmark_suffix": "percentile_29", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
+{"benchmark_suffix": "percentile_39", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
+{"benchmark_suffix": "percentile_49", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
+{"benchmark_suffix": "percentile_59", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
+{"benchmark_suffix": "percentile_69", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
+{"benchmark_suffix": "percentile_79", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
+{"benchmark_suffix": "percentile_89", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
+{"benchmark_suffix": "percentile_99", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}
diff --git a/tests/benchmark/read_gbq_colab/dry_run.py b/tests/benchmark/read_gbq_colab/dry_run.py
new file mode 100644
index 0000000000..c2de1b7cc4
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/dry_run.py
@@ -0,0 +1,48 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+
+def dry_run(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}",
+        dry_run=True,
+    )
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        dry_run,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
new file mode 100644
index 0000000000..7945d9f0c6
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def filter_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    # Simulate the user filtering by a column and visualizing those results
+    df_filtered = df[df["col_bool_0"]]
+    df_filtered.shape
+    next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        filter_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py
new file mode 100644
index 0000000000..2df9990d22
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/first_page.py
@@ -0,0 +1,53 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def first_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Get number of rows (to calculate number of pages) and the first page.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        first_page,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error b/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py
new file mode 100644
index 0000000000..ad785a29e8
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/last_page.py
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def last_page(*, project_id, dataset_id, table_id, session: bigframes.session.Session):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Get number of rows (to calculate number of pages) and then all pages.
+    df.shape
+    for _ in df.to_pandas_batches(page_size=PAGE_SIZE):
+        pass
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        last_page,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py
new file mode 100644
index 0000000000..997de5683d
--- /dev/null
+++ b/tests/benchmark/read_gbq_colab/sort_output.py
@@ -0,0 +1,64 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+
+import bigframes.session
+
+PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
+
+
+def sort_output(
+    *, project_id, dataset_id, table_id, session: bigframes.session.Session
+):
+    # TODO(tswast): Support alternative query if table_id is a local DataFrame,
+    # e.g. "{local_inline}" or "{local_large}"
+    df = session._read_gbq_colab(
+        f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}"
+    )
+
+    # Simulate getting the first page, since we'll always do that first in the UI.
+    df.shape
+    next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
+
+    # Simulate the user sorting by a column and visualizing those results
+    sort_column = "col_int64_1"
+    if sort_column not in df.columns:
+        sort_column = "col_bool_0"
+
+    df_sorted = df.sort_values(sort_column)
+    df_sorted.shape
+    next(iter(df_sorted.to_pandas_batches(page_size=PAGE_SIZE)))
+
+
+if __name__ == "__main__":
+    (
+        project_id,
+        dataset_id,
+        table_id,
+        session,
+        suffix,
+    ) = utils.get_configuration(include_table_id=True)
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        sort_output,
+        current_path,
+        suffix,
+        project_id=project_id,
+        dataset_id=dataset_id,
+        table_id=table_id,
+        session=session,
+    )
diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py
index 887d54dba2..48357ddde7 100644
--- a/tests/benchmark/utils.py
+++ b/tests/benchmark/utils.py
@@ -17,6 +17,8 @@
 
 import bigframes
 
+READ_GBQ_COLAB_PAGE_SIZE = 100
+
 
 def get_configuration(include_table_id=False):
     parser = argparse.ArgumentParser()
@@ -94,6 +96,7 @@ def _str_to_bool(value):
 
 
 def _initialize_session(ordered: bool):
+    # TODO(tswast): add a flag to enable the polars semi-executor.
     context = bigframes.BigQueryOptions(
         location="US", ordering_mode="strict" if ordered else "partial"
     )

From b14171c4f8bdde2dcf2287bce23bfa0815087a11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Thu, 26 Jun 2025 13:15:54 -0500
Subject: [PATCH 2/5] correct project id

---
 tests/benchmark/read_gbq_colab/config.jsonl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/benchmark/read_gbq_colab/config.jsonl b/tests/benchmark/read_gbq_colab/config.jsonl
index 53f2779151..6f1ddf4a5f 100644
--- a/tests/benchmark/read_gbq_colab/config.jsonl
+++ b/tests/benchmark/read_gbq_colab/config.jsonl
@@ -1,10 +1,10 @@
-{"benchmark_suffix": "percentile_09", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
-{"benchmark_suffix": "percentile_19", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
-{"benchmark_suffix": "percentile_29", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
-{"benchmark_suffix": "percentile_39", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
-{"benchmark_suffix": "percentile_49", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
-{"benchmark_suffix": "percentile_59", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
-{"benchmark_suffix": "percentile_69", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
-{"benchmark_suffix": "percentile_79", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
-{"benchmark_suffix": "percentile_89", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
-{"benchmark_suffix": "percentile_99", "project_id": "swast-scratch", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}
+{"benchmark_suffix": "percentile_09", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_09", "ordered": false}
+{"benchmark_suffix": "percentile_19", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_19", "ordered": false}
+{"benchmark_suffix": "percentile_29", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_29", "ordered": false}
+{"benchmark_suffix": "percentile_39", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_39", "ordered": false}
+{"benchmark_suffix": "percentile_49", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_49", "ordered": false}
+{"benchmark_suffix": "percentile_59", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_59", "ordered": false}
+{"benchmark_suffix": "percentile_69", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_69", "ordered": false}
+{"benchmark_suffix": "percentile_79", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_79", "ordered": false}
+{"benchmark_suffix": "percentile_89", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_89", "ordered": false}
+{"benchmark_suffix": "percentile_99", "project_id": "bigframes-dev-perf", "dataset_id": "read_gbq_colab_benchmark", "table_id": "percentile_99", "ordered": false}

From 615a76a07056094352569ac707fc8e580a887037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Thu, 26 Jun 2025 13:20:40 -0500
Subject: [PATCH 3/5] exclude error too

---
 tests/benchmark/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore
index 14b434a131..f1bf042bf7 100644
--- a/tests/benchmark/.gitignore
+++ b/tests/benchmark/.gitignore
@@ -1,5 +1,6 @@
 *.bytesprocessed
 *.bq_exec_time_seconds
+*.error
 *.local_exec_time_seconds
 *.query_char_count
 *.slotmillis

From 1c963c3f4150565404e5a40a1b587aa3412638a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Thu, 26 Jun 2025 13:21:24 -0500
Subject: [PATCH 4/5] Delete
 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error

---
 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error

diff --git a/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error b/tests/benchmark/read_gbq_colab/first_page.py_percentile_99.error
deleted file mode 100644
index e69de29bb2..0000000000

From 8a36a3212c6c74d8388219119c4503b4388d83f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Fri, 27 Jun 2025 15:14:21 -0500
Subject: [PATCH 5/5] explain column selection for groupby

---
 tests/benchmark/read_gbq_colab/aggregate_output.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
index 1302ea1850..b612e2998c 100644
--- a/tests/benchmark/read_gbq_colab/aggregate_output.py
+++ b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -33,6 +33,9 @@ def aggregate_output(
     df.shape
     next(iter(df.to_pandas_batches(page_size=PAGE_SIZE)))
 
+    # To simulate very small rows that can only fit a boolean,
+    # some tables don't have an integer column. If an integer column is available,
+    # we prefer to group by that to get a more realistic number of groups.
     group_column = "col_int64_1"
     if group_column not in df.columns:
         group_column = "col_bool_0"