From 34043613e1c59880f82e112746419e71d457b983 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Mon, 30 Jun 2025 12:12:36 -0500
Subject: [PATCH 1/2] chore: fix `read_gbq_colab` benchmark

* Correct the table size to match actual percentiles.
* Only do sum() on numeric columns.
---
 .../create_read_gbq_colab_benchmark_tables.py   | 17 +++--------------
 .../read_gbq_colab/aggregate_output.py          |  2 +-
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py
index 703c946360..63419bc660 100644
--- a/scripts/create_read_gbq_colab_benchmark_tables.py
+++ b/scripts/create_read_gbq_colab_benchmark_tables.py
@@ -42,18 +42,6 @@
         17486432.0,
         1919625975.0,
     ],
-    "num_materialized_or_scanned_rows": [
-        0.0,
-        6.0,
-        100.0,
-        4955.0,
-        23108.0,
-        139504.0,
-        616341.0,
-        3855698.0,
-        83725698.0,
-        5991998082.0,
-    ],
     "avg_row_bytes": [
         0.00014346299635435792,
         0.005370969708923197,
@@ -524,10 +512,11 @@ def main():
         for i in range(num_percentiles):
             percentile = TABLE_STATS["percentile"][i]
             avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i]
-            num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i]
+            table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i]
 
+            target_table_bytes = max(1, int(math.ceil(table_bytes_raw)))
             target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw)))
-            num_rows = max(1, int(math.ceil(num_rows_raw)))
+            num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes)))
 
             table_name = f"percentile_{percentile:02d}"
             print(f"\n--- Processing Table: {table_name} ---")
diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py
index b612e2998c..dda4bf95a4 100644
--- a/tests/benchmark/read_gbq_colab/aggregate_output.py
+++ b/tests/benchmark/read_gbq_colab/aggregate_output.py
@@ -44,7 +44,7 @@ def aggregate_output(
     df_aggregated = (
         df.assign(rounded=df[group_column].astype("Int64").round(-9))
         .groupby("rounded")
-        .sum()
+        .sum(numeric_only=True)
     )
 
     df_aggregated.shape

From 3e72aecb4293f9c087b9dfe8af37eb74d9e721b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a?= <swast@google.com>
Date: Mon, 30 Jun 2025 12:18:23 -0500
Subject: [PATCH 2/2] fix for filter bench

---
 tests/benchmark/read_gbq_colab/filter_output.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py
index 7945d9f0c6..5e872bb727 100644
--- a/tests/benchmark/read_gbq_colab/filter_output.py
+++ b/tests/benchmark/read_gbq_colab/filter_output.py
@@ -14,6 +14,7 @@
 import pathlib
 
 import benchmark.utils as utils
+import pytest
 
 import bigframes.session
 
@@ -35,8 +36,15 @@ def filter_output(
 
     # Simulate the user filtering by a column and visualizing those results
     df_filtered = df[df["col_bool_0"]]
-    df_filtered.shape
-    next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    rows, _ = df_filtered.shape
+
+    # It's possible we don't have any pages at all, since we filtered out all
+    # matching rows.
+    if rows == 0:
+        with pytest.raises(StopIteration):
+            next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
+    else:
+        next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE)))
 
 
 if __name__ == "__main__":