From 34043613e1c59880f82e112746419e71d457b983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 30 Jun 2025 12:12:36 -0500 Subject: [PATCH 1/2] chore: fix `read_gbq_colab` benchmark * Correct the table size to match actual percentiles. * Only do sum() on numeric columns. --- .../create_read_gbq_colab_benchmark_tables.py | 17 +++-------------- .../read_gbq_colab/aggregate_output.py | 2 +- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py index 703c946360..63419bc660 100644 --- a/scripts/create_read_gbq_colab_benchmark_tables.py +++ b/scripts/create_read_gbq_colab_benchmark_tables.py @@ -42,18 +42,6 @@ 17486432.0, 1919625975.0, ], - "num_materialized_or_scanned_rows": [ - 0.0, - 6.0, - 100.0, - 4955.0, - 23108.0, - 139504.0, - 616341.0, - 3855698.0, - 83725698.0, - 5991998082.0, - ], "avg_row_bytes": [ 0.00014346299635435792, 0.005370969708923197, @@ -524,10 +512,11 @@ def main(): for i in range(num_percentiles): percentile = TABLE_STATS["percentile"][i] avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i] - num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i] + table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i] + target_table_bytes = max(1, int(math.ceil(table_bytes_raw))) target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw))) - num_rows = max(1, int(math.ceil(num_rows_raw))) + num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes))) table_name = f"percentile_{percentile:02d}" print(f"\n--- Processing Table: {table_name} ---") diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index b612e2998c..dda4bf95a4 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -44,7 +44,7 @@ def aggregate_output( df_aggregated = ( df.assign(rounded=df[group_column].astype("Int64").round(-9)) .groupby("rounded") - .sum() + .sum(numeric_only=True) ) df_aggregated.shape From 3e72aecb4293f9c087b9dfe8af37eb74d9e721b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 30 Jun 2025 12:18:23 -0500 Subject: [PATCH 2/2] fix for filter bench --- tests/benchmark/read_gbq_colab/filter_output.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 7945d9f0c6..5e872bb727 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -14,6 +14,7 @@ import pathlib import benchmark.utils as utils +import pytest import bigframes.session @@ -35,8 +36,15 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - df_filtered.shape - next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + rows, _ = df_filtered.shape + + # It's possible we don't have any pages at all, since we filtered out all + # matching rows. + if rows == 0: + with pytest.raises(StopIteration): + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + else: + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) if __name__ == "__main__":