diff --git a/scripts/create_read_gbq_colab_benchmark_tables.py b/scripts/create_read_gbq_colab_benchmark_tables.py index 703c946360..63419bc660 100644 --- a/scripts/create_read_gbq_colab_benchmark_tables.py +++ b/scripts/create_read_gbq_colab_benchmark_tables.py @@ -42,18 +42,6 @@ 17486432.0, 1919625975.0, ], - "num_materialized_or_scanned_rows": [ - 0.0, - 6.0, - 100.0, - 4955.0, - 23108.0, - 139504.0, - 616341.0, - 3855698.0, - 83725698.0, - 5991998082.0, - ], "avg_row_bytes": [ 0.00014346299635435792, 0.005370969708923197, @@ -524,10 +512,11 @@ def main(): for i in range(num_percentiles): percentile = TABLE_STATS["percentile"][i] avg_row_bytes_raw = TABLE_STATS["avg_row_bytes"][i] - num_rows_raw = TABLE_STATS["num_materialized_or_scanned_rows"][i] + table_bytes_raw = TABLE_STATS["materialized_or_scanned_bytes"][i] + target_table_bytes = max(1, int(math.ceil(table_bytes_raw))) target_row_bytes = max(1, int(math.ceil(avg_row_bytes_raw))) - num_rows = max(1, int(math.ceil(num_rows_raw))) + num_rows = max(1, int(math.ceil(target_table_bytes / target_row_bytes))) table_name = f"percentile_{percentile:02d}" print(f"\n--- Processing Table: {table_name} ---") diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index b612e2998c..dda4bf95a4 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -44,7 +44,7 @@ def aggregate_output( df_aggregated = ( df.assign(rounded=df[group_column].astype("Int64").round(-9)) .groupby("rounded") - .sum() + .sum(numeric_only=True) ) df_aggregated.shape diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 7945d9f0c6..5e872bb727 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -14,6 +14,7 @@ import pathlib import benchmark.utils as utils +import pytest import bigframes.session @@ -35,8 +36,15 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - df_filtered.shape - next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + rows, _ = df_filtered.shape + + # It's possible we don't have any pages at all, since we filtered out all + # matching rows. + if rows == 0: + with pytest.raises(StopIteration): + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) + else: + next(iter(df_filtered.to_pandas_batches(page_size=PAGE_SIZE))) if __name__ == "__main__":