From eb9426fc89d9b441634801709ff6e763eebd53e8 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 21 Oct 2025 20:08:52 +0000 Subject: [PATCH 1/2] feat: include local data bytes in the dry run report when available --- bigframes/core/blocks.py | 2 +- bigframes/session/dry_runs.py | 35 ++++++++++++++++++++++++++++-- tests/system/small/test_session.py | 16 ++++++++++++++ 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 166841dfbd..1900b7208a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -967,7 +967,7 @@ def _compute_dry_run( } dry_run_stats = dry_runs.get_query_stats_with_dtypes( - query_job, column_dtypes, self.index.dtypes + query_job, column_dtypes, self.index.dtypes, self.expr.node ) return dry_run_stats, query_job diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index 51e8e72c9a..38ba56cd61 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -20,6 +20,7 @@ import pandas from bigframes import dtypes +from bigframes.core import bigframe_node, nodes def get_table_stats(table: bigquery.Table) -> pandas.Series: @@ -86,13 +87,26 @@ def get_query_stats_with_dtypes( query_job: bigquery.QueryJob, column_dtypes: Dict[str, dtypes.Dtype], index_dtypes: Sequence[dtypes.Dtype], + expr_root: bigframe_node.BigFrameNode | None = None, ) -> pandas.Series: + """ + Returns important stats from the query job as a Pandas Series. The dtypes information is added too. + + Args: + expr_root (Optional): + The root of the expression tree that may contain local data, whose size is added to the + total bytes count if available. + + """ index = ["columnCount", "columnDtypes", "indexLevel", "indexDtypes"] values = [len(column_dtypes), column_dtypes, len(index_dtypes), index_dtypes] s = pandas.Series(values, index=index) - return pandas.concat([s, get_query_stats(query_job)]) + result = pandas.concat([s, get_query_stats(query_job)]) + if expr_root is not None: + result["totalBytesProcessed"] += get_local_bytes(expr_root) + return result def get_query_stats( @@ -145,4 +159,21 @@ def get_query_stats( else None ) - return pandas.Series(values, index=index) + result = pandas.Series(values, index=index) + result["totalBytesProcessed"] = int(result["totalBytesProcessed"]) + + return result + + +def get_local_bytes(root: bigframe_node.BigFrameNode) -> int: + def get_total_bytes( + root: bigframe_node.BigFrameNode, child_results: tuple[int, ...] + ) -> int: + child_bytes = sum(child_results) + + if isinstance(root, nodes.ReadLocalNode): + return child_bytes + root.local_data_source.data.get_total_buffer_size() + + return child_bytes + + return root.reduce_up(get_total_bytes) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 001e02c2fa..d3e646dc92 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -2173,6 +2173,22 @@ def test_read_gbq_query_dry_run(scalars_table_id, session): _assert_query_dry_run_stats_are_valid(result) +def test_block_dry_run_includes_local_data(session): + df1 = bigframes.dataframe.DataFrame({"col_1": [1, 2, 3]}, session=session) + df2 = bigframes.dataframe.DataFrame({"col_2": [1, 2, 3]}, session=session) + + result = df1.merge(df2, how="cross").to_pandas(dry_run=True) + + assert isinstance(result, pd.Series) + _assert_query_dry_run_stats_are_valid(result) + assert result["totalBytesProcessed"] > 0 + assert ( + df1.to_pandas(dry_run=True)["totalBytesProcessed"] + + df2.to_pandas(dry_run=True)["totalBytesProcessed"] + == result["totalBytesProcessed"] + ) + + def _assert_query_dry_run_stats_are_valid(result: pd.Series): expected_index = pd.Index( [ From 1769df0f10c2c267b0d94629ce86fdee630cab3a Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 21 Oct 2025 20:23:16 +0000 Subject: [PATCH 2/2] fix test --- bigframes/session/dry_runs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index 38ba56cd61..bd54bb65d7 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -160,7 +160,10 @@ def get_query_stats( ) result = pandas.Series(values, index=index) - result["totalBytesProcessed"] = int(result["totalBytesProcessed"]) + if result["totalBytesProcessed"] is None: + result["totalBytesProcessed"] = 0 + else: + result["totalBytesProcessed"] = int(result["totalBytesProcessed"]) return result