From d4c5b77d14862010155d5578e51506ed97a11e2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 1 Jul 2025 13:09:34 -0500 Subject: [PATCH 1/2] chore: round earlier in TPC-H q15 to try and reduce non-determinism due to aggregating twice --- third_party/bigframes_vendored/tpch/queries/q15.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index 1cba0ca4bc..eaff311c41 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -31,6 +31,11 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .agg(TOTAL_REVENUE=bpd.NamedAgg(column="REVENUE", aggfunc="sum")) .rename(columns={"L_SUPPKEY": "SUPPLIER_NO"}) ) + # Round earlier to prevent non-determinism in the later join due to + # differences in distributed floating point operation sort order. + grouped_revenue = grouped_revenue.assign( + TOTAL_REVENUE=grouped_revenue["TOTAL_REVENUE"].round(2) + ) joined_data = bpd.merge( supplier, grouped_revenue, left_on="S_SUPPKEY", right_on="SUPPLIER_NO" @@ -44,9 +49,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): joined_data["TOTAL_REVENUE"] == joined_data["MAX_REVENUE"] ] - max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers[ - "TOTAL_REVENUE" - ].round(2) + max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers["TOTAL_REVENUE"] q_final = max_revenue_suppliers[ ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"] ].sort_values("S_SUPPKEY") From ebb11d8fc03d13c15792d7d6aa959c0673393835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 7 Jul 2025 14:01:00 -0500 Subject: [PATCH 2/2] remove unnecessary code --- third_party/bigframes_vendored/tpch/queries/q15.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index eaff311c41..0e3460189d 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -48,8 +48,6 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): max_revenue_suppliers = joined_data[ joined_data["TOTAL_REVENUE"] == joined_data["MAX_REVENUE"] ] - - max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers["TOTAL_REVENUE"] q_final = max_revenue_suppliers[ ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"] ].sort_values("S_SUPPKEY")