chore: add tpch q4-8 (#926)

Genesis929 · web-flow · commit 2f6cd9f56a78 · 2024-08-29T10:31:13.000-07:00
* chore: add tpch q4-8

* remove 10t test
diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl
@@ -6,5 +6,3 @@
 {"benchmark_suffix": "100g_unordered", "dataset_id": "tpch_0100g", "ordered": false}
 {"benchmark_suffix": "1t_ordered", "dataset_id": "tpch_0001t", "ordered": true}
 {"benchmark_suffix": "1t_unordered", "dataset_id": "tpch_0001t", "ordered": false}
-{"benchmark_suffix": "10t_ordered", "dataset_id": "tpch_0010t", "ordered": true}
-{"benchmark_suffix": "10t_unordered", "dataset_id": "tpch_0010t", "ordered": false}
diff --git a/tests/benchmark/tpch/q4.py b/tests/benchmark/tpch/q4.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+import bigframes_vendored.tpch.queries.q4 as vendored_tpch_q4
+
+if __name__ == "__main__":
+    dataset_id, session, suffix = utils.get_tpch_configuration()
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        vendored_tpch_q4.q, current_path, suffix, dataset_id, session
+    )
diff --git a/tests/benchmark/tpch/q5.py b/tests/benchmark/tpch/q5.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q5
+
+if __name__ == "__main__":
+    dataset_id, session, suffix = utils.get_tpch_configuration()
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        vendored_tpch_q5.q, current_path, suffix, dataset_id, session
+    )
diff --git a/tests/benchmark/tpch/q6.py b/tests/benchmark/tpch/q6.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+import bigframes_vendored.tpch.queries.q6 as vendored_tpch_q6
+
+if __name__ == "__main__":
+    dataset_id, session, suffix = utils.get_tpch_configuration()
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        vendored_tpch_q6.q, current_path, suffix, dataset_id, session
+    )
diff --git a/tests/benchmark/tpch/q7.py b/tests/benchmark/tpch/q7.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+import bigframes_vendored.tpch.queries.q7 as vendored_tpch_q7
+
+if __name__ == "__main__":
+    dataset_id, session, suffix = utils.get_tpch_configuration()
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        vendored_tpch_q7.q, current_path, suffix, dataset_id, session
+    )
diff --git a/tests/benchmark/tpch/q8.py b/tests/benchmark/tpch/q8.py
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+
+import benchmark.utils as utils
+import bigframes_vendored.tpch.queries.q8 as vendored_tpch_q8
+
+if __name__ == "__main__":
+    dataset_id, session, suffix = utils.get_tpch_configuration()
+    current_path = pathlib.Path(__file__).absolute()
+
+    utils.get_execution_time(
+        vendored_tpch_q8.q, current_path, suffix, dataset_id, session
+    )
diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py
@@ -0,0 +1,38 @@
+# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q4.py
+
+
+from datetime import date
+import typing
+
+import bigframes
+import bigframes.pandas as bpd
+
+
+def q(dataset_id: str, session: bigframes.Session):
+    lineitem = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.LINEITEM",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    orders = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.ORDERS",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+
+    var1 = date(1993, 7, 1)
+    var2 = date(1993, 10, 1)
+
+    jn = lineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
+
+    jn = jn[(jn["O_ORDERDATE"] >= var1) & (jn["O_ORDERDATE"] < var2)]
+    jn = jn[jn["L_COMMITDATE"] < jn["L_RECEIPTDATE"]]
+
+    if not session._strictly_ordered:
+        jn = jn.sort_values(by=["O_ORDERPRIORITY", "L_ORDERKEY"])
+
+    jn = jn.drop_duplicates(subset=["O_ORDERPRIORITY", "L_ORDERKEY"])
+
+    gb = jn.groupby("O_ORDERPRIORITY", as_index=False)
+    agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count"))
+
+    result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"])
+    result_df.to_gbq()
diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py
@@ -0,0 +1,55 @@
+# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q5.py
+
+from datetime import date
+
+import bigframes
+
+
+def q(dataset_id: str, session: bigframes.Session):
+    region = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.REGION",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    nation = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.NATION",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    customer = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    lineitem = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.LINEITEM",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    orders = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.ORDERES",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    supplier = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+
+    var1 = "ASIA"
+    var2 = date(1994, 1, 1)
+    var3 = date(1995, 1, 1)
+
+    jn1 = region.merge(nation, left_on="R_REGIONKEY", right_on="N_REGIONKEY")
+    jn2 = jn1.merge(customer, left_on="N_NATIONKEY", right_on="C_NATIONKEY")
+    jn3 = jn2.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
+    jn4 = jn3.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
+    jn5 = jn4.merge(
+        supplier,
+        left_on=["L_SUPPKEY", "N_NATIONKEY"],
+        right_on=["S_SUPPKEY", "S_NATIONKEY"],
+    )
+
+    jn5 = jn5[jn5["R_NAME"] == var1]
+    jn5 = jn5[(jn5["O_ORDERDATE"] >= var2) & (jn5["O_ORDERDATE"] < var3)]
+    jn5["REVENUE"] = jn5["L_EXTENDEDPRICE"] * (1.0 - jn5["L_DISCOUNT"])
+
+    gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum()
+    result_df = gb.sort_values("REVENUE", ascending=False)
+
+    result_df.to_gbq()
diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py
@@ -0,0 +1,30 @@
+# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q6.py
+
+from datetime import date
+
+import bigframes
+
+
+def q(dataset_id: str, session: bigframes.Session):
+    lineitem = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.LINEITEM",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+
+    var1 = date(1994, 1, 1)
+    var2 = date(1995, 1, 1)
+    var3 = 0.05
+    var4 = 0.07
+    var5 = 24
+
+    filt = lineitem[(lineitem["L_SHIPDATE"] >= var1) & (lineitem["L_SHIPDATE"] < var2)]
+    filt = filt[(filt["L_DISCOUNT"] >= var3) & (filt["L_DISCOUNT"] <= var4)]
+    filt = filt[filt["L_QUANTITY"] < var5]
+    result_df = (
+        (filt["L_EXTENDEDPRICE"] * filt["L_DISCOUNT"])
+        .agg(["sum"])
+        .rename("REVENUE")
+        .to_frame()
+    )
+
+    result_df.to_gbq()
diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py
@@ -0,0 +1,79 @@
+# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q7.py
+
+from datetime import date
+import typing
+
+import bigframes
+import bigframes.dataframe
+import bigframes.pandas as bpd
+
+
+def q(dataset_id: str, session: bigframes.Session):
+    nation = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.NATION",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    customer = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    lineitem = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.LINEITEM",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    orders = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.ORDERS",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    supplier = session.read_gbq(
+        f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+
+    var1 = "FRANCE"
+    var2 = "GERMANY"
+    var3 = date(1995, 1, 1)
+    var4 = date(1996, 12, 31)
+
+    n1 = nation[(nation["N_NAME"] == var1)]
+    n2 = nation[(nation["N_NAME"] == var2)]
+
+    jn1 = customer.merge(n1, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
+    jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
+    jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"})
+    jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
+    jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
+    jn5 = jn4.merge(n2, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
+    df1 = jn5.rename(columns={"N_NAME": "SUPP_NATION"})
+
+    jn1 = customer.merge(n2, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
+    jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
+    jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"})
+    jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
+    jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
+    jn5 = jn4.merge(n1, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
+    df2 = jn5.rename(columns={"N_NAME": "SUPP_NATION"})
+
+    total = bpd.concat([df1, df2])
+
+    # TODO(huanc): TEMPORARY CODE to force a fresh start. Currently,
+    # combining everything into a single query seems to trigger a bug
+    # causing incorrect results. This workaround involves writing to and
+    # then reading from BigQuery. Remove this once b/355714291 is
+    # resolved.
+    dest = total.to_gbq()
+    total = bpd.read_gbq(dest)
+
+    total = total[(total["L_SHIPDATE"] >= var3) & (total["L_SHIPDATE"] <= var4)]
+    total["VOLUME"] = total["L_EXTENDEDPRICE"] * (1.0 - total["L_DISCOUNT"])
+    total["L_YEAR"] = total["L_SHIPDATE"].dt.year
+
+    gb = typing.cast(bpd.DataFrame, total).groupby(
+        ["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index=False
+    )
+    agg = gb.agg(REVENUE=bpd.NamedAgg(column="VOLUME", aggfunc="sum"))
+
+    result_df = typing.cast(bpd.DataFrame, agg).sort_values(
+        ["SUPP_NATION", "CUST_NATION", "L_YEAR"]
+    )
+    result_df.to_gbq()
diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py