Skip to content

Commit 2f6cd9f

Browse files
authored
chore: add tpch q4-8 (#926)
* chore: add tpch q4-8 * remove 10t test
1 parent e0afb7a commit 2f6cd9f

File tree

11 files changed

+405
-2
lines changed

11 files changed

+405
-2
lines changed

tests/benchmark/tpch/config.jsonl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,3 @@
66
{"benchmark_suffix": "100g_unordered", "dataset_id": "tpch_0100g", "ordered": false}
77
{"benchmark_suffix": "1t_ordered", "dataset_id": "tpch_0001t", "ordered": true}
88
{"benchmark_suffix": "1t_unordered", "dataset_id": "tpch_0001t", "ordered": false}
9-
{"benchmark_suffix": "10t_ordered", "dataset_id": "tpch_0010t", "ordered": true}
10-
{"benchmark_suffix": "10t_unordered", "dataset_id": "tpch_0010t", "ordered": false}

tests/benchmark/tpch/q4.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
import bigframes_vendored.tpch.queries.q4 as vendored_tpch_q4
18+
19+
if __name__ == "__main__":
20+
dataset_id, session, suffix = utils.get_tpch_configuration()
21+
current_path = pathlib.Path(__file__).absolute()
22+
23+
utils.get_execution_time(
24+
vendored_tpch_q4.q, current_path, suffix, dataset_id, session
25+
)

tests/benchmark/tpch/q5.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q5
18+
19+
if __name__ == "__main__":
20+
dataset_id, session, suffix = utils.get_tpch_configuration()
21+
current_path = pathlib.Path(__file__).absolute()
22+
23+
utils.get_execution_time(
24+
vendored_tpch_q5.q, current_path, suffix, dataset_id, session
25+
)

tests/benchmark/tpch/q6.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
import bigframes_vendored.tpch.queries.q6 as vendored_tpch_q6
18+
19+
if __name__ == "__main__":
20+
dataset_id, session, suffix = utils.get_tpch_configuration()
21+
current_path = pathlib.Path(__file__).absolute()
22+
23+
utils.get_execution_time(
24+
vendored_tpch_q6.q, current_path, suffix, dataset_id, session
25+
)

tests/benchmark/tpch/q7.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
import bigframes_vendored.tpch.queries.q7 as vendored_tpch_q7
18+
19+
if __name__ == "__main__":
20+
dataset_id, session, suffix = utils.get_tpch_configuration()
21+
current_path = pathlib.Path(__file__).absolute()
22+
23+
utils.get_execution_time(
24+
vendored_tpch_q7.q, current_path, suffix, dataset_id, session
25+
)

tests/benchmark/tpch/q8.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import pathlib
15+
16+
import benchmark.utils as utils
17+
import bigframes_vendored.tpch.queries.q8 as vendored_tpch_q8
18+
19+
if __name__ == "__main__":
20+
dataset_id, session, suffix = utils.get_tpch_configuration()
21+
current_path = pathlib.Path(__file__).absolute()
22+
23+
utils.get_execution_time(
24+
vendored_tpch_q8.q, current_path, suffix, dataset_id, session
25+
)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q4.py
2+
3+
4+
from datetime import date
5+
import typing
6+
7+
import bigframes
8+
import bigframes.pandas as bpd
9+
10+
11+
def q(dataset_id: str, session: bigframes.Session):
12+
lineitem = session.read_gbq(
13+
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
14+
index_col=bigframes.enums.DefaultIndexKind.NULL,
15+
)
16+
orders = session.read_gbq(
17+
f"bigframes-dev-perf.{dataset_id}.ORDERS",
18+
index_col=bigframes.enums.DefaultIndexKind.NULL,
19+
)
20+
21+
var1 = date(1993, 7, 1)
22+
var2 = date(1993, 10, 1)
23+
24+
jn = lineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
25+
26+
jn = jn[(jn["O_ORDERDATE"] >= var1) & (jn["O_ORDERDATE"] < var2)]
27+
jn = jn[jn["L_COMMITDATE"] < jn["L_RECEIPTDATE"]]
28+
29+
if not session._strictly_ordered:
30+
jn = jn.sort_values(by=["O_ORDERPRIORITY", "L_ORDERKEY"])
31+
32+
jn = jn.drop_duplicates(subset=["O_ORDERPRIORITY", "L_ORDERKEY"])
33+
34+
gb = jn.groupby("O_ORDERPRIORITY", as_index=False)
35+
agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count"))
36+
37+
result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"])
38+
result_df.to_gbq()
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q5.py
2+
3+
from datetime import date
4+
5+
import bigframes
6+
7+
8+
def q(dataset_id: str, session: bigframes.Session):
9+
region = session.read_gbq(
10+
f"bigframes-dev-perf.{dataset_id}.REGION",
11+
index_col=bigframes.enums.DefaultIndexKind.NULL,
12+
)
13+
nation = session.read_gbq(
14+
f"bigframes-dev-perf.{dataset_id}.NATION",
15+
index_col=bigframes.enums.DefaultIndexKind.NULL,
16+
)
17+
customer = session.read_gbq(
18+
f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
19+
index_col=bigframes.enums.DefaultIndexKind.NULL,
20+
)
21+
lineitem = session.read_gbq(
22+
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
23+
index_col=bigframes.enums.DefaultIndexKind.NULL,
24+
)
25+
orders = session.read_gbq(
26+
f"bigframes-dev-perf.{dataset_id}.ORDERES",
27+
index_col=bigframes.enums.DefaultIndexKind.NULL,
28+
)
29+
supplier = session.read_gbq(
30+
f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
31+
index_col=bigframes.enums.DefaultIndexKind.NULL,
32+
)
33+
34+
var1 = "ASIA"
35+
var2 = date(1994, 1, 1)
36+
var3 = date(1995, 1, 1)
37+
38+
jn1 = region.merge(nation, left_on="R_REGIONKEY", right_on="N_REGIONKEY")
39+
jn2 = jn1.merge(customer, left_on="N_NATIONKEY", right_on="C_NATIONKEY")
40+
jn3 = jn2.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
41+
jn4 = jn3.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
42+
jn5 = jn4.merge(
43+
supplier,
44+
left_on=["L_SUPPKEY", "N_NATIONKEY"],
45+
right_on=["S_SUPPKEY", "S_NATIONKEY"],
46+
)
47+
48+
jn5 = jn5[jn5["R_NAME"] == var1]
49+
jn5 = jn5[(jn5["O_ORDERDATE"] >= var2) & (jn5["O_ORDERDATE"] < var3)]
50+
jn5["REVENUE"] = jn5["L_EXTENDEDPRICE"] * (1.0 - jn5["L_DISCOUNT"])
51+
52+
gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum()
53+
result_df = gb.sort_values("REVENUE", ascending=False)
54+
55+
result_df.to_gbq()
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q6.py
2+
3+
from datetime import date
4+
5+
import bigframes
6+
7+
8+
def q(dataset_id: str, session: bigframes.Session):
9+
lineitem = session.read_gbq(
10+
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
11+
index_col=bigframes.enums.DefaultIndexKind.NULL,
12+
)
13+
14+
var1 = date(1994, 1, 1)
15+
var2 = date(1995, 1, 1)
16+
var3 = 0.05
17+
var4 = 0.07
18+
var5 = 24
19+
20+
filt = lineitem[(lineitem["L_SHIPDATE"] >= var1) & (lineitem["L_SHIPDATE"] < var2)]
21+
filt = filt[(filt["L_DISCOUNT"] >= var3) & (filt["L_DISCOUNT"] <= var4)]
22+
filt = filt[filt["L_QUANTITY"] < var5]
23+
result_df = (
24+
(filt["L_EXTENDEDPRICE"] * filt["L_DISCOUNT"])
25+
.agg(["sum"])
26+
.rename("REVENUE")
27+
.to_frame()
28+
)
29+
30+
result_df.to_gbq()
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q7.py
2+
3+
from datetime import date
4+
import typing
5+
6+
import bigframes
7+
import bigframes.dataframe
8+
import bigframes.pandas as bpd
9+
10+
11+
def q(dataset_id: str, session: bigframes.Session):
12+
nation = session.read_gbq(
13+
f"bigframes-dev-perf.{dataset_id}.NATION",
14+
index_col=bigframes.enums.DefaultIndexKind.NULL,
15+
)
16+
customer = session.read_gbq(
17+
f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
18+
index_col=bigframes.enums.DefaultIndexKind.NULL,
19+
)
20+
lineitem = session.read_gbq(
21+
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
22+
index_col=bigframes.enums.DefaultIndexKind.NULL,
23+
)
24+
orders = session.read_gbq(
25+
f"bigframes-dev-perf.{dataset_id}.ORDERS",
26+
index_col=bigframes.enums.DefaultIndexKind.NULL,
27+
)
28+
supplier = session.read_gbq(
29+
f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
30+
index_col=bigframes.enums.DefaultIndexKind.NULL,
31+
)
32+
33+
var1 = "FRANCE"
34+
var2 = "GERMANY"
35+
var3 = date(1995, 1, 1)
36+
var4 = date(1996, 12, 31)
37+
38+
n1 = nation[(nation["N_NAME"] == var1)]
39+
n2 = nation[(nation["N_NAME"] == var2)]
40+
41+
jn1 = customer.merge(n1, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
42+
jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
43+
jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"})
44+
jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
45+
jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
46+
jn5 = jn4.merge(n2, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
47+
df1 = jn5.rename(columns={"N_NAME": "SUPP_NATION"})
48+
49+
jn1 = customer.merge(n2, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
50+
jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
51+
jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"})
52+
jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")
53+
jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
54+
jn5 = jn4.merge(n1, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
55+
df2 = jn5.rename(columns={"N_NAME": "SUPP_NATION"})
56+
57+
total = bpd.concat([df1, df2])
58+
59+
# TODO(huanc): TEMPORARY CODE to force a fresh start. Currently,
60+
# combining everything into a single query seems to trigger a bug
61+
# causing incorrect results. This workaround involves writing to and
62+
# then reading from BigQuery. Remove this once b/355714291 is
63+
# resolved.
64+
dest = total.to_gbq()
65+
total = bpd.read_gbq(dest)
66+
67+
total = total[(total["L_SHIPDATE"] >= var3) & (total["L_SHIPDATE"] <= var4)]
68+
total["VOLUME"] = total["L_EXTENDEDPRICE"] * (1.0 - total["L_DISCOUNT"])
69+
total["L_YEAR"] = total["L_SHIPDATE"].dt.year
70+
71+
gb = typing.cast(bpd.DataFrame, total).groupby(
72+
["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index=False
73+
)
74+
agg = gb.agg(REVENUE=bpd.NamedAgg(column="VOLUME", aggfunc="sum"))
75+
76+
result_df = typing.cast(bpd.DataFrame, agg).sort_values(
77+
["SUPP_NATION", "CUST_NATION", "L_YEAR"]
78+
)
79+
result_df.to_gbq()

0 commit comments

Comments
 (0)