diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile deleted file mode 100644 index 704c863d20..0000000000 --- a/benchmarks/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11 - -RUN apt update \ - && apt install -y git python3 python3-pip \ - && apt clean - -RUN cd /opt \ - && git clone https://github.com/apache/datafusion-benchmarks.git diff --git a/benchmarks/README.md b/benchmarks/README.md index 7e2dfc9f2b..f1e8d39db6 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -17,88 +17,177 @@ specific language governing permissions and limitations under the License. --> -# Running Comet Benchmarks in Microk8s +# Comet Benchmark Suite + +Unified benchmark infrastructure for Apache DataFusion Comet. Supports +TPC-H/TPC-DS and shuffle benchmarks across multiple engines (Spark, Comet, +Gluten) with composable configuration and optional memory profiling. + +## Quick Start + +```bash +# Run TPC-H with Comet on a standalone cluster +python benchmarks/run.py \ + --engine comet --profile standalone-tpch --restart-cluster \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --iterations 1 + +# Preview the spark-submit command without executing +python benchmarks/run.py \ + --engine comet --profile standalone-tpch --dry-run \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --iterations 1 +``` + +## Directory Layout + +``` +benchmarks/ +├── run.py # Entry point — builds and runs spark-submit +├── conf/ +│ ├── engines/ # Per-engine configs (comet, spark, gluten, ...) +│ └── profiles/ # Per-environment configs (local, standalone, docker) +├── runner/ +│ ├── cli.py # Python CLI passed to spark-submit (subcommands: tpc, shuffle, micro) +│ ├── config.py # Config file loader and merger +│ ├── spark_session.py # SparkSession builder +│ └── profiling.py # Level 1 JVM metrics via Spark REST API +├── suites/ +│ ├── tpc.py # TPC-H / TPC-DS benchmark suite +│ ├── shuffle.py # Shuffle benchmark suite (hash, round-robin) +│ └── micro.py # Microbenchmark suite (string expressions, ...) +├── analysis/ +│ ├── compare.py # Generate comparison charts from result JSON +│ └── memory_report.py # Generate memory reports from profiling CSV +├── infra/ +│ ├── docker/ # Dockerfile, docker-compose, metrics collector +├── create-iceberg-tpch.py # Utility: convert TPC-H Parquet to Iceberg tables +└── drop-caches.sh # Utility: drop OS page caches before benchmarks +``` -This guide explains how to run benchmarks derived from TPC-H and TPC-DS in Apache DataFusion Comet deployed in a -local Microk8s cluster. +## How It Works -## Use Microk8s locally +`run.py` is the single entry point. It: -Install Micro8s following the instructions at https://microk8s.io/docs/getting-started and then perform these -additional steps, ensuring that any existing kube config is backed up first. +1. Reads a **profile** config (cluster shape, memory, master URL) +2. Reads an **engine** config (plugin JARs, shuffle manager, engine-specific settings) +3. Applies any `--conf key=value` CLI overrides (highest precedence) +4. Builds and executes the `spark-submit` command -```shell -mkdir -p ~/.kube -microk8s config > ~/.kube/config +The merge order is: **profile < engine < CLI overrides**, so engine configs +can override profile defaults (e.g., an engine can set `offHeap.enabled=false` +even though the profile enables it). -microk8s enable dns -microk8s enable registry +### Wrapper arguments (before `--`) -microk8s kubectl create serviceaccount spark -``` +| Flag | Description | +| ------------------- | ----------------------------------------------- | +| `--engine NAME` | Engine config from `conf/engines/NAME.conf` | +| `--profile NAME` | Profile config from `conf/profiles/NAME.conf` | +| `--conf key=value` | Extra Spark/runner config override (repeatable) | +| `--restart-cluster` | Stop/start Spark standalone master + worker | +| `--dry-run` | Print spark-submit command without executing | + +### Suite arguments (after `--`) + +Everything after `--` is passed to `runner/cli.py`. See per-suite docs: + +- [TPC-H / TPC-DS](suites/TPC.md) +- [Shuffle](suites/SHUFFLE.md) +- [Microbenchmarks](suites/MICRO.md) + +## Available Engines + +| Engine | Config file | Description | +| ---------------------- | ----------------------------------- | --------------------------------- | +| `spark` | `engines/spark.conf` | Vanilla Spark (no accelerator) | +| `comet` | `engines/comet.conf` | DataFusion Comet with native scan | +| `comet-iceberg` | `engines/comet-iceberg.conf` | Comet + native Iceberg scanning | +| `gluten` | `engines/gluten.conf` | Gluten (Velox backend) — Java 8 | +| `spark-shuffle` | `engines/spark-shuffle.conf` | Spark baseline for shuffle tests | +| `comet-jvm-shuffle` | `engines/comet-jvm-shuffle.conf` | Comet with JVM shuffle mode | +| `comet-native-shuffle` | `engines/comet-native-shuffle.conf` | Comet with native shuffle | + +## Available Profiles -## Build Comet Docker Image +| Profile | Config file | Description | +| ------------------ | -------------------------------- | ------------------------------ | +| `local` | `profiles/local.conf` | `local[*]` mode, no cluster | +| `standalone-tpch` | `profiles/standalone-tpch.conf` | 1 executor, 8 cores, S3A | +| `standalone-tpcds` | `profiles/standalone-tpcds.conf` | 2 executors, 16 cores, S3A | +| `docker` | `profiles/docker.conf` | For docker-compose deployments | -Run the following command from the root of this repository to build the Comet Docker image, or use a published -Docker image from https://github.com/orgs/apache/packages?repo_name=datafusion-comet +## Environment Variables -```shell -docker build -t apache/datafusion-comet -f kube/Dockerfile . +The config files use `${VAR}` references that are expanded from the +environment at load time: + +| Variable | Used by | Description | +| -------------- | -------------------- | --------------------------------- | +| `SPARK_HOME` | `run.py` | Path to Spark installation | +| `SPARK_MASTER` | standalone profiles | Spark master URL | +| `COMET_JAR` | comet engines | Path to Comet JAR | +| `GLUTEN_JAR` | gluten engine | Path to Gluten JAR | +| `ICEBERG_JAR` | comet-iceberg engine | Path to Iceberg Spark runtime JAR | + +## Profiling + +Add `--profile` (the flag, not the config) to any suite command to enable +Level 1 JVM metrics collection via the Spark REST API: + +```bash +python benchmarks/run.py --engine comet --profile standalone-tpch \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --iterations 1 --profile --profile-interval 1.0 ``` -## Build Comet Benchmark Docker Image +This writes a `{name}-{benchmark}-metrics.csv` alongside the result JSON. + +For container-level memory profiling, use the constrained docker-compose +overlay — see [Docker infrastructure](infra/docker/). + +## Generating Charts -Build the benchmark Docker image and push to the Microk8s Docker registry. +```bash +# Compare two result JSON files +python -m benchmarks.analysis.compare \ + comet-tpch-*.json spark-tpch-*.json \ + --labels Comet Spark --benchmark tpch \ + --title "TPC-H SF100" --output-dir ./charts -```shell -docker build -t apache/datafusion-comet-tpcbench . -docker tag apache/datafusion-comet-tpcbench localhost:32000/apache/datafusion-comet-tpcbench:latest -docker push localhost:32000/apache/datafusion-comet-tpcbench:latest +# Generate memory reports +python -m benchmarks.analysis.memory_report \ + --spark-csv comet-tpch-metrics.csv \ + --container-csv container-metrics.csv \ + --output-dir ./charts ``` -## Run benchmarks - -```shell -export SPARK_MASTER=k8s://https://127.0.0.1:16443 -export COMET_DOCKER_IMAGE=localhost:32000/apache/datafusion-comet-tpcbench:latest -# Location of Comet JAR within the Docker image -export COMET_JAR=/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0-SNAPSHOT.jar - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --deploy-mode cluster \ - --name comet-tpcbench \ - --driver-memory 8G \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.memory=32G \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.task.cpus=1 \ - --conf spark.executor.memoryOverhead=3G \ - --jars local://$COMET_JAR \ - --conf spark.executor.extraClassPath=$COMET_JAR \ - --conf spark.driver.extraClassPath=$COMET_JAR \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ - --conf spark.comet.enabled=true \ - --conf spark.comet.exec.enabled=true \ - --conf spark.comet.exec.all.enabled=true \ - --conf spark.comet.cast.allowIncompatible=true \ - --conf spark.comet.exec.shuffle.enabled=true \ - --conf spark.comet.exec.shuffle.mode=auto \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.kubernetes.namespace=default \ - --conf spark.kubernetes.driver.pod.name=tpcbench \ - --conf spark.kubernetes.container.image=$COMET_DOCKER_IMAGE \ - --conf spark.kubernetes.driver.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \ - --conf spark.kubernetes.driver.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \ - --conf spark.kubernetes.executor.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \ - --conf spark.kubernetes.executor.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \ - --conf spark.kubernetes.authenticate.caCertFile=/var/snap/microk8s/current/certs/ca.crt \ - local:///opt/datafusion-benchmarks/runners/datafusion-comet/tpcbench.py \ - --benchmark tpcds \ - --data /mnt/bigdata/tpcds/sf100/ \ - --queries /opt/datafusion-benchmarks/tpcds/queries-spark \ - --iterations 1 +## Running in Docker + +See [infra/docker/](infra/docker/) for docker-compose setup with optional +memory-constrained overlays and cgroup metrics collection. + +The Docker image includes both Java 8 and Java 17 runtimes. Java 17 is the +default (`JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64`), which is required +by Comet. Gluten requires Java 8, so override `JAVA_HOME` for all containers +when running Gluten benchmarks: + +```bash +# Start the cluster with Java 8 for Gluten +docker compose -f benchmarks/infra/docker/docker-compose.yml up -d + +# Run Gluten benchmark (override JAVA_HOME on all containers) +docker compose -f benchmarks/infra/docker/docker-compose.yml run --rm \ + -e JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + -e GLUTEN_JAR=/jars/gluten.jar \ + bench bash -c 'python3 /opt/benchmarks/run.py \ + --engine gluten --profile docker \ + -- tpc --name gluten --benchmark tpch --data /data \ + --queries /queries --output /results --iterations 1' ``` + +> **Note:** The Spark worker must also run Java 8 for Gluten. Use a +> docker-compose override file to set `JAVA_HOME` on `spark-master` and +> `spark-worker` services before starting the cluster, or restart the +> cluster between engine switches. + diff --git a/benchmarks/analysis/__init__.py b/benchmarks/analysis/__init__.py new file mode 100644 index 0000000000..0ccbeeeafb --- /dev/null +++ b/benchmarks/analysis/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmarks/analysis/compare.py b/benchmarks/analysis/compare.py new file mode 100644 index 0000000000..b9a24acc57 --- /dev/null +++ b/benchmarks/analysis/compare.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Compare benchmark results and generate charts. + +Reads the JSON output produced by ``suites/tpc.py`` (integer query keys +serialised as strings by ``json.dumps``). + +Usage:: + + python -m benchmarks.analysis.compare \\ + comet-tpch-*.json spark-tpch-*.json \\ + --labels comet spark --benchmark tpch --title "SF100" \\ + --output-dir ./charts +""" + +import argparse +import json +import os +import sys +from typing import Any, Dict, List, Sequence, Tuple + +import matplotlib.pyplot as plt +import numpy as np + + +QUERY_COUNTS = {"tpch": 22, "tpcds": 99} + + +def _query_range(benchmark: str) -> range: + n = QUERY_COUNTS.get(benchmark) + if n is None: + raise ValueError(f"Unknown benchmark: {benchmark}") + return range(1, n + 1) + + +def _median(timings: List[float]) -> float: + return float(np.median(np.array(timings))) + + +# --------------------------------------------------------------------------- +# Chart generators +# --------------------------------------------------------------------------- + +def generate_summary_chart( + results: Sequence[Dict[str, Any]], + labels: Sequence[str], + benchmark: str, + title: str, + output_dir: str = ".", +) -> str: + """Total wall-clock bar chart. Returns the output path.""" + num_queries = QUERY_COUNTS[benchmark] + timings = [0.0] * len(results) + for query in _query_range(benchmark): + for i, r in enumerate(results): + timings[i] += _median(r[str(query)]) + + fig, ax = plt.subplots(figsize=(10, 6)) + ax.set_title(title) + ax.set_ylabel( + f"Time in seconds to run all {num_queries} {benchmark} queries " + f"(lower is better)" + ) + times = [round(x, 0) for x in timings] + bars = ax.bar(labels, times, color="skyblue", width=0.8) + for bar in bars: + yval = bar.get_height() + ax.text( + bar.get_x() + bar.get_width() / 2.0, yval, f"{yval}", + va="bottom", ha="center", + ) + path = os.path.join(output_dir, f"{benchmark}_allqueries.png") + plt.savefig(path, format="png") + plt.close(fig) + return path + + +def generate_comparison_chart( + results: Sequence[Dict[str, Any]], + labels: Sequence[str], + benchmark: str, + title: str, + output_dir: str = ".", +) -> str: + """Per-query grouped bar chart. Returns the output path.""" + queries: List[str] = [] + benches: List[List[float]] = [[] for _ in results] + for query in _query_range(benchmark): + queries.append(f"q{query}") + for i, r in enumerate(results): + benches[i].append(_median(r[str(query)])) + + bar_width = 0.3 + index = np.arange(len(queries)) * 1.5 + fig_w = 15 if benchmark == "tpch" else 35 + fig, ax = plt.subplots(figsize=(fig_w, 6)) + + for i, label in enumerate(labels): + ax.bar(index + i * bar_width, benches[i], bar_width, label=label) + + ax.set_title(title) + ax.set_xlabel("Queries") + ax.set_ylabel("Query Time (seconds)") + ax.set_xticks(index + bar_width / 2) + ax.set_xticklabels(queries) + ax.legend() + + path = os.path.join(output_dir, f"{benchmark}_queries_compare.png") + plt.savefig(path, format="png") + plt.close(fig) + return path + + +def _speedup_data( + baseline: Dict, comparison: Dict, benchmark: str, absolute: bool, +) -> Tuple[List[str], List[float]]: + """Compute per-query speedup (relative % or absolute seconds).""" + rows: List[Tuple[str, float]] = [] + for query in _query_range(benchmark): + a = _median(baseline[str(query)]) + b = _median(comparison[str(query)]) + if absolute: + rows.append((f"q{query}", round(a - b, 1))) + else: + if a > b: + speedup = a / b - 1 + else: + speedup = -(1 / (a / b) - 1) + rows.append((f"q{query}", round(speedup * 100, 0))) + rows.sort(key=lambda x: -x[1]) + qs, vals = zip(*rows) + return list(qs), list(vals) + + +def generate_speedup_chart( + baseline: Dict[str, Any], + comparison: Dict[str, Any], + label1: str, + label2: str, + benchmark: str, + title: str, + absolute: bool = False, + output_dir: str = ".", +) -> str: + """Relative (%) or absolute (seconds) speedup chart. Returns path.""" + queries, speedups = _speedup_data(baseline, comparison, benchmark, absolute) + + fig_w = 10 if benchmark == "tpch" else 35 + fig_h = 6 if benchmark == "tpch" else 10 + fig, ax = plt.subplots(figsize=(fig_w, fig_h)) + bars = ax.bar(queries, speedups, color="skyblue") + + for bar, val in zip(bars, speedups): + yval = bar.get_height() + fmt = f"{val:.1f}" if absolute else f"{val:.0f}%" + va = "bottom" if yval >= 0 else "top" + y = min(800, yval + 5) if yval >= 0 else yval + ax.text( + bar.get_x() + bar.get_width() / 2.0, y, fmt, + va=va, ha="center", fontsize=8, color="blue", rotation=90, + ) + + kind = "seconds" if absolute else "percentage" + suffix = "abs" if absolute else "rel" + ylabel = "Speedup (in seconds)" if absolute else "Speedup Percentage (100% speedup = 2x faster)" + ax.set_title(f"{label2} speedup over {label1} ({title})") + ax.set_ylabel(ylabel) + ax.set_xlabel("Query") + ax.axhline(0, color="black", linewidth=0.8) + ax.yaxis.grid(True) + + if not absolute: + min_val = (min(speedups) // 100) * 100 + max_val = ((max(speedups) // 100) + 1) * 100 + 50 + if benchmark == "tpch": + ax.set_ylim(min_val, max_val) + else: + ax.set_ylim(-250, 300) + else: + ax.set_ylim(min(speedups) * 2 - 20, max(speedups) * 1.5) + + path = os.path.join(output_dir, f"{benchmark}_queries_speedup_{suffix}.png") + plt.savefig(path, format="png") + plt.close(fig) + return path + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def compare( + files: Sequence[str], + labels: Sequence[str], + benchmark: str, + title: str, + output_dir: str = ".", +) -> List[str]: + """Run all applicable charts. Returns list of output file paths.""" + os.makedirs(output_dir, exist_ok=True) + results = [] + for filename in files: + with open(filename) as f: + results.append(json.load(f)) + + paths = [ + generate_summary_chart(results, labels, benchmark, title, output_dir), + generate_comparison_chart(results, labels, benchmark, title, output_dir), + ] + + if len(files) == 2: + paths.append( + generate_speedup_chart( + results[0], results[1], labels[0], labels[1], + benchmark, title, absolute=True, output_dir=output_dir, + ) + ) + paths.append( + generate_speedup_chart( + results[0], results[1], labels[0], labels[1], + benchmark, title, absolute=False, output_dir=output_dir, + ) + ) + + for p in paths: + print(f"Wrote {p}") + return paths + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(argv=None): + parser = argparse.ArgumentParser( + description="Compare benchmark results and generate charts", + ) + parser.add_argument("filenames", nargs="+", help="JSON result files") + parser.add_argument("--labels", nargs="+", required=True, help="Labels for each file") + parser.add_argument("--benchmark", required=True, help="tpch or tpcds") + parser.add_argument("--title", required=True, help="Chart title") + parser.add_argument("--output-dir", default=".", help="Directory for chart PNGs") + args = parser.parse_args(argv) + + if len(args.filenames) != len(args.labels): + parser.error("Number of filenames must match number of labels") + + compare(args.filenames, args.labels, args.benchmark, args.title, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/analysis/memory_report.py b/benchmarks/analysis/memory_report.py new file mode 100644 index 0000000000..e77ff10dc0 --- /dev/null +++ b/benchmarks/analysis/memory_report.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Parse profiling output and generate memory utilisation reports. + +Supports two data sources: + +1. **Spark REST API metrics** — CSV written by ``runner/profiling.py`` + (``SparkMetricsProfiler``). Columns include ``elapsed_secs``, + ``executor_id``, ``memoryUsed``, ``maxMemory``, and various peak metrics. + +2. **Container cgroup metrics** — CSV written by + ``infra/docker/collect-metrics.sh``. Columns: + ``timestamp_ms, memory_usage_bytes, memory_limit_bytes, rss_bytes, + cache_bytes, swap_bytes``. + +Usage:: + + python -m benchmarks.analysis.memory_report \\ + --spark-csv results/comet-tpch-metrics.csv \\ + --container-csv results/container-metrics.csv \\ + --output-dir ./charts +""" + +import argparse +import csv +import os +import sys +from typing import Dict, List, Optional + +import matplotlib.pyplot as plt + + +# --------------------------------------------------------------------------- +# Spark REST API metrics +# --------------------------------------------------------------------------- + +def parse_spark_csv(path: str) -> Dict[str, List[Dict]]: + """Parse a SparkMetricsProfiler CSV into per-executor time series. + + Returns ``{executor_id: [{elapsed_secs, memoryUsed, maxMemory, ...}]}`` + """ + executors: Dict[str, List[Dict]] = {} + with open(path, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + eid = row.get("executor_id", "unknown") + parsed = {} + for k, v in row.items(): + try: + parsed[k] = float(v) + except (ValueError, TypeError): + parsed[k] = v + executors.setdefault(eid, []).append(parsed) + return executors + + +def generate_spark_memory_chart( + spark_csv: str, + output_dir: str = ".", +) -> List[str]: + """Generate per-executor memory usage over time. Returns output paths.""" + executors = parse_spark_csv(spark_csv) + paths = [] + + for eid, samples in executors.items(): + elapsed = [s.get("elapsed_secs", 0) for s in samples] + used = [s.get("memoryUsed", 0) / (1024 ** 2) for s in samples] # MB + max_mem = [s.get("maxMemory", 0) / (1024 ** 2) for s in samples] + + fig, ax = plt.subplots(figsize=(12, 5)) + ax.plot(elapsed, used, label="memoryUsed", linewidth=1.5) + if any(m > 0 for m in max_mem): + ax.plot(elapsed, max_mem, label="maxMemory", linestyle="--", alpha=0.6) + ax.set_xlabel("Elapsed (seconds)") + ax.set_ylabel("Memory (MB)") + ax.set_title(f"Executor {eid} — JVM Memory Usage") + ax.legend() + ax.grid(True, alpha=0.3) + + fname = f"spark_memory_executor_{eid}.png" + path = os.path.join(output_dir, fname) + plt.savefig(path, format="png") + plt.close(fig) + paths.append(path) + + # Peak memory bar chart across executors + if executors: + fig, ax = plt.subplots(figsize=(max(6, len(executors) * 1.5), 5)) + eids = list(executors.keys()) + peaks = [] + for eid in eids: + peak = max( + (s.get("peak_JVMHeapMemory", 0) + s.get("peak_JVMOffHeapMemory", 0)) + for s in executors[eid] + ) / (1024 ** 2) + peaks.append(peak) + + bars = ax.bar(eids, peaks, color="coral") + for bar, val in zip(bars, peaks): + ax.text( + bar.get_x() + bar.get_width() / 2.0, val, + f"{val:.0f}", va="bottom", ha="center", fontsize=9, + ) + ax.set_xlabel("Executor") + ax.set_ylabel("Peak JVM Memory (MB)") + ax.set_title("Peak JVM Memory by Executor") + ax.grid(True, axis="y", alpha=0.3) + + path = os.path.join(output_dir, "spark_memory_peak.png") + plt.savefig(path, format="png") + plt.close(fig) + paths.append(path) + + for p in paths: + print(f"Wrote {p}") + return paths + + +# --------------------------------------------------------------------------- +# Container cgroup metrics +# --------------------------------------------------------------------------- + +def parse_container_csv(path: str) -> List[Dict[str, float]]: + """Parse a collect-metrics.sh CSV into a list of samples.""" + samples = [] + with open(path, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + parsed = {} + for k, v in row.items(): + try: + parsed[k] = float(v) + except (ValueError, TypeError): + parsed[k] = v + samples.append(parsed) + return samples + + +def generate_container_memory_chart( + container_csv: str, + output_dir: str = ".", +) -> List[str]: + """Generate container memory usage over time. Returns output paths.""" + samples = parse_container_csv(container_csv) + if not samples: + print("No container metrics samples found") + return [] + + t0 = samples[0].get("timestamp_ms", 0) + elapsed = [(s.get("timestamp_ms", 0) - t0) / 1000.0 for s in samples] + usage_mb = [s.get("memory_usage_bytes", 0) / (1024 ** 2) for s in samples] + rss_mb = [s.get("rss_bytes", 0) / (1024 ** 2) for s in samples] + cache_mb = [s.get("cache_bytes", 0) / (1024 ** 2) for s in samples] + limit_mb = [s.get("memory_limit_bytes", 0) / (1024 ** 2) for s in samples] + + fig, ax = plt.subplots(figsize=(12, 5)) + ax.plot(elapsed, usage_mb, label="total usage", linewidth=1.5) + ax.plot(elapsed, rss_mb, label="RSS", linewidth=1.2) + ax.plot(elapsed, cache_mb, label="cache", linewidth=1.0, alpha=0.7) + if any(m > 0 for m in limit_mb): + ax.axhline( + limit_mb[0], color="red", linestyle="--", linewidth=1.0, + label=f"limit ({limit_mb[0]:.0f} MB)", + ) + ax.set_xlabel("Elapsed (seconds)") + ax.set_ylabel("Memory (MB)") + ax.set_title("Container Memory Usage (cgroup)") + ax.legend() + ax.grid(True, alpha=0.3) + + paths = [] + path = os.path.join(output_dir, "container_memory.png") + plt.savefig(path, format="png") + plt.close(fig) + paths.append(path) + + # Summary stats + peak_usage = max(usage_mb) + peak_rss = max(rss_mb) + limit = limit_mb[0] if limit_mb else 0 + print(f"Container memory summary:") + print(f" Peak usage: {peak_usage:.0f} MB") + print(f" Peak RSS: {peak_rss:.0f} MB") + if limit > 0: + print(f" Limit: {limit:.0f} MB") + print(f" Peak % used: {peak_usage / limit * 100:.1f}%") + + for p in paths: + print(f"Wrote {p}") + return paths + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(argv=None): + parser = argparse.ArgumentParser( + description="Generate memory utilisation reports from profiling data", + ) + parser.add_argument( + "--spark-csv", help="Path to SparkMetricsProfiler CSV", + ) + parser.add_argument( + "--container-csv", help="Path to collect-metrics.sh CSV", + ) + parser.add_argument( + "--output-dir", default=".", help="Directory for chart PNGs", + ) + args = parser.parse_args(argv) + + if not args.spark_csv and not args.container_csv: + parser.error("At least one of --spark-csv or --container-csv is required") + + os.makedirs(args.output_dir, exist_ok=True) + + if args.spark_csv: + generate_spark_memory_chart(args.spark_csv, args.output_dir) + if args.container_csv: + generate_container_memory_chart(args.container_csv, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/conf/engines/comet-iceberg.conf b/benchmarks/conf/engines/comet-iceberg.conf new file mode 100644 index 0000000000..bfdfd4ccf6 --- /dev/null +++ b/benchmarks/conf/engines/comet-iceberg.conf @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# DataFusion Comet with native Iceberg scanning. +# Catalog configs (spark.sql.catalog.*, spark.sql.defaultCatalog) should be +# passed via --conf CLI overrides since the catalog name is user-specific. +runner.name=comet-iceberg +runner.jars=${COMET_JAR},${ICEBERG_JAR} + +spark.driver.extraClassPath=${COMET_JAR}:${ICEBERG_JAR} +spark.executor.extraClassPath=${COMET_JAR}:${ICEBERG_JAR} +spark.plugins=org.apache.spark.CometPlugin +spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager +spark.comet.exec.replaceSortMergeJoin=true +spark.comet.expression.Cast.allowIncompatible=true +spark.comet.enabled=true +spark.comet.exec.enabled=true +spark.comet.scan.icebergNative.enabled=true +spark.comet.explainFallback.enabled=true diff --git a/benchmarks/conf/engines/comet-jvm-shuffle.conf b/benchmarks/conf/engines/comet-jvm-shuffle.conf new file mode 100644 index 0000000000..12b3d23a18 --- /dev/null +++ b/benchmarks/conf/engines/comet-jvm-shuffle.conf @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Comet with JVM shuffle mode — for shuffle benchmarks. +runner.name=comet-jvm-shuffle +runner.jars=${COMET_JAR} + +spark.driver.extraClassPath=${COMET_JAR} +spark.executor.extraClassPath=${COMET_JAR} +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g +spark.comet.enabled=true +spark.comet.operator.DataWritingCommandExec.allowIncompatible=true +spark.comet.parquet.write.enabled=true +spark.comet.logFallbackReasons.enabled=true +spark.comet.explainFallback.enabled=true +spark.comet.shuffle.mode=jvm +spark.comet.exec.shuffle.mode=jvm +spark.comet.exec.replaceSortMergeJoin=true +spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager +spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions +spark.comet.cast.allowIncompatible=true diff --git a/benchmarks/conf/engines/comet-native-shuffle.conf b/benchmarks/conf/engines/comet-native-shuffle.conf new file mode 100644 index 0000000000..0df2eac0c6 --- /dev/null +++ b/benchmarks/conf/engines/comet-native-shuffle.conf @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Comet with native shuffle mode — for shuffle benchmarks. +runner.name=comet-native-shuffle +runner.jars=${COMET_JAR} + +spark.driver.extraClassPath=${COMET_JAR} +spark.executor.extraClassPath=${COMET_JAR} +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g +spark.comet.enabled=true +spark.comet.operator.DataWritingCommandExec.allowIncompatible=true +spark.comet.parquet.write.enabled=true +spark.comet.logFallbackReasons.enabled=true +spark.comet.explainFallback.enabled=true +spark.comet.exec.shuffle.mode=native +spark.comet.exec.replaceSortMergeJoin=true +spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager +spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions +spark.comet.cast.allowIncompatible=true diff --git a/benchmarks/conf/engines/comet.conf b/benchmarks/conf/engines/comet.conf new file mode 100644 index 0000000000..257fd7dd56 --- /dev/null +++ b/benchmarks/conf/engines/comet.conf @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# DataFusion Comet accelerator. +runner.name=comet +runner.jars=${COMET_JAR} + +spark.driver.extraClassPath=${COMET_JAR} +spark.executor.extraClassPath=${COMET_JAR} +spark.plugins=org.apache.spark.CometPlugin +spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager +spark.comet.scan.impl=native_datafusion +spark.comet.exec.replaceSortMergeJoin=true +spark.comet.expression.Cast.allowIncompatible=true diff --git a/benchmarks/conf/engines/gluten.conf b/benchmarks/conf/engines/gluten.conf new file mode 100644 index 0000000000..91599c5bde --- /dev/null +++ b/benchmarks/conf/engines/gluten.conf @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Gluten accelerator. +runner.name=gluten +runner.jars=${GLUTEN_JAR} +runner.env.TZ=UTC + +spark.driver.extraClassPath=${GLUTEN_JAR} +spark.executor.extraClassPath=${GLUTEN_JAR} +spark.plugins=org.apache.gluten.GlutenPlugin +spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager +spark.gluten.sql.columnar.forceShuffledHashJoin=true +spark.sql.session.timeZone=UTC diff --git a/benchmarks/conf/engines/spark-shuffle.conf b/benchmarks/conf/engines/spark-shuffle.conf new file mode 100644 index 0000000000..2b087a129b --- /dev/null +++ b/benchmarks/conf/engines/spark-shuffle.conf @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Vanilla Spark baseline for shuffle benchmarks — Comet explicitly disabled. +runner.name=spark-shuffle + +spark.comet.enabled=false +spark.comet.exec.shuffle.enabled=false diff --git a/benchmarks/conf/engines/spark.conf b/benchmarks/conf/engines/spark.conf new file mode 100644 index 0000000000..e1831c4ae5 --- /dev/null +++ b/benchmarks/conf/engines/spark.conf @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Vanilla Spark — no accelerator plugin. +runner.name=spark diff --git a/benchmarks/conf/profiles/docker.conf b/benchmarks/conf/profiles/docker.conf new file mode 100644 index 0000000000..9b2bec6841 --- /dev/null +++ b/benchmarks/conf/profiles/docker.conf @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Profile for running inside docker-compose (see infra/docker/). +# Data is mounted at /data, queries at /queries, results at /results. +runner.master=${SPARK_MASTER} + +spark.driver.memory=8G +spark.executor.instances=1 +spark.executor.cores=8 +spark.cores.max=8 +spark.executor.memory=16g +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g +spark.eventLog.enabled=true diff --git a/benchmarks/conf/profiles/local.conf b/benchmarks/conf/profiles/local.conf new file mode 100644 index 0000000000..75bb8454b3 --- /dev/null +++ b/benchmarks/conf/profiles/local.conf @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Local mode — runs on local[*] with no master URL. +runner.master=local[*] + +spark.driver.memory=8G +spark.executor.memory=16g +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g diff --git a/benchmarks/conf/profiles/standalone-tpcds.conf b/benchmarks/conf/profiles/standalone-tpcds.conf new file mode 100644 index 0000000000..c892a7e77f --- /dev/null +++ b/benchmarks/conf/profiles/standalone-tpcds.conf @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Standalone cluster profile for TPC-DS: 2 executors, 16 cores total. +runner.master=${SPARK_MASTER} + +spark.driver.memory=8G +spark.executor.instances=2 +spark.executor.cores=8 +spark.cores.max=16 +spark.executor.memory=16g +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g +spark.eventLog.enabled=true +spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain diff --git a/benchmarks/conf/profiles/standalone-tpch.conf b/benchmarks/conf/profiles/standalone-tpch.conf new file mode 100644 index 0000000000..024a7364f3 --- /dev/null +++ b/benchmarks/conf/profiles/standalone-tpch.conf @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Standalone cluster profile for TPC-H: 1 executor, 8 cores. +runner.master=${SPARK_MASTER} + +spark.driver.memory=8G +spark.executor.instances=1 +spark.executor.cores=8 +spark.cores.max=8 +spark.executor.memory=16g +spark.memory.offHeap.enabled=true +spark.memory.offHeap.size=16g +spark.eventLog.enabled=true +spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain diff --git a/dev/benchmarks/create-iceberg-tpch.py b/benchmarks/create-iceberg-tpch.py similarity index 100% rename from dev/benchmarks/create-iceberg-tpch.py rename to benchmarks/create-iceberg-tpch.py diff --git a/dev/benchmarks/drop-caches.sh b/benchmarks/drop-caches.sh similarity index 100% rename from dev/benchmarks/drop-caches.sh rename to benchmarks/drop-caches.sh diff --git a/benchmarks/pyspark/generate_data.py b/benchmarks/generate_shuffle_data.py old mode 100755 new mode 100644 similarity index 100% rename from benchmarks/pyspark/generate_data.py rename to benchmarks/generate_shuffle_data.py diff --git a/benchmarks/infra/docker/Dockerfile b/benchmarks/infra/docker/Dockerfile new file mode 100644 index 0000000000..861411819a --- /dev/null +++ b/benchmarks/infra/docker/Dockerfile @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Unified benchmark image for running TPC and shuffle benchmarks across +# engines (Spark, Comet, Gluten). +# +# Build: +# docker build -t comet-bench -f benchmarks/infra/docker/Dockerfile . +# +# The build context should be the repository root so that benchmarks/ is +# available. + +ARG SPARK_IMAGE=apache/spark:3.5.2-python3 +FROM ${SPARK_IMAGE} + +USER root + +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3 python3-pip procps \ + openjdk-8-jre-headless openjdk-17-jre-headless \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Default to Java 17 (Comet). Override with JAVA_HOME for other engines. +ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 + +# Copy the unified benchmark runner into the image. +COPY benchmarks/conf /opt/benchmarks/conf +COPY benchmarks/runner /opt/benchmarks/runner +COPY benchmarks/suites /opt/benchmarks/suites +COPY benchmarks/queries /opt/benchmarks/queries +COPY benchmarks/run.py /opt/benchmarks/run.py + +# Copy the metrics collector script. +COPY benchmarks/infra/docker/collect-metrics.sh /opt/benchmarks/collect-metrics.sh +RUN chmod +x /opt/benchmarks/collect-metrics.sh + +# Engine JARs are bind-mounted or copied in at runtime via --jars. +# Data and query paths are also bind-mounted. + +ENV PYTHONPATH="/opt:${PYTHONPATH}" + +WORKDIR /opt/benchmarks + +USER ${spark_uid} diff --git a/benchmarks/infra/docker/collect-metrics.sh b/benchmarks/infra/docker/collect-metrics.sh new file mode 100755 index 0000000000..fd9c1d848f --- /dev/null +++ b/benchmarks/infra/docker/collect-metrics.sh @@ -0,0 +1,103 @@ +#!/bin/sh +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Container-level memory metrics collector. +# +# Polls cgroup memory stats at a fixed interval and writes a CSV with +# columns: timestamp, memory_usage_bytes, memory_limit_bytes, rss_bytes, +# cache_bytes, swap_bytes. +# +# Works with both cgroup v1 and v2. +# +# Usage: +# collect-metrics.sh [INTERVAL_SECS] [OUTPUT_CSV] +# +# Defaults: interval=1, output=/results/container-metrics.csv + +set -e + +INTERVAL="${1:-1}" +OUTPUT="${2:-/results/container-metrics.csv}" + +# Detect cgroup version +if [ -f /sys/fs/cgroup/memory/memory.usage_in_bytes ]; then + CGROUP_VERSION=1 +elif [ -f /sys/fs/cgroup/memory.current ]; then + CGROUP_VERSION=2 +else + echo "Warning: cannot detect cgroup memory files; polling disabled" >&2 + # Still write a header so downstream tools don't break on a missing file. + echo "timestamp_ms,memory_usage_bytes,memory_limit_bytes,rss_bytes,cache_bytes,swap_bytes" > "$OUTPUT" + # Sleep forever so the container stays up (compose expects it to keep running). + exec sleep infinity +fi + +# ---- helpers ---- + +read_file() { + # Return the contents of a file, or "0" if it doesn't exist. + if [ -f "$1" ]; then cat "$1"; else echo "0"; fi +} + +read_stat() { + # Extract a named field from memory.stat (cgroup v1 format: "key value"). + grep "^$1 " "$2" 2>/dev/null | awk '{print $2}' || echo "0" +} + +poll_v1() { + local usage limit rss cache swap + usage=$(read_file /sys/fs/cgroup/memory/memory.usage_in_bytes) + limit=$(read_file /sys/fs/cgroup/memory/memory.limit_in_bytes) + local stat=/sys/fs/cgroup/memory/memory.stat + rss=$(read_stat total_rss "$stat") + cache=$(read_stat total_cache "$stat") + swap=$(read_file /sys/fs/cgroup/memory/memory.memsw.usage_in_bytes) + # swap file reports memory+swap; subtract memory to get swap only + if [ "$swap" != "0" ]; then + swap=$((swap - usage)) + [ "$swap" -lt 0 ] && swap=0 + fi + echo "$usage,$limit,$rss,$cache,$swap" +} + +poll_v2() { + local usage limit rss cache swap + usage=$(read_file /sys/fs/cgroup/memory.current) + limit=$(read_file /sys/fs/cgroup/memory.max) + [ "$limit" = "max" ] && limit=0 + local stat=/sys/fs/cgroup/memory.stat + rss=$(read_stat anon "$stat") + cache=$(read_stat file "$stat") + swap=$(read_file /sys/fs/cgroup/memory.swap.current) + echo "$usage,$limit,$rss,$cache,$swap" +} + +# ---- main loop ---- + +echo "timestamp_ms,memory_usage_bytes,memory_limit_bytes,rss_bytes,cache_bytes,swap_bytes" > "$OUTPUT" +echo "Collecting container memory metrics every ${INTERVAL}s -> ${OUTPUT} (cgroup v${CGROUP_VERSION})" >&2 + +while true; do + ts=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))') + if [ "$CGROUP_VERSION" = "1" ]; then + vals=$(poll_v1) + else + vals=$(poll_v2) + fi + echo "${ts},${vals}" >> "$OUTPUT" + sleep "$INTERVAL" +done diff --git a/benchmarks/infra/docker/docker-compose.constrained.yml b/benchmarks/infra/docker/docker-compose.constrained.yml new file mode 100644 index 0000000000..eff730d0e3 --- /dev/null +++ b/benchmarks/infra/docker/docker-compose.constrained.yml @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Constrained memory overlay. +# +# Apply on top of docker-compose.yml to enforce hard memory limits and +# enable the metrics-collector sidecar: +# +# docker compose -f docker-compose.yml -f docker-compose.constrained.yml up -d +# +# Environment variables: +# WORKER_MEM_LIMIT - Hard memory limit for the worker (default: 6g) +# BENCH_MEM_LIMIT - Hard memory limit for the bench runner (default: 10g) +# METRICS_INTERVAL - Collection interval in seconds (default: 1) + +services: + spark-worker: + mem_limit: ${WORKER_MEM_LIMIT:-6g} + memswap_limit: ${WORKER_MEM_LIMIT:-6g} # same as mem_limit → no swap + + bench: + mem_limit: ${BENCH_MEM_LIMIT:-10g} + memswap_limit: ${BENCH_MEM_LIMIT:-10g} + + metrics-collector: + image: ${BENCH_IMAGE:-comet-bench} + container_name: metrics-collector + pid: "service:spark-worker" # share PID namespace with worker + command: + - /opt/benchmarks/collect-metrics.sh + - "${METRICS_INTERVAL:-1}" + - /results/container-metrics.csv + volumes: + - ${RESULTS_DIR:-/tmp/bench-results}:/results + depends_on: + - spark-worker diff --git a/benchmarks/infra/docker/docker-compose.yml b/benchmarks/infra/docker/docker-compose.yml new file mode 100644 index 0000000000..36261e3ded --- /dev/null +++ b/benchmarks/infra/docker/docker-compose.yml @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Spark standalone cluster for benchmarks. +# +# Usage: +# docker compose -f benchmarks/infra/docker/docker-compose.yml up -d +# +# Override with constrained memory limits: +# docker compose -f benchmarks/infra/docker/docker-compose.yml \ +# -f benchmarks/infra/docker/docker-compose.constrained.yml up -d +# +# Environment variables (set in .env or export before running): +# BENCH_IMAGE - Docker image to use (default: comet-bench) +# DATA_DIR - Host path to TPC data (default: /tmp/tpc-data) +# QUERIES_DIR - Host path to query SQL files (default: /tmp/tpc-queries) +# RESULTS_DIR - Host path for results output (default: /tmp/bench-results) +# ENGINE_JARS_DIR - Host path containing engine JARs (default: /tmp/engine-jars) + +services: + spark-master: + image: ${BENCH_IMAGE:-comet-bench} + container_name: spark-master + hostname: spark-master + command: /opt/spark/sbin/start-master.sh --host spark-master + ports: + - "7077:7077" + - "8080:8080" + volumes: + - ${DATA_DIR:-/tmp/tpc-data}:/data:ro + - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro + - ${RESULTS_DIR:-/tmp/bench-results}:/results + - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro + environment: + - SPARK_MASTER_HOST=spark-master + - SPARK_NO_DAEMONIZE=true + + spark-worker: + image: ${BENCH_IMAGE:-comet-bench} + container_name: spark-worker + hostname: spark-worker + depends_on: + - spark-master + command: /opt/spark/sbin/start-worker.sh spark://spark-master:7077 + ports: + - "8081:8081" + volumes: + - ${DATA_DIR:-/tmp/tpc-data}:/data:ro + - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro + - ${RESULTS_DIR:-/tmp/bench-results}:/results + - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro + environment: + - SPARK_WORKER_CORES=${WORKER_CORES:-8} + - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-16g} + - SPARK_NO_DAEMONIZE=true + + bench: + image: ${BENCH_IMAGE:-comet-bench} + container_name: bench-runner + depends_on: + - spark-master + - spark-worker + # Override 'command' to run a specific benchmark, e.g.: + # docker compose run bench python /opt/benchmarks/run.py \ + # --engine comet --profile docker -- tpc ... + command: ["echo", "Use 'docker compose run bench python /opt/benchmarks/run.py ...' to run benchmarks"] + volumes: + - ${DATA_DIR:-/tmp/tpc-data}:/data:ro + - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro + - ${RESULTS_DIR:-/tmp/bench-results}:/results + - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro + environment: + - SPARK_HOME=/opt/spark + - SPARK_MASTER=spark://spark-master:7077 + - COMET_JAR=/jars/comet.jar + - PYTHONPATH=/opt diff --git a/benchmarks/pyspark/README.md b/benchmarks/pyspark/README.md deleted file mode 100644 index 3fc55123f0..0000000000 --- a/benchmarks/pyspark/README.md +++ /dev/null @@ -1,178 +0,0 @@ - - -# PySpark Benchmarks - -A suite of PySpark benchmarks for comparing performance between Spark, Comet JVM, and Comet Native implementations. - -## Available Benchmarks - -Run `python run_benchmark.py --list-benchmarks` to see all available benchmarks: - -- **shuffle-hash** - Shuffle all columns using hash partitioning on group_key -- **shuffle-roundrobin** - Shuffle all columns using round-robin partitioning - -## Prerequisites - -- Apache Spark cluster (standalone, YARN, or Kubernetes) -- PySpark installed -- Comet JAR built - -## Build Comet JAR - -```bash -cd /path/to/datafusion-comet -make release -``` - -## Step 1: Generate Test Data - -Generate test data with realistic 50-column schema (nested structs, arrays, maps): - -```bash -spark-submit \ - --master spark://master:7077 \ - --executor-memory 16g \ - generate_data.py \ - --output /tmp/shuffle-benchmark-data \ - --rows 10000000 \ - --partitions 200 -``` - -### Data Generation Options - -| Option | Default | Description | -| -------------------- | ---------- | ---------------------------- | -| `--output`, `-o` | (required) | Output path for parquet data | -| `--rows`, `-r` | 10000000 | Number of rows | -| `--partitions`, `-p` | 200 | Number of output partitions | - -## Step 2: Run Benchmarks - -### List Available Benchmarks - -```bash -python run_benchmark.py --list-benchmarks -``` - -### Run Individual Benchmarks - -You can run specific benchmarks by name: - -```bash -# Hash partitioning shuffle - Spark baseline -spark-submit --master spark://master:7077 \ - run_benchmark.py --data /tmp/shuffle-benchmark-data --mode spark --benchmark shuffle-hash - -# Round-robin shuffle - Spark baseline -spark-submit --master spark://master:7077 \ - run_benchmark.py --data /tmp/shuffle-benchmark-data --mode spark --benchmark shuffle-roundrobin - -# Hash partitioning - Comet JVM shuffle -spark-submit --master spark://master:7077 \ - --jars /path/to/comet.jar \ - --conf spark.comet.enabled=true \ - --conf spark.comet.exec.shuffle.enabled=true \ - --conf spark.comet.shuffle.mode=jvm \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - run_benchmark.py --data /tmp/shuffle-benchmark-data --mode jvm --benchmark shuffle-hash - -# Round-robin - Comet Native shuffle -spark-submit --master spark://master:7077 \ - --jars /path/to/comet.jar \ - --conf spark.comet.enabled=true \ - --conf spark.comet.exec.shuffle.enabled=true \ - --conf spark.comet.exec.shuffle.mode=native \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - run_benchmark.py --data /tmp/shuffle-benchmark-data --mode native --benchmark shuffle-roundrobin -``` - -### Run All Benchmarks - -Use the provided script to run all benchmarks across all modes: - -```bash -SPARK_MASTER=spark://master:7077 \ -EXECUTOR_MEMORY=16g \ -./run_all_benchmarks.sh /tmp/shuffle-benchmark-data -``` - -## Checking Results - -Open the Spark UI (default: http://localhost:4040) during each benchmark run to compare shuffle write sizes in the Stages tab. - -## Adding New Benchmarks - -The benchmark framework makes it easy to add new benchmarks: - -1. **Create a benchmark class** in `benchmarks/` directory (or add to existing file): - -```python -from benchmarks.base import Benchmark - -class MyBenchmark(Benchmark): - @classmethod - def name(cls) -> str: - return "my-benchmark" - - @classmethod - def description(cls) -> str: - return "Description of what this benchmark does" - - def run(self) -> Dict[str, Any]: - # Read data - df = self.spark.read.parquet(self.data_path) - - # Run your benchmark operation - def benchmark_operation(): - result = df.filter(...).groupBy(...).agg(...) - result.write.mode("overwrite").parquet("/tmp/output") - - # Time it - duration_ms = self._time_operation(benchmark_operation) - - return { - 'duration_ms': duration_ms, - # Add any other metrics you want to track - } -``` - -2. **Register the benchmark** in `benchmarks/__init__.py`: - -```python -from .my_module import MyBenchmark - -_BENCHMARK_REGISTRY = { - # ... existing benchmarks - MyBenchmark.name(): MyBenchmark, -} -``` - -3. **Run your new benchmark**: - -```bash -python run_benchmark.py --data /path/to/data --mode spark --benchmark my-benchmark -``` - -The base `Benchmark` class provides: - -- Automatic timing via `_time_operation()` -- Standard output formatting via `execute_timed()` -- Access to SparkSession, data path, and mode -- Spark configuration printing diff --git a/benchmarks/pyspark/benchmarks/__init__.py b/benchmarks/pyspark/benchmarks/__init__.py deleted file mode 100644 index 7d913a7d6d..0000000000 --- a/benchmarks/pyspark/benchmarks/__init__.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Benchmark registry for PySpark benchmarks. - -This module provides a central registry for discovering and running benchmarks. -""" - -from typing import Dict, Type, List - -from .base import Benchmark -from .shuffle import ShuffleHashBenchmark, ShuffleRoundRobinBenchmark - - -# Registry of all available benchmarks -_BENCHMARK_REGISTRY: Dict[str, Type[Benchmark]] = { - ShuffleHashBenchmark.name(): ShuffleHashBenchmark, - ShuffleRoundRobinBenchmark.name(): ShuffleRoundRobinBenchmark, -} - - -def get_benchmark(name: str) -> Type[Benchmark]: - """ - Get a benchmark class by name. - - Args: - name: Benchmark name - - Returns: - Benchmark class - - Raises: - KeyError: If benchmark name is not found - """ - if name not in _BENCHMARK_REGISTRY: - available = ", ".join(sorted(_BENCHMARK_REGISTRY.keys())) - raise KeyError( - f"Unknown benchmark: {name}. Available benchmarks: {available}" - ) - return _BENCHMARK_REGISTRY[name] - - -def list_benchmarks() -> List[tuple[str, str]]: - """ - List all available benchmarks. - - Returns: - List of (name, description) tuples - """ - benchmarks = [] - for name in sorted(_BENCHMARK_REGISTRY.keys()): - benchmark_cls = _BENCHMARK_REGISTRY[name] - benchmarks.append((name, benchmark_cls.description())) - return benchmarks - - -__all__ = [ - 'Benchmark', - 'get_benchmark', - 'list_benchmarks', - 'ShuffleHashBenchmark', - 'ShuffleRoundRobinBenchmark', -] diff --git a/benchmarks/pyspark/benchmarks/base.py b/benchmarks/pyspark/benchmarks/base.py deleted file mode 100644 index 7e8e8db5a9..0000000000 --- a/benchmarks/pyspark/benchmarks/base.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Base benchmark class providing common functionality for all benchmarks. -""" - -import time -from abc import ABC, abstractmethod -from typing import Dict, Any - -from pyspark.sql import SparkSession - - -class Benchmark(ABC): - """Base class for all PySpark benchmarks.""" - - def __init__(self, spark: SparkSession, data_path: str, mode: str): - """ - Initialize benchmark. - - Args: - spark: SparkSession instance - data_path: Path to input data - mode: Execution mode (spark, jvm, native) - """ - self.spark = spark - self.data_path = data_path - self.mode = mode - - @classmethod - @abstractmethod - def name(cls) -> str: - """Return the benchmark name (used for CLI).""" - pass - - @classmethod - @abstractmethod - def description(cls) -> str: - """Return a short description of the benchmark.""" - pass - - @abstractmethod - def run(self) -> Dict[str, Any]: - """ - Run the benchmark and return results. - - Returns: - Dictionary containing benchmark results (must include 'duration_ms') - """ - pass - - def execute_timed(self) -> Dict[str, Any]: - """ - Execute the benchmark with timing and standard output. - - Returns: - Dictionary containing benchmark results - """ - print(f"\n{'=' * 80}") - print(f"Benchmark: {self.name()}") - print(f"Mode: {self.mode.upper()}") - print(f"{'=' * 80}") - print(f"Data path: {self.data_path}") - - # Print relevant Spark configuration - self._print_spark_config() - - # Clear cache before running - self.spark.catalog.clearCache() - - # Run the benchmark - print(f"\nRunning benchmark...") - results = self.run() - - # Print results - print(f"\nDuration: {results['duration_ms']:,} ms") - if 'row_count' in results: - print(f"Rows processed: {results['row_count']:,}") - - # Print any additional metrics - for key, value in results.items(): - if key not in ['duration_ms', 'row_count']: - print(f"{key}: {value}") - - print(f"{'=' * 80}\n") - - return results - - def _print_spark_config(self): - """Print relevant Spark configuration.""" - conf = self.spark.sparkContext.getConf() - print(f"Shuffle manager: {conf.get('spark.shuffle.manager', 'default')}") - print(f"Comet enabled: {conf.get('spark.comet.enabled', 'false')}") - print(f"Comet shuffle enabled: {conf.get('spark.comet.exec.shuffle.enabled', 'false')}") - print(f"Comet shuffle mode: {conf.get('spark.comet.shuffle.mode', 'not set')}") - print(f"Spark UI: {self.spark.sparkContext.uiWebUrl}") - - def _time_operation(self, operation_fn): - """ - Time an operation and return duration in milliseconds. - - Args: - operation_fn: Function to time (takes no arguments) - - Returns: - Duration in milliseconds - """ - start_time = time.time() - operation_fn() - duration_ms = int((time.time() - start_time) * 1000) - return duration_ms diff --git a/benchmarks/pyspark/benchmarks/shuffle.py b/benchmarks/pyspark/benchmarks/shuffle.py deleted file mode 100644 index 0facd2340d..0000000000 --- a/benchmarks/pyspark/benchmarks/shuffle.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Shuffle benchmarks for comparing shuffle file sizes and performance. - -These benchmarks test different partitioning strategies (hash, round-robin) -across Spark, Comet JVM, and Comet Native shuffle implementations. -""" - -from typing import Dict, Any -from pyspark.sql import DataFrame - -from .base import Benchmark - - -class ShuffleBenchmark(Benchmark): - """Base class for shuffle benchmarks with common repartitioning logic.""" - - def __init__(self, spark, data_path: str, mode: str, num_partitions: int = 200): - """ - Initialize shuffle benchmark. - - Args: - spark: SparkSession instance - data_path: Path to input parquet data - mode: Execution mode (spark, jvm, native) - num_partitions: Number of partitions to shuffle to - """ - super().__init__(spark, data_path, mode) - self.num_partitions = num_partitions - - def _read_and_count(self) -> tuple[DataFrame, int]: - """Read input data and count rows.""" - df = self.spark.read.parquet(self.data_path) - row_count = df.count() - return df, row_count - - def _repartition(self, df: DataFrame) -> DataFrame: - """ - Repartition dataframe using the strategy defined by subclass. - - Args: - df: Input dataframe - - Returns: - Repartitioned dataframe - """ - raise NotImplementedError("Subclasses must implement _repartition") - - def _write_output(self, df: DataFrame, output_path: str): - """Write repartitioned data to parquet.""" - df.write.mode("overwrite").parquet(output_path) - - def run(self) -> Dict[str, Any]: - """ - Run the shuffle benchmark. - - Returns: - Dictionary with duration_ms and row_count - """ - # Read input data - df, row_count = self._read_and_count() - print(f"Number of rows: {row_count:,}") - - # Define the benchmark operation - def benchmark_operation(): - # Repartition using the specific strategy - repartitioned = self._repartition(df) - - # Write to parquet to force materialization - output_path = f"/tmp/shuffle-benchmark-output-{self.mode}-{self.name()}" - self._write_output(repartitioned, output_path) - print(f"Wrote repartitioned data to: {output_path}") - - # Time the operation - duration_ms = self._time_operation(benchmark_operation) - - return { - 'duration_ms': duration_ms, - 'row_count': row_count, - 'num_partitions': self.num_partitions, - } - - -class ShuffleHashBenchmark(ShuffleBenchmark): - """Shuffle benchmark using hash partitioning on a key column.""" - - @classmethod - def name(cls) -> str: - return "shuffle-hash" - - @classmethod - def description(cls) -> str: - return "Shuffle all columns using hash partitioning on group_key" - - def _repartition(self, df: DataFrame) -> DataFrame: - """Repartition using hash partitioning on group_key.""" - return df.repartition(self.num_partitions, "group_key") - - -class ShuffleRoundRobinBenchmark(ShuffleBenchmark): - """Shuffle benchmark using round-robin partitioning.""" - - @classmethod - def name(cls) -> str: - return "shuffle-roundrobin" - - @classmethod - def description(cls) -> str: - return "Shuffle all columns using round-robin partitioning" - - def _repartition(self, df: DataFrame) -> DataFrame: - """Repartition using round-robin (no partition columns specified).""" - return df.repartition(self.num_partitions) diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh deleted file mode 100755 index 81eb044884..0000000000 --- a/benchmarks/pyspark/run_all_benchmarks.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# Run all shuffle benchmarks (Spark, Comet JVM, Comet Native) -# Check the Spark UI during each run to compare shuffle sizes - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -DATA_PATH="${1:-/tmp/shuffle-benchmark-data}" -COMET_JAR="${COMET_JAR:-$SCRIPT_DIR/../../spark/target/comet-spark-spark3.5_2.12-0.14.0-SNAPSHOT.jar}" -SPARK_MASTER="${SPARK_MASTER:-local[*]}" -EXECUTOR_MEMORY="${EXECUTOR_MEMORY:-16g}" -EVENT_LOG_DIR="${EVENT_LOG_DIR:-/tmp/spark-events}" - -# Create event log directory -mkdir -p "$EVENT_LOG_DIR" - -echo "========================================" -echo "Shuffle Size Comparison Benchmark" -echo "========================================" -echo "Data path: $DATA_PATH" -echo "Comet JAR: $COMET_JAR" -echo "Spark master: $SPARK_MASTER" -echo "Executor memory: $EXECUTOR_MEMORY" -echo "Event log dir: $EVENT_LOG_DIR" -echo "========================================" - -# Run Spark baseline (no Comet) -echo "" -echo ">>> Running SPARK shuffle benchmark..." -$SPARK_HOME/bin/spark-submit \ - --master "$SPARK_MASTER" \ - --executor-memory "$EXECUTOR_MEMORY" \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir="$EVENT_LOG_DIR" \ - --conf spark.comet.enabled=false \ - --conf spark.comet.exec.shuffle.enabled=false \ - "$SCRIPT_DIR/run_benchmark.py" \ - --data "$DATA_PATH" \ - --mode spark - -# Run Comet JVM shuffle -echo "" -echo ">>> Running COMET JVM shuffle benchmark..." -$SPARK_HOME/bin/spark-submit \ - --master "$SPARK_MASTER" \ - --executor-memory "$EXECUTOR_MEMORY" \ - --jars "$COMET_JAR" \ - --driver-class-path "$COMET_JAR" \ - --conf spark.executor.extraClassPath="$COMET_JAR" \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir="$EVENT_LOG_DIR" \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.comet.enabled=true \ - --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \ - --conf spark.comet.parquet.write.enabled=true \ - --conf spark.comet.logFallbackReasons.enabled=true \ - --conf spark.comet.explainFallback.enabled=true \ - --conf spark.comet.shuffle.mode=jvm \ - --conf spark.comet.exec.shuffle.mode=jvm \ - --conf spark.comet.exec.replaceSortMergeJoin=true \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ - --conf spark.comet.cast.allowIncompatible=true \ - "$SCRIPT_DIR/run_benchmark.py" \ - --data "$DATA_PATH" \ - --mode jvm - -# Run Comet Native shuffle -echo "" -echo ">>> Running COMET NATIVE shuffle benchmark..." -$SPARK_HOME/bin/spark-submit \ - --master "$SPARK_MASTER" \ - --executor-memory "$EXECUTOR_MEMORY" \ - --jars "$COMET_JAR" \ - --driver-class-path "$COMET_JAR" \ - --conf spark.executor.extraClassPath="$COMET_JAR" \ - --conf spark.eventLog.enabled=true \ - --conf spark.eventLog.dir="$EVENT_LOG_DIR" \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.comet.enabled=true \ - --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \ - --conf spark.comet.parquet.write.enabled=true \ - --conf spark.comet.logFallbackReasons.enabled=true \ - --conf spark.comet.explainFallback.enabled=true \ - --conf spark.comet.exec.shuffle.mode=native \ - --conf spark.comet.exec.replaceSortMergeJoin=true \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ - --conf spark.comet.cast.allowIncompatible=true \ - "$SCRIPT_DIR/run_benchmark.py" \ - --data "$DATA_PATH" \ - --mode native - -echo "" -echo "========================================" -echo "BENCHMARK COMPLETE" -echo "========================================" -echo "Event logs written to: $EVENT_LOG_DIR" -echo "" diff --git a/benchmarks/pyspark/run_benchmark.py b/benchmarks/pyspark/run_benchmark.py deleted file mode 100755 index 6713f0ff21..0000000000 --- a/benchmarks/pyspark/run_benchmark.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Run PySpark benchmarks. - -Run benchmarks by name with appropriate spark-submit configs for different modes -(spark, jvm, native). Check the Spark UI to compare results between modes. -""" - -import argparse -import sys - -from pyspark.sql import SparkSession - -from benchmarks import get_benchmark, list_benchmarks - - -def main(): - parser = argparse.ArgumentParser( - description="Run PySpark benchmarks", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Run hash partitioning shuffle benchmark in Spark mode - python run_benchmark.py --data /path/to/data --mode spark --benchmark shuffle-hash - - # Run round-robin shuffle benchmark in Comet native mode - python run_benchmark.py --data /path/to/data --mode native --benchmark shuffle-roundrobin - - # List all available benchmarks - python run_benchmark.py --list-benchmarks - """ - ) - parser.add_argument( - "--data", "-d", - help="Path to input parquet data" - ) - parser.add_argument( - "--mode", "-m", - choices=["spark", "jvm", "native"], - help="Shuffle mode being tested" - ) - parser.add_argument( - "--benchmark", "-b", - default="shuffle-hash", - help="Benchmark to run (default: shuffle-hash)" - ) - parser.add_argument( - "--list-benchmarks", - action="store_true", - help="List all available benchmarks and exit" - ) - - args = parser.parse_args() - - # Handle --list-benchmarks - if args.list_benchmarks: - print("Available benchmarks:\n") - for name, description in list_benchmarks(): - print(f" {name:25s} - {description}") - return 0 - - # Validate required arguments - if not args.data: - parser.error("--data is required when running a benchmark") - if not args.mode: - parser.error("--mode is required when running a benchmark") - - # Get the benchmark class - try: - benchmark_cls = get_benchmark(args.benchmark) - except KeyError as e: - print(f"Error: {e}", file=sys.stderr) - print("\nUse --list-benchmarks to see available benchmarks", file=sys.stderr) - return 1 - - # Create Spark session - spark = SparkSession.builder \ - .appName(f"{benchmark_cls.name()}-{args.mode.upper()}") \ - .getOrCreate() - - try: - # Create and run the benchmark - benchmark = benchmark_cls(spark, args.data, args.mode) - results = benchmark.execute_timed() - - print("\nCheck Spark UI for shuffle sizes and detailed metrics") - return 0 - - finally: - spark.stop() - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/queries/tpcds/q1.sql b/benchmarks/queries/tpcds/q1.sql new file mode 100644 index 0000000000..00328875ab --- /dev/null +++ b/benchmarks/queries/tpcds/q1.sql @@ -0,0 +1,26 @@ +-- SQLBench-DS query 1 derived from TPC-DS query 1 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with customer_total_return as +(select sr_customer_sk as ctr_customer_sk +,sr_store_sk as ctr_store_sk +,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return +from store_returns +,date_dim +where sr_returned_date_sk = d_date_sk +and d_year =1999 +group by sr_customer_sk +,sr_store_sk) + select c_customer_id +from customer_total_return ctr1 +,store +,customer +where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 +from customer_total_return ctr2 +where ctr1.ctr_store_sk = ctr2.ctr_store_sk) +and s_store_sk = ctr1.ctr_store_sk +and s_state = 'TN' +and ctr1.ctr_customer_sk = c_customer_sk +order by c_customer_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q10.sql b/benchmarks/queries/tpcds/q10.sql new file mode 100644 index 0000000000..3a47920e04 --- /dev/null +++ b/benchmarks/queries/tpcds/q10.sql @@ -0,0 +1,60 @@ +-- SQLBench-DS query 10 derived from TPC-DS query 10 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + cd_gender, + cd_marital_status, + cd_education_status, + count(*) cnt1, + cd_purchase_estimate, + count(*) cnt2, + cd_credit_rating, + count(*) cnt3, + cd_dep_count, + count(*) cnt4, + cd_dep_employed_count, + count(*) cnt5, + cd_dep_college_count, + count(*) cnt6 + from + customer c,customer_address ca,customer_demographics + where + c.c_current_addr_sk = ca.ca_address_sk and + ca_county in ('Clinton County','Platte County','Franklin County','Louisa County','Harmon County') and + cd_demo_sk = c.c_current_cdemo_sk and + exists (select * + from store_sales,date_dim + where c.c_customer_sk = ss_customer_sk and + ss_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 3 and 3+3) and + (exists (select * + from web_sales,date_dim + where c.c_customer_sk = ws_bill_customer_sk and + ws_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 3 ANd 3+3) or + exists (select * + from catalog_sales,date_dim + where c.c_customer_sk = cs_ship_customer_sk and + cs_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 3 and 3+3)) + group by cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + order by cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q11.sql b/benchmarks/queries/tpcds/q11.sql new file mode 100644 index 0000000000..7ffd3094f9 --- /dev/null +++ b/benchmarks/queries/tpcds/q11.sql @@ -0,0 +1,82 @@ +-- SQLBench-DS query 11 derived from TPC-DS query 11 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with year_total as ( + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,c_preferred_cust_flag customer_preferred_cust_flag + ,c_birth_country customer_birth_country + ,c_login customer_login + ,c_email_address customer_email_address + ,d_year dyear + ,sum(ss_ext_list_price-ss_ext_discount_amt) year_total + ,'s' sale_type + from customer + ,store_sales + ,date_dim + where c_customer_sk = ss_customer_sk + and ss_sold_date_sk = d_date_sk + group by c_customer_id + ,c_first_name + ,c_last_name + ,c_preferred_cust_flag + ,c_birth_country + ,c_login + ,c_email_address + ,d_year + union all + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,c_preferred_cust_flag customer_preferred_cust_flag + ,c_birth_country customer_birth_country + ,c_login customer_login + ,c_email_address customer_email_address + ,d_year dyear + ,sum(ws_ext_list_price-ws_ext_discount_amt) year_total + ,'w' sale_type + from customer + ,web_sales + ,date_dim + where c_customer_sk = ws_bill_customer_sk + and ws_sold_date_sk = d_date_sk + group by c_customer_id + ,c_first_name + ,c_last_name + ,c_preferred_cust_flag + ,c_birth_country + ,c_login + ,c_email_address + ,d_year + ) + select + t_s_secyear.customer_id + ,t_s_secyear.customer_first_name + ,t_s_secyear.customer_last_name + ,t_s_secyear.customer_email_address + from year_total t_s_firstyear + ,year_total t_s_secyear + ,year_total t_w_firstyear + ,year_total t_w_secyear + where t_s_secyear.customer_id = t_s_firstyear.customer_id + and t_s_firstyear.customer_id = t_w_secyear.customer_id + and t_s_firstyear.customer_id = t_w_firstyear.customer_id + and t_s_firstyear.sale_type = 's' + and t_w_firstyear.sale_type = 'w' + and t_s_secyear.sale_type = 's' + and t_w_secyear.sale_type = 'w' + and t_s_firstyear.dyear = 1999 + and t_s_secyear.dyear = 1999+1 + and t_w_firstyear.dyear = 1999 + and t_w_secyear.dyear = 1999+1 + and t_s_firstyear.year_total > 0 + and t_w_firstyear.year_total > 0 + and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else 0.0 end + > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else 0.0 end + order by t_s_secyear.customer_id + ,t_s_secyear.customer_first_name + ,t_s_secyear.customer_last_name + ,t_s_secyear.customer_email_address + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q12.sql b/benchmarks/queries/tpcds/q12.sql new file mode 100644 index 0000000000..eb267ca64b --- /dev/null +++ b/benchmarks/queries/tpcds/q12.sql @@ -0,0 +1,35 @@ +-- SQLBench-DS query 12 derived from TPC-DS query 12 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price + ,sum(ws_ext_sales_price) as itemrevenue + ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over + (partition by i_class) as revenueratio +from + web_sales + ,item + ,date_dim +where + ws_item_sk = i_item_sk + and i_category in ('Jewelry', 'Books', 'Women') + and ws_sold_date_sk = d_date_sk + and d_date between cast('2002-03-22' as date) + and (cast('2002-03-22' as date) + INTERVAL '30 DAYS') +group by + i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price +order by + i_category + ,i_class + ,i_item_id + ,i_item_desc + ,revenueratio + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q13.sql b/benchmarks/queries/tpcds/q13.sql new file mode 100644 index 0000000000..31b1171b9e --- /dev/null +++ b/benchmarks/queries/tpcds/q13.sql @@ -0,0 +1,53 @@ +-- SQLBench-DS query 13 derived from TPC-DS query 13 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select avg(ss_quantity) + ,avg(ss_ext_sales_price) + ,avg(ss_ext_wholesale_cost) + ,sum(ss_ext_wholesale_cost) + from store_sales + ,store + ,customer_demographics + ,household_demographics + ,customer_address + ,date_dim + where s_store_sk = ss_store_sk + and ss_sold_date_sk = d_date_sk and d_year = 2001 + and((ss_hdemo_sk=hd_demo_sk + and cd_demo_sk = ss_cdemo_sk + and cd_marital_status = 'U' + and cd_education_status = '4 yr Degree' + and ss_sales_price between 100.00 and 150.00 + and hd_dep_count = 3 + )or + (ss_hdemo_sk=hd_demo_sk + and cd_demo_sk = ss_cdemo_sk + and cd_marital_status = 'S' + and cd_education_status = 'Unknown' + and ss_sales_price between 50.00 and 100.00 + and hd_dep_count = 1 + ) or + (ss_hdemo_sk=hd_demo_sk + and cd_demo_sk = ss_cdemo_sk + and cd_marital_status = 'D' + and cd_education_status = '2 yr Degree' + and ss_sales_price between 150.00 and 200.00 + and hd_dep_count = 1 + )) + and((ss_addr_sk = ca_address_sk + and ca_country = 'United States' + and ca_state in ('CO', 'MI', 'MN') + and ss_net_profit between 100 and 200 + ) or + (ss_addr_sk = ca_address_sk + and ca_country = 'United States' + and ca_state in ('NC', 'NY', 'TX') + and ss_net_profit between 150 and 300 + ) or + (ss_addr_sk = ca_address_sk + and ca_country = 'United States' + and ca_state in ('CA', 'NE', 'TN') + and ss_net_profit between 50 and 250 + )) +; + diff --git a/benchmarks/queries/tpcds/q14.sql b/benchmarks/queries/tpcds/q14.sql new file mode 100644 index 0000000000..119791f59d --- /dev/null +++ b/benchmarks/queries/tpcds/q14.sql @@ -0,0 +1,211 @@ +-- SQLBench-DS query 14 derived from TPC-DS query 14 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with cross_items as + (select i_item_sk ss_item_sk + from item, + (select iss.i_brand_id brand_id + ,iss.i_class_id class_id + ,iss.i_category_id category_id + from store_sales + ,item iss + ,date_dim d1 + where ss_item_sk = iss.i_item_sk + and ss_sold_date_sk = d1.d_date_sk + and d1.d_year between 1999 AND 1999 + 2 + intersect + select ics.i_brand_id + ,ics.i_class_id + ,ics.i_category_id + from catalog_sales + ,item ics + ,date_dim d2 + where cs_item_sk = ics.i_item_sk + and cs_sold_date_sk = d2.d_date_sk + and d2.d_year between 1999 AND 1999 + 2 + intersect + select iws.i_brand_id + ,iws.i_class_id + ,iws.i_category_id + from web_sales + ,item iws + ,date_dim d3 + where ws_item_sk = iws.i_item_sk + and ws_sold_date_sk = d3.d_date_sk + and d3.d_year between 1999 AND 1999 + 2) + where i_brand_id = brand_id + and i_class_id = class_id + and i_category_id = category_id +), + avg_sales as + (select avg(quantity*list_price) average_sales + from (select ss_quantity quantity + ,ss_list_price list_price + from store_sales + ,date_dim + where ss_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2 + union all + select cs_quantity quantity + ,cs_list_price list_price + from catalog_sales + ,date_dim + where cs_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2 + union all + select ws_quantity quantity + ,ws_list_price list_price + from web_sales + ,date_dim + where ws_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2) x) + select channel, i_brand_id,i_class_id,i_category_id,sum(sales), sum(number_sales) + from( + select 'store' channel, i_brand_id,i_class_id + ,i_category_id,sum(ss_quantity*ss_list_price) sales + , count(*) number_sales + from store_sales + ,item + ,date_dim + where ss_item_sk in (select ss_item_sk from cross_items) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_year = 1999+2 + and d_moy = 11 + group by i_brand_id,i_class_id,i_category_id + having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales) + union all + select 'catalog' channel, i_brand_id,i_class_id,i_category_id, sum(cs_quantity*cs_list_price) sales, count(*) number_sales + from catalog_sales + ,item + ,date_dim + where cs_item_sk in (select ss_item_sk from cross_items) + and cs_item_sk = i_item_sk + and cs_sold_date_sk = d_date_sk + and d_year = 1999+2 + and d_moy = 11 + group by i_brand_id,i_class_id,i_category_id + having sum(cs_quantity*cs_list_price) > (select average_sales from avg_sales) + union all + select 'web' channel, i_brand_id,i_class_id,i_category_id, sum(ws_quantity*ws_list_price) sales , count(*) number_sales + from web_sales + ,item + ,date_dim + where ws_item_sk in (select ss_item_sk from cross_items) + and ws_item_sk = i_item_sk + and ws_sold_date_sk = d_date_sk + and d_year = 1999+2 + and d_moy = 11 + group by i_brand_id,i_class_id,i_category_id + having sum(ws_quantity*ws_list_price) > (select average_sales from avg_sales) + ) y + group by rollup (channel, i_brand_id,i_class_id,i_category_id) + order by channel,i_brand_id,i_class_id,i_category_id + LIMIT 100; +with cross_items as + (select i_item_sk ss_item_sk + from item, + (select iss.i_brand_id brand_id + ,iss.i_class_id class_id + ,iss.i_category_id category_id + from store_sales + ,item iss + ,date_dim d1 + where ss_item_sk = iss.i_item_sk + and ss_sold_date_sk = d1.d_date_sk + and d1.d_year between 1999 AND 1999 + 2 + intersect + select ics.i_brand_id + ,ics.i_class_id + ,ics.i_category_id + from catalog_sales + ,item ics + ,date_dim d2 + where cs_item_sk = ics.i_item_sk + and cs_sold_date_sk = d2.d_date_sk + and d2.d_year between 1999 AND 1999 + 2 + intersect + select iws.i_brand_id + ,iws.i_class_id + ,iws.i_category_id + from web_sales + ,item iws + ,date_dim d3 + where ws_item_sk = iws.i_item_sk + and ws_sold_date_sk = d3.d_date_sk + and d3.d_year between 1999 AND 1999 + 2) x + where i_brand_id = brand_id + and i_class_id = class_id + and i_category_id = category_id +), + avg_sales as +(select avg(quantity*list_price) average_sales + from (select ss_quantity quantity + ,ss_list_price list_price + from store_sales + ,date_dim + where ss_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2 + union all + select cs_quantity quantity + ,cs_list_price list_price + from catalog_sales + ,date_dim + where cs_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2 + union all + select ws_quantity quantity + ,ws_list_price list_price + from web_sales + ,date_dim + where ws_sold_date_sk = d_date_sk + and d_year between 1999 and 1999 + 2) x) + select this_year.channel ty_channel + ,this_year.i_brand_id ty_brand + ,this_year.i_class_id ty_class + ,this_year.i_category_id ty_category + ,this_year.sales ty_sales + ,this_year.number_sales ty_number_sales + ,last_year.channel ly_channel + ,last_year.i_brand_id ly_brand + ,last_year.i_class_id ly_class + ,last_year.i_category_id ly_category + ,last_year.sales ly_sales + ,last_year.number_sales ly_number_sales + from + (select 'store' channel, i_brand_id,i_class_id,i_category_id + ,sum(ss_quantity*ss_list_price) sales, count(*) number_sales + from store_sales + ,item + ,date_dim + where ss_item_sk in (select ss_item_sk from cross_items) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_week_seq = (select d_week_seq + from date_dim + where d_year = 1999 + 1 + and d_moy = 12 + and d_dom = 14) + group by i_brand_id,i_class_id,i_category_id + having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales)) this_year, + (select 'store' channel, i_brand_id,i_class_id + ,i_category_id, sum(ss_quantity*ss_list_price) sales, count(*) number_sales + from store_sales + ,item + ,date_dim + where ss_item_sk in (select ss_item_sk from cross_items) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_week_seq = (select d_week_seq + from date_dim + where d_year = 1999 + and d_moy = 12 + and d_dom = 14) + group by i_brand_id,i_class_id,i_category_id + having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales)) last_year + where this_year.i_brand_id= last_year.i_brand_id + and this_year.i_class_id = last_year.i_class_id + and this_year.i_category_id = last_year.i_category_id + order by this_year.channel, this_year.i_brand_id, this_year.i_class_id, this_year.i_category_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q15.sql b/benchmarks/queries/tpcds/q15.sql new file mode 100644 index 0000000000..bb1812a07c --- /dev/null +++ b/benchmarks/queries/tpcds/q15.sql @@ -0,0 +1,21 @@ +-- SQLBench-DS query 15 derived from TPC-DS query 15 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select ca_zip + ,sum(cs_sales_price) + from catalog_sales + ,customer + ,customer_address + ,date_dim + where cs_bill_customer_sk = c_customer_sk + and c_current_addr_sk = ca_address_sk + and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475', + '85392', '85460', '80348', '81792') + or ca_state in ('CA','WA','GA') + or cs_sales_price > 500) + and cs_sold_date_sk = d_date_sk + and d_qoy = 2 and d_year = 2002 + group by ca_zip + order by ca_zip + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q16.sql b/benchmarks/queries/tpcds/q16.sql new file mode 100644 index 0000000000..2e0f9a9922 --- /dev/null +++ b/benchmarks/queries/tpcds/q16.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 16 derived from TPC-DS query 16 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + count(distinct cs_order_number) as `order count` + ,sum(cs_ext_ship_cost) as `total shipping cost` + ,sum(cs_net_profit) as `total net profit` +from + catalog_sales cs1 + ,date_dim + ,customer_address + ,call_center +where + d_date between '1999-5-01' and + (cast('1999-5-01' as date) + INTERVAL '60 DAYS') +and cs1.cs_ship_date_sk = d_date_sk +and cs1.cs_ship_addr_sk = ca_address_sk +and ca_state = 'ID' +and cs1.cs_call_center_sk = cc_call_center_sk +and cc_county in ('Williamson County','Williamson County','Williamson County','Williamson County', + 'Williamson County' +) +and exists (select * + from catalog_sales cs2 + where cs1.cs_order_number = cs2.cs_order_number + and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) +and not exists(select * + from catalog_returns cr1 + where cs1.cs_order_number = cr1.cr_order_number) +order by count(distinct cs_order_number) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q17.sql b/benchmarks/queries/tpcds/q17.sql new file mode 100644 index 0000000000..9f9e97d76e --- /dev/null +++ b/benchmarks/queries/tpcds/q17.sql @@ -0,0 +1,46 @@ +-- SQLBench-DS query 17 derived from TPC-DS query 17 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,s_state + ,count(ss_quantity) as store_sales_quantitycount + ,avg(ss_quantity) as store_sales_quantityave + ,stddev_samp(ss_quantity) as store_sales_quantitystdev + ,stddev_samp(ss_quantity)/avg(ss_quantity) as store_sales_quantitycov + ,count(sr_return_quantity) as store_returns_quantitycount + ,avg(sr_return_quantity) as store_returns_quantityave + ,stddev_samp(sr_return_quantity) as store_returns_quantitystdev + ,stddev_samp(sr_return_quantity)/avg(sr_return_quantity) as store_returns_quantitycov + ,count(cs_quantity) as catalog_sales_quantitycount ,avg(cs_quantity) as catalog_sales_quantityave + ,stddev_samp(cs_quantity) as catalog_sales_quantitystdev + ,stddev_samp(cs_quantity)/avg(cs_quantity) as catalog_sales_quantitycov + from store_sales + ,store_returns + ,catalog_sales + ,date_dim d1 + ,date_dim d2 + ,date_dim d3 + ,store + ,item + where d1.d_quarter_name = '1999Q1' + and d1.d_date_sk = ss_sold_date_sk + and i_item_sk = ss_item_sk + and s_store_sk = ss_store_sk + and ss_customer_sk = sr_customer_sk + and ss_item_sk = sr_item_sk + and ss_ticket_number = sr_ticket_number + and sr_returned_date_sk = d2.d_date_sk + and d2.d_quarter_name in ('1999Q1','1999Q2','1999Q3') + and sr_customer_sk = cs_bill_customer_sk + and sr_item_sk = cs_item_sk + and cs_sold_date_sk = d3.d_date_sk + and d3.d_quarter_name in ('1999Q1','1999Q2','1999Q3') + group by i_item_id + ,i_item_desc + ,s_state + order by i_item_id + ,i_item_desc + ,s_state + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q18.sql b/benchmarks/queries/tpcds/q18.sql new file mode 100644 index 0000000000..50cc6c63f2 --- /dev/null +++ b/benchmarks/queries/tpcds/q18.sql @@ -0,0 +1,35 @@ +-- SQLBench-DS query 18 derived from TPC-DS query 18 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id, + ca_country, + ca_state, + ca_county, + avg( cast(cs_quantity as decimal(12,2))) agg1, + avg( cast(cs_list_price as decimal(12,2))) agg2, + avg( cast(cs_coupon_amt as decimal(12,2))) agg3, + avg( cast(cs_sales_price as decimal(12,2))) agg4, + avg( cast(cs_net_profit as decimal(12,2))) agg5, + avg( cast(c_birth_year as decimal(12,2))) agg6, + avg( cast(cd1.cd_dep_count as decimal(12,2))) agg7 + from catalog_sales, customer_demographics cd1, + customer_demographics cd2, customer, customer_address, date_dim, item + where cs_sold_date_sk = d_date_sk and + cs_item_sk = i_item_sk and + cs_bill_cdemo_sk = cd1.cd_demo_sk and + cs_bill_customer_sk = c_customer_sk and + cd1.cd_gender = 'M' and + cd1.cd_education_status = 'Primary' and + c_current_cdemo_sk = cd2.cd_demo_sk and + c_current_addr_sk = ca_address_sk and + c_birth_month in (1,2,9,5,11,3) and + d_year = 1998 and + ca_state in ('MS','NE','IA' + ,'MI','GA','NY','CO') + group by rollup (i_item_id, ca_country, ca_state, ca_county) + order by ca_country, + ca_state, + ca_county, + i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q19.sql b/benchmarks/queries/tpcds/q19.sql new file mode 100644 index 0000000000..bf54b3b802 --- /dev/null +++ b/benchmarks/queries/tpcds/q19.sql @@ -0,0 +1,26 @@ +-- SQLBench-DS query 19 derived from TPC-DS query 19 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_brand_id brand_id, i_brand brand, i_manufact_id, i_manufact, + sum(ss_ext_sales_price) ext_price + from date_dim, store_sales, item,customer,customer_address,store + where d_date_sk = ss_sold_date_sk + and ss_item_sk = i_item_sk + and i_manager_id=8 + and d_moy=11 + and d_year=1999 + and ss_customer_sk = c_customer_sk + and c_current_addr_sk = ca_address_sk + and substr(ca_zip,1,5) <> substr(s_zip,1,5) + and ss_store_sk = s_store_sk + group by i_brand + ,i_brand_id + ,i_manufact_id + ,i_manufact + order by ext_price desc + ,i_brand + ,i_brand_id + ,i_manufact_id + ,i_manufact + LIMIT 100 ; + diff --git a/benchmarks/queries/tpcds/q2.sql b/benchmarks/queries/tpcds/q2.sql new file mode 100644 index 0000000000..838717836b --- /dev/null +++ b/benchmarks/queries/tpcds/q2.sql @@ -0,0 +1,61 @@ +-- SQLBench-DS query 2 derived from TPC-DS query 2 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with wscs as + (select sold_date_sk + ,sales_price + from (select ws_sold_date_sk sold_date_sk + ,ws_ext_sales_price sales_price + from web_sales + union all + select cs_sold_date_sk sold_date_sk + ,cs_ext_sales_price sales_price + from catalog_sales)), + wswscs as + (select d_week_seq, + sum(case when (d_day_name='Sunday') then sales_price else null end) sun_sales, + sum(case when (d_day_name='Monday') then sales_price else null end) mon_sales, + sum(case when (d_day_name='Tuesday') then sales_price else null end) tue_sales, + sum(case when (d_day_name='Wednesday') then sales_price else null end) wed_sales, + sum(case when (d_day_name='Thursday') then sales_price else null end) thu_sales, + sum(case when (d_day_name='Friday') then sales_price else null end) fri_sales, + sum(case when (d_day_name='Saturday') then sales_price else null end) sat_sales + from wscs + ,date_dim + where d_date_sk = sold_date_sk + group by d_week_seq) + select d_week_seq1 + ,round(sun_sales1/sun_sales2,2) + ,round(mon_sales1/mon_sales2,2) + ,round(tue_sales1/tue_sales2,2) + ,round(wed_sales1/wed_sales2,2) + ,round(thu_sales1/thu_sales2,2) + ,round(fri_sales1/fri_sales2,2) + ,round(sat_sales1/sat_sales2,2) + from + (select wswscs.d_week_seq d_week_seq1 + ,sun_sales sun_sales1 + ,mon_sales mon_sales1 + ,tue_sales tue_sales1 + ,wed_sales wed_sales1 + ,thu_sales thu_sales1 + ,fri_sales fri_sales1 + ,sat_sales sat_sales1 + from wswscs,date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and + d_year = 2000) y, + (select wswscs.d_week_seq d_week_seq2 + ,sun_sales sun_sales2 + ,mon_sales mon_sales2 + ,tue_sales tue_sales2 + ,wed_sales wed_sales2 + ,thu_sales thu_sales2 + ,fri_sales fri_sales2 + ,sat_sales sat_sales2 + from wswscs + ,date_dim + where date_dim.d_week_seq = wswscs.d_week_seq and + d_year = 2000+1) z + where d_week_seq1=d_week_seq2-53 + order by d_week_seq1; + diff --git a/benchmarks/queries/tpcds/q20.sql b/benchmarks/queries/tpcds/q20.sql new file mode 100644 index 0000000000..ea4747317d --- /dev/null +++ b/benchmarks/queries/tpcds/q20.sql @@ -0,0 +1,31 @@ +-- SQLBench-DS query 20 derived from TPC-DS query 20 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price + ,sum(cs_ext_sales_price) as itemrevenue + ,sum(cs_ext_sales_price)*100/sum(sum(cs_ext_sales_price)) over + (partition by i_class) as revenueratio + from catalog_sales + ,item + ,date_dim + where cs_item_sk = i_item_sk + and i_category in ('Children', 'Sports', 'Music') + and cs_sold_date_sk = d_date_sk + and d_date between cast('2002-04-01' as date) + and (cast('2002-04-01' as date) + INTERVAL '30 DAYS') + group by i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price + order by i_category + ,i_class + ,i_item_id + ,i_item_desc + ,revenueratio + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q21.sql b/benchmarks/queries/tpcds/q21.sql new file mode 100644 index 0000000000..d768fa1428 --- /dev/null +++ b/benchmarks/queries/tpcds/q21.sql @@ -0,0 +1,31 @@ +-- SQLBench-DS query 21 derived from TPC-DS query 21 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * + from(select w_warehouse_name + ,i_item_id + ,sum(case when (cast(d_date as date) < cast ('2000-05-19' as date)) + then inv_quantity_on_hand + else 0 end) as inv_before + ,sum(case when (cast(d_date as date) >= cast ('2000-05-19' as date)) + then inv_quantity_on_hand + else 0 end) as inv_after + from inventory + ,warehouse + ,item + ,date_dim + where i_current_price between 0.99 and 1.49 + and i_item_sk = inv_item_sk + and inv_warehouse_sk = w_warehouse_sk + and inv_date_sk = d_date_sk + and d_date between (cast ('2000-05-19' as date) - INTERVAL '30 DAYS') + and (cast ('2000-05-19' as date) + INTERVAL '30 DAYS') + group by w_warehouse_name, i_item_id) x + where (case when inv_before > 0 + then inv_after / inv_before + else null + end) between 2.0/3.0 and 3.0/2.0 + order by w_warehouse_name + ,i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q22.sql b/benchmarks/queries/tpcds/q22.sql new file mode 100644 index 0000000000..c7e1c78181 --- /dev/null +++ b/benchmarks/queries/tpcds/q22.sql @@ -0,0 +1,21 @@ +-- SQLBench-DS query 22 derived from TPC-DS query 22 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_product_name + ,i_brand + ,i_class + ,i_category + ,avg(inv_quantity_on_hand) qoh + from inventory + ,date_dim + ,item + where inv_date_sk=d_date_sk + and inv_item_sk=i_item_sk + and d_month_seq between 1201 and 1201 + 11 + group by rollup(i_product_name + ,i_brand + ,i_class + ,i_category) +order by qoh, i_product_name, i_brand, i_class, i_category + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q23.sql b/benchmarks/queries/tpcds/q23.sql new file mode 100644 index 0000000000..0dc7f73859 --- /dev/null +++ b/benchmarks/queries/tpcds/q23.sql @@ -0,0 +1,108 @@ +-- SQLBench-DS query 23 derived from TPC-DS query 23 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with frequent_ss_items as + (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt + from store_sales + ,date_dim + ,item + where ss_sold_date_sk = d_date_sk + and ss_item_sk = i_item_sk + and d_year in (2000,2000+1,2000+2,2000+3) + group by substr(i_item_desc,1,30),i_item_sk,d_date + having count(*) >4), + max_store_sales as + (select max(csales) tpcds_cmax + from (select c_customer_sk,sum(ss_quantity*ss_sales_price) csales + from store_sales + ,customer + ,date_dim + where ss_customer_sk = c_customer_sk + and ss_sold_date_sk = d_date_sk + and d_year in (2000,2000+1,2000+2,2000+3) + group by c_customer_sk)), + best_ss_customer as + (select c_customer_sk,sum(ss_quantity*ss_sales_price) ssales + from store_sales + ,customer + where ss_customer_sk = c_customer_sk + group by c_customer_sk + having sum(ss_quantity*ss_sales_price) > (95/100.0) * (select + * +from + max_store_sales)) + select sum(sales) + from (select cs_quantity*cs_list_price sales + from catalog_sales + ,date_dim + where d_year = 2000 + and d_moy = 3 + and cs_sold_date_sk = d_date_sk + and cs_item_sk in (select item_sk from frequent_ss_items) + and cs_bill_customer_sk in (select c_customer_sk from best_ss_customer) + union all + select ws_quantity*ws_list_price sales + from web_sales + ,date_dim + where d_year = 2000 + and d_moy = 3 + and ws_sold_date_sk = d_date_sk + and ws_item_sk in (select item_sk from frequent_ss_items) + and ws_bill_customer_sk in (select c_customer_sk from best_ss_customer)) + LIMIT 100; +with frequent_ss_items as + (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt + from store_sales + ,date_dim + ,item + where ss_sold_date_sk = d_date_sk + and ss_item_sk = i_item_sk + and d_year in (2000,2000 + 1,2000 + 2,2000 + 3) + group by substr(i_item_desc,1,30),i_item_sk,d_date + having count(*) >4), + max_store_sales as + (select max(csales) tpcds_cmax + from (select c_customer_sk,sum(ss_quantity*ss_sales_price) csales + from store_sales + ,customer + ,date_dim + where ss_customer_sk = c_customer_sk + and ss_sold_date_sk = d_date_sk + and d_year in (2000,2000+1,2000+2,2000+3) + group by c_customer_sk)), + best_ss_customer as + (select c_customer_sk,sum(ss_quantity*ss_sales_price) ssales + from store_sales + ,customer + where ss_customer_sk = c_customer_sk + group by c_customer_sk + having sum(ss_quantity*ss_sales_price) > (95/100.0) * (select + * + from max_store_sales)) + select c_last_name,c_first_name,sales + from (select c_last_name,c_first_name,sum(cs_quantity*cs_list_price) sales + from catalog_sales + ,customer + ,date_dim + where d_year = 2000 + and d_moy = 3 + and cs_sold_date_sk = d_date_sk + and cs_item_sk in (select item_sk from frequent_ss_items) + and cs_bill_customer_sk in (select c_customer_sk from best_ss_customer) + and cs_bill_customer_sk = c_customer_sk + group by c_last_name,c_first_name + union all + select c_last_name,c_first_name,sum(ws_quantity*ws_list_price) sales + from web_sales + ,customer + ,date_dim + where d_year = 2000 + and d_moy = 3 + and ws_sold_date_sk = d_date_sk + and ws_item_sk in (select item_sk from frequent_ss_items) + and ws_bill_customer_sk in (select c_customer_sk from best_ss_customer) + and ws_bill_customer_sk = c_customer_sk + group by c_last_name,c_first_name) + order by c_last_name,c_first_name,sales + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q24.sql b/benchmarks/queries/tpcds/q24.sql new file mode 100644 index 0000000000..5d6d2f5053 --- /dev/null +++ b/benchmarks/queries/tpcds/q24.sql @@ -0,0 +1,108 @@ +-- SQLBench-DS query 24 derived from TPC-DS query 24 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ssales as +(select c_last_name + ,c_first_name + ,s_store_name + ,ca_state + ,s_state + ,i_color + ,i_current_price + ,i_manager_id + ,i_units + ,i_size + ,sum(ss_net_profit) netpaid +from store_sales + ,store_returns + ,store + ,item + ,customer + ,customer_address +where ss_ticket_number = sr_ticket_number + and ss_item_sk = sr_item_sk + and ss_customer_sk = c_customer_sk + and ss_item_sk = i_item_sk + and ss_store_sk = s_store_sk + and c_current_addr_sk = ca_address_sk + and c_birth_country <> upper(ca_country) + and s_zip = ca_zip +and s_market_id=10 +group by c_last_name + ,c_first_name + ,s_store_name + ,ca_state + ,s_state + ,i_color + ,i_current_price + ,i_manager_id + ,i_units + ,i_size) +select c_last_name + ,c_first_name + ,s_store_name + ,sum(netpaid) paid +from ssales +where i_color = 'orchid' +group by c_last_name + ,c_first_name + ,s_store_name +having sum(netpaid) > (select 0.05*avg(netpaid) + from ssales) +order by c_last_name + ,c_first_name + ,s_store_name +; +with ssales as +(select c_last_name + ,c_first_name + ,s_store_name + ,ca_state + ,s_state + ,i_color + ,i_current_price + ,i_manager_id + ,i_units + ,i_size + ,sum(ss_net_profit) netpaid +from store_sales + ,store_returns + ,store + ,item + ,customer + ,customer_address +where ss_ticket_number = sr_ticket_number + and ss_item_sk = sr_item_sk + and ss_customer_sk = c_customer_sk + and ss_item_sk = i_item_sk + and ss_store_sk = s_store_sk + and c_current_addr_sk = ca_address_sk + and c_birth_country <> upper(ca_country) + and s_zip = ca_zip + and s_market_id = 10 +group by c_last_name + ,c_first_name + ,s_store_name + ,ca_state + ,s_state + ,i_color + ,i_current_price + ,i_manager_id + ,i_units + ,i_size) +select c_last_name + ,c_first_name + ,s_store_name + ,sum(netpaid) paid +from ssales +where i_color = 'green' +group by c_last_name + ,c_first_name + ,s_store_name +having sum(netpaid) > (select 0.05*avg(netpaid) + from ssales) +order by c_last_name + ,c_first_name + ,s_store_name +; + diff --git a/benchmarks/queries/tpcds/q25.sql b/benchmarks/queries/tpcds/q25.sql new file mode 100644 index 0000000000..b0af0e61dd --- /dev/null +++ b/benchmarks/queries/tpcds/q25.sql @@ -0,0 +1,49 @@ +-- SQLBench-DS query 25 derived from TPC-DS query 25 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + ,min(ss_net_profit) as store_sales_profit + ,min(sr_net_loss) as store_returns_loss + ,min(cs_net_profit) as catalog_sales_profit + from + store_sales + ,store_returns + ,catalog_sales + ,date_dim d1 + ,date_dim d2 + ,date_dim d3 + ,store + ,item + where + d1.d_moy = 4 + and d1.d_year = 2002 + and d1.d_date_sk = ss_sold_date_sk + and i_item_sk = ss_item_sk + and s_store_sk = ss_store_sk + and ss_customer_sk = sr_customer_sk + and ss_item_sk = sr_item_sk + and ss_ticket_number = sr_ticket_number + and sr_returned_date_sk = d2.d_date_sk + and d2.d_moy between 4 and 10 + and d2.d_year = 2002 + and sr_customer_sk = cs_bill_customer_sk + and sr_item_sk = cs_item_sk + and cs_sold_date_sk = d3.d_date_sk + and d3.d_moy between 4 and 10 + and d3.d_year = 2002 + group by + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + order by + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q26.sql b/benchmarks/queries/tpcds/q26.sql new file mode 100644 index 0000000000..55ccc8b511 --- /dev/null +++ b/benchmarks/queries/tpcds/q26.sql @@ -0,0 +1,22 @@ +-- SQLBench-DS query 26 derived from TPC-DS query 26 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id, + avg(cs_quantity) agg1, + avg(cs_list_price) agg2, + avg(cs_coupon_amt) agg3, + avg(cs_sales_price) agg4 + from catalog_sales, customer_demographics, date_dim, item, promotion + where cs_sold_date_sk = d_date_sk and + cs_item_sk = i_item_sk and + cs_bill_cdemo_sk = cd_demo_sk and + cs_promo_sk = p_promo_sk and + cd_gender = 'F' and + cd_marital_status = 'M' and + cd_education_status = '4 yr Degree' and + (p_channel_email = 'N' or p_channel_event = 'N') and + d_year = 2000 + group by i_item_id + order by i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q27.sql b/benchmarks/queries/tpcds/q27.sql new file mode 100644 index 0000000000..6d28e4e663 --- /dev/null +++ b/benchmarks/queries/tpcds/q27.sql @@ -0,0 +1,24 @@ +-- SQLBench-DS query 27 derived from TPC-DS query 27 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id, + s_state, grouping(s_state) g_state, + avg(ss_quantity) agg1, + avg(ss_list_price) agg2, + avg(ss_coupon_amt) agg3, + avg(ss_sales_price) agg4 + from store_sales, customer_demographics, date_dim, store, item + where ss_sold_date_sk = d_date_sk and + ss_item_sk = i_item_sk and + ss_store_sk = s_store_sk and + ss_cdemo_sk = cd_demo_sk and + cd_gender = 'M' and + cd_marital_status = 'U' and + cd_education_status = 'Secondary' and + d_year = 2000 and + s_state in ('TN','TN', 'TN', 'TN', 'TN', 'TN') + group by rollup (i_item_id, s_state) + order by i_item_id + ,s_state + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q28.sql b/benchmarks/queries/tpcds/q28.sql new file mode 100644 index 0000000000..6efa7d7d77 --- /dev/null +++ b/benchmarks/queries/tpcds/q28.sql @@ -0,0 +1,54 @@ +-- SQLBench-DS query 28 derived from TPC-DS query 28 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * +from (select avg(ss_list_price) B1_LP + ,count(ss_list_price) B1_CNT + ,count(distinct ss_list_price) B1_CNTD + from store_sales + where ss_quantity between 0 and 5 + and (ss_list_price between 28 and 28+10 + or ss_coupon_amt between 12573 and 12573+1000 + or ss_wholesale_cost between 33 and 33+20)) B1, + (select avg(ss_list_price) B2_LP + ,count(ss_list_price) B2_CNT + ,count(distinct ss_list_price) B2_CNTD + from store_sales + where ss_quantity between 6 and 10 + and (ss_list_price between 143 and 143+10 + or ss_coupon_amt between 5562 and 5562+1000 + or ss_wholesale_cost between 45 and 45+20)) B2, + (select avg(ss_list_price) B3_LP + ,count(ss_list_price) B3_CNT + ,count(distinct ss_list_price) B3_CNTD + from store_sales + where ss_quantity between 11 and 15 + and (ss_list_price between 159 and 159+10 + or ss_coupon_amt between 2807 and 2807+1000 + or ss_wholesale_cost between 24 and 24+20)) B3, + (select avg(ss_list_price) B4_LP + ,count(ss_list_price) B4_CNT + ,count(distinct ss_list_price) B4_CNTD + from store_sales + where ss_quantity between 16 and 20 + and (ss_list_price between 24 and 24+10 + or ss_coupon_amt between 3706 and 3706+1000 + or ss_wholesale_cost between 46 and 46+20)) B4, + (select avg(ss_list_price) B5_LP + ,count(ss_list_price) B5_CNT + ,count(distinct ss_list_price) B5_CNTD + from store_sales + where ss_quantity between 21 and 25 + and (ss_list_price between 76 and 76+10 + or ss_coupon_amt between 2096 and 2096+1000 + or ss_wholesale_cost between 50 and 50+20)) B5, + (select avg(ss_list_price) B6_LP + ,count(ss_list_price) B6_CNT + ,count(distinct ss_list_price) B6_CNTD + from store_sales + where ss_quantity between 26 and 30 + and (ss_list_price between 169 and 169+10 + or ss_coupon_amt between 10672 and 10672+1000 + or ss_wholesale_cost between 58 and 58+20)) B6 + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q29.sql b/benchmarks/queries/tpcds/q29.sql new file mode 100644 index 0000000000..8d463f3771 --- /dev/null +++ b/benchmarks/queries/tpcds/q29.sql @@ -0,0 +1,48 @@ +-- SQLBench-DS query 29 derived from TPC-DS query 29 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + ,stddev_samp(ss_quantity) as store_sales_quantity + ,stddev_samp(sr_return_quantity) as store_returns_quantity + ,stddev_samp(cs_quantity) as catalog_sales_quantity + from + store_sales + ,store_returns + ,catalog_sales + ,date_dim d1 + ,date_dim d2 + ,date_dim d3 + ,store + ,item + where + d1.d_moy = 4 + and d1.d_year = 1999 + and d1.d_date_sk = ss_sold_date_sk + and i_item_sk = ss_item_sk + and s_store_sk = ss_store_sk + and ss_customer_sk = sr_customer_sk + and ss_item_sk = sr_item_sk + and ss_ticket_number = sr_ticket_number + and sr_returned_date_sk = d2.d_date_sk + and d2.d_moy between 4 and 4 + 3 + and d2.d_year = 1999 + and sr_customer_sk = cs_bill_customer_sk + and sr_item_sk = cs_item_sk + and cs_sold_date_sk = d3.d_date_sk + and d3.d_year in (1999,1999+1,1999+2) + group by + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + order by + i_item_id + ,i_item_desc + ,s_store_id + ,s_store_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q3.sql b/benchmarks/queries/tpcds/q3.sql new file mode 100644 index 0000000000..d6a55cb8cf --- /dev/null +++ b/benchmarks/queries/tpcds/q3.sql @@ -0,0 +1,22 @@ +-- SQLBench-DS query 3 derived from TPC-DS query 3 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select dt.d_year + ,item.i_brand_id brand_id + ,item.i_brand brand + ,sum(ss_net_profit) sum_agg + from date_dim dt + ,store_sales + ,item + where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manufact_id = 445 + and dt.d_moy=12 + group by dt.d_year + ,item.i_brand + ,item.i_brand_id + order by dt.d_year + ,sum_agg desc + ,brand_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q30.sql b/benchmarks/queries/tpcds/q30.sql new file mode 100644 index 0000000000..7004078a50 --- /dev/null +++ b/benchmarks/queries/tpcds/q30.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 30 derived from TPC-DS query 30 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with customer_total_return as + (select wr_returning_customer_sk as ctr_customer_sk + ,ca_state as ctr_state, + sum(wr_return_amt) as ctr_total_return + from web_returns + ,date_dim + ,customer_address + where wr_returned_date_sk = d_date_sk + and d_year =2000 + and wr_returning_addr_sk = ca_address_sk + group by wr_returning_customer_sk + ,ca_state) + select c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag + ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address + ,c_last_review_date_sk,ctr_total_return + from customer_total_return ctr1 + ,customer_address + ,customer + where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 + from customer_total_return ctr2 + where ctr1.ctr_state = ctr2.ctr_state) + and ca_address_sk = c_current_addr_sk + and ca_state = 'KS' + and ctr1.ctr_customer_sk = c_customer_sk + order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag + ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address + ,c_last_review_date_sk,ctr_total_return + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q31.sql b/benchmarks/queries/tpcds/q31.sql new file mode 100644 index 0000000000..89aba18998 --- /dev/null +++ b/benchmarks/queries/tpcds/q31.sql @@ -0,0 +1,53 @@ +-- SQLBench-DS query 31 derived from TPC-DS query 31 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss as + (select ca_county,d_qoy, d_year,sum(ss_ext_sales_price) as store_sales + from store_sales,date_dim,customer_address + where ss_sold_date_sk = d_date_sk + and ss_addr_sk=ca_address_sk + group by ca_county,d_qoy, d_year), + ws as + (select ca_county,d_qoy, d_year,sum(ws_ext_sales_price) as web_sales + from web_sales,date_dim,customer_address + where ws_sold_date_sk = d_date_sk + and ws_bill_addr_sk=ca_address_sk + group by ca_county,d_qoy, d_year) + select + ss1.ca_county + ,ss1.d_year + ,ws2.web_sales/ws1.web_sales web_q1_q2_increase + ,ss2.store_sales/ss1.store_sales store_q1_q2_increase + ,ws3.web_sales/ws2.web_sales web_q2_q3_increase + ,ss3.store_sales/ss2.store_sales store_q2_q3_increase + from + ss ss1 + ,ss ss2 + ,ss ss3 + ,ws ws1 + ,ws ws2 + ,ws ws3 + where + ss1.d_qoy = 1 + and ss1.d_year = 1999 + and ss1.ca_county = ss2.ca_county + and ss2.d_qoy = 2 + and ss2.d_year = 1999 + and ss2.ca_county = ss3.ca_county + and ss3.d_qoy = 3 + and ss3.d_year = 1999 + and ss1.ca_county = ws1.ca_county + and ws1.d_qoy = 1 + and ws1.d_year = 1999 + and ws1.ca_county = ws2.ca_county + and ws2.d_qoy = 2 + and ws2.d_year = 1999 + and ws1.ca_county = ws3.ca_county + and ws3.d_qoy = 3 + and ws3.d_year =1999 + and case when ws1.web_sales > 0 then ws2.web_sales/ws1.web_sales else null end + > case when ss1.store_sales > 0 then ss2.store_sales/ss1.store_sales else null end + and case when ws2.web_sales > 0 then ws3.web_sales/ws2.web_sales else null end + > case when ss2.store_sales > 0 then ss3.store_sales/ss2.store_sales else null end + order by ss1.ca_county; + diff --git a/benchmarks/queries/tpcds/q32.sql b/benchmarks/queries/tpcds/q32.sql new file mode 100644 index 0000000000..419dcd0b05 --- /dev/null +++ b/benchmarks/queries/tpcds/q32.sql @@ -0,0 +1,29 @@ +-- SQLBench-DS query 32 derived from TPC-DS query 32 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select sum(cs_ext_discount_amt) as `excess discount amount` +from + catalog_sales + ,item + ,date_dim +where +i_manufact_id = 283 +and i_item_sk = cs_item_sk +and d_date between '1999-02-22' and + (cast('1999-02-22' as date) + INTERVAL '90 DAYS') +and d_date_sk = cs_sold_date_sk +and cs_ext_discount_amt + > ( + select + 1.3 * avg(cs_ext_discount_amt) + from + catalog_sales + ,date_dim + where + cs_item_sk = i_item_sk + and d_date between '1999-02-22' and + (cast('1999-02-22' as date) + INTERVAL '90 DAYS') + and d_date_sk = cs_sold_date_sk + ) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q33.sql b/benchmarks/queries/tpcds/q33.sql new file mode 100644 index 0000000000..1aabc472b7 --- /dev/null +++ b/benchmarks/queries/tpcds/q33.sql @@ -0,0 +1,76 @@ +-- SQLBench-DS query 33 derived from TPC-DS query 33 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss as ( + select + i_manufact_id,sum(ss_ext_sales_price) total_sales + from + store_sales, + date_dim, + customer_address, + item + where + i_manufact_id in (select + i_manufact_id +from + item +where i_category in ('Books')) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_year = 1999 + and d_moy = 4 + and ss_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_manufact_id), + cs as ( + select + i_manufact_id,sum(cs_ext_sales_price) total_sales + from + catalog_sales, + date_dim, + customer_address, + item + where + i_manufact_id in (select + i_manufact_id +from + item +where i_category in ('Books')) + and cs_item_sk = i_item_sk + and cs_sold_date_sk = d_date_sk + and d_year = 1999 + and d_moy = 4 + and cs_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_manufact_id), + ws as ( + select + i_manufact_id,sum(ws_ext_sales_price) total_sales + from + web_sales, + date_dim, + customer_address, + item + where + i_manufact_id in (select + i_manufact_id +from + item +where i_category in ('Books')) + and ws_item_sk = i_item_sk + and ws_sold_date_sk = d_date_sk + and d_year = 1999 + and d_moy = 4 + and ws_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_manufact_id) + select i_manufact_id ,sum(total_sales) total_sales + from (select * from ss + union all + select * from cs + union all + select * from ws) tmp1 + group by i_manufact_id + order by total_sales + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q34.sql b/benchmarks/queries/tpcds/q34.sql new file mode 100644 index 0000000000..f61caa51a3 --- /dev/null +++ b/benchmarks/queries/tpcds/q34.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 34 derived from TPC-DS query 34 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select c_last_name + ,c_first_name + ,c_salutation + ,c_preferred_cust_flag + ,ss_ticket_number + ,cnt from + (select ss_ticket_number + ,ss_customer_sk + ,count(*) cnt + from store_sales,date_dim,store,household_demographics + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_store_sk = store.s_store_sk + and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + and (date_dim.d_dom between 1 and 3 or date_dim.d_dom between 25 and 28) + and (household_demographics.hd_buy_potential = '501-1000' or + household_demographics.hd_buy_potential = 'Unknown') + and household_demographics.hd_vehicle_count > 0 + and (case when household_demographics.hd_vehicle_count > 0 + then household_demographics.hd_dep_count/ household_demographics.hd_vehicle_count + else null + end) > 1.2 + and date_dim.d_year in (2000,2000+1,2000+2) + and store.s_county in ('Williamson County','Williamson County','Williamson County','Williamson County', + 'Williamson County','Williamson County','Williamson County','Williamson County') + group by ss_ticket_number,ss_customer_sk) dn,customer + where ss_customer_sk = c_customer_sk + and cnt between 15 and 20 + order by c_last_name,c_first_name,c_salutation,c_preferred_cust_flag desc, ss_ticket_number; + diff --git a/benchmarks/queries/tpcds/q35.sql b/benchmarks/queries/tpcds/q35.sql new file mode 100644 index 0000000000..ba0ccf3667 --- /dev/null +++ b/benchmarks/queries/tpcds/q35.sql @@ -0,0 +1,59 @@ +-- SQLBench-DS query 35 derived from TPC-DS query 35 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + ca_state, + cd_gender, + cd_marital_status, + cd_dep_count, + count(*) cnt1, + max(cd_dep_count), + stddev_samp(cd_dep_count), + stddev_samp(cd_dep_count), + cd_dep_employed_count, + count(*) cnt2, + max(cd_dep_employed_count), + stddev_samp(cd_dep_employed_count), + stddev_samp(cd_dep_employed_count), + cd_dep_college_count, + count(*) cnt3, + max(cd_dep_college_count), + stddev_samp(cd_dep_college_count), + stddev_samp(cd_dep_college_count) + from + customer c,customer_address ca,customer_demographics + where + c.c_current_addr_sk = ca.ca_address_sk and + cd_demo_sk = c.c_current_cdemo_sk and + exists (select * + from store_sales,date_dim + where c.c_customer_sk = ss_customer_sk and + ss_sold_date_sk = d_date_sk and + d_year = 2000 and + d_qoy < 4) and + (exists (select * + from web_sales,date_dim + where c.c_customer_sk = ws_bill_customer_sk and + ws_sold_date_sk = d_date_sk and + d_year = 2000 and + d_qoy < 4) or + exists (select * + from catalog_sales,date_dim + where c.c_customer_sk = cs_ship_customer_sk and + cs_sold_date_sk = d_date_sk and + d_year = 2000 and + d_qoy < 4)) + group by ca_state, + cd_gender, + cd_marital_status, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + order by ca_state, + cd_gender, + cd_marital_status, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q36.sql b/benchmarks/queries/tpcds/q36.sql new file mode 100644 index 0000000000..889fff5d14 --- /dev/null +++ b/benchmarks/queries/tpcds/q36.sql @@ -0,0 +1,31 @@ +-- SQLBench-DS query 36 derived from TPC-DS query 36 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin + ,i_category + ,i_class + ,grouping(i_category)+grouping(i_class) as lochierarchy + ,rank() over ( + partition by grouping(i_category)+grouping(i_class), + case when grouping(i_class) = 0 then i_category end + order by sum(ss_net_profit)/sum(ss_ext_sales_price) asc) as rank_within_parent + from + store_sales + ,date_dim d1 + ,item + ,store + where + d1.d_year = 2001 + and d1.d_date_sk = ss_sold_date_sk + and i_item_sk = ss_item_sk + and s_store_sk = ss_store_sk + and s_state in ('TN','TN','TN','TN', + 'TN','TN','TN','TN') + group by rollup(i_category,i_class) + order by + lochierarchy desc + ,case when lochierarchy = 0 then i_category end + ,rank_within_parent + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q37.sql b/benchmarks/queries/tpcds/q37.sql new file mode 100644 index 0000000000..bdd12dc82e --- /dev/null +++ b/benchmarks/queries/tpcds/q37.sql @@ -0,0 +1,18 @@ +-- SQLBench-DS query 37 derived from TPC-DS query 37 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,i_current_price + from item, inventory, date_dim, catalog_sales + where i_current_price between 26 and 26 + 30 + and inv_item_sk = i_item_sk + and d_date_sk=inv_date_sk + and d_date between cast('2001-06-09' as date) and (cast('2001-06-09' as date) + INTERVAL '60 DAYS') + and i_manufact_id in (744,884,722,693) + and inv_quantity_on_hand between 100 and 500 + and cs_item_sk = i_item_sk + group by i_item_id,i_item_desc,i_current_price + order by i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q38.sql b/benchmarks/queries/tpcds/q38.sql new file mode 100644 index 0000000000..03e4e07635 --- /dev/null +++ b/benchmarks/queries/tpcds/q38.sql @@ -0,0 +1,24 @@ +-- SQLBench-DS query 38 derived from TPC-DS query 38 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select count(*) from ( + select distinct c_last_name, c_first_name, d_date + from store_sales, date_dim, customer + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_customer_sk = customer.c_customer_sk + and d_month_seq between 1190 and 1190 + 11 + intersect + select distinct c_last_name, c_first_name, d_date + from catalog_sales, date_dim, customer + where catalog_sales.cs_sold_date_sk = date_dim.d_date_sk + and catalog_sales.cs_bill_customer_sk = customer.c_customer_sk + and d_month_seq between 1190 and 1190 + 11 + intersect + select distinct c_last_name, c_first_name, d_date + from web_sales, date_dim, customer + where web_sales.ws_sold_date_sk = date_dim.d_date_sk + and web_sales.ws_bill_customer_sk = customer.c_customer_sk + and d_month_seq between 1190 and 1190 + 11 +) hot_cust + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q39.sql b/benchmarks/queries/tpcds/q39.sql new file mode 100644 index 0000000000..f49c223eba --- /dev/null +++ b/benchmarks/queries/tpcds/q39.sql @@ -0,0 +1,55 @@ +-- SQLBench-DS query 39 derived from TPC-DS query 39 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with inv as +(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stdev,mean, case mean when 0 then null else stdev/mean end cov + from(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stddev_samp(inv_quantity_on_hand) stdev,avg(inv_quantity_on_hand) mean + from inventory + ,item + ,warehouse + ,date_dim + where inv_item_sk = i_item_sk + and inv_warehouse_sk = w_warehouse_sk + and inv_date_sk = d_date_sk + and d_year =2001 + group by w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy) foo + where case mean when 0 then 0 else stdev/mean end > 1) +select inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean, inv1.cov + ,inv2.w_warehouse_sk,inv2.i_item_sk,inv2.d_moy,inv2.mean, inv2.cov +from inv inv1,inv inv2 +where inv1.i_item_sk = inv2.i_item_sk + and inv1.w_warehouse_sk = inv2.w_warehouse_sk + and inv1.d_moy=1 + and inv2.d_moy=1+1 +order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov + ,inv2.d_moy,inv2.mean, inv2.cov +; +with inv as +(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stdev,mean, case mean when 0 then null else stdev/mean end cov + from(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy + ,stddev_samp(inv_quantity_on_hand) stdev,avg(inv_quantity_on_hand) mean + from inventory + ,item + ,warehouse + ,date_dim + where inv_item_sk = i_item_sk + and inv_warehouse_sk = w_warehouse_sk + and inv_date_sk = d_date_sk + and d_year =2001 + group by w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy) foo + where case mean when 0 then 0 else stdev/mean end > 1) +select inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean, inv1.cov + ,inv2.w_warehouse_sk,inv2.i_item_sk,inv2.d_moy,inv2.mean, inv2.cov +from inv inv1,inv inv2 +where inv1.i_item_sk = inv2.i_item_sk + and inv1.w_warehouse_sk = inv2.w_warehouse_sk + and inv1.d_moy=1 + and inv2.d_moy=1+1 + and inv1.cov > 1.5 +order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov + ,inv2.d_moy,inv2.mean, inv2.cov +; + diff --git a/benchmarks/queries/tpcds/q4.sql b/benchmarks/queries/tpcds/q4.sql new file mode 100644 index 0000000000..08643201a5 --- /dev/null +++ b/benchmarks/queries/tpcds/q4.sql @@ -0,0 +1,117 @@ +-- SQLBench-DS query 4 derived from TPC-DS query 4 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with year_total as ( + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,c_preferred_cust_flag customer_preferred_cust_flag + ,c_birth_country customer_birth_country + ,c_login customer_login + ,c_email_address customer_email_address + ,d_year dyear + ,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total + ,'s' sale_type + from customer + ,store_sales + ,date_dim + where c_customer_sk = ss_customer_sk + and ss_sold_date_sk = d_date_sk + group by c_customer_id + ,c_first_name + ,c_last_name + ,c_preferred_cust_flag + ,c_birth_country + ,c_login + ,c_email_address + ,d_year + union all + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,c_preferred_cust_flag customer_preferred_cust_flag + ,c_birth_country customer_birth_country + ,c_login customer_login + ,c_email_address customer_email_address + ,d_year dyear + ,sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2) ) year_total + ,'c' sale_type + from customer + ,catalog_sales + ,date_dim + where c_customer_sk = cs_bill_customer_sk + and cs_sold_date_sk = d_date_sk + group by c_customer_id + ,c_first_name + ,c_last_name + ,c_preferred_cust_flag + ,c_birth_country + ,c_login + ,c_email_address + ,d_year +union all + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,c_preferred_cust_flag customer_preferred_cust_flag + ,c_birth_country customer_birth_country + ,c_login customer_login + ,c_email_address customer_email_address + ,d_year dyear + ,sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2) ) year_total + ,'w' sale_type + from customer + ,web_sales + ,date_dim + where c_customer_sk = ws_bill_customer_sk + and ws_sold_date_sk = d_date_sk + group by c_customer_id + ,c_first_name + ,c_last_name + ,c_preferred_cust_flag + ,c_birth_country + ,c_login + ,c_email_address + ,d_year + ) + select + t_s_secyear.customer_id + ,t_s_secyear.customer_first_name + ,t_s_secyear.customer_last_name + ,t_s_secyear.customer_email_address + from year_total t_s_firstyear + ,year_total t_s_secyear + ,year_total t_c_firstyear + ,year_total t_c_secyear + ,year_total t_w_firstyear + ,year_total t_w_secyear + where t_s_secyear.customer_id = t_s_firstyear.customer_id + and t_s_firstyear.customer_id = t_c_secyear.customer_id + and t_s_firstyear.customer_id = t_c_firstyear.customer_id + and t_s_firstyear.customer_id = t_w_firstyear.customer_id + and t_s_firstyear.customer_id = t_w_secyear.customer_id + and t_s_firstyear.sale_type = 's' + and t_c_firstyear.sale_type = 'c' + and t_w_firstyear.sale_type = 'w' + and t_s_secyear.sale_type = 's' + and t_c_secyear.sale_type = 'c' + and t_w_secyear.sale_type = 'w' + and t_s_firstyear.dyear = 2001 + and t_s_secyear.dyear = 2001+1 + and t_c_firstyear.dyear = 2001 + and t_c_secyear.dyear = 2001+1 + and t_w_firstyear.dyear = 2001 + and t_w_secyear.dyear = 2001+1 + and t_s_firstyear.year_total > 0 + and t_c_firstyear.year_total > 0 + and t_w_firstyear.year_total > 0 + and case when t_c_firstyear.year_total > 0 then t_c_secyear.year_total / t_c_firstyear.year_total else null end + > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end + and case when t_c_firstyear.year_total > 0 then t_c_secyear.year_total / t_c_firstyear.year_total else null end + > case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else null end + order by t_s_secyear.customer_id + ,t_s_secyear.customer_first_name + ,t_s_secyear.customer_last_name + ,t_s_secyear.customer_email_address + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q40.sql b/benchmarks/queries/tpcds/q40.sql new file mode 100644 index 0000000000..7f54a9bbdf --- /dev/null +++ b/benchmarks/queries/tpcds/q40.sql @@ -0,0 +1,29 @@ +-- SQLBench-DS query 40 derived from TPC-DS query 40 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + w_state + ,i_item_id + ,sum(case when (cast(d_date as date) < cast ('2002-05-18' as date)) + then cs_sales_price - coalesce(cr_refunded_cash,0) else 0 end) as sales_before + ,sum(case when (cast(d_date as date) >= cast ('2002-05-18' as date)) + then cs_sales_price - coalesce(cr_refunded_cash,0) else 0 end) as sales_after + from + catalog_sales left outer join catalog_returns on + (cs_order_number = cr_order_number + and cs_item_sk = cr_item_sk) + ,warehouse + ,item + ,date_dim + where + i_current_price between 0.99 and 1.49 + and i_item_sk = cs_item_sk + and cs_warehouse_sk = w_warehouse_sk + and cs_sold_date_sk = d_date_sk + and d_date between (cast ('2002-05-18' as date) - INTERVAL '30 DAYS') + and (cast ('2002-05-18' as date) + INTERVAL '30 DAYS') + group by + w_state,i_item_id + order by w_state,i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q41.sql b/benchmarks/queries/tpcds/q41.sql new file mode 100644 index 0000000000..d561cdba50 --- /dev/null +++ b/benchmarks/queries/tpcds/q41.sql @@ -0,0 +1,53 @@ +-- SQLBench-DS query 41 derived from TPC-DS query 41 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select distinct(i_product_name) + from item i1 + where i_manufact_id between 668 and 668+40 + and (select count(*) as item_cnt + from item + where (i_manufact = i1.i_manufact and + ((i_category = 'Women' and + (i_color = 'cream' or i_color = 'ghost') and + (i_units = 'Ton' or i_units = 'Gross') and + (i_size = 'economy' or i_size = 'small') + ) or + (i_category = 'Women' and + (i_color = 'midnight' or i_color = 'burlywood') and + (i_units = 'Tsp' or i_units = 'Bundle') and + (i_size = 'medium' or i_size = 'extra large') + ) or + (i_category = 'Men' and + (i_color = 'lavender' or i_color = 'azure') and + (i_units = 'Each' or i_units = 'Lb') and + (i_size = 'large' or i_size = 'N/A') + ) or + (i_category = 'Men' and + (i_color = 'chocolate' or i_color = 'steel') and + (i_units = 'N/A' or i_units = 'Dozen') and + (i_size = 'economy' or i_size = 'small') + ))) or + (i_manufact = i1.i_manufact and + ((i_category = 'Women' and + (i_color = 'floral' or i_color = 'royal') and + (i_units = 'Unknown' or i_units = 'Tbl') and + (i_size = 'economy' or i_size = 'small') + ) or + (i_category = 'Women' and + (i_color = 'navy' or i_color = 'forest') and + (i_units = 'Bunch' or i_units = 'Dram') and + (i_size = 'medium' or i_size = 'extra large') + ) or + (i_category = 'Men' and + (i_color = 'cyan' or i_color = 'indian') and + (i_units = 'Carton' or i_units = 'Cup') and + (i_size = 'large' or i_size = 'N/A') + ) or + (i_category = 'Men' and + (i_color = 'coral' or i_color = 'pale') and + (i_units = 'Pallet' or i_units = 'Gram') and + (i_size = 'economy' or i_size = 'small') + )))) > 0 + order by i_product_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q42.sql b/benchmarks/queries/tpcds/q42.sql new file mode 100644 index 0000000000..ac91e7cc2b --- /dev/null +++ b/benchmarks/queries/tpcds/q42.sql @@ -0,0 +1,23 @@ +-- SQLBench-DS query 42 derived from TPC-DS query 42 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select dt.d_year + ,item.i_category_id + ,item.i_category + ,sum(ss_ext_sales_price) + from date_dim dt + ,store_sales + ,item + where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manager_id = 1 + and dt.d_moy=11 + and dt.d_year=1998 + group by dt.d_year + ,item.i_category_id + ,item.i_category + order by sum(ss_ext_sales_price) desc,dt.d_year + ,item.i_category_id + ,item.i_category + LIMIT 100 ; + diff --git a/benchmarks/queries/tpcds/q43.sql b/benchmarks/queries/tpcds/q43.sql new file mode 100644 index 0000000000..ca09e8e77d --- /dev/null +++ b/benchmarks/queries/tpcds/q43.sql @@ -0,0 +1,20 @@ +-- SQLBench-DS query 43 derived from TPC-DS query 43 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select s_store_name, s_store_id, + sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales, + sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales, + sum(case when (d_day_name='Tuesday') then ss_sales_price else null end) tue_sales, + sum(case when (d_day_name='Wednesday') then ss_sales_price else null end) wed_sales, + sum(case when (d_day_name='Thursday') then ss_sales_price else null end) thu_sales, + sum(case when (d_day_name='Friday') then ss_sales_price else null end) fri_sales, + sum(case when (d_day_name='Saturday') then ss_sales_price else null end) sat_sales + from date_dim, store_sales, store + where d_date_sk = ss_sold_date_sk and + s_store_sk = ss_store_sk and + s_gmt_offset = -5 and + d_year = 2000 + group by s_store_name, s_store_id + order by s_store_name, s_store_id,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q44.sql b/benchmarks/queries/tpcds/q44.sql new file mode 100644 index 0000000000..8c635cef49 --- /dev/null +++ b/benchmarks/queries/tpcds/q44.sql @@ -0,0 +1,36 @@ +-- SQLBench-DS query 44 derived from TPC-DS query 44 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name worst_performing +from(select * + from (select item_sk,rank() over (order by rank_col asc) rnk + from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col + from store_sales ss1 + where ss_store_sk = 6 + group by ss_item_sk + having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col + from store_sales + where ss_store_sk = 6 + and ss_hdemo_sk is null + group by ss_store_sk))V1)V11 + where rnk < 11) asceding, + (select * + from (select item_sk,rank() over (order by rank_col desc) rnk + from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col + from store_sales ss1 + where ss_store_sk = 6 + group by ss_item_sk + having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col + from store_sales + where ss_store_sk = 6 + and ss_hdemo_sk is null + group by ss_store_sk))V2)V21 + where rnk < 11) descending, +item i1, +item i2 +where asceding.rnk = descending.rnk + and i1.i_item_sk=asceding.item_sk + and i2.i_item_sk=descending.item_sk +order by asceding.rnk + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q45.sql b/benchmarks/queries/tpcds/q45.sql new file mode 100644 index 0000000000..682cc9b54d --- /dev/null +++ b/benchmarks/queries/tpcds/q45.sql @@ -0,0 +1,21 @@ +-- SQLBench-DS query 45 derived from TPC-DS query 45 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select ca_zip, ca_city, sum(ws_sales_price) + from web_sales, customer, customer_address, date_dim, item + where ws_bill_customer_sk = c_customer_sk + and c_current_addr_sk = ca_address_sk + and ws_item_sk = i_item_sk + and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475', '85392', '85460', '80348', '81792') + or + i_item_id in (select i_item_id + from item + where i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29) + ) + ) + and ws_sold_date_sk = d_date_sk + and d_qoy = 2 and d_year = 2000 + group by ca_zip, ca_city + order by ca_zip, ca_city + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q46.sql b/benchmarks/queries/tpcds/q46.sql new file mode 100644 index 0000000000..81ae1d5815 --- /dev/null +++ b/benchmarks/queries/tpcds/q46.sql @@ -0,0 +1,36 @@ +-- SQLBench-DS query 46 derived from TPC-DS query 46 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select c_last_name + ,c_first_name + ,ca_city + ,bought_city + ,ss_ticket_number + ,amt,profit + from + (select ss_ticket_number + ,ss_customer_sk + ,ca_city bought_city + ,sum(ss_coupon_amt) amt + ,sum(ss_net_profit) profit + from store_sales,date_dim,store,household_demographics,customer_address + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_store_sk = store.s_store_sk + and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + and store_sales.ss_addr_sk = customer_address.ca_address_sk + and (household_demographics.hd_dep_count = 3 or + household_demographics.hd_vehicle_count= 1) + and date_dim.d_dow in (6,0) + and date_dim.d_year in (1999,1999+1,1999+2) + and store.s_city in ('Midway','Fairview','Fairview','Midway','Fairview') + group by ss_ticket_number,ss_customer_sk,ss_addr_sk,ca_city) dn,customer,customer_address current_addr + where ss_customer_sk = c_customer_sk + and customer.c_current_addr_sk = current_addr.ca_address_sk + and current_addr.ca_city <> bought_city + order by c_last_name + ,c_first_name + ,ca_city + ,bought_city + ,ss_ticket_number + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q47.sql b/benchmarks/queries/tpcds/q47.sql new file mode 100644 index 0000000000..f741fe44cd --- /dev/null +++ b/benchmarks/queries/tpcds/q47.sql @@ -0,0 +1,52 @@ +-- SQLBench-DS query 47 derived from TPC-DS query 47 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with v1 as( + select i_category, i_brand, + s_store_name, s_company_name, + d_year, d_moy, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) over + (partition by i_category, i_brand, + s_store_name, s_company_name, d_year) + avg_monthly_sales, + rank() over + (partition by i_category, i_brand, + s_store_name, s_company_name + order by d_year, d_moy) rn + from item, store_sales, date_dim, store + where ss_item_sk = i_item_sk and + ss_sold_date_sk = d_date_sk and + ss_store_sk = s_store_sk and + ( + d_year = 2001 or + ( d_year = 2001-1 and d_moy =12) or + ( d_year = 2001+1 and d_moy =1) + ) + group by i_category, i_brand, + s_store_name, s_company_name, + d_year, d_moy), + v2 as( + select v1.i_category, v1.i_brand, v1.s_store_name, v1.s_company_name + ,v1.d_year + ,v1.avg_monthly_sales + ,v1.sum_sales, v1_lag.sum_sales psum, v1_lead.sum_sales nsum + from v1, v1 v1_lag, v1 v1_lead + where v1.i_category = v1_lag.i_category and + v1.i_category = v1_lead.i_category and + v1.i_brand = v1_lag.i_brand and + v1.i_brand = v1_lead.i_brand and + v1.s_store_name = v1_lag.s_store_name and + v1.s_store_name = v1_lead.s_store_name and + v1.s_company_name = v1_lag.s_company_name and + v1.s_company_name = v1_lead.s_company_name and + v1.rn = v1_lag.rn + 1 and + v1.rn = v1_lead.rn - 1) + select * + from v2 + where d_year = 2001 and + avg_monthly_sales > 0 and + case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1 + order by sum_sales - avg_monthly_sales, nsum + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q48.sql b/benchmarks/queries/tpcds/q48.sql new file mode 100644 index 0000000000..fb83279b13 --- /dev/null +++ b/benchmarks/queries/tpcds/q48.sql @@ -0,0 +1,68 @@ +-- SQLBench-DS query 48 derived from TPC-DS query 48 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select sum (ss_quantity) + from store_sales, store, customer_demographics, customer_address, date_dim + where s_store_sk = ss_store_sk + and ss_sold_date_sk = d_date_sk and d_year = 2001 + and + ( + ( + cd_demo_sk = ss_cdemo_sk + and + cd_marital_status = 'W' + and + cd_education_status = '2 yr Degree' + and + ss_sales_price between 100.00 and 150.00 + ) + or + ( + cd_demo_sk = ss_cdemo_sk + and + cd_marital_status = 'S' + and + cd_education_status = 'Advanced Degree' + and + ss_sales_price between 50.00 and 100.00 + ) + or + ( + cd_demo_sk = ss_cdemo_sk + and + cd_marital_status = 'D' + and + cd_education_status = 'Primary' + and + ss_sales_price between 150.00 and 200.00 + ) + ) + and + ( + ( + ss_addr_sk = ca_address_sk + and + ca_country = 'United States' + and + ca_state in ('IL', 'KY', 'OR') + and ss_net_profit between 0 and 2000 + ) + or + (ss_addr_sk = ca_address_sk + and + ca_country = 'United States' + and + ca_state in ('VA', 'FL', 'AL') + and ss_net_profit between 150 and 3000 + ) + or + (ss_addr_sk = ca_address_sk + and + ca_country = 'United States' + and + ca_state in ('OK', 'IA', 'TX') + and ss_net_profit between 50 and 25000 + ) + ) +; + diff --git a/benchmarks/queries/tpcds/q49.sql b/benchmarks/queries/tpcds/q49.sql new file mode 100644 index 0000000000..c97286528b --- /dev/null +++ b/benchmarks/queries/tpcds/q49.sql @@ -0,0 +1,130 @@ +-- SQLBench-DS query 49 derived from TPC-DS query 49 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select channel, item, return_ratio, return_rank, currency_rank from + (select + 'web' as channel + ,web.item + ,web.return_ratio + ,web.return_rank + ,web.currency_rank + from ( + select + item + ,return_ratio + ,currency_ratio + ,rank() over (order by return_ratio) as return_rank + ,rank() over (order by currency_ratio) as currency_rank + from + ( select ws.ws_item_sk as item + ,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/ + cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as return_ratio + ,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/ + cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as currency_ratio + from + web_sales ws left outer join web_returns wr + on (ws.ws_order_number = wr.wr_order_number and + ws.ws_item_sk = wr.wr_item_sk) + ,date_dim + where + wr.wr_return_amt > 10000 + and ws.ws_net_profit > 1 + and ws.ws_net_paid > 0 + and ws.ws_quantity > 0 + and ws_sold_date_sk = d_date_sk + and d_year = 2000 + and d_moy = 12 + group by ws.ws_item_sk + ) in_web + ) web + where + ( + web.return_rank <= 10 + or + web.currency_rank <= 10 + ) + union + select + 'catalog' as channel + ,catalog.item + ,catalog.return_ratio + ,catalog.return_rank + ,catalog.currency_rank + from ( + select + item + ,return_ratio + ,currency_ratio + ,rank() over (order by return_ratio) as return_rank + ,rank() over (order by currency_ratio) as currency_rank + from + ( select + cs.cs_item_sk as item + ,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/ + cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as return_ratio + ,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/ + cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as currency_ratio + from + catalog_sales cs left outer join catalog_returns cr + on (cs.cs_order_number = cr.cr_order_number and + cs.cs_item_sk = cr.cr_item_sk) + ,date_dim + where + cr.cr_return_amount > 10000 + and cs.cs_net_profit > 1 + and cs.cs_net_paid > 0 + and cs.cs_quantity > 0 + and cs_sold_date_sk = d_date_sk + and d_year = 2000 + and d_moy = 12 + group by cs.cs_item_sk + ) in_cat + ) catalog + where + ( + catalog.return_rank <= 10 + or + catalog.currency_rank <=10 + ) + union + select + 'store' as channel + ,store.item + ,store.return_ratio + ,store.return_rank + ,store.currency_rank + from ( + select + item + ,return_ratio + ,currency_ratio + ,rank() over (order by return_ratio) as return_rank + ,rank() over (order by currency_ratio) as currency_rank + from + ( select sts.ss_item_sk as item + ,(cast(sum(coalesce(sr.sr_return_quantity,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as return_ratio + ,(cast(sum(coalesce(sr.sr_return_amt,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as decimal(15,4) )) as currency_ratio + from + store_sales sts left outer join store_returns sr + on (sts.ss_ticket_number = sr.sr_ticket_number and sts.ss_item_sk = sr.sr_item_sk) + ,date_dim + where + sr.sr_return_amt > 10000 + and sts.ss_net_profit > 1 + and sts.ss_net_paid > 0 + and sts.ss_quantity > 0 + and ss_sold_date_sk = d_date_sk + and d_year = 2000 + and d_moy = 12 + group by sts.ss_item_sk + ) in_store + ) store + where ( + store.return_rank <= 10 + or + store.currency_rank <= 10 + ) + ) + order by 1,4,5,2 + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q5.sql b/benchmarks/queries/tpcds/q5.sql new file mode 100644 index 0000000000..4f2721634c --- /dev/null +++ b/benchmarks/queries/tpcds/q5.sql @@ -0,0 +1,129 @@ +-- SQLBench-DS query 5 derived from TPC-DS query 5 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ssr as + (select s_store_id, + sum(sales_price) as sales, + sum(profit) as profit, + sum(return_amt) as returns, + sum(net_loss) as profit_loss + from + ( select ss_store_sk as store_sk, + ss_sold_date_sk as date_sk, + ss_ext_sales_price as sales_price, + ss_net_profit as profit, + cast(0 as decimal(7,2)) as return_amt, + cast(0 as decimal(7,2)) as net_loss + from store_sales + union all + select sr_store_sk as store_sk, + sr_returned_date_sk as date_sk, + cast(0 as decimal(7,2)) as sales_price, + cast(0 as decimal(7,2)) as profit, + sr_return_amt as return_amt, + sr_net_loss as net_loss + from store_returns + ) salesreturns, + date_dim, + store + where date_sk = d_date_sk + and d_date between cast('2001-08-04' as date) + and (cast('2001-08-04' as date) + INTERVAL '14 DAYS') + and store_sk = s_store_sk + group by s_store_id) + , + csr as + (select cp_catalog_page_id, + sum(sales_price) as sales, + sum(profit) as profit, + sum(return_amt) as returns, + sum(net_loss) as profit_loss + from + ( select cs_catalog_page_sk as page_sk, + cs_sold_date_sk as date_sk, + cs_ext_sales_price as sales_price, + cs_net_profit as profit, + cast(0 as decimal(7,2)) as return_amt, + cast(0 as decimal(7,2)) as net_loss + from catalog_sales + union all + select cr_catalog_page_sk as page_sk, + cr_returned_date_sk as date_sk, + cast(0 as decimal(7,2)) as sales_price, + cast(0 as decimal(7,2)) as profit, + cr_return_amount as return_amt, + cr_net_loss as net_loss + from catalog_returns + ) salesreturns, + date_dim, + catalog_page + where date_sk = d_date_sk + and d_date between cast('2001-08-04' as date) + and (cast('2001-08-04' as date) + INTERVAL '14 DAYS') + and page_sk = cp_catalog_page_sk + group by cp_catalog_page_id) + , + wsr as + (select web_site_id, + sum(sales_price) as sales, + sum(profit) as profit, + sum(return_amt) as returns, + sum(net_loss) as profit_loss + from + ( select ws_web_site_sk as wsr_web_site_sk, + ws_sold_date_sk as date_sk, + ws_ext_sales_price as sales_price, + ws_net_profit as profit, + cast(0 as decimal(7,2)) as return_amt, + cast(0 as decimal(7,2)) as net_loss + from web_sales + union all + select ws_web_site_sk as wsr_web_site_sk, + wr_returned_date_sk as date_sk, + cast(0 as decimal(7,2)) as sales_price, + cast(0 as decimal(7,2)) as profit, + wr_return_amt as return_amt, + wr_net_loss as net_loss + from web_returns left outer join web_sales on + ( wr_item_sk = ws_item_sk + and wr_order_number = ws_order_number) + ) salesreturns, + date_dim, + web_site + where date_sk = d_date_sk + and d_date between cast('2001-08-04' as date) + and (cast('2001-08-04' as date) + INTERVAL '14 DAYS') + and wsr_web_site_sk = web_site_sk + group by web_site_id) + select channel + , id + , sum(sales) as sales + , sum(returns) as returns + , sum(profit) as profit + from + (select 'store channel' as channel + , 'store' || s_store_id as id + , sales + , returns + , (profit - profit_loss) as profit + from ssr + union all + select 'catalog channel' as channel + , 'catalog_page' || cp_catalog_page_id as id + , sales + , returns + , (profit - profit_loss) as profit + from csr + union all + select 'web channel' as channel + , 'web_site' || web_site_id as id + , sales + , returns + , (profit - profit_loss) as profit + from wsr + ) x + group by rollup (channel, id) + order by channel + ,id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q50.sql b/benchmarks/queries/tpcds/q50.sql new file mode 100644 index 0000000000..d3dd26a156 --- /dev/null +++ b/benchmarks/queries/tpcds/q50.sql @@ -0,0 +1,60 @@ +-- SQLBench-DS query 50 derived from TPC-DS query 50 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + s_store_name + ,s_company_id + ,s_street_number + ,s_street_name + ,s_street_type + ,s_suite_number + ,s_city + ,s_county + ,s_state + ,s_zip + ,sum(case when (sr_returned_date_sk - ss_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days` + ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 30) and + (sr_returned_date_sk - ss_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days` + ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 60) and + (sr_returned_date_sk - ss_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days` + ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 90) and + (sr_returned_date_sk - ss_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days` + ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 120) then 1 else 0 end) as `>120 days` +from + store_sales + ,store_returns + ,store + ,date_dim d1 + ,date_dim d2 +where + d2.d_year = 2002 +and d2.d_moy = 8 +and ss_ticket_number = sr_ticket_number +and ss_item_sk = sr_item_sk +and ss_sold_date_sk = d1.d_date_sk +and sr_returned_date_sk = d2.d_date_sk +and ss_customer_sk = sr_customer_sk +and ss_store_sk = s_store_sk +group by + s_store_name + ,s_company_id + ,s_street_number + ,s_street_name + ,s_street_type + ,s_suite_number + ,s_city + ,s_county + ,s_state + ,s_zip +order by s_store_name + ,s_company_id + ,s_street_number + ,s_street_name + ,s_street_type + ,s_suite_number + ,s_city + ,s_county + ,s_state + ,s_zip + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q51.sql b/benchmarks/queries/tpcds/q51.sql new file mode 100644 index 0000000000..5aeb3087b4 --- /dev/null +++ b/benchmarks/queries/tpcds/q51.sql @@ -0,0 +1,46 @@ +-- SQLBench-DS query 51 derived from TPC-DS query 51 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +WITH web_v1 as ( +select + ws_item_sk item_sk, d_date, + sum(sum(ws_sales_price)) + over (partition by ws_item_sk order by d_date rows between unbounded preceding and current row) cume_sales +from web_sales + ,date_dim +where ws_sold_date_sk=d_date_sk + and d_month_seq between 1215 and 1215+11 + and ws_item_sk is not NULL +group by ws_item_sk, d_date), +store_v1 as ( +select + ss_item_sk item_sk, d_date, + sum(sum(ss_sales_price)) + over (partition by ss_item_sk order by d_date rows between unbounded preceding and current row) cume_sales +from store_sales + ,date_dim +where ss_sold_date_sk=d_date_sk + and d_month_seq between 1215 and 1215+11 + and ss_item_sk is not NULL +group by ss_item_sk, d_date) + select * +from (select item_sk + ,d_date + ,web_sales + ,store_sales + ,max(web_sales) + over (partition by item_sk order by d_date rows between unbounded preceding and current row) web_cumulative + ,max(store_sales) + over (partition by item_sk order by d_date rows between unbounded preceding and current row) store_cumulative + from (select case when web.item_sk is not null then web.item_sk else store.item_sk end item_sk + ,case when web.d_date is not null then web.d_date else store.d_date end d_date + ,web.cume_sales web_sales + ,store.cume_sales store_sales + from web_v1 web full outer join store_v1 store on (web.item_sk = store.item_sk + and web.d_date = store.d_date) + )x )y +where web_cumulative > store_cumulative +order by item_sk + ,d_date + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q52.sql b/benchmarks/queries/tpcds/q52.sql new file mode 100644 index 0000000000..b4d032baec --- /dev/null +++ b/benchmarks/queries/tpcds/q52.sql @@ -0,0 +1,23 @@ +-- SQLBench-DS query 52 derived from TPC-DS query 52 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select dt.d_year + ,item.i_brand_id brand_id + ,item.i_brand brand + ,sum(ss_ext_sales_price) ext_price + from date_dim dt + ,store_sales + ,item + where dt.d_date_sk = store_sales.ss_sold_date_sk + and store_sales.ss_item_sk = item.i_item_sk + and item.i_manager_id = 1 + and dt.d_moy=11 + and dt.d_year=2000 + group by dt.d_year + ,item.i_brand + ,item.i_brand_id + order by dt.d_year + ,ext_price desc + ,brand_id + LIMIT 100 ; + diff --git a/benchmarks/queries/tpcds/q53.sql b/benchmarks/queries/tpcds/q53.sql new file mode 100644 index 0000000000..4c87797741 --- /dev/null +++ b/benchmarks/queries/tpcds/q53.sql @@ -0,0 +1,29 @@ +-- SQLBench-DS query 53 derived from TPC-DS query 53 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * from +(select i_manufact_id, +sum(ss_sales_price) sum_sales, +avg(sum(ss_sales_price)) over (partition by i_manufact_id) avg_quarterly_sales +from item, store_sales, date_dim, store +where ss_item_sk = i_item_sk and +ss_sold_date_sk = d_date_sk and +ss_store_sk = s_store_sk and +d_month_seq in (1197,1197+1,1197+2,1197+3,1197+4,1197+5,1197+6,1197+7,1197+8,1197+9,1197+10,1197+11) and +((i_category in ('Books','Children','Electronics') and +i_class in ('personal','portable','reference','self-help') and +i_brand in ('scholaramalgamalg #14','scholaramalgamalg #7', + 'exportiunivamalg #9','scholaramalgamalg #9')) +or(i_category in ('Women','Music','Men') and +i_class in ('accessories','classical','fragrances','pants') and +i_brand in ('amalgimporto #1','edu packscholar #1','exportiimporto #1', + 'importoamalg #1'))) +group by i_manufact_id, d_qoy ) tmp1 +where case when avg_quarterly_sales > 0 + then abs (sum_sales - avg_quarterly_sales)/ avg_quarterly_sales + else null end > 0.1 +order by avg_quarterly_sales, + sum_sales, + i_manufact_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q54.sql b/benchmarks/queries/tpcds/q54.sql new file mode 100644 index 0000000000..4b382e1abe --- /dev/null +++ b/benchmarks/queries/tpcds/q54.sql @@ -0,0 +1,57 @@ +-- SQLBench-DS query 54 derived from TPC-DS query 54 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with my_customers as ( + select distinct c_customer_sk + , c_current_addr_sk + from + ( select cs_sold_date_sk sold_date_sk, + cs_bill_customer_sk customer_sk, + cs_item_sk item_sk + from catalog_sales + union all + select ws_sold_date_sk sold_date_sk, + ws_bill_customer_sk customer_sk, + ws_item_sk item_sk + from web_sales + ) cs_or_ws_sales, + item, + date_dim, + customer + where sold_date_sk = d_date_sk + and item_sk = i_item_sk + and i_category = 'Men' + and i_class = 'shirts' + and c_customer_sk = cs_or_ws_sales.customer_sk + and d_moy = 4 + and d_year = 1998 + ) + , my_revenue as ( + select c_customer_sk, + sum(ss_ext_sales_price) as revenue + from my_customers, + store_sales, + customer_address, + store, + date_dim + where c_current_addr_sk = ca_address_sk + and ca_county = s_county + and ca_state = s_state + and ss_sold_date_sk = d_date_sk + and c_customer_sk = ss_customer_sk + and d_month_seq between (select distinct d_month_seq+1 + from date_dim where d_year = 1998 and d_moy = 4) + and (select distinct d_month_seq+3 + from date_dim where d_year = 1998 and d_moy = 4) + group by c_customer_sk + ) + , segments as + (select cast((revenue/50) as int) as segment + from my_revenue + ) + select segment, count(*) as num_customers, segment*50 as segment_base + from segments + group by segment + order by segment, num_customers + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q55.sql b/benchmarks/queries/tpcds/q55.sql new file mode 100644 index 0000000000..5dabcab05f --- /dev/null +++ b/benchmarks/queries/tpcds/q55.sql @@ -0,0 +1,15 @@ +-- SQLBench-DS query 55 derived from TPC-DS query 55 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_brand_id brand_id, i_brand brand, + sum(ss_ext_sales_price) ext_price + from date_dim, store_sales, item + where d_date_sk = ss_sold_date_sk + and ss_item_sk = i_item_sk + and i_manager_id=20 + and d_moy=12 + and d_year=1998 + group by i_brand, i_brand_id + order by ext_price desc, i_brand_id + LIMIT 100 ; + diff --git a/benchmarks/queries/tpcds/q56.sql b/benchmarks/queries/tpcds/q56.sql new file mode 100644 index 0000000000..d877d0b8b9 --- /dev/null +++ b/benchmarks/queries/tpcds/q56.sql @@ -0,0 +1,70 @@ +-- SQLBench-DS query 56 derived from TPC-DS query 56 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss as ( + select i_item_id,sum(ss_ext_sales_price) total_sales + from + store_sales, + date_dim, + customer_address, + item + where i_item_id in (select + i_item_id +from item +where i_color in ('powder','goldenrod','bisque')) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_year = 1998 + and d_moy = 5 + and ss_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_item_id), + cs as ( + select i_item_id,sum(cs_ext_sales_price) total_sales + from + catalog_sales, + date_dim, + customer_address, + item + where + i_item_id in (select + i_item_id +from item +where i_color in ('powder','goldenrod','bisque')) + and cs_item_sk = i_item_sk + and cs_sold_date_sk = d_date_sk + and d_year = 1998 + and d_moy = 5 + and cs_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_item_id), + ws as ( + select i_item_id,sum(ws_ext_sales_price) total_sales + from + web_sales, + date_dim, + customer_address, + item + where + i_item_id in (select + i_item_id +from item +where i_color in ('powder','goldenrod','bisque')) + and ws_item_sk = i_item_sk + and ws_sold_date_sk = d_date_sk + and d_year = 1998 + and d_moy = 5 + and ws_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -5 + group by i_item_id) + select i_item_id ,sum(total_sales) total_sales + from (select * from ss + union all + select * from cs + union all + select * from ws) tmp1 + group by i_item_id + order by total_sales, + i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q57.sql b/benchmarks/queries/tpcds/q57.sql new file mode 100644 index 0000000000..088ddc9eeb --- /dev/null +++ b/benchmarks/queries/tpcds/q57.sql @@ -0,0 +1,49 @@ +-- SQLBench-DS query 57 derived from TPC-DS query 57 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with v1 as( + select i_category, i_brand, + cc_name, + d_year, d_moy, + sum(cs_sales_price) sum_sales, + avg(sum(cs_sales_price)) over + (partition by i_category, i_brand, + cc_name, d_year) + avg_monthly_sales, + rank() over + (partition by i_category, i_brand, + cc_name + order by d_year, d_moy) rn + from item, catalog_sales, date_dim, call_center + where cs_item_sk = i_item_sk and + cs_sold_date_sk = d_date_sk and + cc_call_center_sk= cs_call_center_sk and + ( + d_year = 2000 or + ( d_year = 2000-1 and d_moy =12) or + ( d_year = 2000+1 and d_moy =1) + ) + group by i_category, i_brand, + cc_name , d_year, d_moy), + v2 as( + select v1.cc_name + ,v1.d_year, v1.d_moy + ,v1.avg_monthly_sales + ,v1.sum_sales, v1_lag.sum_sales psum, v1_lead.sum_sales nsum + from v1, v1 v1_lag, v1 v1_lead + where v1.i_category = v1_lag.i_category and + v1.i_category = v1_lead.i_category and + v1.i_brand = v1_lag.i_brand and + v1.i_brand = v1_lead.i_brand and + v1. cc_name = v1_lag. cc_name and + v1. cc_name = v1_lead. cc_name and + v1.rn = v1_lag.rn + 1 and + v1.rn = v1_lead.rn - 1) + select * + from v2 + where d_year = 2000 and + avg_monthly_sales > 0 and + case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1 + order by sum_sales - avg_monthly_sales, psum + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q58.sql b/benchmarks/queries/tpcds/q58.sql new file mode 100644 index 0000000000..05801ea4b3 --- /dev/null +++ b/benchmarks/queries/tpcds/q58.sql @@ -0,0 +1,66 @@ +-- SQLBench-DS query 58 derived from TPC-DS query 58 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss_items as + (select i_item_id item_id + ,sum(ss_ext_sales_price) ss_item_rev + from store_sales + ,item + ,date_dim + where ss_item_sk = i_item_sk + and d_date in (select d_date + from date_dim + where d_week_seq = (select d_week_seq + from date_dim + where d_date = '2000-02-12')) + and ss_sold_date_sk = d_date_sk + group by i_item_id), + cs_items as + (select i_item_id item_id + ,sum(cs_ext_sales_price) cs_item_rev + from catalog_sales + ,item + ,date_dim + where cs_item_sk = i_item_sk + and d_date in (select d_date + from date_dim + where d_week_seq = (select d_week_seq + from date_dim + where d_date = '2000-02-12')) + and cs_sold_date_sk = d_date_sk + group by i_item_id), + ws_items as + (select i_item_id item_id + ,sum(ws_ext_sales_price) ws_item_rev + from web_sales + ,item + ,date_dim + where ws_item_sk = i_item_sk + and d_date in (select d_date + from date_dim + where d_week_seq =(select d_week_seq + from date_dim + where d_date = '2000-02-12')) + and ws_sold_date_sk = d_date_sk + group by i_item_id) + select ss_items.item_id + ,ss_item_rev + ,ss_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ss_dev + ,cs_item_rev + ,cs_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 cs_dev + ,ws_item_rev + ,ws_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ws_dev + ,(ss_item_rev+cs_item_rev+ws_item_rev)/3 average + from ss_items,cs_items,ws_items + where ss_items.item_id=cs_items.item_id + and ss_items.item_id=ws_items.item_id + and ss_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev + and ss_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev + and cs_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev + and cs_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev + and ws_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev + and ws_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev + order by item_id + ,ss_item_rev + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q59.sql b/benchmarks/queries/tpcds/q59.sql new file mode 100644 index 0000000000..e10c0dbf61 --- /dev/null +++ b/benchmarks/queries/tpcds/q59.sql @@ -0,0 +1,45 @@ +-- SQLBench-DS query 59 derived from TPC-DS query 59 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with wss as + (select d_week_seq, + ss_store_sk, + sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales, + sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales, + sum(case when (d_day_name='Tuesday') then ss_sales_price else null end) tue_sales, + sum(case when (d_day_name='Wednesday') then ss_sales_price else null end) wed_sales, + sum(case when (d_day_name='Thursday') then ss_sales_price else null end) thu_sales, + sum(case when (d_day_name='Friday') then ss_sales_price else null end) fri_sales, + sum(case when (d_day_name='Saturday') then ss_sales_price else null end) sat_sales + from store_sales,date_dim + where d_date_sk = ss_sold_date_sk + group by d_week_seq,ss_store_sk + ) + select s_store_name1,s_store_id1,d_week_seq1 + ,sun_sales1/sun_sales2,mon_sales1/mon_sales2 + ,tue_sales1/tue_sales2,wed_sales1/wed_sales2,thu_sales1/thu_sales2 + ,fri_sales1/fri_sales2,sat_sales1/sat_sales2 + from + (select s_store_name s_store_name1,wss.d_week_seq d_week_seq1 + ,s_store_id s_store_id1,sun_sales sun_sales1 + ,mon_sales mon_sales1,tue_sales tue_sales1 + ,wed_sales wed_sales1,thu_sales thu_sales1 + ,fri_sales fri_sales1,sat_sales sat_sales1 + from wss,store,date_dim d + where d.d_week_seq = wss.d_week_seq and + ss_store_sk = s_store_sk and + d_month_seq between 1206 and 1206 + 11) y, + (select s_store_name s_store_name2,wss.d_week_seq d_week_seq2 + ,s_store_id s_store_id2,sun_sales sun_sales2 + ,mon_sales mon_sales2,tue_sales tue_sales2 + ,wed_sales wed_sales2,thu_sales thu_sales2 + ,fri_sales fri_sales2,sat_sales sat_sales2 + from wss,store,date_dim d + where d.d_week_seq = wss.d_week_seq and + ss_store_sk = s_store_sk and + d_month_seq between 1206+ 12 and 1206 + 23) x + where s_store_id1=s_store_id2 + and d_week_seq1=d_week_seq2-52 + order by s_store_name1,s_store_id1,d_week_seq1 + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q6.sql b/benchmarks/queries/tpcds/q6.sql new file mode 100644 index 0000000000..098db850c8 --- /dev/null +++ b/benchmarks/queries/tpcds/q6.sql @@ -0,0 +1,27 @@ +-- SQLBench-DS query 6 derived from TPC-DS query 6 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select a.ca_state state, count(*) cnt + from customer_address a + ,customer c + ,store_sales s + ,date_dim d + ,item i + where a.ca_address_sk = c.c_current_addr_sk + and c.c_customer_sk = s.ss_customer_sk + and s.ss_sold_date_sk = d.d_date_sk + and s.ss_item_sk = i.i_item_sk + and d.d_month_seq = + (select distinct (d_month_seq) + from date_dim + where d_year = 1998 + and d_moy = 3 ) + and i.i_current_price > 1.2 * + (select avg(j.i_current_price) + from item j + where j.i_category = i.i_category) + group by a.ca_state + having count(*) >= 10 + order by cnt, a.ca_state + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q60.sql b/benchmarks/queries/tpcds/q60.sql new file mode 100644 index 0000000000..1e088c1605 --- /dev/null +++ b/benchmarks/queries/tpcds/q60.sql @@ -0,0 +1,79 @@ +-- SQLBench-DS query 60 derived from TPC-DS query 60 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss as ( + select + i_item_id,sum(ss_ext_sales_price) total_sales + from + store_sales, + date_dim, + customer_address, + item + where + i_item_id in (select + i_item_id +from + item +where i_category in ('Shoes')) + and ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and d_year = 2001 + and d_moy = 10 + and ss_addr_sk = ca_address_sk + and ca_gmt_offset = -6 + group by i_item_id), + cs as ( + select + i_item_id,sum(cs_ext_sales_price) total_sales + from + catalog_sales, + date_dim, + customer_address, + item + where + i_item_id in (select + i_item_id +from + item +where i_category in ('Shoes')) + and cs_item_sk = i_item_sk + and cs_sold_date_sk = d_date_sk + and d_year = 2001 + and d_moy = 10 + and cs_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -6 + group by i_item_id), + ws as ( + select + i_item_id,sum(ws_ext_sales_price) total_sales + from + web_sales, + date_dim, + customer_address, + item + where + i_item_id in (select + i_item_id +from + item +where i_category in ('Shoes')) + and ws_item_sk = i_item_sk + and ws_sold_date_sk = d_date_sk + and d_year = 2001 + and d_moy = 10 + and ws_bill_addr_sk = ca_address_sk + and ca_gmt_offset = -6 + group by i_item_id) + select + i_item_id +,sum(total_sales) total_sales + from (select * from ss + union all + select * from cs + union all + select * from ws) tmp1 + group by i_item_id + order by i_item_id + ,total_sales + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q61.sql b/benchmarks/queries/tpcds/q61.sql new file mode 100644 index 0000000000..6d6c2a5fcb --- /dev/null +++ b/benchmarks/queries/tpcds/q61.sql @@ -0,0 +1,45 @@ +-- SQLBench-DS query 61 derived from TPC-DS query 61 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select promotions,total,cast(promotions as decimal(15,4))/cast(total as decimal(15,4))*100 +from + (select sum(ss_ext_sales_price) promotions + from store_sales + ,store + ,promotion + ,date_dim + ,customer + ,customer_address + ,item + where ss_sold_date_sk = d_date_sk + and ss_store_sk = s_store_sk + and ss_promo_sk = p_promo_sk + and ss_customer_sk= c_customer_sk + and ca_address_sk = c_current_addr_sk + and ss_item_sk = i_item_sk + and ca_gmt_offset = -6 + and i_category = 'Sports' + and (p_channel_dmail = 'Y' or p_channel_email = 'Y' or p_channel_tv = 'Y') + and s_gmt_offset = -6 + and d_year = 2002 + and d_moy = 11) promotional_sales, + (select sum(ss_ext_sales_price) total + from store_sales + ,store + ,date_dim + ,customer + ,customer_address + ,item + where ss_sold_date_sk = d_date_sk + and ss_store_sk = s_store_sk + and ss_customer_sk= c_customer_sk + and ca_address_sk = c_current_addr_sk + and ss_item_sk = i_item_sk + and ca_gmt_offset = -6 + and i_category = 'Sports' + and s_gmt_offset = -6 + and d_year = 2002 + and d_moy = 11) all_sales +order by promotions, total + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q62.sql b/benchmarks/queries/tpcds/q62.sql new file mode 100644 index 0000000000..d0138e057b --- /dev/null +++ b/benchmarks/queries/tpcds/q62.sql @@ -0,0 +1,36 @@ +-- SQLBench-DS query 62 derived from TPC-DS query 62 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + substr(w_warehouse_name,1,20) + ,sm_type + ,web_name + ,sum(case when (ws_ship_date_sk - ws_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days` + ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 30) and + (ws_ship_date_sk - ws_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days` + ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 60) and + (ws_ship_date_sk - ws_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days` + ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 90) and + (ws_ship_date_sk - ws_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days` + ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 120) then 1 else 0 end) as `>120 days` +from + web_sales + ,warehouse + ,ship_mode + ,web_site + ,date_dim +where + d_month_seq between 1217 and 1217 + 11 +and ws_ship_date_sk = d_date_sk +and ws_warehouse_sk = w_warehouse_sk +and ws_ship_mode_sk = sm_ship_mode_sk +and ws_web_site_sk = web_site_sk +group by + substr(w_warehouse_name,1,20) + ,sm_type + ,web_name +order by substr(w_warehouse_name,1,20) + ,sm_type + ,web_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q63.sql b/benchmarks/queries/tpcds/q63.sql new file mode 100644 index 0000000000..3d85a2e38b --- /dev/null +++ b/benchmarks/queries/tpcds/q63.sql @@ -0,0 +1,30 @@ +-- SQLBench-DS query 63 derived from TPC-DS query 63 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * +from (select i_manager_id + ,sum(ss_sales_price) sum_sales + ,avg(sum(ss_sales_price)) over (partition by i_manager_id) avg_monthly_sales + from item + ,store_sales + ,date_dim + ,store + where ss_item_sk = i_item_sk + and ss_sold_date_sk = d_date_sk + and ss_store_sk = s_store_sk + and d_month_seq in (1181,1181+1,1181+2,1181+3,1181+4,1181+5,1181+6,1181+7,1181+8,1181+9,1181+10,1181+11) + and (( i_category in ('Books','Children','Electronics') + and i_class in ('personal','portable','reference','self-help') + and i_brand in ('scholaramalgamalg #14','scholaramalgamalg #7', + 'exportiunivamalg #9','scholaramalgamalg #9')) + or( i_category in ('Women','Music','Men') + and i_class in ('accessories','classical','fragrances','pants') + and i_brand in ('amalgimporto #1','edu packscholar #1','exportiimporto #1', + 'importoamalg #1'))) +group by i_manager_id, d_moy) tmp1 +where case when avg_monthly_sales > 0 then abs (sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1 +order by i_manager_id + ,avg_monthly_sales + ,sum_sales + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q64.sql b/benchmarks/queries/tpcds/q64.sql new file mode 100644 index 0000000000..0350cdc7d0 --- /dev/null +++ b/benchmarks/queries/tpcds/q64.sql @@ -0,0 +1,122 @@ +-- SQLBench-DS query 64 derived from TPC-DS query 64 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with cs_ui as + (select cs_item_sk + ,sum(cs_ext_list_price) as sale,sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit) as refund + from catalog_sales + ,catalog_returns + where cs_item_sk = cr_item_sk + and cs_order_number = cr_order_number + group by cs_item_sk + having sum(cs_ext_list_price)>2*sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit)), +cross_sales as + (select i_product_name product_name + ,i_item_sk item_sk + ,s_store_name store_name + ,s_zip store_zip + ,ad1.ca_street_number b_street_number + ,ad1.ca_street_name b_street_name + ,ad1.ca_city b_city + ,ad1.ca_zip b_zip + ,ad2.ca_street_number c_street_number + ,ad2.ca_street_name c_street_name + ,ad2.ca_city c_city + ,ad2.ca_zip c_zip + ,d1.d_year as syear + ,d2.d_year as fsyear + ,d3.d_year s2year + ,count(*) cnt + ,sum(ss_wholesale_cost) s1 + ,sum(ss_list_price) s2 + ,sum(ss_coupon_amt) s3 + FROM store_sales + ,store_returns + ,cs_ui + ,date_dim d1 + ,date_dim d2 + ,date_dim d3 + ,store + ,customer + ,customer_demographics cd1 + ,customer_demographics cd2 + ,promotion + ,household_demographics hd1 + ,household_demographics hd2 + ,customer_address ad1 + ,customer_address ad2 + ,income_band ib1 + ,income_band ib2 + ,item + WHERE ss_store_sk = s_store_sk AND + ss_sold_date_sk = d1.d_date_sk AND + ss_customer_sk = c_customer_sk AND + ss_cdemo_sk= cd1.cd_demo_sk AND + ss_hdemo_sk = hd1.hd_demo_sk AND + ss_addr_sk = ad1.ca_address_sk and + ss_item_sk = i_item_sk and + ss_item_sk = sr_item_sk and + ss_ticket_number = sr_ticket_number and + ss_item_sk = cs_ui.cs_item_sk and + c_current_cdemo_sk = cd2.cd_demo_sk AND + c_current_hdemo_sk = hd2.hd_demo_sk AND + c_current_addr_sk = ad2.ca_address_sk and + c_first_sales_date_sk = d2.d_date_sk and + c_first_shipto_date_sk = d3.d_date_sk and + ss_promo_sk = p_promo_sk and + hd1.hd_income_band_sk = ib1.ib_income_band_sk and + hd2.hd_income_band_sk = ib2.ib_income_band_sk and + cd1.cd_marital_status <> cd2.cd_marital_status and + i_color in ('light','cyan','burnished','green','almond','smoke') and + i_current_price between 22 and 22 + 10 and + i_current_price between 22 + 1 and 22 + 15 +group by i_product_name + ,i_item_sk + ,s_store_name + ,s_zip + ,ad1.ca_street_number + ,ad1.ca_street_name + ,ad1.ca_city + ,ad1.ca_zip + ,ad2.ca_street_number + ,ad2.ca_street_name + ,ad2.ca_city + ,ad2.ca_zip + ,d1.d_year + ,d2.d_year + ,d3.d_year +) +select cs1.product_name + ,cs1.store_name + ,cs1.store_zip + ,cs1.b_street_number + ,cs1.b_street_name + ,cs1.b_city + ,cs1.b_zip + ,cs1.c_street_number + ,cs1.c_street_name + ,cs1.c_city + ,cs1.c_zip + ,cs1.syear + ,cs1.cnt + ,cs1.s1 as s11 + ,cs1.s2 as s21 + ,cs1.s3 as s31 + ,cs2.s1 as s12 + ,cs2.s2 as s22 + ,cs2.s3 as s32 + ,cs2.syear + ,cs2.cnt +from cross_sales cs1,cross_sales cs2 +where cs1.item_sk=cs2.item_sk and + cs1.syear = 2001 and + cs2.syear = 2001 + 1 and + cs2.cnt <= cs1.cnt and + cs1.store_name = cs2.store_name and + cs1.store_zip = cs2.store_zip +order by cs1.product_name + ,cs1.store_name + ,cs2.cnt + ,cs1.s1 + ,cs2.s1; + diff --git a/benchmarks/queries/tpcds/q65.sql b/benchmarks/queries/tpcds/q65.sql new file mode 100644 index 0000000000..0c13a0debf --- /dev/null +++ b/benchmarks/queries/tpcds/q65.sql @@ -0,0 +1,30 @@ +-- SQLBench-DS query 65 derived from TPC-DS query 65 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + s_store_name, + i_item_desc, + sc.revenue, + i_current_price, + i_wholesale_cost, + i_brand + from store, item, + (select ss_store_sk, avg(revenue) as ave + from + (select ss_store_sk, ss_item_sk, + sum(ss_sales_price) as revenue + from store_sales, date_dim + where ss_sold_date_sk = d_date_sk and d_month_seq between 1186 and 1186+11 + group by ss_store_sk, ss_item_sk) sa + group by ss_store_sk) sb, + (select ss_store_sk, ss_item_sk, sum(ss_sales_price) as revenue + from store_sales, date_dim + where ss_sold_date_sk = d_date_sk and d_month_seq between 1186 and 1186+11 + group by ss_store_sk, ss_item_sk) sc + where sb.ss_store_sk = sc.ss_store_sk and + sc.revenue <= 0.1 * sb.ave and + s_store_sk = sc.ss_store_sk and + i_item_sk = sc.ss_item_sk + order by s_store_name, i_item_desc + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q66.sql b/benchmarks/queries/tpcds/q66.sql new file mode 100644 index 0000000000..ba066a561d --- /dev/null +++ b/benchmarks/queries/tpcds/q66.sql @@ -0,0 +1,221 @@ +-- SQLBench-DS query 66 derived from TPC-DS query 66 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,ship_carriers + ,year + ,sum(jan_sales) as jan_sales + ,sum(feb_sales) as feb_sales + ,sum(mar_sales) as mar_sales + ,sum(apr_sales) as apr_sales + ,sum(may_sales) as may_sales + ,sum(jun_sales) as jun_sales + ,sum(jul_sales) as jul_sales + ,sum(aug_sales) as aug_sales + ,sum(sep_sales) as sep_sales + ,sum(oct_sales) as oct_sales + ,sum(nov_sales) as nov_sales + ,sum(dec_sales) as dec_sales + ,sum(jan_sales/w_warehouse_sq_ft) as jan_sales_per_sq_foot + ,sum(feb_sales/w_warehouse_sq_ft) as feb_sales_per_sq_foot + ,sum(mar_sales/w_warehouse_sq_ft) as mar_sales_per_sq_foot + ,sum(apr_sales/w_warehouse_sq_ft) as apr_sales_per_sq_foot + ,sum(may_sales/w_warehouse_sq_ft) as may_sales_per_sq_foot + ,sum(jun_sales/w_warehouse_sq_ft) as jun_sales_per_sq_foot + ,sum(jul_sales/w_warehouse_sq_ft) as jul_sales_per_sq_foot + ,sum(aug_sales/w_warehouse_sq_ft) as aug_sales_per_sq_foot + ,sum(sep_sales/w_warehouse_sq_ft) as sep_sales_per_sq_foot + ,sum(oct_sales/w_warehouse_sq_ft) as oct_sales_per_sq_foot + ,sum(nov_sales/w_warehouse_sq_ft) as nov_sales_per_sq_foot + ,sum(dec_sales/w_warehouse_sq_ft) as dec_sales_per_sq_foot + ,sum(jan_net) as jan_net + ,sum(feb_net) as feb_net + ,sum(mar_net) as mar_net + ,sum(apr_net) as apr_net + ,sum(may_net) as may_net + ,sum(jun_net) as jun_net + ,sum(jul_net) as jul_net + ,sum(aug_net) as aug_net + ,sum(sep_net) as sep_net + ,sum(oct_net) as oct_net + ,sum(nov_net) as nov_net + ,sum(dec_net) as dec_net + from ( + select + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,'FEDEX' || ',' || 'GERMA' as ship_carriers + ,d_year as year + ,sum(case when d_moy = 1 + then ws_ext_list_price* ws_quantity else 0 end) as jan_sales + ,sum(case when d_moy = 2 + then ws_ext_list_price* ws_quantity else 0 end) as feb_sales + ,sum(case when d_moy = 3 + then ws_ext_list_price* ws_quantity else 0 end) as mar_sales + ,sum(case when d_moy = 4 + then ws_ext_list_price* ws_quantity else 0 end) as apr_sales + ,sum(case when d_moy = 5 + then ws_ext_list_price* ws_quantity else 0 end) as may_sales + ,sum(case when d_moy = 6 + then ws_ext_list_price* ws_quantity else 0 end) as jun_sales + ,sum(case when d_moy = 7 + then ws_ext_list_price* ws_quantity else 0 end) as jul_sales + ,sum(case when d_moy = 8 + then ws_ext_list_price* ws_quantity else 0 end) as aug_sales + ,sum(case when d_moy = 9 + then ws_ext_list_price* ws_quantity else 0 end) as sep_sales + ,sum(case when d_moy = 10 + then ws_ext_list_price* ws_quantity else 0 end) as oct_sales + ,sum(case when d_moy = 11 + then ws_ext_list_price* ws_quantity else 0 end) as nov_sales + ,sum(case when d_moy = 12 + then ws_ext_list_price* ws_quantity else 0 end) as dec_sales + ,sum(case when d_moy = 1 + then ws_net_profit * ws_quantity else 0 end) as jan_net + ,sum(case when d_moy = 2 + then ws_net_profit * ws_quantity else 0 end) as feb_net + ,sum(case when d_moy = 3 + then ws_net_profit * ws_quantity else 0 end) as mar_net + ,sum(case when d_moy = 4 + then ws_net_profit * ws_quantity else 0 end) as apr_net + ,sum(case when d_moy = 5 + then ws_net_profit * ws_quantity else 0 end) as may_net + ,sum(case when d_moy = 6 + then ws_net_profit * ws_quantity else 0 end) as jun_net + ,sum(case when d_moy = 7 + then ws_net_profit * ws_quantity else 0 end) as jul_net + ,sum(case when d_moy = 8 + then ws_net_profit * ws_quantity else 0 end) as aug_net + ,sum(case when d_moy = 9 + then ws_net_profit * ws_quantity else 0 end) as sep_net + ,sum(case when d_moy = 10 + then ws_net_profit * ws_quantity else 0 end) as oct_net + ,sum(case when d_moy = 11 + then ws_net_profit * ws_quantity else 0 end) as nov_net + ,sum(case when d_moy = 12 + then ws_net_profit * ws_quantity else 0 end) as dec_net + from + web_sales + ,warehouse + ,date_dim + ,time_dim + ,ship_mode + where + ws_warehouse_sk = w_warehouse_sk + and ws_sold_date_sk = d_date_sk + and ws_sold_time_sk = t_time_sk + and ws_ship_mode_sk = sm_ship_mode_sk + and d_year = 2001 + and t_time between 19072 and 19072+28800 + and sm_carrier in ('FEDEX','GERMA') + group by + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,d_year + union all + select + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,'FEDEX' || ',' || 'GERMA' as ship_carriers + ,d_year as year + ,sum(case when d_moy = 1 + then cs_sales_price* cs_quantity else 0 end) as jan_sales + ,sum(case when d_moy = 2 + then cs_sales_price* cs_quantity else 0 end) as feb_sales + ,sum(case when d_moy = 3 + then cs_sales_price* cs_quantity else 0 end) as mar_sales + ,sum(case when d_moy = 4 + then cs_sales_price* cs_quantity else 0 end) as apr_sales + ,sum(case when d_moy = 5 + then cs_sales_price* cs_quantity else 0 end) as may_sales + ,sum(case when d_moy = 6 + then cs_sales_price* cs_quantity else 0 end) as jun_sales + ,sum(case when d_moy = 7 + then cs_sales_price* cs_quantity else 0 end) as jul_sales + ,sum(case when d_moy = 8 + then cs_sales_price* cs_quantity else 0 end) as aug_sales + ,sum(case when d_moy = 9 + then cs_sales_price* cs_quantity else 0 end) as sep_sales + ,sum(case when d_moy = 10 + then cs_sales_price* cs_quantity else 0 end) as oct_sales + ,sum(case when d_moy = 11 + then cs_sales_price* cs_quantity else 0 end) as nov_sales + ,sum(case when d_moy = 12 + then cs_sales_price* cs_quantity else 0 end) as dec_sales + ,sum(case when d_moy = 1 + then cs_net_paid * cs_quantity else 0 end) as jan_net + ,sum(case when d_moy = 2 + then cs_net_paid * cs_quantity else 0 end) as feb_net + ,sum(case when d_moy = 3 + then cs_net_paid * cs_quantity else 0 end) as mar_net + ,sum(case when d_moy = 4 + then cs_net_paid * cs_quantity else 0 end) as apr_net + ,sum(case when d_moy = 5 + then cs_net_paid * cs_quantity else 0 end) as may_net + ,sum(case when d_moy = 6 + then cs_net_paid * cs_quantity else 0 end) as jun_net + ,sum(case when d_moy = 7 + then cs_net_paid * cs_quantity else 0 end) as jul_net + ,sum(case when d_moy = 8 + then cs_net_paid * cs_quantity else 0 end) as aug_net + ,sum(case when d_moy = 9 + then cs_net_paid * cs_quantity else 0 end) as sep_net + ,sum(case when d_moy = 10 + then cs_net_paid * cs_quantity else 0 end) as oct_net + ,sum(case when d_moy = 11 + then cs_net_paid * cs_quantity else 0 end) as nov_net + ,sum(case when d_moy = 12 + then cs_net_paid * cs_quantity else 0 end) as dec_net + from + catalog_sales + ,warehouse + ,date_dim + ,time_dim + ,ship_mode + where + cs_warehouse_sk = w_warehouse_sk + and cs_sold_date_sk = d_date_sk + and cs_sold_time_sk = t_time_sk + and cs_ship_mode_sk = sm_ship_mode_sk + and d_year = 2001 + and t_time between 19072 AND 19072+28800 + and sm_carrier in ('FEDEX','GERMA') + group by + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,d_year + ) x + group by + w_warehouse_name + ,w_warehouse_sq_ft + ,w_city + ,w_county + ,w_state + ,w_country + ,ship_carriers + ,year + order by w_warehouse_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q67.sql b/benchmarks/queries/tpcds/q67.sql new file mode 100644 index 0000000000..7d684e6745 --- /dev/null +++ b/benchmarks/queries/tpcds/q67.sql @@ -0,0 +1,45 @@ +-- SQLBench-DS query 67 derived from TPC-DS query 67 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * +from (select i_category + ,i_class + ,i_brand + ,i_product_name + ,d_year + ,d_qoy + ,d_moy + ,s_store_id + ,sumsales + ,rank() over (partition by i_category order by sumsales desc) rk + from (select i_category + ,i_class + ,i_brand + ,i_product_name + ,d_year + ,d_qoy + ,d_moy + ,s_store_id + ,sum(coalesce(ss_sales_price*ss_quantity,0)) sumsales + from store_sales + ,date_dim + ,store + ,item + where ss_sold_date_sk=d_date_sk + and ss_item_sk=i_item_sk + and ss_store_sk = s_store_sk + and d_month_seq between 1194 and 1194+11 + group by rollup(i_category, i_class, i_brand, i_product_name, d_year, d_qoy, d_moy,s_store_id))dw1) dw2 +where rk <= 100 +order by i_category + ,i_class + ,i_brand + ,i_product_name + ,d_year + ,d_qoy + ,d_moy + ,s_store_id + ,sumsales + ,rk + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q68.sql b/benchmarks/queries/tpcds/q68.sql new file mode 100644 index 0000000000..242e0dbd93 --- /dev/null +++ b/benchmarks/queries/tpcds/q68.sql @@ -0,0 +1,43 @@ +-- SQLBench-DS query 68 derived from TPC-DS query 68 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select c_last_name + ,c_first_name + ,ca_city + ,bought_city + ,ss_ticket_number + ,extended_price + ,extended_tax + ,list_price + from (select ss_ticket_number + ,ss_customer_sk + ,ca_city bought_city + ,sum(ss_ext_sales_price) extended_price + ,sum(ss_ext_list_price) list_price + ,sum(ss_ext_tax) extended_tax + from store_sales + ,date_dim + ,store + ,household_demographics + ,customer_address + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_store_sk = store.s_store_sk + and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + and store_sales.ss_addr_sk = customer_address.ca_address_sk + and date_dim.d_dom between 1 and 2 + and (household_demographics.hd_dep_count = 8 or + household_demographics.hd_vehicle_count= 3) + and date_dim.d_year in (2000,2000+1,2000+2) + and store.s_city in ('Midway','Fairview') + group by ss_ticket_number + ,ss_customer_sk + ,ss_addr_sk,ca_city) dn + ,customer + ,customer_address current_addr + where ss_customer_sk = c_customer_sk + and customer.c_current_addr_sk = current_addr.ca_address_sk + and current_addr.ca_city <> bought_city + order by c_last_name + ,ss_ticket_number + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q69.sql b/benchmarks/queries/tpcds/q69.sql new file mode 100644 index 0000000000..4d4030cf59 --- /dev/null +++ b/benchmarks/queries/tpcds/q69.sql @@ -0,0 +1,48 @@ +-- SQLBench-DS query 69 derived from TPC-DS query 69 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + cd_gender, + cd_marital_status, + cd_education_status, + count(*) cnt1, + cd_purchase_estimate, + count(*) cnt2, + cd_credit_rating, + count(*) cnt3 + from + customer c,customer_address ca,customer_demographics + where + c.c_current_addr_sk = ca.ca_address_sk and + ca_state in ('IN','VA','MS') and + cd_demo_sk = c.c_current_cdemo_sk and + exists (select * + from store_sales,date_dim + where c.c_customer_sk = ss_customer_sk and + ss_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 2 and 2+2) and + (not exists (select * + from web_sales,date_dim + where c.c_customer_sk = ws_bill_customer_sk and + ws_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 2 and 2+2) and + not exists (select * + from catalog_sales,date_dim + where c.c_customer_sk = cs_ship_customer_sk and + cs_sold_date_sk = d_date_sk and + d_year = 2002 and + d_moy between 2 and 2+2)) + group by cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating + order by cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q7.sql b/benchmarks/queries/tpcds/q7.sql new file mode 100644 index 0000000000..bb58851616 --- /dev/null +++ b/benchmarks/queries/tpcds/q7.sql @@ -0,0 +1,22 @@ +-- SQLBench-DS query 7 derived from TPC-DS query 7 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id, + avg(ss_quantity) agg1, + avg(ss_list_price) agg2, + avg(ss_coupon_amt) agg3, + avg(ss_sales_price) agg4 + from store_sales, customer_demographics, date_dim, item, promotion + where ss_sold_date_sk = d_date_sk and + ss_item_sk = i_item_sk and + ss_cdemo_sk = cd_demo_sk and + ss_promo_sk = p_promo_sk and + cd_gender = 'M' and + cd_marital_status = 'M' and + cd_education_status = '4 yr Degree' and + (p_channel_email = 'N' or p_channel_event = 'N') and + d_year = 2001 + group by i_item_id + order by i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q70.sql b/benchmarks/queries/tpcds/q70.sql new file mode 100644 index 0000000000..a8b5f1c99f --- /dev/null +++ b/benchmarks/queries/tpcds/q70.sql @@ -0,0 +1,39 @@ +-- SQLBench-DS query 70 derived from TPC-DS query 70 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + sum(ss_net_profit) as total_sum + ,s_state + ,s_county + ,grouping(s_state)+grouping(s_county) as lochierarchy + ,rank() over ( + partition by grouping(s_state)+grouping(s_county), + case when grouping(s_county) = 0 then s_state end + order by sum(ss_net_profit) desc) as rank_within_parent + from + store_sales + ,date_dim d1 + ,store + where + d1.d_month_seq between 1180 and 1180+11 + and d1.d_date_sk = ss_sold_date_sk + and s_store_sk = ss_store_sk + and s_state in + ( select s_state + from (select s_state as s_state, + rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking + from store_sales, store, date_dim + where d_month_seq between 1180 and 1180+11 + and d_date_sk = ss_sold_date_sk + and s_store_sk = ss_store_sk + group by s_state + ) tmp1 + where ranking <= 5 + ) + group by rollup(s_state,s_county) + order by + lochierarchy desc + ,case when lochierarchy = 0 then s_state end + ,rank_within_parent + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q71.sql b/benchmarks/queries/tpcds/q71.sql new file mode 100644 index 0000000000..90d00806b0 --- /dev/null +++ b/benchmarks/queries/tpcds/q71.sql @@ -0,0 +1,41 @@ +-- SQLBench-DS query 71 derived from TPC-DS query 71 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_brand_id brand_id, i_brand brand,t_hour,t_minute, + sum(ext_price) ext_price + from item, (select ws_ext_sales_price as ext_price, + ws_sold_date_sk as sold_date_sk, + ws_item_sk as sold_item_sk, + ws_sold_time_sk as time_sk + from web_sales,date_dim + where d_date_sk = ws_sold_date_sk + and d_moy=11 + and d_year=2001 + union all + select cs_ext_sales_price as ext_price, + cs_sold_date_sk as sold_date_sk, + cs_item_sk as sold_item_sk, + cs_sold_time_sk as time_sk + from catalog_sales,date_dim + where d_date_sk = cs_sold_date_sk + and d_moy=11 + and d_year=2001 + union all + select ss_ext_sales_price as ext_price, + ss_sold_date_sk as sold_date_sk, + ss_item_sk as sold_item_sk, + ss_sold_time_sk as time_sk + from store_sales,date_dim + where d_date_sk = ss_sold_date_sk + and d_moy=11 + and d_year=2001 + ) tmp,time_dim + where + sold_item_sk = i_item_sk + and i_manager_id=1 + and time_sk = t_time_sk + and (t_meal_time = 'breakfast' or t_meal_time = 'dinner') + group by i_brand, i_brand_id,t_hour,t_minute + order by ext_price desc, i_brand_id + ; + diff --git a/benchmarks/queries/tpcds/q72.sql b/benchmarks/queries/tpcds/q72.sql new file mode 100644 index 0000000000..0e31057a03 --- /dev/null +++ b/benchmarks/queries/tpcds/q72.sql @@ -0,0 +1,30 @@ +-- SQLBench-DS query 72 derived from TPC-DS query 72 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_desc + ,w_warehouse_name + ,d1.d_week_seq + ,sum(case when p_promo_sk is null then 1 else 0 end) no_promo + ,sum(case when p_promo_sk is not null then 1 else 0 end) promo + ,count(*) total_cnt +from catalog_sales +join inventory on (cs_item_sk = inv_item_sk) +join warehouse on (w_warehouse_sk=inv_warehouse_sk) +join item on (i_item_sk = cs_item_sk) +join customer_demographics on (cs_bill_cdemo_sk = cd_demo_sk) +join household_demographics on (cs_bill_hdemo_sk = hd_demo_sk) +join date_dim d1 on (cs_sold_date_sk = d1.d_date_sk) +join date_dim d2 on (inv_date_sk = d2.d_date_sk) +join date_dim d3 on (cs_ship_date_sk = d3.d_date_sk) +left outer join promotion on (cs_promo_sk=p_promo_sk) +left outer join catalog_returns on (cr_item_sk = cs_item_sk and cr_order_number = cs_order_number) +where d1.d_week_seq = d2.d_week_seq + and inv_quantity_on_hand < cs_quantity + and d3.d_date > d1.d_date + 5 + and hd_buy_potential = '501-1000' + and d1.d_year = 1999 + and cd_marital_status = 'S' +group by i_item_desc,w_warehouse_name,d1.d_week_seq +order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q72_optimized.sql b/benchmarks/queries/tpcds/q72_optimized.sql new file mode 100644 index 0000000000..a98a70e8f0 --- /dev/null +++ b/benchmarks/queries/tpcds/q72_optimized.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 72 derived from TPC-DS query 72 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. + +-- This is a modified version of q72 that changes the join order to be sensible (the original q72 +-- intentionally has a terrible join order for testing database vendors join reordering rules) + +select i_item_desc + ,w_warehouse_name + ,d1.d_week_seq + ,sum(case when p_promo_sk is null then 1 else 0 end) no_promo + ,sum(case when p_promo_sk is not null then 1 else 0 end) promo + ,count(*) total_cnt +from catalog_sales + join date_dim d1 on (cs_sold_date_sk = d1.d_date_sk) + join customer_demographics on (cs_bill_cdemo_sk = cd_demo_sk) + join household_demographics on (cs_bill_hdemo_sk = hd_demo_sk) + join item on (i_item_sk = cs_item_sk) + join inventory on (cs_item_sk = inv_item_sk) + join warehouse on (w_warehouse_sk=inv_warehouse_sk) + join date_dim d2 on (inv_date_sk = d2.d_date_sk) + join date_dim d3 on (cs_ship_date_sk = d3.d_date_sk) + left outer join promotion on (cs_promo_sk=p_promo_sk) + left outer join catalog_returns on (cr_item_sk = cs_item_sk and cr_order_number = cs_order_number) +where d1.d_week_seq = d2.d_week_seq + and inv_quantity_on_hand < cs_quantity + and d3.d_date > d1.d_date + 5 + and hd_buy_potential = '501-1000' + and d1.d_year = 1999 + and cd_marital_status = 'S' +group by i_item_desc,w_warehouse_name,d1.d_week_seq +order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq +LIMIT 100; \ No newline at end of file diff --git a/benchmarks/queries/tpcds/q73.sql b/benchmarks/queries/tpcds/q73.sql new file mode 100644 index 0000000000..e7879d09ff --- /dev/null +++ b/benchmarks/queries/tpcds/q73.sql @@ -0,0 +1,29 @@ +-- SQLBench-DS query 73 derived from TPC-DS query 73 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select c_last_name + ,c_first_name + ,c_salutation + ,c_preferred_cust_flag + ,ss_ticket_number + ,cnt from + (select ss_ticket_number + ,ss_customer_sk + ,count(*) cnt + from store_sales,date_dim,store,household_demographics + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_store_sk = store.s_store_sk + and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + and date_dim.d_dom between 1 and 2 + and (household_demographics.hd_buy_potential = '1001-5000' or + household_demographics.hd_buy_potential = '5001-10000') + and household_demographics.hd_vehicle_count > 0 + and case when household_demographics.hd_vehicle_count > 0 then + household_demographics.hd_dep_count/ household_demographics.hd_vehicle_count else null end > 1 + and date_dim.d_year in (1999,1999+1,1999+2) + and store.s_county in ('Williamson County','Williamson County','Williamson County','Williamson County') + group by ss_ticket_number,ss_customer_sk) dj,customer + where ss_customer_sk = c_customer_sk + and cnt between 1 and 5 + order by cnt desc, c_last_name asc; + diff --git a/benchmarks/queries/tpcds/q74.sql b/benchmarks/queries/tpcds/q74.sql new file mode 100644 index 0000000000..b9829d9d5e --- /dev/null +++ b/benchmarks/queries/tpcds/q74.sql @@ -0,0 +1,62 @@ +-- SQLBench-DS query 74 derived from TPC-DS query 74 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with year_total as ( + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,d_year as year + ,stddev_samp(ss_net_paid) year_total + ,'s' sale_type + from customer + ,store_sales + ,date_dim + where c_customer_sk = ss_customer_sk + and ss_sold_date_sk = d_date_sk + and d_year in (2001,2001+1) + group by c_customer_id + ,c_first_name + ,c_last_name + ,d_year + union all + select c_customer_id customer_id + ,c_first_name customer_first_name + ,c_last_name customer_last_name + ,d_year as year + ,stddev_samp(ws_net_paid) year_total + ,'w' sale_type + from customer + ,web_sales + ,date_dim + where c_customer_sk = ws_bill_customer_sk + and ws_sold_date_sk = d_date_sk + and d_year in (2001,2001+1) + group by c_customer_id + ,c_first_name + ,c_last_name + ,d_year + ) + select + t_s_secyear.customer_id, t_s_secyear.customer_first_name, t_s_secyear.customer_last_name + from year_total t_s_firstyear + ,year_total t_s_secyear + ,year_total t_w_firstyear + ,year_total t_w_secyear + where t_s_secyear.customer_id = t_s_firstyear.customer_id + and t_s_firstyear.customer_id = t_w_secyear.customer_id + and t_s_firstyear.customer_id = t_w_firstyear.customer_id + and t_s_firstyear.sale_type = 's' + and t_w_firstyear.sale_type = 'w' + and t_s_secyear.sale_type = 's' + and t_w_secyear.sale_type = 'w' + and t_s_firstyear.year = 2001 + and t_s_secyear.year = 2001+1 + and t_w_firstyear.year = 2001 + and t_w_secyear.year = 2001+1 + and t_s_firstyear.year_total > 0 + and t_w_firstyear.year_total > 0 + and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else null end + > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end + order by 3,2,1 + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q75.sql b/benchmarks/queries/tpcds/q75.sql new file mode 100644 index 0000000000..cec9da56a5 --- /dev/null +++ b/benchmarks/queries/tpcds/q75.sql @@ -0,0 +1,71 @@ +-- SQLBench-DS query 75 derived from TPC-DS query 75 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +WITH all_sales AS ( + SELECT d_year + ,i_brand_id + ,i_class_id + ,i_category_id + ,i_manufact_id + ,SUM(sales_cnt) AS sales_cnt + ,SUM(sales_amt) AS sales_amt + FROM (SELECT d_year + ,i_brand_id + ,i_class_id + ,i_category_id + ,i_manufact_id + ,cs_quantity - COALESCE(cr_return_quantity,0) AS sales_cnt + ,cs_ext_sales_price - COALESCE(cr_return_amount,0.0) AS sales_amt + FROM catalog_sales JOIN item ON i_item_sk=cs_item_sk + JOIN date_dim ON d_date_sk=cs_sold_date_sk + LEFT JOIN catalog_returns ON (cs_order_number=cr_order_number + AND cs_item_sk=cr_item_sk) + WHERE i_category='Shoes' + UNION + SELECT d_year + ,i_brand_id + ,i_class_id + ,i_category_id + ,i_manufact_id + ,ss_quantity - COALESCE(sr_return_quantity,0) AS sales_cnt + ,ss_ext_sales_price - COALESCE(sr_return_amt,0.0) AS sales_amt + FROM store_sales JOIN item ON i_item_sk=ss_item_sk + JOIN date_dim ON d_date_sk=ss_sold_date_sk + LEFT JOIN store_returns ON (ss_ticket_number=sr_ticket_number + AND ss_item_sk=sr_item_sk) + WHERE i_category='Shoes' + UNION + SELECT d_year + ,i_brand_id + ,i_class_id + ,i_category_id + ,i_manufact_id + ,ws_quantity - COALESCE(wr_return_quantity,0) AS sales_cnt + ,ws_ext_sales_price - COALESCE(wr_return_amt,0.0) AS sales_amt + FROM web_sales JOIN item ON i_item_sk=ws_item_sk + JOIN date_dim ON d_date_sk=ws_sold_date_sk + LEFT JOIN web_returns ON (ws_order_number=wr_order_number + AND ws_item_sk=wr_item_sk) + WHERE i_category='Shoes') sales_detail + GROUP BY d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id) + SELECT prev_yr.d_year AS prev_year + ,curr_yr.d_year AS year + ,curr_yr.i_brand_id + ,curr_yr.i_class_id + ,curr_yr.i_category_id + ,curr_yr.i_manufact_id + ,prev_yr.sales_cnt AS prev_yr_cnt + ,curr_yr.sales_cnt AS curr_yr_cnt + ,curr_yr.sales_cnt-prev_yr.sales_cnt AS sales_cnt_diff + ,curr_yr.sales_amt-prev_yr.sales_amt AS sales_amt_diff + FROM all_sales curr_yr, all_sales prev_yr + WHERE curr_yr.i_brand_id=prev_yr.i_brand_id + AND curr_yr.i_class_id=prev_yr.i_class_id + AND curr_yr.i_category_id=prev_yr.i_category_id + AND curr_yr.i_manufact_id=prev_yr.i_manufact_id + AND curr_yr.d_year=2000 + AND prev_yr.d_year=2000-1 + AND CAST(curr_yr.sales_cnt AS DECIMAL(17,2))/CAST(prev_yr.sales_cnt AS DECIMAL(17,2))<0.9 + ORDER BY sales_cnt_diff,sales_amt_diff + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q76.sql b/benchmarks/queries/tpcds/q76.sql new file mode 100644 index 0000000000..931a1334f6 --- /dev/null +++ b/benchmarks/queries/tpcds/q76.sql @@ -0,0 +1,25 @@ +-- SQLBench-DS query 76 derived from TPC-DS query 76 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select channel, col_name, d_year, d_qoy, i_category, COUNT(*) sales_cnt, SUM(ext_sales_price) sales_amt FROM ( + SELECT 'store' as channel, 'ss_customer_sk' col_name, d_year, d_qoy, i_category, ss_ext_sales_price ext_sales_price + FROM store_sales, item, date_dim + WHERE ss_customer_sk IS NULL + AND ss_sold_date_sk=d_date_sk + AND ss_item_sk=i_item_sk + UNION ALL + SELECT 'web' as channel, 'ws_ship_hdemo_sk' col_name, d_year, d_qoy, i_category, ws_ext_sales_price ext_sales_price + FROM web_sales, item, date_dim + WHERE ws_ship_hdemo_sk IS NULL + AND ws_sold_date_sk=d_date_sk + AND ws_item_sk=i_item_sk + UNION ALL + SELECT 'catalog' as channel, 'cs_bill_customer_sk' col_name, d_year, d_qoy, i_category, cs_ext_sales_price ext_sales_price + FROM catalog_sales, item, date_dim + WHERE cs_bill_customer_sk IS NULL + AND cs_sold_date_sk=d_date_sk + AND cs_item_sk=i_item_sk) foo +GROUP BY channel, col_name, d_year, d_qoy, i_category +ORDER BY channel, col_name, d_year, d_qoy, i_category + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q77.sql b/benchmarks/queries/tpcds/q77.sql new file mode 100644 index 0000000000..d04bc14bc9 --- /dev/null +++ b/benchmarks/queries/tpcds/q77.sql @@ -0,0 +1,109 @@ +-- SQLBench-DS query 77 derived from TPC-DS query 77 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ss as + (select s_store_sk, + sum(ss_ext_sales_price) as sales, + sum(ss_net_profit) as profit + from store_sales, + date_dim, + store + where ss_sold_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + and ss_store_sk = s_store_sk + group by s_store_sk) + , + sr as + (select s_store_sk, + sum(sr_return_amt) as returns, + sum(sr_net_loss) as profit_loss + from store_returns, + date_dim, + store + where sr_returned_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + and sr_store_sk = s_store_sk + group by s_store_sk), + cs as + (select cs_call_center_sk, + sum(cs_ext_sales_price) as sales, + sum(cs_net_profit) as profit + from catalog_sales, + date_dim + where cs_sold_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + group by cs_call_center_sk + ), + cr as + (select cr_call_center_sk, + sum(cr_return_amount) as returns, + sum(cr_net_loss) as profit_loss + from catalog_returns, + date_dim + where cr_returned_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + group by cr_call_center_sk + ), + ws as + ( select wp_web_page_sk, + sum(ws_ext_sales_price) as sales, + sum(ws_net_profit) as profit + from web_sales, + date_dim, + web_page + where ws_sold_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + and ws_web_page_sk = wp_web_page_sk + group by wp_web_page_sk), + wr as + (select wp_web_page_sk, + sum(wr_return_amt) as returns, + sum(wr_net_loss) as profit_loss + from web_returns, + date_dim, + web_page + where wr_returned_date_sk = d_date_sk + and d_date between cast('2001-08-11' as date) + and (cast('2001-08-11' as date) + INTERVAL '30 DAYS') + and wr_web_page_sk = wp_web_page_sk + group by wp_web_page_sk) + select channel + , id + , sum(sales) as sales + , sum(returns) as returns + , sum(profit) as profit + from + (select 'store channel' as channel + , ss.s_store_sk as id + , sales + , coalesce(returns, 0) as returns + , (profit - coalesce(profit_loss,0)) as profit + from ss left join sr + on ss.s_store_sk = sr.s_store_sk + union all + select 'catalog channel' as channel + , cs_call_center_sk as id + , sales + , returns + , (profit - profit_loss) as profit + from cs + , cr + union all + select 'web channel' as channel + , ws.wp_web_page_sk as id + , sales + , coalesce(returns, 0) returns + , (profit - coalesce(profit_loss,0)) as profit + from ws left join wr + on ws.wp_web_page_sk = wr.wp_web_page_sk + ) x + group by rollup (channel, id) + order by channel + ,id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q78.sql b/benchmarks/queries/tpcds/q78.sql new file mode 100644 index 0000000000..927ef63561 --- /dev/null +++ b/benchmarks/queries/tpcds/q78.sql @@ -0,0 +1,59 @@ +-- SQLBench-DS query 78 derived from TPC-DS query 78 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ws as + (select d_year AS ws_sold_year, ws_item_sk, + ws_bill_customer_sk ws_customer_sk, + sum(ws_quantity) ws_qty, + sum(ws_wholesale_cost) ws_wc, + sum(ws_sales_price) ws_sp + from web_sales + left join web_returns on wr_order_number=ws_order_number and ws_item_sk=wr_item_sk + join date_dim on ws_sold_date_sk = d_date_sk + where wr_order_number is null + group by d_year, ws_item_sk, ws_bill_customer_sk + ), +cs as + (select d_year AS cs_sold_year, cs_item_sk, + cs_bill_customer_sk cs_customer_sk, + sum(cs_quantity) cs_qty, + sum(cs_wholesale_cost) cs_wc, + sum(cs_sales_price) cs_sp + from catalog_sales + left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk + join date_dim on cs_sold_date_sk = d_date_sk + where cr_order_number is null + group by d_year, cs_item_sk, cs_bill_customer_sk + ), +ss as + (select d_year AS ss_sold_year, ss_item_sk, + ss_customer_sk, + sum(ss_quantity) ss_qty, + sum(ss_wholesale_cost) ss_wc, + sum(ss_sales_price) ss_sp + from store_sales + left join store_returns on sr_ticket_number=ss_ticket_number and ss_item_sk=sr_item_sk + join date_dim on ss_sold_date_sk = d_date_sk + where sr_ticket_number is null + group by d_year, ss_item_sk, ss_customer_sk + ) + select +ss_customer_sk, +round(ss_qty/(coalesce(ws_qty,0)+coalesce(cs_qty,0)),2) ratio, +ss_qty store_qty, ss_wc store_wholesale_cost, ss_sp store_sales_price, +coalesce(ws_qty,0)+coalesce(cs_qty,0) other_chan_qty, +coalesce(ws_wc,0)+coalesce(cs_wc,0) other_chan_wholesale_cost, +coalesce(ws_sp,0)+coalesce(cs_sp,0) other_chan_sales_price +from ss +left join ws on (ws_sold_year=ss_sold_year and ws_item_sk=ss_item_sk and ws_customer_sk=ss_customer_sk) +left join cs on (cs_sold_year=ss_sold_year and cs_item_sk=ss_item_sk and cs_customer_sk=ss_customer_sk) +where (coalesce(ws_qty,0)>0 or coalesce(cs_qty, 0)>0) and ss_sold_year=2001 +order by + ss_customer_sk, + ss_qty desc, ss_wc desc, ss_sp desc, + other_chan_qty, + other_chan_wholesale_cost, + other_chan_sales_price, + ratio + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q79.sql b/benchmarks/queries/tpcds/q79.sql new file mode 100644 index 0000000000..568444b152 --- /dev/null +++ b/benchmarks/queries/tpcds/q79.sql @@ -0,0 +1,24 @@ +-- SQLBench-DS query 79 derived from TPC-DS query 79 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + c_last_name,c_first_name,substr(s_city,1,30),ss_ticket_number,amt,profit + from + (select ss_ticket_number + ,ss_customer_sk + ,store.s_city + ,sum(ss_coupon_amt) amt + ,sum(ss_net_profit) profit + from store_sales,date_dim,store,household_demographics + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_store_sk = store.s_store_sk + and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk + and (household_demographics.hd_dep_count = 0 or household_demographics.hd_vehicle_count > 4) + and date_dim.d_dow = 1 + and date_dim.d_year in (1999,1999+1,1999+2) + and store.s_number_employees between 200 and 295 + group by ss_ticket_number,ss_customer_sk,ss_addr_sk,store.s_city) ms,customer + where ss_customer_sk = c_customer_sk + order by c_last_name,c_first_name,substr(s_city,1,30), profit + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q8.sql b/benchmarks/queries/tpcds/q8.sql new file mode 100644 index 0000000000..0a994b4d21 --- /dev/null +++ b/benchmarks/queries/tpcds/q8.sql @@ -0,0 +1,109 @@ +-- SQLBench-DS query 8 derived from TPC-DS query 8 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select s_store_name + ,sum(ss_net_profit) + from store_sales + ,date_dim + ,store, + (select ca_zip + from ( + SELECT substr(ca_zip,1,5) ca_zip + FROM customer_address + WHERE substr(ca_zip,1,5) IN ( + '19100','41548','51640','49699','88329','55986', + '85119','19510','61020','95452','26235', + '51102','16733','42819','27823','90192', + '31905','28865','62197','23750','81398', + '95288','45114','82060','12313','25218', + '64386','46400','77230','69271','43672', + '36521','34217','13017','27936','42766', + '59233','26060','27477','39981','93402', + '74270','13932','51731','71642','17710', + '85156','21679','70840','67191','39214', + '35273','27293','17128','15458','31615', + '60706','67657','54092','32775','14683', + '32206','62543','43053','11297','58216', + '49410','14710','24501','79057','77038', + '91286','32334','46298','18326','67213', + '65382','40315','56115','80162','55956', + '81583','73588','32513','62880','12201', + '11592','17014','83832','61796','57872', + '78829','69912','48524','22016','26905', + '48511','92168','63051','25748','89786', + '98827','86404','53029','37524','14039', + '50078','34487','70142','18697','40129', + '60642','42810','62667','57183','46414', + '58463','71211','46364','34851','54884', + '25382','25239','74126','21568','84204', + '13607','82518','32982','36953','86001', + '79278','21745','64444','35199','83181', + '73255','86177','98043','90392','13882', + '47084','17859','89526','42072','20233', + '52745','75000','22044','77013','24182', + '52554','56138','43440','86100','48791', + '21883','17096','15965','31196','74903', + '19810','35763','92020','55176','54433', + '68063','71919','44384','16612','32109', + '28207','14762','89933','10930','27616', + '56809','14244','22733','33177','29784', + '74968','37887','11299','34692','85843', + '83663','95421','19323','17406','69264', + '28341','50150','79121','73974','92917', + '21229','32254','97408','46011','37169', + '18146','27296','62927','68812','47734', + '86572','12620','80252','50173','27261', + '29534','23488','42184','23695','45868', + '12910','23429','29052','63228','30731', + '15747','25827','22332','62349','56661', + '44652','51862','57007','22773','40361', + '65238','19327','17282','44708','35484', + '34064','11148','92729','22995','18833', + '77528','48917','17256','93166','68576', + '71096','56499','35096','80551','82424', + '17700','32748','78969','46820','57725', + '46179','54677','98097','62869','83959', + '66728','19716','48326','27420','53458', + '69056','84216','36688','63957','41469', + '66843','18024','81950','21911','58387', + '58103','19813','34581','55347','17171', + '35914','75043','75088','80541','26802', + '28849','22356','57721','77084','46385', + '59255','29308','65885','70673','13306', + '68788','87335','40987','31654','67560', + '92309','78116','65961','45018','16548', + '67092','21818','33716','49449','86150', + '12156','27574','43201','50977','52839', + '33234','86611','71494','17823','57172', + '59869','34086','51052','11320','39717', + '79604','24672','70555','38378','91135', + '15567','21606','74994','77168','38607', + '27384','68328','88944','40203','37893', + '42726','83549','48739','55652','27543', + '23109','98908','28831','45011','47525', + '43870','79404','35780','42136','49317', + '14574','99586','21107','14302','83882', + '81272','92552','14916','87533','86518', + '17862','30741','96288','57886','30304', + '24201','79457','36728','49833','35182', + '20108','39858','10804','47042','20439', + '54708','59027','82499','75311','26548', + '53406','92060','41152','60446','33129', + '43979','16903','60319','35550','33887', + '25463','40343','20726','44429') + intersect + select ca_zip + from (SELECT substr(ca_zip,1,5) ca_zip,count(*) cnt + FROM customer_address, customer + WHERE ca_address_sk = c_current_addr_sk and + c_preferred_cust_flag='Y' + group by ca_zip + having count(*) > 10)A1)A2) V1 + where ss_store_sk = s_store_sk + and ss_sold_date_sk = d_date_sk + and d_qoy = 1 and d_year = 2000 + and (substr(s_zip,1,2) = substr(V1.ca_zip,1,2)) + group by s_store_name + order by s_store_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q80.sql b/benchmarks/queries/tpcds/q80.sql new file mode 100644 index 0000000000..29b2f87464 --- /dev/null +++ b/benchmarks/queries/tpcds/q80.sql @@ -0,0 +1,97 @@ +-- SQLBench-DS query 80 derived from TPC-DS query 80 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ssr as + (select s_store_id as store_id, + sum(ss_ext_sales_price) as sales, + sum(coalesce(sr_return_amt, 0)) as returns, + sum(ss_net_profit - coalesce(sr_net_loss, 0)) as profit + from store_sales left outer join store_returns on + (ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number), + date_dim, + store, + item, + promotion + where ss_sold_date_sk = d_date_sk + and d_date between cast('2002-08-04' as date) + and (cast('2002-08-04' as date) + INTERVAL '30 DAYS') + and ss_store_sk = s_store_sk + and ss_item_sk = i_item_sk + and i_current_price > 50 + and ss_promo_sk = p_promo_sk + and p_channel_tv = 'N' + group by s_store_id) + , + csr as + (select cp_catalog_page_id as catalog_page_id, + sum(cs_ext_sales_price) as sales, + sum(coalesce(cr_return_amount, 0)) as returns, + sum(cs_net_profit - coalesce(cr_net_loss, 0)) as profit + from catalog_sales left outer join catalog_returns on + (cs_item_sk = cr_item_sk and cs_order_number = cr_order_number), + date_dim, + catalog_page, + item, + promotion + where cs_sold_date_sk = d_date_sk + and d_date between cast('2002-08-04' as date) + and (cast('2002-08-04' as date) + INTERVAL '30 DAYS') + and cs_catalog_page_sk = cp_catalog_page_sk + and cs_item_sk = i_item_sk + and i_current_price > 50 + and cs_promo_sk = p_promo_sk + and p_channel_tv = 'N' +group by cp_catalog_page_id) + , + wsr as + (select web_site_id, + sum(ws_ext_sales_price) as sales, + sum(coalesce(wr_return_amt, 0)) as returns, + sum(ws_net_profit - coalesce(wr_net_loss, 0)) as profit + from web_sales left outer join web_returns on + (ws_item_sk = wr_item_sk and ws_order_number = wr_order_number), + date_dim, + web_site, + item, + promotion + where ws_sold_date_sk = d_date_sk + and d_date between cast('2002-08-04' as date) + and (cast('2002-08-04' as date) + INTERVAL '30 DAYS') + and ws_web_site_sk = web_site_sk + and ws_item_sk = i_item_sk + and i_current_price > 50 + and ws_promo_sk = p_promo_sk + and p_channel_tv = 'N' +group by web_site_id) + select channel + , id + , sum(sales) as sales + , sum(returns) as returns + , sum(profit) as profit + from + (select 'store channel' as channel + , 'store' || store_id as id + , sales + , returns + , profit + from ssr + union all + select 'catalog channel' as channel + , 'catalog_page' || catalog_page_id as id + , sales + , returns + , profit + from csr + union all + select 'web channel' as channel + , 'web_site' || web_site_id as id + , sales + , returns + , profit + from wsr + ) x + group by rollup (channel, id) + order by channel + ,id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q81.sql b/benchmarks/queries/tpcds/q81.sql new file mode 100644 index 0000000000..8dd4c43067 --- /dev/null +++ b/benchmarks/queries/tpcds/q81.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 81 derived from TPC-DS query 81 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with customer_total_return as + (select cr_returning_customer_sk as ctr_customer_sk + ,ca_state as ctr_state, + sum(cr_return_amt_inc_tax) as ctr_total_return + from catalog_returns + ,date_dim + ,customer_address + where cr_returned_date_sk = d_date_sk + and d_year =1998 + and cr_returning_addr_sk = ca_address_sk + group by cr_returning_customer_sk + ,ca_state ) + select c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name + ,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset + ,ca_location_type,ctr_total_return + from customer_total_return ctr1 + ,customer_address + ,customer + where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2 + from customer_total_return ctr2 + where ctr1.ctr_state = ctr2.ctr_state) + and ca_address_sk = c_current_addr_sk + and ca_state = 'TX' + and ctr1.ctr_customer_sk = c_customer_sk + order by c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name + ,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset + ,ca_location_type,ctr_total_return + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q82.sql b/benchmarks/queries/tpcds/q82.sql new file mode 100644 index 0000000000..faea7a2f67 --- /dev/null +++ b/benchmarks/queries/tpcds/q82.sql @@ -0,0 +1,18 @@ +-- SQLBench-DS query 82 derived from TPC-DS query 82 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,i_current_price + from item, inventory, date_dim, store_sales + where i_current_price between 69 and 69+30 + and inv_item_sk = i_item_sk + and d_date_sk=inv_date_sk + and d_date between cast('1998-06-06' as date) and (cast('1998-06-06' as date) + INTERVAL '60 DAYS') + and i_manufact_id in (105,513,180,137) + and inv_quantity_on_hand between 100 and 500 + and ss_item_sk = i_item_sk + group by i_item_id,i_item_desc,i_current_price + order by i_item_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q83.sql b/benchmarks/queries/tpcds/q83.sql new file mode 100644 index 0000000000..b2512ed83e --- /dev/null +++ b/benchmarks/queries/tpcds/q83.sql @@ -0,0 +1,68 @@ +-- SQLBench-DS query 83 derived from TPC-DS query 83 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with sr_items as + (select i_item_id item_id, + sum(sr_return_quantity) sr_item_qty + from store_returns, + item, + date_dim + where sr_item_sk = i_item_sk + and d_date in + (select d_date + from date_dim + where d_week_seq in + (select d_week_seq + from date_dim + where d_date in ('2000-04-29','2000-09-09','2000-11-02'))) + and sr_returned_date_sk = d_date_sk + group by i_item_id), + cr_items as + (select i_item_id item_id, + sum(cr_return_quantity) cr_item_qty + from catalog_returns, + item, + date_dim + where cr_item_sk = i_item_sk + and d_date in + (select d_date + from date_dim + where d_week_seq in + (select d_week_seq + from date_dim + where d_date in ('2000-04-29','2000-09-09','2000-11-02'))) + and cr_returned_date_sk = d_date_sk + group by i_item_id), + wr_items as + (select i_item_id item_id, + sum(wr_return_quantity) wr_item_qty + from web_returns, + item, + date_dim + where wr_item_sk = i_item_sk + and d_date in + (select d_date + from date_dim + where d_week_seq in + (select d_week_seq + from date_dim + where d_date in ('2000-04-29','2000-09-09','2000-11-02'))) + and wr_returned_date_sk = d_date_sk + group by i_item_id) + select sr_items.item_id + ,sr_item_qty + ,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev + ,cr_item_qty + ,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev + ,wr_item_qty + ,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev + ,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average + from sr_items + ,cr_items + ,wr_items + where sr_items.item_id=cr_items.item_id + and sr_items.item_id=wr_items.item_id + order by sr_items.item_id + ,sr_item_qty + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q84.sql b/benchmarks/queries/tpcds/q84.sql new file mode 100644 index 0000000000..a07249b463 --- /dev/null +++ b/benchmarks/queries/tpcds/q84.sql @@ -0,0 +1,22 @@ +-- SQLBench-DS query 84 derived from TPC-DS query 84 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select c_customer_id as customer_id + , coalesce(c_last_name,'') || ', ' || coalesce(c_first_name,'') as customername + from customer + ,customer_address + ,customer_demographics + ,household_demographics + ,income_band + ,store_returns + where ca_city = 'White Oak' + and c_current_addr_sk = ca_address_sk + and ib_lower_bound >= 45626 + and ib_upper_bound <= 45626 + 50000 + and ib_income_band_sk = hd_income_band_sk + and cd_demo_sk = c_current_cdemo_sk + and hd_demo_sk = c_current_hdemo_sk + and sr_cdemo_sk = cd_demo_sk + order by c_customer_id + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q85.sql b/benchmarks/queries/tpcds/q85.sql new file mode 100644 index 0000000000..c529acfe9e --- /dev/null +++ b/benchmarks/queries/tpcds/q85.sql @@ -0,0 +1,85 @@ +-- SQLBench-DS query 85 derived from TPC-DS query 85 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select substr(r_reason_desc,1,20) + ,avg(ws_quantity) + ,avg(wr_refunded_cash) + ,avg(wr_fee) + from web_sales, web_returns, web_page, customer_demographics cd1, + customer_demographics cd2, customer_address, date_dim, reason + where ws_web_page_sk = wp_web_page_sk + and ws_item_sk = wr_item_sk + and ws_order_number = wr_order_number + and ws_sold_date_sk = d_date_sk and d_year = 2001 + and cd1.cd_demo_sk = wr_refunded_cdemo_sk + and cd2.cd_demo_sk = wr_returning_cdemo_sk + and ca_address_sk = wr_refunded_addr_sk + and r_reason_sk = wr_reason_sk + and + ( + ( + cd1.cd_marital_status = 'D' + and + cd1.cd_marital_status = cd2.cd_marital_status + and + cd1.cd_education_status = 'Primary' + and + cd1.cd_education_status = cd2.cd_education_status + and + ws_sales_price between 100.00 and 150.00 + ) + or + ( + cd1.cd_marital_status = 'U' + and + cd1.cd_marital_status = cd2.cd_marital_status + and + cd1.cd_education_status = 'Unknown' + and + cd1.cd_education_status = cd2.cd_education_status + and + ws_sales_price between 50.00 and 100.00 + ) + or + ( + cd1.cd_marital_status = 'M' + and + cd1.cd_marital_status = cd2.cd_marital_status + and + cd1.cd_education_status = 'Advanced Degree' + and + cd1.cd_education_status = cd2.cd_education_status + and + ws_sales_price between 150.00 and 200.00 + ) + ) + and + ( + ( + ca_country = 'United States' + and + ca_state in ('SC', 'IN', 'VA') + and ws_net_profit between 100 and 200 + ) + or + ( + ca_country = 'United States' + and + ca_state in ('WA', 'KS', 'KY') + and ws_net_profit between 150 and 300 + ) + or + ( + ca_country = 'United States' + and + ca_state in ('SD', 'WI', 'NE') + and ws_net_profit between 50 and 250 + ) + ) +group by r_reason_desc +order by substr(r_reason_desc,1,20) + ,avg(ws_quantity) + ,avg(wr_refunded_cash) + ,avg(wr_fee) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q86.sql b/benchmarks/queries/tpcds/q86.sql new file mode 100644 index 0000000000..ed7f4f85d0 --- /dev/null +++ b/benchmarks/queries/tpcds/q86.sql @@ -0,0 +1,27 @@ +-- SQLBench-DS query 86 derived from TPC-DS query 86 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + sum(ws_net_paid) as total_sum + ,i_category + ,i_class + ,grouping(i_category)+grouping(i_class) as lochierarchy + ,rank() over ( + partition by grouping(i_category)+grouping(i_class), + case when grouping(i_class) = 0 then i_category end + order by sum(ws_net_paid) desc) as rank_within_parent + from + web_sales + ,date_dim d1 + ,item + where + d1.d_month_seq between 1205 and 1205+11 + and d1.d_date_sk = ws_sold_date_sk + and i_item_sk = ws_item_sk + group by rollup(i_category,i_class) + order by + lochierarchy desc, + case when lochierarchy = 0 then i_category end, + rank_within_parent + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q87.sql b/benchmarks/queries/tpcds/q87.sql new file mode 100644 index 0000000000..13e2d8e2e7 --- /dev/null +++ b/benchmarks/queries/tpcds/q87.sql @@ -0,0 +1,24 @@ +-- SQLBench-DS query 87 derived from TPC-DS query 87 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select count(*) +from ((select distinct c_last_name, c_first_name, d_date + from store_sales, date_dim, customer + where store_sales.ss_sold_date_sk = date_dim.d_date_sk + and store_sales.ss_customer_sk = customer.c_customer_sk + and d_month_seq between 1189 and 1189+11) + except + (select distinct c_last_name, c_first_name, d_date + from catalog_sales, date_dim, customer + where catalog_sales.cs_sold_date_sk = date_dim.d_date_sk + and catalog_sales.cs_bill_customer_sk = customer.c_customer_sk + and d_month_seq between 1189 and 1189+11) + except + (select distinct c_last_name, c_first_name, d_date + from web_sales, date_dim, customer + where web_sales.ws_sold_date_sk = date_dim.d_date_sk + and web_sales.ws_bill_customer_sk = customer.c_customer_sk + and d_month_seq between 1189 and 1189+11) +) cool_cust +; + diff --git a/benchmarks/queries/tpcds/q88.sql b/benchmarks/queries/tpcds/q88.sql new file mode 100644 index 0000000000..8d47334a4e --- /dev/null +++ b/benchmarks/queries/tpcds/q88.sql @@ -0,0 +1,95 @@ +-- SQLBench-DS query 88 derived from TPC-DS query 88 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * +from + (select count(*) h8_30_to_9 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 8 + and time_dim.t_minute >= 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s1, + (select count(*) h9_to_9_30 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 9 + and time_dim.t_minute < 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s2, + (select count(*) h9_30_to_10 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 9 + and time_dim.t_minute >= 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s3, + (select count(*) h10_to_10_30 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 10 + and time_dim.t_minute < 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s4, + (select count(*) h10_30_to_11 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 10 + and time_dim.t_minute >= 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s5, + (select count(*) h11_to_11_30 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 11 + and time_dim.t_minute < 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s6, + (select count(*) h11_30_to_12 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 11 + and time_dim.t_minute >= 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s7, + (select count(*) h12_to_12_30 + from store_sales, household_demographics , time_dim, store + where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 12 + and time_dim.t_minute < 30 + and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or + (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or + (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2)) + and store.s_store_name = 'ese') s8 +; + diff --git a/benchmarks/queries/tpcds/q89.sql b/benchmarks/queries/tpcds/q89.sql new file mode 100644 index 0000000000..ac02b6fe33 --- /dev/null +++ b/benchmarks/queries/tpcds/q89.sql @@ -0,0 +1,29 @@ +-- SQLBench-DS query 89 derived from TPC-DS query 89 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select * +from( +select i_category, i_class, i_brand, + s_store_name, s_company_name, + d_moy, + sum(ss_sales_price) sum_sales, + avg(sum(ss_sales_price)) over + (partition by i_category, i_brand, s_store_name, s_company_name) + avg_monthly_sales +from item, store_sales, date_dim, store +where ss_item_sk = i_item_sk and + ss_sold_date_sk = d_date_sk and + ss_store_sk = s_store_sk and + d_year in (2001) and + ((i_category in ('Children','Jewelry','Home') and + i_class in ('infants','birdal','flatware') + ) + or (i_category in ('Electronics','Music','Books') and + i_class in ('audio','classical','science') + )) +group by i_category, i_class, i_brand, + s_store_name, s_company_name, d_moy) tmp1 +where case when (avg_monthly_sales <> 0) then (abs(sum_sales - avg_monthly_sales) / avg_monthly_sales) else null end > 0.1 +order by sum_sales - avg_monthly_sales, s_store_name + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q9.sql b/benchmarks/queries/tpcds/q9.sql new file mode 100644 index 0000000000..cf723ccf29 --- /dev/null +++ b/benchmarks/queries/tpcds/q9.sql @@ -0,0 +1,52 @@ +-- SQLBench-DS query 9 derived from TPC-DS query 9 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select case when (select count(*) + from store_sales + where ss_quantity between 1 and 20) > 31002 + then (select avg(ss_ext_discount_amt) + from store_sales + where ss_quantity between 1 and 20) + else (select avg(ss_net_profit) + from store_sales + where ss_quantity between 1 and 20) end bucket1 , + case when (select count(*) + from store_sales + where ss_quantity between 21 and 40) > 588 + then (select avg(ss_ext_discount_amt) + from store_sales + where ss_quantity between 21 and 40) + else (select avg(ss_net_profit) + from store_sales + where ss_quantity between 21 and 40) end bucket2, + case when (select count(*) + from store_sales + where ss_quantity between 41 and 60) > 2456 + then (select avg(ss_ext_discount_amt) + from store_sales + where ss_quantity between 41 and 60) + else (select avg(ss_net_profit) + from store_sales + where ss_quantity between 41 and 60) end bucket3, + case when (select count(*) + from store_sales + where ss_quantity between 61 and 80) > 21645 + then (select avg(ss_ext_discount_amt) + from store_sales + where ss_quantity between 61 and 80) + else (select avg(ss_net_profit) + from store_sales + where ss_quantity between 61 and 80) end bucket4, + case when (select count(*) + from store_sales + where ss_quantity between 81 and 100) > 20553 + then (select avg(ss_ext_discount_amt) + from store_sales + where ss_quantity between 81 and 100) + else (select avg(ss_net_profit) + from store_sales + where ss_quantity between 81 and 100) end bucket5 +from reason +where r_reason_sk = 1 +; + diff --git a/benchmarks/queries/tpcds/q90.sql b/benchmarks/queries/tpcds/q90.sql new file mode 100644 index 0000000000..dedf5fd066 --- /dev/null +++ b/benchmarks/queries/tpcds/q90.sql @@ -0,0 +1,23 @@ +-- SQLBench-DS query 90 derived from TPC-DS query 90 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select cast(amc as decimal(15,4))/cast(pmc as decimal(15,4)) am_pm_ratio + from ( select count(*) amc + from web_sales, household_demographics , time_dim, web_page + where ws_sold_time_sk = time_dim.t_time_sk + and ws_ship_hdemo_sk = household_demographics.hd_demo_sk + and ws_web_page_sk = web_page.wp_web_page_sk + and time_dim.t_hour between 9 and 9+1 + and household_demographics.hd_dep_count = 2 + and web_page.wp_char_count between 5000 and 5200) at, + ( select count(*) pmc + from web_sales, household_demographics , time_dim, web_page + where ws_sold_time_sk = time_dim.t_time_sk + and ws_ship_hdemo_sk = household_demographics.hd_demo_sk + and ws_web_page_sk = web_page.wp_web_page_sk + and time_dim.t_hour between 15 and 15+1 + and household_demographics.hd_dep_count = 2 + and web_page.wp_char_count between 5000 and 5200) pt + order by am_pm_ratio + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q91.sql b/benchmarks/queries/tpcds/q91.sql new file mode 100644 index 0000000000..894d41bb2b --- /dev/null +++ b/benchmarks/queries/tpcds/q91.sql @@ -0,0 +1,32 @@ +-- SQLBench-DS query 91 derived from TPC-DS query 91 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + cc_call_center_id Call_Center, + cc_name Call_Center_Name, + cc_manager Manager, + sum(cr_net_loss) Returns_Loss +from + call_center, + catalog_returns, + date_dim, + customer, + customer_address, + customer_demographics, + household_demographics +where + cr_call_center_sk = cc_call_center_sk +and cr_returned_date_sk = d_date_sk +and cr_returning_customer_sk= c_customer_sk +and cd_demo_sk = c_current_cdemo_sk +and hd_demo_sk = c_current_hdemo_sk +and ca_address_sk = c_current_addr_sk +and d_year = 2002 +and d_moy = 11 +and ( (cd_marital_status = 'M' and cd_education_status = 'Unknown') + or(cd_marital_status = 'W' and cd_education_status = 'Advanced Degree')) +and hd_buy_potential like 'Unknown%' +and ca_gmt_offset = -6 +group by cc_call_center_id,cc_name,cc_manager,cd_marital_status,cd_education_status +order by sum(cr_net_loss) desc; + diff --git a/benchmarks/queries/tpcds/q92.sql b/benchmarks/queries/tpcds/q92.sql new file mode 100644 index 0000000000..171a968627 --- /dev/null +++ b/benchmarks/queries/tpcds/q92.sql @@ -0,0 +1,31 @@ +-- SQLBench-DS query 92 derived from TPC-DS query 92 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + sum(ws_ext_discount_amt) as `Excess Discount Amount` +from + web_sales + ,item + ,date_dim +where +i_manufact_id = 914 +and i_item_sk = ws_item_sk +and d_date between '2001-01-25' and + (cast('2001-01-25' as date) + INTERVAL '90 DAYS') +and d_date_sk = ws_sold_date_sk +and ws_ext_discount_amt + > ( + SELECT + 1.3 * avg(ws_ext_discount_amt) + FROM + web_sales + ,date_dim + WHERE + ws_item_sk = i_item_sk + and d_date between '2001-01-25' and + (cast('2001-01-25' as date) + INTERVAL '90 DAYS') + and d_date_sk = ws_sold_date_sk + ) +order by sum(ws_ext_discount_amt) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q93.sql b/benchmarks/queries/tpcds/q93.sql new file mode 100644 index 0000000000..31ec9e7d4e --- /dev/null +++ b/benchmarks/queries/tpcds/q93.sql @@ -0,0 +1,19 @@ +-- SQLBench-DS query 93 derived from TPC-DS query 93 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select ss_customer_sk + ,sum(act_sales) sumsales + from (select ss_item_sk + ,ss_ticket_number + ,ss_customer_sk + ,case when sr_return_quantity is not null then (ss_quantity-sr_return_quantity)*ss_sales_price + else (ss_quantity*ss_sales_price) end act_sales + from store_sales left outer join store_returns on (sr_item_sk = ss_item_sk + and sr_ticket_number = ss_ticket_number) + ,reason + where sr_reason_sk = r_reason_sk + and r_reason_desc = 'Did not get it on time') t + group by ss_customer_sk + order by sumsales, ss_customer_sk + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q94.sql b/benchmarks/queries/tpcds/q94.sql new file mode 100644 index 0000000000..cf04e14e0d --- /dev/null +++ b/benchmarks/queries/tpcds/q94.sql @@ -0,0 +1,30 @@ +-- SQLBench-DS query 94 derived from TPC-DS query 94 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + count(distinct ws_order_number) as `order count` + ,sum(ws_ext_ship_cost) as `total shipping cost` + ,sum(ws_net_profit) as `total net profit` +from + web_sales ws1 + ,date_dim + ,customer_address + ,web_site +where + d_date between '1999-4-01' and + (cast('1999-4-01' as date) + INTERVAL '60 DAYS') +and ws1.ws_ship_date_sk = d_date_sk +and ws1.ws_ship_addr_sk = ca_address_sk +and ca_state = 'WI' +and ws1.ws_web_site_sk = web_site_sk +and web_company_name = 'pri' +and exists (select * + from web_sales ws2 + where ws1.ws_order_number = ws2.ws_order_number + and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) +and not exists(select * + from web_returns wr1 + where ws1.ws_order_number = wr1.wr_order_number) +order by count(distinct ws_order_number) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q95.sql b/benchmarks/queries/tpcds/q95.sql new file mode 100644 index 0000000000..2db3e50327 --- /dev/null +++ b/benchmarks/queries/tpcds/q95.sql @@ -0,0 +1,33 @@ +-- SQLBench-DS query 95 derived from TPC-DS query 95 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ws_wh as +(select ws1.ws_order_number,ws1.ws_warehouse_sk wh1,ws2.ws_warehouse_sk wh2 + from web_sales ws1,web_sales ws2 + where ws1.ws_order_number = ws2.ws_order_number + and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) + select + count(distinct ws_order_number) as `order count` + ,sum(ws_ext_ship_cost) as `total shipping cost` + ,sum(ws_net_profit) as `total net profit` +from + web_sales ws1 + ,date_dim + ,customer_address + ,web_site +where + d_date between '2002-5-01' and + (cast('2002-5-01' as date) + INTERVAL '60 DAYS') +and ws1.ws_ship_date_sk = d_date_sk +and ws1.ws_ship_addr_sk = ca_address_sk +and ca_state = 'MA' +and ws1.ws_web_site_sk = web_site_sk +and web_company_name = 'pri' +and ws1.ws_order_number in (select ws_order_number + from ws_wh) +and ws1.ws_order_number in (select wr_order_number + from web_returns,ws_wh + where wr_order_number = ws_wh.ws_order_number) +order by count(distinct ws_order_number) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q96.sql b/benchmarks/queries/tpcds/q96.sql new file mode 100644 index 0000000000..63c6fdbf97 --- /dev/null +++ b/benchmarks/queries/tpcds/q96.sql @@ -0,0 +1,17 @@ +-- SQLBench-DS query 96 derived from TPC-DS query 96 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select count(*) +from store_sales + ,household_demographics + ,time_dim, store +where ss_sold_time_sk = time_dim.t_time_sk + and ss_hdemo_sk = household_demographics.hd_demo_sk + and ss_store_sk = s_store_sk + and time_dim.t_hour = 8 + and time_dim.t_minute >= 30 + and household_demographics.hd_dep_count = 5 + and store.s_store_name = 'ese' +order by count(*) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q97.sql b/benchmarks/queries/tpcds/q97.sql new file mode 100644 index 0000000000..5741cc9c56 --- /dev/null +++ b/benchmarks/queries/tpcds/q97.sql @@ -0,0 +1,26 @@ +-- SQLBench-DS query 97 derived from TPC-DS query 97 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +with ssci as ( +select ss_customer_sk customer_sk + ,ss_item_sk item_sk +from store_sales,date_dim +where ss_sold_date_sk = d_date_sk + and d_month_seq between 1211 and 1211 + 11 +group by ss_customer_sk + ,ss_item_sk), +csci as( + select cs_bill_customer_sk customer_sk + ,cs_item_sk item_sk +from catalog_sales,date_dim +where cs_sold_date_sk = d_date_sk + and d_month_seq between 1211 and 1211 + 11 +group by cs_bill_customer_sk + ,cs_item_sk) + select sum(case when ssci.customer_sk is not null and csci.customer_sk is null then 1 else 0 end) store_only + ,sum(case when ssci.customer_sk is null and csci.customer_sk is not null then 1 else 0 end) catalog_only + ,sum(case when ssci.customer_sk is not null and csci.customer_sk is not null then 1 else 0 end) store_and_catalog +from ssci full outer join csci on (ssci.customer_sk=csci.customer_sk + and ssci.item_sk = csci.item_sk) + LIMIT 100; + diff --git a/benchmarks/queries/tpcds/q98.sql b/benchmarks/queries/tpcds/q98.sql new file mode 100644 index 0000000000..86bf08b2a1 --- /dev/null +++ b/benchmarks/queries/tpcds/q98.sql @@ -0,0 +1,34 @@ +-- SQLBench-DS query 98 derived from TPC-DS query 98 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price + ,sum(ss_ext_sales_price) as itemrevenue + ,sum(ss_ext_sales_price)*100/sum(sum(ss_ext_sales_price)) over + (partition by i_class) as revenueratio +from + store_sales + ,item + ,date_dim +where + ss_item_sk = i_item_sk + and i_category in ('Shoes', 'Music', 'Men') + and ss_sold_date_sk = d_date_sk + and d_date between cast('2000-01-05' as date) + and (cast('2000-01-05' as date) + INTERVAL '30 DAYS') +group by + i_item_id + ,i_item_desc + ,i_category + ,i_class + ,i_current_price +order by + i_category + ,i_class + ,i_item_id + ,i_item_desc + ,revenueratio; + diff --git a/benchmarks/queries/tpcds/q99.sql b/benchmarks/queries/tpcds/q99.sql new file mode 100644 index 0000000000..8bd1f6406d --- /dev/null +++ b/benchmarks/queries/tpcds/q99.sql @@ -0,0 +1,36 @@ +-- SQLBench-DS query 99 derived from TPC-DS query 99 under the terms of the TPC Fair Use Policy. +-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council. +-- This query was generated at scale factor 1. +select + substr(w_warehouse_name,1,20) + ,sm_type + ,cc_name + ,sum(case when (cs_ship_date_sk - cs_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days` + ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 30) and + (cs_ship_date_sk - cs_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days` + ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 60) and + (cs_ship_date_sk - cs_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days` + ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 90) and + (cs_ship_date_sk - cs_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days` + ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 120) then 1 else 0 end) as `>120 days` +from + catalog_sales + ,warehouse + ,ship_mode + ,call_center + ,date_dim +where + d_month_seq between 1188 and 1188 + 11 +and cs_ship_date_sk = d_date_sk +and cs_warehouse_sk = w_warehouse_sk +and cs_ship_mode_sk = sm_ship_mode_sk +and cs_call_center_sk = cc_call_center_sk +group by + substr(w_warehouse_name,1,20) + ,sm_type + ,cc_name +order by substr(w_warehouse_name,1,20) + ,sm_type + ,cc_name + LIMIT 100; + diff --git a/benchmarks/queries/tpch/q1.sql b/benchmarks/queries/tpch/q1.sql new file mode 100644 index 0000000000..0dc4c3e598 --- /dev/null +++ b/benchmarks/queries/tpch/q1.sql @@ -0,0 +1,23 @@ +-- SQLBench-H query 1 derived from TPC-H query 1 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-12-01' - interval '68 days' +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; diff --git a/benchmarks/queries/tpch/q10.sql b/benchmarks/queries/tpch/q10.sql new file mode 100644 index 0000000000..576338f044 --- /dev/null +++ b/benchmarks/queries/tpch/q10.sql @@ -0,0 +1,33 @@ +-- SQLBench-H query 10 derived from TPC-H query 10 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-07-01' + and o_orderdate < date '1993-07-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc limit 20; diff --git a/benchmarks/queries/tpch/q11.sql b/benchmarks/queries/tpch/q11.sql new file mode 100644 index 0000000000..f4ead457b7 --- /dev/null +++ b/benchmarks/queries/tpch/q11.sql @@ -0,0 +1,29 @@ +-- SQLBench-H query 11 derived from TPC-H query 11 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' + ) +order by + value desc; diff --git a/benchmarks/queries/tpch/q12.sql b/benchmarks/queries/tpch/q12.sql new file mode 100644 index 0000000000..4ab4ea6e3b --- /dev/null +++ b/benchmarks/queries/tpch/q12.sql @@ -0,0 +1,30 @@ +-- SQLBench-H query 12 derived from TPC-H query 12 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in ('FOB', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1995-01-01' + and l_receiptdate < date '1995-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode; diff --git a/benchmarks/queries/tpch/q13.sql b/benchmarks/queries/tpch/q13.sql new file mode 100644 index 0000000000..301e35d193 --- /dev/null +++ b/benchmarks/queries/tpch/q13.sql @@ -0,0 +1,22 @@ +-- SQLBench-H query 13 derived from TPC-H query 13 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%express%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc; diff --git a/benchmarks/queries/tpch/q14.sql b/benchmarks/queries/tpch/q14.sql new file mode 100644 index 0000000000..6040ac734c --- /dev/null +++ b/benchmarks/queries/tpch/q14.sql @@ -0,0 +1,15 @@ +-- SQLBench-H query 14 derived from TPC-H query 14 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-02-01' + and l_shipdate < date '1995-02-01' + interval '1' month; diff --git a/benchmarks/queries/tpch/q15.sql b/benchmarks/queries/tpch/q15.sql new file mode 100644 index 0000000000..0fe03a79c0 --- /dev/null +++ b/benchmarks/queries/tpch/q15.sql @@ -0,0 +1,33 @@ +-- SQLBench-H query 15 derived from TPC-H query 15 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1996-08-01' + and l_shipdate < date '1996-08-01' + interval '3' month + group by + l_suppkey; +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey; +drop view revenue0; diff --git a/benchmarks/queries/tpch/q16.sql b/benchmarks/queries/tpch/q16.sql new file mode 100644 index 0000000000..7fdf36522a --- /dev/null +++ b/benchmarks/queries/tpch/q16.sql @@ -0,0 +1,32 @@ +-- SQLBench-H query 16 derived from TPC-H query 16 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#14' + and p_type not like 'SMALL PLATED%' + and p_size in (14, 6, 5, 31, 49, 15, 41, 47) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size; diff --git a/benchmarks/queries/tpch/q17.sql b/benchmarks/queries/tpch/q17.sql new file mode 100644 index 0000000000..ffa0f15c8a --- /dev/null +++ b/benchmarks/queries/tpch/q17.sql @@ -0,0 +1,19 @@ +-- SQLBench-H query 17 derived from TPC-H query 17 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = 'Brand#42' + and p_container = 'LG BAG' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey + ); diff --git a/benchmarks/queries/tpch/q18.sql b/benchmarks/queries/tpch/q18.sql new file mode 100644 index 0000000000..f4ab1945e7 --- /dev/null +++ b/benchmarks/queries/tpch/q18.sql @@ -0,0 +1,34 @@ +-- SQLBench-H query 18 derived from TPC-H query 18 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 313 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate limit 100; diff --git a/benchmarks/queries/tpch/q19.sql b/benchmarks/queries/tpch/q19.sql new file mode 100644 index 0000000000..ad5fb7d929 --- /dev/null +++ b/benchmarks/queries/tpch/q19.sql @@ -0,0 +1,37 @@ +-- SQLBench-H query 19 derived from TPC-H query 19 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#21' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 8 and l_quantity <= 8 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#13' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#52' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 30 and l_quantity <= 30 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ); diff --git a/benchmarks/queries/tpch/q2.sql b/benchmarks/queries/tpch/q2.sql new file mode 100644 index 0000000000..2936532889 --- /dev/null +++ b/benchmarks/queries/tpch/q2.sql @@ -0,0 +1,45 @@ +-- SQLBench-H query 2 derived from TPC-H query 2 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 48 + and p_type like '%TIN' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey limit 100; diff --git a/benchmarks/queries/tpch/q20.sql b/benchmarks/queries/tpch/q20.sql new file mode 100644 index 0000000000..3136ca302c --- /dev/null +++ b/benchmarks/queries/tpch/q20.sql @@ -0,0 +1,39 @@ +-- SQLBench-H query 20 derived from TPC-H query 20 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'blanched%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1993-01-01' + and l_shipdate < date '1993-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'KENYA' +order by + s_name; diff --git a/benchmarks/queries/tpch/q21.sql b/benchmarks/queries/tpch/q21.sql new file mode 100644 index 0000000000..01704697c4 --- /dev/null +++ b/benchmarks/queries/tpch/q21.sql @@ -0,0 +1,41 @@ +-- SQLBench-H query 21 derived from TPC-H query 21 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'ARGENTINA' +group by + s_name +order by + numwait desc, + s_name limit 100; diff --git a/benchmarks/queries/tpch/q22.sql b/benchmarks/queries/tpch/q22.sql new file mode 100644 index 0000000000..8d528ef6da --- /dev/null +++ b/benchmarks/queries/tpch/q22.sql @@ -0,0 +1,39 @@ +-- SQLBench-H query 22 derived from TPC-H query 22 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('24', '34', '16', '30', '33', '14', '13') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('24', '34', '16', '30', '33', '14', '13') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode; diff --git a/benchmarks/queries/tpch/q3.sql b/benchmarks/queries/tpch/q3.sql new file mode 100644 index 0000000000..b60be7ff69 --- /dev/null +++ b/benchmarks/queries/tpch/q3.sql @@ -0,0 +1,24 @@ +-- SQLBench-H query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-15' + and l_shipdate > date '1995-03-15' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate limit 10; diff --git a/benchmarks/queries/tpch/q4.sql b/benchmarks/queries/tpch/q4.sql new file mode 100644 index 0000000000..05fae97af9 --- /dev/null +++ b/benchmarks/queries/tpch/q4.sql @@ -0,0 +1,23 @@ +-- SQLBench-H query 4 derived from TPC-H query 4 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= date '1995-04-01' + and o_orderdate < date '1995-04-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority; diff --git a/benchmarks/queries/tpch/q5.sql b/benchmarks/queries/tpch/q5.sql new file mode 100644 index 0000000000..4b97ef0e48 --- /dev/null +++ b/benchmarks/queries/tpch/q5.sql @@ -0,0 +1,26 @@ +-- SQLBench-H query 5 derived from TPC-H query 5 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'AFRICA' + and o_orderdate >= date '1994-01-01' + and o_orderdate < date '1994-01-01' + interval '1' year +group by + n_name +order by + revenue desc; diff --git a/benchmarks/queries/tpch/q6.sql b/benchmarks/queries/tpch/q6.sql new file mode 100644 index 0000000000..f5b4bae70e --- /dev/null +++ b/benchmarks/queries/tpch/q6.sql @@ -0,0 +1,11 @@ +-- SQLBench-H query 6 derived from TPC-H query 6 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + and l_discount between 0.04 - 0.01 and 0.04 + 0.01 + and l_quantity < 24; diff --git a/benchmarks/queries/tpch/q7.sql b/benchmarks/queries/tpch/q7.sql new file mode 100644 index 0000000000..f3919be2db --- /dev/null +++ b/benchmarks/queries/tpch/q7.sql @@ -0,0 +1,41 @@ +-- SQLBench-H query 7 derived from TPC-H query 7 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ') + or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year; diff --git a/benchmarks/queries/tpch/q8.sql b/benchmarks/queries/tpch/q8.sql new file mode 100644 index 0000000000..7c53e145e4 --- /dev/null +++ b/benchmarks/queries/tpch/q8.sql @@ -0,0 +1,39 @@ +-- SQLBench-H query 8 derived from TPC-H query 8 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + o_year, + sum(case + when nation = 'IRAQ' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'MIDDLE EAST' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'LARGE PLATED STEEL' + ) as all_nations +group by + o_year +order by + o_year; diff --git a/benchmarks/queries/tpch/q9.sql b/benchmarks/queries/tpch/q9.sql new file mode 100644 index 0000000000..2455695618 --- /dev/null +++ b/benchmarks/queries/tpch/q9.sql @@ -0,0 +1,34 @@ +-- SQLBench-H query 9 derived from TPC-H query 9 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%moccasin%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc; diff --git a/benchmarks/run.py b/benchmarks/run.py new file mode 100755 index 0000000000..0632764edc --- /dev/null +++ b/benchmarks/run.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Unified benchmark runner wrapper. + +Reads .conf files, merges them with precedence (profile < engine < CLI), +then builds and executes the spark-submit command. + +Usage:: + + # = comet-tpch.sh + python benchmarks/run.py --engine comet --profile standalone-tpch \\ + --restart-cluster \\ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \\ + --output . --iterations 1 + + + + # = comet-tpch-iceberg.sh (dynamic catalog via --conf) + python benchmarks/run.py --engine comet-iceberg --profile standalone-tpch \\ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \\ + --conf spark.sql.catalog.local.type=hadoop \\ + --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \\ + --conf spark.sql.defaultCatalog=local \\ + --restart-cluster \\ + -- tpc --benchmark tpch --catalog local --database tpch \\ + --queries $TPCH_QUERIES --output . --iterations 1 + + # shuffle benchmark + python benchmarks/run.py --engine comet-jvm-shuffle --profile local \\ + -- shuffle --benchmark shuffle-hash --data /tmp/data --mode jvm \\ + --output . --iterations 3 +""" + +import argparse +import os +import subprocess +import sys + +# Allow importing from the repo root so ``from benchmarks.runner.config ...`` +# works when this script is run directly. +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_REPO_ROOT = os.path.dirname(_SCRIPT_DIR) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +from benchmarks.runner.config import merge_configs, split_config + + +def _parse_args(): + """Parse wrapper-level arguments, splitting on ``--``.""" + parser = argparse.ArgumentParser( + description="Unified benchmark runner — builds and executes spark-submit", + usage=( + "%(prog)s --engine NAME [--profile NAME] " + "[--conf key=value ...] [--restart-cluster] " + "[--dry-run] -- SUITE_ARGS..." + ), + ) + parser.add_argument("--engine", required=True, help="Engine config name") + parser.add_argument("--profile", default=None, help="Profile config name") + parser.add_argument( + "--conf", action="append", default=[], + help="Extra key=value config override (repeatable)", + ) + parser.add_argument( + "--restart-cluster", action="store_true", + help="Stop and restart Spark standalone master + worker", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Print the spark-submit command without executing it", + ) + + # Split on "--": everything before goes to this parser, everything after + # is passed through to the benchmark suite CLI. + argv = sys.argv[1:] + if "--" in argv: + sep = argv.index("--") + wrapper_args = argv[:sep] + suite_args = argv[sep + 1:] + else: + wrapper_args = argv + suite_args = [] + + args = parser.parse_args(wrapper_args) + args.suite_args = suite_args + return args + + +def _resolve_conf_path(conf_dir, kind, name): + """Return the path to a .conf file, or exit with an error.""" + path = os.path.join(conf_dir, kind, f"{name}.conf") + if not os.path.isfile(path): + available = sorted( + f.removesuffix(".conf") + for f in os.listdir(os.path.join(conf_dir, kind)) + if f.endswith(".conf") + ) + print( + f"Error: {kind} config '{name}' not found at {path}\n" + f"Available: {', '.join(available)}", + file=sys.stderr, + ) + sys.exit(1) + return path + + +def _restart_cluster(): + """Stop and start Spark standalone master + worker.""" + spark_home = os.environ.get("SPARK_HOME") + if not spark_home: + print("Error: SPARK_HOME must be set for --restart-cluster", file=sys.stderr) + sys.exit(1) + spark_master = os.environ.get("SPARK_MASTER") + if not spark_master: + print("Error: SPARK_MASTER must be set for --restart-cluster", file=sys.stderr) + sys.exit(1) + + sbin = os.path.join(spark_home, "sbin") + print("Restarting Spark standalone cluster...") + subprocess.run([os.path.join(sbin, "stop-master.sh")], stderr=subprocess.DEVNULL, check=False) + subprocess.run([os.path.join(sbin, "stop-worker.sh")], stderr=subprocess.DEVNULL, check=False) + subprocess.check_call([os.path.join(sbin, "start-master.sh")]) + subprocess.check_call([os.path.join(sbin, "start-worker.sh"), spark_master]) + + +def main(): + args = _parse_args() + conf_dir = os.path.join(_SCRIPT_DIR, "conf") + + # Resolve config file paths + engine_path = _resolve_conf_path(conf_dir, "engines", args.engine) + profile_path = ( + _resolve_conf_path(conf_dir, "profiles", args.profile) + if args.profile else None + ) + + # Merge configs: profile < engine < CLI overrides + merged = merge_configs( + profile_path=profile_path, + engine_path=engine_path, + cli_overrides=args.conf, + ) + spark_conf, runner_conf = split_config(merged) + + # Export runner.env.* as environment variables + for key, value in runner_conf.items(): + if key.startswith("env."): + env_var = key[len("env."):] + os.environ[env_var] = value + print(f"Exported {env_var}={value}") + + # Restart cluster if requested + if args.restart_cluster: + _restart_cluster() + + # Build spark-submit command + spark_home = os.environ.get("SPARK_HOME", "") + if not spark_home: + print("Error: SPARK_HOME must be set", file=sys.stderr) + sys.exit(1) + + cmd = [os.path.join(spark_home, "bin", "spark-submit")] + + # Master + master = runner_conf.get("master") + if master: + cmd += ["--master", master] + + # JARs + jars = runner_conf.get("jars") + if jars: + cmd += ["--jars", jars] + cmd += ["--driver-class-path", jars.replace(",", ":")] + + # Spark configs + for key, value in spark_conf.items(): + cmd += ["--conf", f"{key}={value}"] + + # Python script (the CLI entry point) + cmd.append(os.path.join(_SCRIPT_DIR, "runner", "cli.py")) + + # Inject --name from runner.name if not already in suite args. + # Insert after the first positional arg (the subcommand) so that + # argparse routes it to the correct subparser. + runner_name = runner_conf.get("name", args.engine) + suite_args = list(args.suite_args) + if "--name" not in suite_args and suite_args: + suite_args.insert(1, "--name") + suite_args.insert(2, runner_name) + + cmd += suite_args + + # Print and execute + print() + print("spark-submit command:") + print(f" {' '.join(cmd)}") + print() + + if args.dry_run: + return + + os.execvp(cmd[0], cmd) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py new file mode 100644 index 0000000000..0ccbeeeafb --- /dev/null +++ b/benchmarks/runner/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmarks/runner/cli.py b/benchmarks/runner/cli.py new file mode 100644 index 0000000000..5394cf9599 --- /dev/null +++ b/benchmarks/runner/cli.py @@ -0,0 +1,315 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Single CLI entry point for the unified benchmark runner. + +Designed to be the Python script passed to ``spark-submit``. Subcommands +correspond to benchmark suites (currently: ``tpc``, ``shuffle``, ``micro``). + +Usage (via spark-submit):: + + spark-submit ... benchmarks/runner/cli.py tpc --benchmark tpch --data /path ... + spark-submit ... benchmarks/runner/cli.py shuffle --benchmark shuffle-hash --data /path ... + spark-submit ... benchmarks/runner/cli.py micro --benchmark string-expressions --output ... +""" + +import argparse +import json +import os +import sys + +from benchmarks.runner.spark_session import create_session +from benchmarks.suites import tpc +from benchmarks.suites import shuffle +from benchmarks.suites import micro + + +# --------------------------------------------------------------------------- +# Profiling helpers +# --------------------------------------------------------------------------- + +def _maybe_start_profiler(spark, args): + """Start profiler if ``--profile`` was passed. Returns profiler or None.""" + if not getattr(args, "profile", False): + return None + from benchmarks.runner.profiling import SparkMetricsProfiler + + interval = getattr(args, "profile_interval", 2.0) + profiler = SparkMetricsProfiler(spark, interval_secs=interval) + profiler.start() + return profiler + + +def _maybe_stop_profiler(profiler, output_dir, name, benchmark): + """Stop profiler and write CSV if active.""" + if profiler is None: + return + profiler.stop() + csv_path = os.path.join(output_dir, f"{name}-{benchmark}-metrics.csv") + profiler.write_csv(csv_path) + + +def _add_profiling_args(parser): + """Add common profiling flags to a subparser.""" + parser.add_argument( + "--profile", action="store_true", + help="Enable Level 1 JVM metrics profiling via Spark REST API", + ) + parser.add_argument( + "--profile-interval", type=float, default=2.0, + help="Profiling poll interval in seconds (default: 2.0)", + ) + + +# --------------------------------------------------------------------------- +# TPC subcommand +# --------------------------------------------------------------------------- + +def _add_tpc_subparser(subparsers): + """Register the ``tpc`` subcommand with the same args as tpcbench.py.""" + p = subparsers.add_parser( + "tpc", + help="Run TPC-H or TPC-DS benchmarks", + description="TPC-H/TPC-DS benchmark runner for files or Iceberg tables", + ) + p.add_argument("--benchmark", required=True, help="tpch or tpcds") + + source = p.add_mutually_exclusive_group(required=True) + source.add_argument("--data", help="Path to data files") + source.add_argument("--catalog", help="Iceberg catalog name") + + p.add_argument( + "--format", default="parquet", + help="Input file format: parquet, csv, json (only with --data)", + ) + p.add_argument( + "--options", type=json.loads, default={}, + help='Spark reader options as JSON, e.g. \'{"header": "true"}\'', + ) + p.add_argument( + "--database", default="tpch", + help="Database containing TPC tables (only with --catalog)", + ) + p.add_argument("--queries", required=True, help="Path to query SQL files") + p.add_argument("--iterations", type=int, default=1, help="Number of iterations") + p.add_argument("--output", required=True, help="Directory for results JSON") + p.add_argument("--name", required=True, help="Prefix for result file") + p.add_argument("--query", type=int, help="Run a single query (1-based)") + p.add_argument("--write", help="Path to save query results as Parquet") + _add_profiling_args(p) + + +def _run_tpc(args): + """Execute the TPC suite.""" + spark = create_session( + app_name=f"{args.name} benchmark derived from {args.benchmark}", + spark_conf={}, # configs already set by spark-submit + ) + + profiler = _maybe_start_profiler(spark, args) + + using_iceberg = tpc.register_tables( + spark, + benchmark=args.benchmark, + data_path=args.data, + catalog=args.catalog, + database=args.database, + file_format=args.format, + reader_options=args.options, + ) + + timings = tpc.run_queries( + spark, + benchmark=args.benchmark, + query_path=args.queries, + iterations=args.iterations, + query_num=args.query, + write_path=args.write, + ) + + results = tpc.build_results( + spark, + benchmark=args.benchmark, + query_path=args.queries, + data_path=args.data, + catalog=args.catalog, + database=args.database, + using_iceberg=using_iceberg, + name=args.name, + timings=timings, + ) + + tpc.write_results(results, args.output, args.name, args.benchmark) + _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark) + spark.stop() + + +# --------------------------------------------------------------------------- +# Shuffle subcommand +# --------------------------------------------------------------------------- + +def _add_shuffle_subparser(subparsers): + """Register the ``shuffle`` subcommand.""" + p = subparsers.add_parser( + "shuffle", + help="Run shuffle benchmarks (hash, round-robin)", + description=( + "Shuffle benchmark runner. Tests different partitioning strategies " + "across Spark, Comet JVM, and Comet Native shuffle implementations." + ), + ) + p.add_argument( + "--benchmark", required=True, + choices=list(shuffle.BENCHMARKS), + help="Shuffle benchmark to run", + ) + p.add_argument("--data", required=True, help="Path to input parquet data") + p.add_argument( + "--mode", required=True, + choices=["spark", "jvm", "native"], + help="Shuffle mode being tested", + ) + p.add_argument( + "--partitions", type=int, default=200, + help="Number of shuffle partitions (default: 200)", + ) + p.add_argument("--iterations", type=int, default=1, help="Number of iterations") + p.add_argument("--output", required=True, help="Directory for results JSON") + p.add_argument("--name", required=True, help="Prefix for result file") + _add_profiling_args(p) + + +def _run_shuffle(args): + """Execute the shuffle suite.""" + spark = create_session( + app_name=f"{args.name}-{args.benchmark}-{args.mode.upper()}", + spark_conf={}, # configs already set by spark-submit + ) + + profiler = _maybe_start_profiler(spark, args) + + timings = shuffle.run_shuffle( + spark, + benchmark=args.benchmark, + data_path=args.data, + mode=args.mode, + num_partitions=args.partitions, + iterations=args.iterations, + ) + + results = shuffle.build_results( + spark, + benchmark=args.benchmark, + data_path=args.data, + mode=args.mode, + name=args.name, + timings=timings, + ) + + shuffle.write_results(results, args.output, args.name, args.benchmark) + _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark) + spark.stop() + + +# --------------------------------------------------------------------------- +# Micro subcommand +# --------------------------------------------------------------------------- + +def _add_micro_subparser(subparsers): + """Register the ``micro`` subcommand.""" + p = subparsers.add_parser( + "micro", + help="Run expression-level microbenchmarks", + description=( + "Microbenchmark runner. Generates a small dataset and times " + "individual SQL expressions." + ), + ) + p.add_argument( + "--benchmark", required=True, + choices=list(micro.BENCHMARKS), + help="Microbenchmark to run", + ) + p.add_argument( + "--rows", type=int, default=1024, + help="Number of rows for data generation (default: 1024)", + ) + p.add_argument("--iterations", type=int, default=3, help="Number of iterations") + p.add_argument("--expression", help="Run a single expression by name") + p.add_argument("--output", required=True, help="Directory for results JSON") + p.add_argument("--name", required=True, help="Prefix for result file") + _add_profiling_args(p) + + +def _run_micro(args): + """Execute the micro suite.""" + spark = create_session( + app_name=f"{args.name}-{args.benchmark}", + spark_conf={}, # configs already set by spark-submit + ) + + profiler = _maybe_start_profiler(spark, args) + + timings = micro.run_micro( + spark, + benchmark=args.benchmark, + num_rows=args.rows, + iterations=args.iterations, + expression=args.expression, + ) + + results = micro.build_results( + spark, + benchmark=args.benchmark, + name=args.name, + timings=timings, + ) + + micro.write_results(results, args.output, args.name, args.benchmark) + _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark) + spark.stop() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(argv=None): + parser = argparse.ArgumentParser( + prog="benchmark-runner", + description="Unified benchmark runner for Apache DataFusion Comet", + ) + subparsers = parser.add_subparsers(dest="suite", required=True) + _add_tpc_subparser(subparsers) + _add_shuffle_subparser(subparsers) + _add_micro_subparser(subparsers) + + args = parser.parse_args(argv) + + if args.suite == "tpc": + _run_tpc(args) + elif args.suite == "shuffle": + _run_shuffle(args) + elif args.suite == "micro": + _run_micro(args) + else: + parser.error(f"Unknown suite: {args.suite}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/runner/config.py b/benchmarks/runner/config.py new file mode 100644 index 0000000000..ff03d622d3 --- /dev/null +++ b/benchmarks/runner/config.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Config loader for the unified benchmark runner. + +Reads key=value .conf files, merges them with precedence +(profile < engine < CLI overrides), and splits into spark vs runner configs. + +The ``runner.*`` namespace controls the shell wrapper (JAR paths, env vars, +result name) without colliding with Spark config keys. Examples: + runner.jars=${COMET_JAR} + runner.env.TZ=UTC + runner.name=comet +""" + +import os +import re +from typing import Dict, List, Tuple + + +def load_conf_file(path: str) -> Dict[str, str]: + """Read a key=value .conf file. + + - Blank lines and lines starting with ``#`` are skipped. + - ``${VAR}`` references are expanded from the environment. + - Values may optionally be quoted with single or double quotes. + """ + conf: Dict[str, str] = {} + with open(path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + key, _, value = line.partition("=") + key = key.strip() + value = value.strip() + if not key or not _: + continue + # Strip optional quotes + if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"): + value = value[1:-1] + # Expand ${VAR} references from environment + value = re.sub( + r"\$\{(\w+)\}", + lambda m: os.environ.get(m.group(1), m.group(0)), + value, + ) + conf[key] = value + return conf + + +def merge_configs( + profile_path: str = None, + engine_path: str = None, + cli_overrides: List[str] = None, +) -> Dict[str, str]: + """Merge configs with precedence: profile < engine < CLI overrides.""" + merged: Dict[str, str] = {} + if profile_path: + merged.update(load_conf_file(profile_path)) + if engine_path: + merged.update(load_conf_file(engine_path)) + for override in cli_overrides or []: + key, _, value = override.partition("=") + key = key.strip() + value = value.strip() + if key and _: + # Expand ${VAR} in CLI overrides too + value = re.sub( + r"\$\{(\w+)\}", + lambda m: os.environ.get(m.group(1), m.group(0)), + value, + ) + merged[key] = value + return merged + + +def split_config(merged: Dict[str, str]) -> Tuple[Dict[str, str], Dict[str, str]]: + """Separate ``runner.*`` keys from ``spark.*`` (and other) keys. + + Returns (spark_conf, runner_conf) where runner_conf has the + ``runner.`` prefix stripped. + """ + spark_conf: Dict[str, str] = {} + runner_conf: Dict[str, str] = {} + for key, value in merged.items(): + if key.startswith("runner."): + runner_conf[key[len("runner."):]] = value + else: + spark_conf[key] = value + return spark_conf, runner_conf diff --git a/benchmarks/runner/profiling.py b/benchmarks/runner/profiling.py new file mode 100644 index 0000000000..98aa56d493 --- /dev/null +++ b/benchmarks/runner/profiling.py @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Level 1 profiling hooks: JVM metrics via the Spark REST API. + +Polls ``/api/v1/applications/{appId}/executors`` at a configurable interval +and records executor memory metrics as a time-series CSV alongside the +benchmark results. + +Usage:: + + profiler = SparkMetricsProfiler(spark, interval_secs=2) + profiler.start() + # ... run benchmark ... + profiler.stop() + profiler.write_csv("/path/to/output/metrics.csv") +""" + +import csv +import threading +import time +from typing import Any, Dict, List, Optional + +from pyspark.sql import SparkSession + +try: + from urllib.request import urlopen + import json as _json + + def _fetch_json(url: str) -> Any: + with urlopen(url, timeout=5) as resp: + return _json.loads(resp.read().decode()) +except ImportError: + _fetch_json = None # type: ignore[assignment] + + +# Metrics we extract per executor from the REST API response +_EXECUTOR_METRICS = [ + "memoryUsed", + "maxMemory", + "totalOnHeapStorageMemory", + "usedOnHeapStorageMemory", + "totalOffHeapStorageMemory", + "usedOffHeapStorageMemory", +] + +# Metrics nested under peakMemoryMetrics (if available) +_PEAK_MEMORY_METRICS = [ + "JVMHeapMemory", + "JVMOffHeapMemory", + "OnHeapExecutionMemory", + "OffHeapExecutionMemory", + "OnHeapStorageMemory", + "OffHeapStorageMemory", + "OnHeapUnifiedMemory", + "OffHeapUnifiedMemory", + "ProcessTreeJVMRSSMemory", +] + + +class SparkMetricsProfiler: + """Periodically polls executor metrics from the Spark REST API.""" + + def __init__( + self, + spark: SparkSession, + interval_secs: float = 2.0, + ): + self._spark = spark + self._interval = interval_secs + self._samples: List[Dict[str, Any]] = [] + self._stop_event = threading.Event() + self._thread: Optional[threading.Thread] = None + self._start_time: float = 0.0 + + @property + def samples(self) -> List[Dict[str, Any]]: + """Return collected samples (each is a flat dict).""" + return list(self._samples) + + def _ui_url(self) -> Optional[str]: + """Return the Spark UI base URL, or None if unavailable.""" + url = self._spark.sparkContext.uiWebUrl + if url: + return url.rstrip("/") + return None + + def _app_id(self) -> str: + return self._spark.sparkContext.applicationId + + def _poll_once(self) -> None: + """Fetch executor metrics and append a timestamped sample.""" + base = self._ui_url() + if base is None or _fetch_json is None: + return + + url = f"{base}/api/v1/applications/{self._app_id()}/executors" + try: + executors = _fetch_json(url) + except Exception: + return + + elapsed = time.time() - self._start_time + for exc in executors: + row: Dict[str, Any] = { + "elapsed_secs": round(elapsed, 2), + "executor_id": exc.get("id", ""), + "is_active": exc.get("isActive", True), + } + for key in _EXECUTOR_METRICS: + row[key] = exc.get(key, 0) + + peak = exc.get("peakMemoryMetrics", {}) + for key in _PEAK_MEMORY_METRICS: + row[f"peak_{key}"] = peak.get(key, 0) + + self._samples.append(row) + + def _run(self) -> None: + """Background polling loop.""" + while not self._stop_event.is_set(): + self._poll_once() + self._stop_event.wait(self._interval) + + def start(self) -> None: + """Start background polling thread.""" + if self._thread is not None: + return + self._start_time = time.time() + self._stop_event.clear() + self._thread = threading.Thread( + target=self._run, name="spark-metrics-profiler", daemon=True + ) + self._thread.start() + print( + f"Profiler started (interval={self._interval}s, " + f"ui={self._ui_url()})" + ) + + def stop(self) -> None: + """Stop the polling thread and collect a final sample.""" + if self._thread is None: + return + self._stop_event.set() + self._thread.join(timeout=self._interval + 2) + self._thread = None + # One last poll to capture final state + self._poll_once() + print(f"Profiler stopped ({len(self._samples)} samples collected)") + + def write_csv(self, path: str) -> str: + """Write collected samples to a CSV file. Returns the path.""" + if not self._samples: + print("Profiler: no samples to write") + return path + + fieldnames = list(self._samples[0].keys()) + with open(path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for row in self._samples: + writer.writerow(row) + print(f"Profiler: wrote {len(self._samples)} samples to {path}") + return path diff --git a/benchmarks/runner/spark_session.py b/benchmarks/runner/spark_session.py new file mode 100644 index 0000000000..f2e21a464a --- /dev/null +++ b/benchmarks/runner/spark_session.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Thin wrapper around SparkSession.builder.""" + +from typing import Dict + +from pyspark.sql import SparkSession + + +def create_session(app_name: str, spark_conf: Dict[str, str]) -> SparkSession: + """Create (or retrieve) a SparkSession with the given config. + + When launched via spark-submit the configs are already set; this just + picks up the existing session. + """ + builder = SparkSession.builder.appName(app_name) + for key, value in spark_conf.items(): + builder = builder.config(key, value) + return builder.getOrCreate() diff --git a/benchmarks/suites/MICRO.md b/benchmarks/suites/MICRO.md new file mode 100644 index 0000000000..41c5fa2bc0 --- /dev/null +++ b/benchmarks/suites/MICRO.md @@ -0,0 +1,108 @@ + + +# Microbenchmark Suite + +Runs expression-level microbenchmarks that generate a small in-memory dataset +and time individual SQL expressions. Currently supports the string expression +benchmark (ported from `CometStringExpressionBenchmark.scala`). + +## Arguments + +| Argument | Required | Default | Description | +| -------------------- | -------- | ------- | ---------------------------------------------- | +| `--benchmark` | yes | | `string-expressions` | +| `--rows` | no | `1024` | Number of rows for data generation | +| `--iterations` | no | `3` | Number of timed iterations per expression | +| `--expression` | no | | Run a single expression by name | +| `--output` | yes | | Directory for results JSON | +| `--name` | auto | | Result file prefix (auto-injected by `run.py`) | +| `--profile` | no | | Enable JVM metrics profiling | +| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds | + +## Examples + +### String expressions with Comet + +```bash +python benchmarks/run.py --engine comet --profile local \ + -- micro --benchmark string-expressions --output . --iterations 3 +``` + +### String expressions with vanilla Spark (baseline) + +```bash +python benchmarks/run.py --engine spark --profile local \ + -- micro --benchmark string-expressions --output . --iterations 3 +``` + +### String expressions with Gluten + +```bash +python benchmarks/run.py --engine gluten --profile local \ + -- micro --benchmark string-expressions --output . --iterations 3 +``` + +### Run a single expression + +```bash +python benchmarks/run.py --engine comet --profile local \ + -- micro --benchmark string-expressions --output . --expression ascii +``` + +### Compare results across engines + +```bash +# Run each engine +for engine in comet spark gluten; do + python benchmarks/run.py --engine $engine --profile local \ + -- micro --benchmark string-expressions --output . --iterations 3 +done + +# Generate comparison chart +python -m benchmarks.analysis.compare \ + comet-string-expressions-*.json spark-string-expressions-*.json \ + --labels Comet Spark --benchmark string-expressions +``` + +## Output Format + +Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`: + +```json +{ + "engine": "datafusion-comet", + "benchmark": "string-expressions", + "spark_conf": { ... }, + "ascii": [0.12, 0.10, 0.08], + "bit_length": [0.05, 0.04, 0.04], + "lower": [0.15, 0.11, 0.07], + ... +} +``` + +Expression names are top-level keys, each mapping to a list of elapsed seconds +per iteration. This format is directly compatible with `analysis/compare.py`. + +## Available Expressions (string-expressions) + +ascii, bit_length, btrim, chr, concat, concat_ws, contains, endswith, initcap, +instr, length, like, lower, lpad, ltrim, octet_length, regexp_replace, repeat, +replace, reverse, rlike, rpad, rtrim, space, startswith, substring, translate, +trim, upper. diff --git a/benchmarks/suites/SHUFFLE.md b/benchmarks/suites/SHUFFLE.md new file mode 100644 index 0000000000..222fa4cc82 --- /dev/null +++ b/benchmarks/suites/SHUFFLE.md @@ -0,0 +1,132 @@ + + +# Shuffle Benchmark Suite + +Compares shuffle file sizes and performance across Spark, Comet JVM, and +Comet Native shuffle implementations using hash or round-robin partitioning. + +## Arguments + +| Argument | Required | Default | Description | +| -------------------- | -------- | ------- | -------------------------------------- | +| `--benchmark` | yes | | `shuffle-hash` or `shuffle-roundrobin` | +| `--data` | yes | | Path to input Parquet data | +| `--mode` | yes | | `spark`, `jvm`, or `native` | +| `--partitions` | no | `200` | Number of shuffle partitions | +| `--iterations` | no | `1` | Number of iterations | +| `--output` | yes | | Directory for results JSON | +| `--name` | auto | | Result file prefix (auto-injected) | +| `--profile` | no | | Enable JVM metrics profiling | +| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds | + +## Generating Test Data + +Generate a Parquet dataset with a wide schema (100 columns including deeply +nested structs, arrays, and maps): + +```bash +$SPARK_HOME/bin/spark-submit \ + --master $SPARK_MASTER \ + --executor-memory 16g \ + benchmarks/generate_shuffle_data.py \ + --output /tmp/shuffle-benchmark-data \ + --rows 10000000 \ + --partitions 200 +``` + +> **Note**: The data generation script is a standalone PySpark job. It can be +> run with any Spark installation — no engine JARs required. + +## Examples + +### Hash shuffle — Spark baseline + +```bash +python benchmarks/run.py --engine spark-shuffle --profile local \ + -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \ + --mode spark --output . --iterations 3 +``` + +### Hash shuffle — Comet JVM + +```bash +python benchmarks/run.py --engine comet-jvm-shuffle --profile local \ + -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \ + --mode jvm --output . --iterations 3 +``` + +### Hash shuffle — Comet Native + +```bash +python benchmarks/run.py --engine comet-native-shuffle --profile local \ + -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \ + --mode native --output . --iterations 3 +``` + +### Round-robin shuffle + +```bash +python benchmarks/run.py --engine comet-native-shuffle --profile local \ + -- shuffle --benchmark shuffle-roundrobin --data /tmp/shuffle-data \ + --mode native --output . --iterations 3 +``` + +### Run all three modes back-to-back + +```bash +for engine_mode in "spark-shuffle spark" "comet-jvm-shuffle jvm" "comet-native-shuffle native"; do + set -- $engine_mode + python benchmarks/run.py --engine "$1" --profile local \ + -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \ + --mode "$2" --output . --iterations 3 +done +``` + +### With profiling + +```bash +python benchmarks/run.py --engine comet-native-shuffle --profile local \ + -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \ + --mode native --output . --iterations 3 --profile --profile-interval 1.0 +``` + +## Output Format + +Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`: + +```json +{ + "engine": "datafusion-comet", + "benchmark": "shuffle-hash", + "mode": "native", + "data_path": "/tmp/shuffle-data", + "spark_conf": { ... }, + "shuffle-hash": [ + {"duration_ms": 12345, "row_count": 10000000, "num_partitions": 200}, + {"duration_ms": 11234, "row_count": 10000000, "num_partitions": 200}, + {"duration_ms": 10987, "row_count": 10000000, "num_partitions": 200} + ] +} +``` + +## Checking Results + +Open the Spark UI (default: http://localhost:4040) during each benchmark run +to compare shuffle write sizes in the Stages tab. diff --git a/benchmarks/suites/TPC.md b/benchmarks/suites/TPC.md new file mode 100644 index 0000000000..7e7ff299b7 --- /dev/null +++ b/benchmarks/suites/TPC.md @@ -0,0 +1,139 @@ + + +# TPC-H / TPC-DS Benchmark Suite + +Runs TPC-H (22 queries) or TPC-DS (99 queries) benchmarks against Parquet +files or Iceberg tables. + +## Arguments + +| Argument | Required | Default | Description | +| -------------------- | -------- | --------- | ------------------------------------------------------- | +| `--benchmark` | yes | | `tpch` or `tpcds` | +| `--data` | \* | | Path to Parquet data files | +| `--catalog` | \* | | Iceberg catalog name (mutually exclusive with `--data`) | +| `--database` | no | `tpch` | Database name (only with `--catalog`) | +| `--format` | no | `parquet` | File format: parquet, csv, json (only with `--data`) | +| `--options` | no | `{}` | Spark reader options as JSON string | +| `--queries` | yes | | Path to directory containing `q1.sql` ... `qN.sql` | +| `--iterations` | no | `1` | Number of times to run all queries | +| `--output` | yes | | Directory for results JSON | +| `--name` | auto | | Result file prefix (auto-injected from engine config) | +| `--query` | no | | Run a single query number (1-based) | +| `--write` | no | | Write query results as Parquet to this path | +| `--profile` | no | | Enable JVM metrics profiling | +| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds | + +`*` Either `--data` or `--catalog` is required, but not both. + +## Examples + +### TPC-H with Comet (standalone cluster) + +```bash +export SPARK_HOME=/opt/spark +export SPARK_MASTER=spark://hostname:7077 +export COMET_JAR=/path/to/comet.jar +export TPCH_DATA=/mnt/bigdata/tpch/sf100 +export TPCH_QUERIES=/mnt/bigdata/tpch/queries + +python benchmarks/run.py \ + --engine comet --profile standalone-tpch --restart-cluster \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --iterations 1 +``` + +### TPC-H with vanilla Spark (baseline) + +```bash +python benchmarks/run.py \ + --engine spark --profile standalone-tpch --restart-cluster \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --iterations 1 +``` + +### TPC-H with Iceberg tables + +First, create Iceberg tables from Parquet data: + +```bash +export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar +export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse + +$SPARK_HOME/bin/spark-submit \ + --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \ + benchmarks/create-iceberg-tpch.py \ + --parquet-path $TPCH_DATA --catalog local --database tpch +``` + +Then run the benchmark with Comet's native Iceberg scanning: + +```bash +python benchmarks/run.py \ + --engine comet-iceberg --profile standalone-tpch \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \ + --conf spark.sql.defaultCatalog=local \ + --restart-cluster \ + -- tpc --benchmark tpch --catalog local --database tpch \ + --queries $TPCH_QUERIES --output . --iterations 1 +``` + +### Run a single query + +```bash +python benchmarks/run.py --engine comet --profile local \ + -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \ + --output . --query 1 +``` + +## Output Format + +Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`: + +```json +{ + "engine": "datafusion-comet", + "benchmark": "tpch", + "query_path": "/path/to/queries", + "spark_conf": { ... }, + "data_path": "/path/to/data", + "1": [12.34], + "2": [5.67], + ... +} +``` + +Query keys are integers (serialised as strings by `json.dumps`). Each value +is a list of elapsed seconds per iteration. This format is compatible with +`analysis/compare.py` for chart generation. + +## Comparing Results + +```bash +python -m benchmarks.analysis.compare \ + comet-tpch-*.json spark-tpch-*.json \ + --labels Comet Spark --benchmark tpch \ + --title "TPC-H SF100" --output-dir ./charts +``` diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py new file mode 100644 index 0000000000..0ccbeeeafb --- /dev/null +++ b/benchmarks/suites/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/benchmarks/suites/micro.py b/benchmarks/suites/micro.py new file mode 100644 index 0000000000..58c7a22283 --- /dev/null +++ b/benchmarks/suites/micro.py @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Microbenchmark suite. + +Ports expression-level benchmarks (e.g. CometStringExpressionBenchmark) to the +unified runner. Each benchmark generates a small dataset, runs SQL expressions +in a tight loop, and records per-iteration wall-clock times. +""" + +import json +import os +import shutil +import tempfile +import time +from datetime import datetime +from typing import Dict, List, Optional + +from pyspark.sql import SparkSession + + +# --------------------------------------------------------------------------- +# String expression benchmark +# --------------------------------------------------------------------------- + +STRING_EXPRESSIONS: List[tuple] = [ + ("ascii", "select ascii(c1) from parquetV1Table"), + ("bit_length", "select bit_length(c1) from parquetV1Table"), + ("btrim", "select btrim(c1) from parquetV1Table"), + ("chr", "select chr(c1) from parquetV1Table"), + ("concat", "select concat(c1, c1) from parquetV1Table"), + ("concat_ws", "select concat_ws(' ', c1, c1) from parquetV1Table"), + ("contains", "select contains(c1, '123') from parquetV1Table"), + ("endswith", "select endswith(c1, '9') from parquetV1Table"), + ("initcap", "select initCap(c1) from parquetV1Table"), + ("instr", "select instr(c1, '123') from parquetV1Table"), + ("length", "select length(c1) from parquetV1Table"), + ("like", "select c1 like '%123%' from parquetV1Table"), + ("lower", "select lower(c1) from parquetV1Table"), + ("lpad", "select lpad(c1, 150, 'x') from parquetV1Table"), + ("ltrim", "select ltrim(c1) from parquetV1Table"), + ("octet_length", "select octet_length(c1) from parquetV1Table"), + ("regexp_replace", "select regexp_replace(c1, '[0-9]', 'X') from parquetV1Table"), + ("repeat", "select repeat(c1, 3) from parquetV1Table"), + ("replace", "select replace(c1, '123', 'ab') from parquetV1Table"), + ("reverse", "select reverse(c1) from parquetV1Table"), + ("rlike", "select c1 rlike '[0-9]+' from parquetV1Table"), + ("rpad", "select rpad(c1, 150, 'x') from parquetV1Table"), + ("rtrim", "select rtrim(c1) from parquetV1Table"), + ("space", "select space(2) from parquetV1Table"), + ("startswith", "select startswith(c1, '1') from parquetV1Table"), + ("substring", "select substring(c1, 1, 100) from parquetV1Table"), + ("translate", "select translate(c1, '123456', 'aBcDeF') from parquetV1Table"), + ("trim", "select trim(c1) from parquetV1Table"), + ("upper", "select upper(c1) from parquetV1Table"), +] + +BENCHMARKS = { + "string-expressions": "String expression microbenchmarks (29 expressions)", +} + + +def prepare_string_table( + spark: SparkSession, num_rows: int, temp_dir: str +) -> None: + """Generate a string column table and register it as ``parquetV1Table``.""" + path = os.path.join(temp_dir, "string_data") + spark.range(num_rows).selectExpr( + "REPEAT(CAST(id AS STRING), 10) AS c1" + ).write.mode("overwrite").option("compression", "snappy").parquet(path) + spark.read.parquet(path).createOrReplaceTempView("parquetV1Table") + print(f"Generated {num_rows} rows in {path}") + + +def run_micro( + spark: SparkSession, + benchmark: str, + num_rows: int = 1024, + iterations: int = 3, + expression: Optional[str] = None, +) -> Dict[str, List[float]]: + """Run a microbenchmark and return ``{expr_name: [elapsed_secs, ...]}``.""" + if benchmark != "string-expressions": + raise ValueError( + f"Unknown micro benchmark: {benchmark}. " + f"Available: {', '.join(BENCHMARKS)}" + ) + + temp_dir = tempfile.mkdtemp(prefix="comet-micro-") + try: + prepare_string_table(spark, num_rows, temp_dir) + + expressions = STRING_EXPRESSIONS + if expression is not None: + expressions = [(n, sql) for n, sql in expressions if n == expression] + if not expressions: + valid = [n for n, _ in STRING_EXPRESSIONS] + raise ValueError( + f"Unknown expression: {expression}. Valid: {', '.join(valid)}" + ) + + timings: Dict[str, List[float]] = {} + + for expr_name, sql in expressions: + print(f"\n{'=' * 60}") + print(f"Expression: {expr_name}") + print(f"{'=' * 60}") + + for iteration in range(iterations): + spark.sparkContext.setJobDescription( + f"{benchmark} {expr_name} iter{iteration + 1}" + ) + start = time.time() + spark.sql(sql).foreach(lambda _: None) + elapsed = time.time() - start + print(f" Iteration {iteration + 1}: {elapsed:.4f}s") + timings.setdefault(expr_name, []).append(elapsed) + + return timings + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +def build_results( + spark: SparkSession, + benchmark: str, + name: str, + timings: Dict[str, List[float]], +) -> Dict: + """Assemble the result dict for micro benchmarks.""" + conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()} + + results: Dict = { + "engine": "datafusion-comet", + "benchmark": benchmark, + "spark_conf": conf_dict, + } + for expr_name, elapsed_list in timings.items(): + results[expr_name] = elapsed_list + + return results + + +def write_results( + results: Dict, + output_dir: str, + name: str, + benchmark: str, +) -> str: + """Write JSON results file. Returns the path written.""" + result_str = json.dumps(results, indent=4) + current_time_millis = int(datetime.now().timestamp() * 1000) + results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json" + print(f"\nWriting results to {results_path}") + with open(results_path, "w") as f: + f.write(result_str) + return results_path diff --git a/benchmarks/suites/shuffle.py b/benchmarks/suites/shuffle.py new file mode 100644 index 0000000000..adabecf2be --- /dev/null +++ b/benchmarks/suites/shuffle.py @@ -0,0 +1,156 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Shuffle benchmark suite. + +Tests different partitioning strategies (hash, round-robin) across Spark, +Comet JVM, and Comet Native shuffle implementations. +""" + +import json +import time +from datetime import datetime +from typing import Any, Dict, List, Optional + +from pyspark.sql import DataFrame, SparkSession + + +BENCHMARKS = { + "shuffle-hash": "Shuffle all columns using hash partitioning on group_key", + "shuffle-roundrobin": "Shuffle all columns using round-robin partitioning", +} + + +def _repartition( + df: DataFrame, benchmark: str, num_partitions: int +) -> DataFrame: + """Apply the partitioning strategy for the given benchmark.""" + if benchmark == "shuffle-hash": + return df.repartition(num_partitions, "group_key") + elif benchmark == "shuffle-roundrobin": + return df.repartition(num_partitions) + else: + raise ValueError( + f"Unknown shuffle benchmark: {benchmark}. " + f"Available: {', '.join(BENCHMARKS)}" + ) + + +def run_shuffle( + spark: SparkSession, + benchmark: str, + data_path: str, + mode: str, + num_partitions: int = 200, + iterations: int = 1, +) -> Dict[str, List[Dict[str, Any]]]: + """Run a shuffle benchmark and return per-iteration results. + + Returns ``{benchmark_name: [{duration_ms, row_count, num_partitions}, ...]}`` + so the structure parallels TPC output (query -> list of timings). + """ + if benchmark not in BENCHMARKS: + raise ValueError( + f"Unknown shuffle benchmark: {benchmark}. " + f"Available: {', '.join(BENCHMARKS)}" + ) + + results: List[Dict[str, Any]] = [] + + # Read input data once + df = spark.read.parquet(data_path) + row_count = df.count() + + for iteration in range(iterations): + print(f"\n{'=' * 60}") + print(f"Shuffle benchmark: {benchmark} | Mode: {mode.upper()}") + print(f"Iteration {iteration + 1} of {iterations}") + print(f"{'=' * 60}") + print(f"Data path: {data_path}") + print(f"Rows: {row_count:,} | Partitions: {num_partitions}") + + # Print relevant Spark configuration + conf = spark.sparkContext.getConf() + print(f"Shuffle manager: {conf.get('spark.shuffle.manager', 'default')}") + print(f"Comet enabled: {conf.get('spark.comet.enabled', 'false')}") + print( + f"Comet shuffle enabled: " + f"{conf.get('spark.comet.exec.shuffle.enabled', 'false')}" + ) + print( + f"Comet shuffle mode: " + f"{conf.get('spark.comet.exec.shuffle.mode', 'not set')}" + ) + + spark.catalog.clearCache() + spark.sparkContext.setJobDescription(f"{benchmark} iter{iteration + 1}") + + start_time = time.time() + + repartitioned = _repartition(df, benchmark, num_partitions) + output_path = f"/tmp/shuffle-benchmark-output-{mode}-{benchmark}" + repartitioned.write.mode("overwrite").parquet(output_path) + print(f"Wrote repartitioned data to: {output_path}") + + duration_ms = int((time.time() - start_time) * 1000) + print(f"Duration: {duration_ms:,} ms") + + results.append({ + "duration_ms": duration_ms, + "row_count": row_count, + "num_partitions": num_partitions, + }) + + return {benchmark: results} + + +def build_results( + spark: SparkSession, + benchmark: str, + data_path: str, + mode: str, + name: str, + timings: Dict[str, List[Dict[str, Any]]], +) -> Dict: + """Assemble the result dict for shuffle benchmarks.""" + conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()} + + return { + "engine": "datafusion-comet", + "benchmark": benchmark, + "mode": mode, + "data_path": data_path, + "spark_conf": conf_dict, + **timings, + } + + +def write_results( + results: Dict, + output_dir: str, + name: str, + benchmark: str, +) -> str: + """Write JSON results file. Returns the path written.""" + result_str = json.dumps(results, indent=4) + current_time_millis = int(datetime.now().timestamp() * 1000) + results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json" + print(f"\nWriting results to {results_path}") + with open(results_path, "w") as f: + f.write(result_str) + return results_path diff --git a/benchmarks/suites/tpc.py b/benchmarks/suites/tpc.py new file mode 100644 index 0000000000..19c020a135 --- /dev/null +++ b/benchmarks/suites/tpc.py @@ -0,0 +1,215 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TPC-H / TPC-DS benchmark suite.""" + +import json +import time +from datetime import datetime +from typing import Dict, List, Optional + +from pyspark.sql import SparkSession + + +# Table definitions per benchmark +TPCH_TABLES = [ + "customer", "lineitem", "nation", "orders", + "part", "partsupp", "region", "supplier", +] + +TPCDS_TABLES = [ + "call_center", "catalog_page", "catalog_returns", "catalog_sales", + "customer", "customer_address", "customer_demographics", "date_dim", + "time_dim", "household_demographics", "income_band", "inventory", + "item", "promotion", "reason", "ship_mode", "store", "store_returns", + "store_sales", "warehouse", "web_page", "web_returns", "web_sales", + "web_site", +] + +BENCHMARK_META = { + "tpch": {"num_queries": 22, "tables": TPCH_TABLES}, + "tpcds": {"num_queries": 99, "tables": TPCDS_TABLES}, +} + + +def dedup_columns(df): + """Rename duplicate column aliases: a, a, b, b -> a, a_1, b, b_1.""" + counts: Dict[str, int] = {} + new_cols: List[str] = [] + for c in df.columns: + if c not in counts: + counts[c] = 0 + new_cols.append(c) + else: + counts[c] += 1 + new_cols.append(f"{c}_{counts[c]}") + return df.toDF(*new_cols) + + +def register_tables( + spark: SparkSession, + benchmark: str, + data_path: Optional[str], + catalog: Optional[str], + database: str, + file_format: str, + reader_options: Optional[Dict[str, str]], +) -> bool: + """Register TPC tables as temp views. + + Returns True when using Iceberg catalog, False for file-based tables. + """ + if benchmark not in BENCHMARK_META: + raise ValueError(f"Invalid benchmark: {benchmark}") + tables = BENCHMARK_META[benchmark]["tables"] + using_iceberg = catalog is not None + + for table in tables: + if using_iceberg: + source = f"{catalog}.{database}.{table}" + print(f"Registering table {table} from {source}") + df = spark.table(source) + else: + source = f"{data_path}/{table}.{file_format}" + print(f"Registering table {table} from {source}") + df = spark.read.format(file_format).options(**(reader_options or {})).load(source) + df.createOrReplaceTempView(table) + + return using_iceberg + + +def run_queries( + spark: SparkSession, + benchmark: str, + query_path: str, + iterations: int, + query_num: Optional[int] = None, + write_path: Optional[str] = None, +) -> Dict[int, List[float]]: + """Execute TPC queries and return {query_num: [elapsed_secs_per_iter]}.""" + meta = BENCHMARK_META[benchmark] + num_queries = meta["num_queries"] + results: Dict[int, List[float]] = {} + + for iteration in range(iterations): + print(f"\n{'=' * 60}") + print(f"Starting iteration {iteration + 1} of {iterations}") + print(f"{'=' * 60}") + iter_start_time = time.time() + + if query_num is not None: + if query_num < 1 or query_num > num_queries: + raise ValueError( + f"Query number {query_num} out of range. " + f"Valid: 1-{num_queries} for {benchmark}" + ) + queries_to_run = [query_num] + else: + queries_to_run = range(1, num_queries + 1) + + for query in queries_to_run: + spark.sparkContext.setJobDescription(f"{benchmark} q{query}") + path = f"{query_path}/q{query}.sql" + print(f"\nRunning query {query} from {path}") + + with open(path, "r") as f: + text = f.read() + queries_sql = text.split(";") + + start_time = time.time() + for sql in queries_sql: + sql = sql.strip().replace("create view", "create temp view") + if len(sql) > 0: + print(f"Executing: {sql[:100]}...") + df = spark.sql(sql) + df.explain("formatted") + + if write_path is not None: + if len(df.columns) > 0: + output_path = f"{write_path}/q{query}" + deduped = dedup_columns(df) + deduped.orderBy(*deduped.columns).coalesce(1).write.mode( + "overwrite" + ).parquet(output_path) + print(f"Results written to {output_path}") + else: + rows = df.collect() + print(f"Query {query} returned {len(rows)} rows") + + end_time = time.time() + elapsed = end_time - start_time + print(f"Query {query} took {elapsed:.2f} seconds") + + results.setdefault(query, []).append(elapsed) + + iter_end_time = time.time() + print( + f"\nIteration {iteration + 1} took " + f"{iter_end_time - iter_start_time:.2f} seconds" + ) + + return results + + +def build_results( + spark: SparkSession, + benchmark: str, + query_path: str, + data_path: Optional[str], + catalog: Optional[str], + database: str, + using_iceberg: bool, + name: str, + timings: Dict[int, List[float]], +) -> Dict: + """Assemble the result dict with the same schema as tpcbench.py.""" + conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()} + + results: Dict = { + "engine": "datafusion-comet", + "benchmark": benchmark, + "query_path": query_path, + "spark_conf": conf_dict, + } + if using_iceberg: + results["catalog"] = catalog + results["database"] = database + else: + results["data_path"] = data_path + + # Integer query keys — json.dumps serialises them as strings, matching + # the format that generate-comparison.py expects (str(query)). + for query, elapsed_list in timings.items(): + results[query] = elapsed_list + + return results + + +def write_results( + results: Dict, + output_dir: str, + name: str, + benchmark: str, +) -> str: + """Write JSON results file. Returns the path written.""" + result_str = json.dumps(results, indent=4) + current_time_millis = int(datetime.now().timestamp() * 1000) + results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json" + print(f"\nWriting results to {results_path}") + with open(results_path, "w") as f: + f.write(result_str) + return results_path diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/.gitignore deleted file mode 100644 index 477aaef0c3..0000000000 --- a/dev/benchmarks/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.json -*.png \ No newline at end of file diff --git a/dev/benchmarks/README.md b/dev/benchmarks/README.md deleted file mode 100644 index b3ea674199..0000000000 --- a/dev/benchmarks/README.md +++ /dev/null @@ -1,151 +0,0 @@ - - -# Comet Benchmarking Scripts - -This directory contains scripts used for generating benchmark results that are published in this repository and in -the Comet documentation. - -For full instructions on running these benchmarks on an EC2 instance, see the [Comet Benchmarking on EC2 Guide]. - -[Comet Benchmarking on EC2 Guide]: https://datafusion.apache.org/comet/contributor-guide/benchmarking_aws_ec2.html - -## Example usage - -Set Spark environment variables: - -```shell -export SPARK_HOME=/opt/spark-3.5.3-bin-hadoop3/ -export SPARK_MASTER=spark://yourhostname:7077 -``` - -Set path to queries and data: - -```shell -export TPCH_QUERIES=/mnt/bigdata/tpch/queries/ -export TPCH_DATA=/mnt/bigdata/tpch/sf100/ -``` - -Run Spark benchmark: - -```shell -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 -sudo ./drop-caches.sh -./spark-tpch.sh -``` - -Run Comet benchmark: - -```shell -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 -export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar -sudo ./drop-caches.sh -./comet-tpch.sh -``` - -Run Gluten benchmark: - -```shell -export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 -export GLUTEN_JAR=/opt/gluten/gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar -sudo ./drop-caches.sh -./gluten-tpch.sh -``` - -Generating charts: - -```shell -python3 generate-comparison.py --benchmark tpch --labels "Spark 3.5.3" "Comet 0.9.0" "Gluten 1.4.0" --title "TPC-H @ 100 GB (single executor, 8 cores, local Parquet files)" spark-tpch-1752338506381.json comet-tpch-1752337818039.json gluten-tpch-1752337474344.json -``` - -## Iceberg Benchmarking - -Comet includes native Iceberg support via iceberg-rust integration. This enables benchmarking TPC-H queries -against Iceberg tables with native scan acceleration. - -### Prerequisites - -Download the Iceberg Spark runtime JAR (required for running the benchmark): - -```shell -wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/iceberg-spark-runtime-3.5_2.12-1.8.1.jar -export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar -``` - -Note: Table creation uses `--packages` which auto-downloads the dependency. - -### Create Iceberg TPC-H tables - -Convert existing Parquet TPC-H data to Iceberg format: - -```shell -export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse -export ICEBERG_CATALOG=${ICEBERG_CATALOG:-local} - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.executor.memory=16g \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \ - create-iceberg-tpch.py \ - --parquet-path $TPCH_DATA \ - --catalog $ICEBERG_CATALOG \ - --database tpch -``` - -### Run Iceberg benchmark - -```shell -export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64 -export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar -export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar -export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse -export TPCH_QUERIES=/mnt/bigdata/tpch/queries/ -sudo ./drop-caches.sh -./comet-tpch-iceberg.sh -``` - -The benchmark uses `spark.comet.scan.icebergNative.enabled=true` to enable Comet's native iceberg-rust -integration. Verify native scanning is active by checking for `CometIcebergNativeScanExec` in the -physical plan output. - -### Iceberg-specific options - -| Environment Variable | Default | Description | -| -------------------- | ---------- | ----------------------------------- | -| `ICEBERG_CATALOG` | `local` | Iceberg catalog name | -| `ICEBERG_DATABASE` | `tpch` | Database containing TPC-H tables | -| `ICEBERG_WAREHOUSE` | (required) | Path to Iceberg warehouse directory | - -### Comparing Parquet vs Iceberg performance - -Run both benchmarks and compare: - -```shell -python3 generate-comparison.py --benchmark tpch \ - --labels "Comet (Parquet)" "Comet (Iceberg)" \ - --title "TPC-H @ 100 GB: Parquet vs Iceberg" \ - comet-tpch-*.json comet-iceberg-tpch-*.json -``` diff --git a/dev/benchmarks/blaze-tpcds.sh b/dev/benchmarks/blaze-tpcds.sh deleted file mode 100755 index 90a4a48468..0000000000 --- a/dev/benchmarks/blaze-tpcds.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --jars $BLAZE_JAR \ - --driver-class-path $BLAZE_JAR \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=2 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=16 \ - --conf spark.executor.memory=16g \ - --conf spark.executor.memoryOverhead=16g \ - --conf spark.memory.offHeap.enabled=false \ - --conf spark.eventLog.enabled=true \ - --conf spark.driver.extraClassPath=$BLAZE_JAR \ - --conf spark.executor.extraClassPath=$BLAZE_JAR \ - --conf spark.sql.extensions=org.apache.spark.sql.blaze.BlazeSparkSessionExtension \ - --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \ - --conf spark.blaze.enable=true \ - --conf spark.blaze.forceShuffledHashJoin=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name blaze \ - --benchmark tpcds \ - --data $TPCDS_DATA \ - --queries $TPCDS_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/blaze-tpch.sh b/dev/benchmarks/blaze-tpch.sh deleted file mode 100755 index 2c6878737d..0000000000 --- a/dev/benchmarks/blaze-tpch.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --jars $BLAZE_JAR \ - --driver-class-path $BLAZE_JAR \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.executor.memory=16g \ - --conf spark.executor.memoryOverhead=16g \ - --conf spark.memory.offHeap.enabled=false \ - --conf spark.eventLog.enabled=true \ - --conf spark.driver.extraClassPath=$BLAZE_JAR \ - --conf spark.executor.extraClassPath=$BLAZE_JAR \ - --conf spark.sql.extensions=org.apache.spark.sql.blaze.BlazeSparkSessionExtension \ - --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \ - --conf spark.blaze.enable=true \ - --conf spark.blaze.forceShuffledHashJoin=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name blaze \ - --benchmark tpch \ - --data $TPCH_DATA \ - --queries $TPCH_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/comet-tpcds.sh b/dev/benchmarks/comet-tpcds.sh deleted file mode 100755 index b55b27188c..0000000000 --- a/dev/benchmarks/comet-tpcds.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --jars $COMET_JAR \ - --driver-class-path $COMET_JAR \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=2 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=16 \ - --conf spark.executor.memory=16g \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.eventLog.enabled=true \ - --conf spark.driver.extraClassPath=$COMET_JAR \ - --conf spark.executor.extraClassPath=$COMET_JAR \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.comet.scan.impl=native_datafusion \ - --conf spark.comet.expression.Cast.allowIncompatible=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name comet \ - --benchmark tpcds \ - --data $TPCDS_DATA \ - --queries $TPCDS_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/comet-tpch-iceberg.sh b/dev/benchmarks/comet-tpch-iceberg.sh deleted file mode 100755 index 7907125c82..0000000000 --- a/dev/benchmarks/comet-tpch-iceberg.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# TPC-H benchmark using Iceberg tables with Comet's native iceberg-rust integration. -# -# Required environment variables: -# SPARK_HOME - Path to Spark installation -# SPARK_MASTER - Spark master URL (e.g., spark://localhost:7077) -# COMET_JAR - Path to Comet JAR -# ICEBERG_JAR - Path to Iceberg Spark runtime JAR -# ICEBERG_WAREHOUSE - Path to Iceberg warehouse directory -# TPCH_QUERIES - Path to TPC-H query files -# -# Optional: -# ICEBERG_CATALOG - Catalog name (default: local) -# ICEBERG_DATABASE - Database name (default: tpch) -# -# Setup (run once to create Iceberg tables from Parquet): -# $SPARK_HOME/bin/spark-submit \ -# --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \ -# --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ -# --conf spark.sql.catalog.local.type=hadoop \ -# --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \ -# create-iceberg-tpch.py \ -# --parquet-path $TPCH_DATA \ -# --catalog local \ -# --database tpch - -set -e - -# Defaults -ICEBERG_CATALOG=${ICEBERG_CATALOG:-local} -ICEBERG_DATABASE=${ICEBERG_DATABASE:-tpch} - -# Validate required variables -if [ -z "$SPARK_HOME" ]; then - echo "Error: SPARK_HOME is not set" - exit 1 -fi -if [ -z "$COMET_JAR" ]; then - echo "Error: COMET_JAR is not set" - exit 1 -fi -if [ -z "$ICEBERG_JAR" ]; then - echo "Error: ICEBERG_JAR is not set" - echo "Download from: https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/" - exit 1 -fi -if [ -z "$ICEBERG_WAREHOUSE" ]; then - echo "Error: ICEBERG_WAREHOUSE is not set" - exit 1 -fi -if [ -z "$TPCH_QUERIES" ]; then - echo "Error: TPCH_QUERIES is not set" - exit 1 -fi - -$SPARK_HOME/sbin/stop-master.sh 2>/dev/null || true -$SPARK_HOME/sbin/stop-worker.sh 2>/dev/null || true - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --jars $COMET_JAR,$ICEBERG_JAR \ - --driver-class-path $COMET_JAR:$ICEBERG_JAR \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.executor.memory=16g \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.eventLog.enabled=true \ - --conf spark.driver.extraClassPath=$COMET_JAR:$ICEBERG_JAR \ - --conf spark.executor.extraClassPath=$COMET_JAR:$ICEBERG_JAR \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.comet.exec.replaceSortMergeJoin=true \ - --conf spark.comet.expression.Cast.allowIncompatible=true \ - --conf spark.comet.enabled=true \ - --conf spark.comet.exec.enabled=true \ - --conf spark.comet.scan.icebergNative.enabled=true \ - --conf spark.comet.explainFallback.enabled=true \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \ - --conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \ - --conf spark.sql.defaultCatalog=${ICEBERG_CATALOG} \ - tpcbench.py \ - --name comet-iceberg \ - --benchmark tpch \ - --catalog $ICEBERG_CATALOG \ - --database $ICEBERG_DATABASE \ - --queries $TPCH_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/comet-tpch.sh b/dev/benchmarks/comet-tpch.sh deleted file mode 100755 index a748a02319..0000000000 --- a/dev/benchmarks/comet-tpch.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --jars $COMET_JAR \ - --driver-class-path $COMET_JAR \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.executor.memory=16g \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.eventLog.enabled=true \ - --conf spark.driver.extraClassPath=$COMET_JAR \ - --conf spark.executor.extraClassPath=$COMET_JAR \ - --conf spark.plugins=org.apache.spark.CometPlugin \ - --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ - --conf spark.comet.scan.impl=native_datafusion \ - --conf spark.comet.exec.replaceSortMergeJoin=true \ - --conf spark.comet.expression.Cast.allowIncompatible=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name comet \ - --benchmark tpch \ - --data $TPCH_DATA \ - --queries $TPCH_QUERIES \ - --output . \ - --iterations 1 \ - --format parquet diff --git a/dev/benchmarks/generate-comparison.py b/dev/benchmarks/generate-comparison.py deleted file mode 100644 index eb57cc1e45..0000000000 --- a/dev/benchmarks/generate-comparison.py +++ /dev/null @@ -1,229 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import json -import matplotlib.pyplot as plt -import numpy as np - -def geomean(data): - return np.prod(data) ** (1 / len(data)) - -def generate_query_rel_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str): - results = [] - for query in range(1, query_count(benchmark)+1): - if query == 999: - continue - a = np.median(np.array(baseline[str(query)])) - b = np.median(np.array(comparison[str(query)])) - if a > b: - speedup = a/b-1 - else: - speedup = -(1/(a/b)-1) - results.append(("q" + str(query), round(speedup*100, 0))) - - results = sorted(results, key=lambda x: -x[1]) - - queries, speedups = zip(*results) - - # Create figure and axis - if benchmark == "tpch": - fig, ax = plt.subplots(figsize=(10, 6)) - else: - fig, ax = plt.subplots(figsize=(35, 10)) - - # Create bar chart - bars = ax.bar(queries, speedups, color='skyblue') - - # Add text annotations - for bar, speedup in zip(bars, speedups): - yval = bar.get_height() - if yval >= 0: - ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8, - color='blue', rotation=90) - else: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8, - color='blue', rotation=90) - - # Add title and labels - ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")") - ax.set_ylabel('Speedup Percentage (100% speedup = 2x faster)') - ax.set_xlabel('Query') - - # Customize the y-axis to handle both positive and negative values better - ax.axhline(0, color='black', linewidth=0.8) - min_value = (min(speedups) // 100) * 100 - max_value = ((max(speedups) // 100) + 1) * 100 + 50 - if benchmark == "tpch": - ax.set_ylim(min_value, max_value) - else: - # TODO improve this - ax.set_ylim(-250, 300) - - # Show grid for better readability - ax.yaxis.grid(True) - - # Save the plot as an image file - plt.savefig(f'{benchmark}_queries_speedup_rel.png', format='png') - -def generate_query_abs_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str): - results = [] - for query in range(1, query_count(benchmark)+1): - if query == 999: - continue - a = np.median(np.array(baseline[str(query)])) - b = np.median(np.array(comparison[str(query)])) - speedup = a-b - results.append(("q" + str(query), round(speedup, 1))) - - results = sorted(results, key=lambda x: -x[1]) - - queries, speedups = zip(*results) - - # Create figure and axis - if benchmark == "tpch": - fig, ax = plt.subplots(figsize=(10, 6)) - else: - fig, ax = plt.subplots(figsize=(35, 10)) - - # Create bar chart - bars = ax.bar(queries, speedups, color='skyblue') - - # Add text annotations - for bar, speedup in zip(bars, speedups): - yval = bar.get_height() - if yval >= 0: - ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.1f}', va='bottom', ha='center', fontsize=8, - color='blue', rotation=90) - else: - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.1f}', va='top', ha='center', fontsize=8, - color='blue', rotation=90) - - # Add title and labels - ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")") - ax.set_ylabel('Speedup (in seconds)') - ax.set_xlabel('Query') - - # Customize the y-axis to handle both positive and negative values better - ax.axhline(0, color='black', linewidth=0.8) - min_value = min(speedups) * 2 - 20 - max_value = max(speedups) * 1.5 - ax.set_ylim(min_value, max_value) - - # Show grid for better readability - ax.yaxis.grid(True) - - # Save the plot as an image file - plt.savefig(f'{benchmark}_queries_speedup_abs.png', format='png') - -def generate_query_comparison_chart(results, labels, benchmark: str, title: str): - queries = [] - benches = [] - for _ in results: - benches.append([]) - for query in range(1, query_count(benchmark)+1): - if query == 999: - continue - queries.append("q" + str(query)) - for i in range(0, len(results)): - benches[i].append(np.median(np.array(results[i][str(query)]))) - - # Define the width of the bars - bar_width = 0.3 - - # Define the positions of the bars on the x-axis - index = np.arange(len(queries)) * 1.5 - - # Create a bar chart - if benchmark == "tpch": - fig, ax = plt.subplots(figsize=(15, 6)) - else: - fig, ax = plt.subplots(figsize=(35, 6)) - - for i in range(0, len(results)): - bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i]) - - # Add labels, title, and legend - ax.set_title(title) - ax.set_xlabel('Queries') - ax.set_ylabel('Query Time (seconds)') - ax.set_xticks(index + bar_width / 2) - ax.set_xticklabels(queries) - ax.legend() - - # Save the plot as an image file - plt.savefig(f'{benchmark}_queries_compare.png', format='png') - -def generate_summary(results, labels, benchmark: str, title: str): - timings = [] - for _ in results: - timings.append(0) - - num_queries = query_count(benchmark) - for query in range(1, num_queries + 1): - if query == 999: - continue - for i in range(0, len(results)): - timings[i] += np.median(np.array(results[i][str(query)])) - - # Create figure and axis - fig, ax = plt.subplots() - fig.set_size_inches(10, 6) - - # Add title and labels - ax.set_title(title) - ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)') - - times = [round(x,0) for x in timings] - - # Create bar chart - bars = ax.bar(labels, times, color='skyblue', width=0.8) - - # Add text annotations - for bar in bars: - yval = bar.get_height() - ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment - - plt.savefig(f'{benchmark}_allqueries.png', format='png') - -def query_count(benchmark: str): - if benchmark == "tpch": - return 22 - elif benchmark == "tpcds": - return 99 - else: - raise "invalid benchmark name" - -def main(files, labels, benchmark: str, title: str): - results = [] - for filename in files: - with open(filename) as f: - results.append(json.load(f)) - generate_summary(results, labels, benchmark, title) - generate_query_comparison_chart(results, labels, benchmark, title) - if len(files) == 2: - generate_query_abs_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title) - generate_query_rel_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title) - -if __name__ == '__main__': - argparse = argparse.ArgumentParser(description='Generate comparison') - argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files') - argparse.add_argument('--labels', nargs='+', type=str, help='Labels') - argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)') - argparse.add_argument('--title', type=str, help='Chart title') - args = argparse.parse_args() - main(args.filenames, args.labels, args.benchmark, args.title) diff --git a/dev/benchmarks/gluten-tpcds.sh b/dev/benchmarks/gluten-tpcds.sh deleted file mode 100755 index 7c475c79c0..0000000000 --- a/dev/benchmarks/gluten-tpcds.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -export TZ=UTC - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=2 \ - --conf spark.executor.memory=16G \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=16 \ - --conf spark.eventLog.enabled=true \ - --jars $GLUTEN_JAR \ - --conf spark.plugins=org.apache.gluten.GlutenPlugin \ - --conf spark.driver.extraClassPath=${GLUTEN_JAR} \ - --conf spark.executor.extraClassPath=${GLUTEN_JAR} \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \ - --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ - --conf spark.sql.session.timeZone=UTC \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name gluten \ - --benchmark tpcds \ - --data $TPCDS_DATA \ - --queries $TPCDS_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/gluten-tpch.sh b/dev/benchmarks/gluten-tpch.sh deleted file mode 100755 index 46c3ed7527..0000000000 --- a/dev/benchmarks/gluten-tpch.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -export TZ=UTC - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.memory=16G \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.eventLog.enabled=true \ - --jars $GLUTEN_JAR \ - --conf spark.plugins=org.apache.gluten.GlutenPlugin \ - --conf spark.driver.extraClassPath=${GLUTEN_JAR} \ - --conf spark.executor.extraClassPath=${GLUTEN_JAR} \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \ - --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ - --conf spark.sql.session.timeZone=UTC \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name gluten \ - --benchmark tpch \ - --data $TPCH_DATA \ - --queries $TPCH_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/spark-tpcds.sh b/dev/benchmarks/spark-tpcds.sh deleted file mode 100755 index dad079ba23..0000000000 --- a/dev/benchmarks/spark-tpcds.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=2 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=16 \ - --conf spark.executor.memory=16g \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.eventLog.enabled=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name spark \ - --benchmark tpcds \ - --data $TPCDS_DATA \ - --queries $TPCDS_QUERIES \ - --output . \ - --iterations 1 diff --git a/dev/benchmarks/spark-tpch.sh b/dev/benchmarks/spark-tpch.sh deleted file mode 100755 index ae359f049f..0000000000 --- a/dev/benchmarks/spark-tpch.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -$SPARK_HOME/sbin/stop-master.sh -$SPARK_HOME/sbin/stop-worker.sh - -$SPARK_HOME/sbin/start-master.sh -$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER - -$SPARK_HOME/bin/spark-submit \ - --master $SPARK_MASTER \ - --conf spark.driver.memory=8G \ - --conf spark.executor.instances=1 \ - --conf spark.executor.cores=8 \ - --conf spark.cores.max=8 \ - --conf spark.executor.memory=16g \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=16g \ - --conf spark.eventLog.enabled=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \ - tpcbench.py \ - --name spark \ - --benchmark tpch \ - --data $TPCH_DATA \ - --queries $TPCH_QUERIES \ - --output . \ - --iterations 1 \ - --format parquet diff --git a/dev/benchmarks/tpcbench.py b/dev/benchmarks/tpcbench.py deleted file mode 100644 index 400ccd175a..0000000000 --- a/dev/benchmarks/tpcbench.py +++ /dev/null @@ -1,257 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -TPC-H / TPC-DS benchmark runner. - -Supports two data sources: - - Files: use --data with --format (parquet, csv, json) and optional --options - - Iceberg tables: use --catalog and --database to specify the catalog location -""" - -import argparse -from datetime import datetime -import json -from pyspark.sql import SparkSession -import time -from typing import Dict - - -def dedup_columns(df): - """Rename duplicate column aliases: a, a, b, b -> a, a_1, b, b_1""" - counts = {} - new_cols = [] - for c in df.columns: - if c not in counts: - counts[c] = 0 - new_cols.append(c) - else: - counts[c] += 1 - new_cols.append(f"{c}_{counts[c]}") - return df.toDF(*new_cols) - - -def main( - benchmark: str, - data_path: str, - catalog: str, - database: str, - query_path: str, - iterations: int, - output: str, - name: str, - format: str, - query_num: int = None, - write_path: str = None, - options: Dict[str, str] = None -): - if options is None: - options = {} - - spark = SparkSession.builder \ - .appName(f"{name} benchmark derived from {benchmark}") \ - .getOrCreate() - - # Define tables for each benchmark - if benchmark == "tpch": - num_queries = 22 - table_names = [ - "customer", "lineitem", "nation", "orders", - "part", "partsupp", "region", "supplier" - ] - elif benchmark == "tpcds": - num_queries = 99 - table_names = [ - "call_center", "catalog_page", "catalog_returns", "catalog_sales", - "customer", "customer_address", "customer_demographics", "date_dim", - "time_dim", "household_demographics", "income_band", "inventory", - "item", "promotion", "reason", "ship_mode", "store", "store_returns", - "store_sales", "warehouse", "web_page", "web_returns", "web_sales", - "web_site" - ] - else: - raise ValueError(f"Invalid benchmark: {benchmark}") - - # Register tables from either files or Iceberg catalog - using_iceberg = catalog is not None - for table in table_names: - if using_iceberg: - source = f"{catalog}.{database}.{table}" - print(f"Registering table {table} from {source}") - df = spark.table(source) - else: - source = f"{data_path}/{table}.{format}" - print(f"Registering table {table} from {source}") - df = spark.read.format(format).options(**options).load(source) - df.createOrReplaceTempView(table) - - conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()} - - results = { - 'engine': 'datafusion-comet', - 'benchmark': benchmark, - 'query_path': query_path, - 'spark_conf': conf_dict, - } - if using_iceberg: - results['catalog'] = catalog - results['database'] = database - else: - results['data_path'] = data_path - - for iteration in range(iterations): - print(f"\n{'='*60}") - print(f"Starting iteration {iteration + 1} of {iterations}") - print(f"{'='*60}") - iter_start_time = time.time() - - # Determine which queries to run - if query_num is not None: - if query_num < 1 or query_num > num_queries: - raise ValueError( - f"Query number {query_num} out of range. " - f"Valid: 1-{num_queries} for {benchmark}" - ) - queries_to_run = [query_num] - else: - queries_to_run = range(1, num_queries + 1) - - for query in queries_to_run: - spark.sparkContext.setJobDescription(f"{benchmark} q{query}") - - path = f"{query_path}/q{query}.sql" - print(f"\nRunning query {query} from {path}") - - with open(path, "r") as f: - text = f.read() - queries = text.split(";") - - start_time = time.time() - for sql in queries: - sql = sql.strip().replace("create view", "create temp view") - if len(sql) > 0: - print(f"Executing: {sql[:100]}...") - df = spark.sql(sql) - df.explain("formatted") - - if write_path is not None: - if len(df.columns) > 0: - output_path = f"{write_path}/q{query}" - deduped = dedup_columns(df) - deduped.orderBy(*deduped.columns).coalesce(1).write.mode("overwrite").parquet(output_path) - print(f"Results written to {output_path}") - else: - rows = df.collect() - print(f"Query {query} returned {len(rows)} rows") - - end_time = time.time() - elapsed = end_time - start_time - print(f"Query {query} took {elapsed:.2f} seconds") - - query_timings = results.setdefault(query, []) - query_timings.append(elapsed) - - iter_end_time = time.time() - print(f"\nIteration {iteration + 1} took {iter_end_time - iter_start_time:.2f} seconds") - - # Write results - result_str = json.dumps(results, indent=4) - current_time_millis = int(datetime.now().timestamp() * 1000) - results_path = f"{output}/{name}-{benchmark}-{current_time_millis}.json" - print(f"\nWriting results to {results_path}") - with open(results_path, "w") as f: - f.write(result_str) - - spark.stop() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="TPC-H/TPC-DS benchmark runner for files or Iceberg tables" - ) - parser.add_argument( - "--benchmark", required=True, - help="Benchmark to run (tpch or tpcds)" - ) - - # Data source - mutually exclusive: either file path or Iceberg catalog - source_group = parser.add_mutually_exclusive_group(required=True) - source_group.add_argument( - "--data", - help="Path to data files" - ) - source_group.add_argument( - "--catalog", - help="Iceberg catalog name" - ) - - # Options for file-based reading - parser.add_argument( - "--format", default="parquet", - help="Input file format: parquet, csv, json (only used with --data)" - ) - parser.add_argument( - "--options", type=json.loads, default={}, - help='Spark reader options as JSON string, e.g., \'{"header": "true"}\' (only used with --data)' - ) - - # Options for Iceberg - parser.add_argument( - "--database", default="tpch", - help="Database containing TPC tables (only used with --catalog)" - ) - - parser.add_argument( - "--queries", required=True, - help="Path to query SQL files" - ) - parser.add_argument( - "--iterations", type=int, default=1, - help="Number of iterations" - ) - parser.add_argument( - "--output", required=True, - help="Path to write results JSON" - ) - parser.add_argument( - "--name", required=True, - help="Prefix for result file" - ) - parser.add_argument( - "--query", type=int, - help="Specific query number (1-based). If omitted, run all." - ) - parser.add_argument( - "--write", - help="Path to save query results as Parquet" - ) - args = parser.parse_args() - - main( - args.benchmark, - args.data, - args.catalog, - args.database, - args.queries, - args.iterations, - args.output, - args.name, - args.format, - args.query, - args.write, - args.options - ) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 4ea10c1dff..ee80a51eeb 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -28,3 +28,5 @@ spark/src/test/resources/tpch-extended/q*.sql spark/src/test/resources/test-data/*.csv spark/src/test/resources/test-data/*.ndjson spark/inspections/CometTPC*results.txt +benchmarks/queries/tpch/q*.sql +benchmarks/queries/tpcds/q*.sql diff --git a/pom.xml b/pom.xml index 1b33fc4757..6ece20552d 100644 --- a/pom.xml +++ b/pom.xml @@ -1058,6 +1058,7 @@ under the License. dev/deploy-file **/test/resources/** **/benchmarks/*.txt + benchmarks/queries/**/*.sql **/inspections/*.txt tpcds-kit/** tpcds-sf-1/**