diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile
deleted file mode 100644
index 704c863d20..0000000000
--- a/benchmarks/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-FROM apache/datafusion-comet:0.7.0-spark3.5.5-scala2.12-java11
-
-RUN apt update \
- && apt install -y git python3 python3-pip \
- && apt clean
-
-RUN cd /opt \
- && git clone https://github.com/apache/datafusion-benchmarks.git
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 7e2dfc9f2b..f1e8d39db6 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -17,88 +17,177 @@ specific language governing permissions and limitations
under the License.
-->
-# Running Comet Benchmarks in Microk8s
+# Comet Benchmark Suite
+
+Unified benchmark infrastructure for Apache DataFusion Comet. Supports
+TPC-H/TPC-DS and shuffle benchmarks across multiple engines (Spark, Comet,
+Gluten) with composable configuration and optional memory profiling.
+
+## Quick Start
+
+```bash
+# Run TPC-H with Comet on a standalone cluster
+python benchmarks/run.py \
+ --engine comet --profile standalone-tpch --restart-cluster \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --iterations 1
+
+# Preview the spark-submit command without executing
+python benchmarks/run.py \
+ --engine comet --profile standalone-tpch --dry-run \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --iterations 1
+```
+
+## Directory Layout
+
+```
+benchmarks/
+├── run.py # Entry point — builds and runs spark-submit
+├── conf/
+│ ├── engines/ # Per-engine configs (comet, spark, gluten, ...)
+│ └── profiles/ # Per-environment configs (local, standalone, docker)
+├── runner/
+│ ├── cli.py # Python CLI passed to spark-submit (subcommands: tpc, shuffle, micro)
+│ ├── config.py # Config file loader and merger
+│ ├── spark_session.py # SparkSession builder
+│ └── profiling.py # Level 1 JVM metrics via Spark REST API
+├── suites/
+│ ├── tpc.py # TPC-H / TPC-DS benchmark suite
+│ ├── shuffle.py # Shuffle benchmark suite (hash, round-robin)
+│ └── micro.py # Microbenchmark suite (string expressions, ...)
+├── analysis/
+│ ├── compare.py # Generate comparison charts from result JSON
+│ └── memory_report.py # Generate memory reports from profiling CSV
+├── infra/
+│ ├── docker/ # Dockerfile, docker-compose, metrics collector
+├── create-iceberg-tpch.py # Utility: convert TPC-H Parquet to Iceberg tables
+└── drop-caches.sh # Utility: drop OS page caches before benchmarks
+```
-This guide explains how to run benchmarks derived from TPC-H and TPC-DS in Apache DataFusion Comet deployed in a
-local Microk8s cluster.
+## How It Works
-## Use Microk8s locally
+`run.py` is the single entry point. It:
-Install Micro8s following the instructions at https://microk8s.io/docs/getting-started and then perform these
-additional steps, ensuring that any existing kube config is backed up first.
+1. Reads a **profile** config (cluster shape, memory, master URL)
+2. Reads an **engine** config (plugin JARs, shuffle manager, engine-specific settings)
+3. Applies any `--conf key=value` CLI overrides (highest precedence)
+4. Builds and executes the `spark-submit` command
-```shell
-mkdir -p ~/.kube
-microk8s config > ~/.kube/config
+The merge order is: **profile < engine < CLI overrides**, so engine configs
+can override profile defaults (e.g., an engine can set `offHeap.enabled=false`
+even though the profile enables it).
-microk8s enable dns
-microk8s enable registry
+### Wrapper arguments (before `--`)
-microk8s kubectl create serviceaccount spark
-```
+| Flag | Description |
+| ------------------- | ----------------------------------------------- |
+| `--engine NAME` | Engine config from `conf/engines/NAME.conf` |
+| `--profile NAME` | Profile config from `conf/profiles/NAME.conf` |
+| `--conf key=value` | Extra Spark/runner config override (repeatable) |
+| `--restart-cluster` | Stop/start Spark standalone master + worker |
+| `--dry-run` | Print spark-submit command without executing |
+
+### Suite arguments (after `--`)
+
+Everything after `--` is passed to `runner/cli.py`. See per-suite docs:
+
+- [TPC-H / TPC-DS](suites/TPC.md)
+- [Shuffle](suites/SHUFFLE.md)
+- [Microbenchmarks](suites/MICRO.md)
+
+## Available Engines
+
+| Engine | Config file | Description |
+| ---------------------- | ----------------------------------- | --------------------------------- |
+| `spark` | `engines/spark.conf` | Vanilla Spark (no accelerator) |
+| `comet` | `engines/comet.conf` | DataFusion Comet with native scan |
+| `comet-iceberg` | `engines/comet-iceberg.conf` | Comet + native Iceberg scanning |
+| `gluten` | `engines/gluten.conf` | Gluten (Velox backend) — Java 8 |
+| `spark-shuffle` | `engines/spark-shuffle.conf` | Spark baseline for shuffle tests |
+| `comet-jvm-shuffle` | `engines/comet-jvm-shuffle.conf` | Comet with JVM shuffle mode |
+| `comet-native-shuffle` | `engines/comet-native-shuffle.conf` | Comet with native shuffle |
+
+## Available Profiles
-## Build Comet Docker Image
+| Profile | Config file | Description |
+| ------------------ | -------------------------------- | ------------------------------ |
+| `local` | `profiles/local.conf` | `local[*]` mode, no cluster |
+| `standalone-tpch` | `profiles/standalone-tpch.conf` | 1 executor, 8 cores, S3A |
+| `standalone-tpcds` | `profiles/standalone-tpcds.conf` | 2 executors, 16 cores, S3A |
+| `docker` | `profiles/docker.conf` | For docker-compose deployments |
-Run the following command from the root of this repository to build the Comet Docker image, or use a published
-Docker image from https://github.com/orgs/apache/packages?repo_name=datafusion-comet
+## Environment Variables
-```shell
-docker build -t apache/datafusion-comet -f kube/Dockerfile .
+The config files use `${VAR}` references that are expanded from the
+environment at load time:
+
+| Variable | Used by | Description |
+| -------------- | -------------------- | --------------------------------- |
+| `SPARK_HOME` | `run.py` | Path to Spark installation |
+| `SPARK_MASTER` | standalone profiles | Spark master URL |
+| `COMET_JAR` | comet engines | Path to Comet JAR |
+| `GLUTEN_JAR` | gluten engine | Path to Gluten JAR |
+| `ICEBERG_JAR` | comet-iceberg engine | Path to Iceberg Spark runtime JAR |
+
+## Profiling
+
+Add `--profile` (the flag, not the config) to any suite command to enable
+Level 1 JVM metrics collection via the Spark REST API:
+
+```bash
+python benchmarks/run.py --engine comet --profile standalone-tpch \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --iterations 1 --profile --profile-interval 1.0
```
-## Build Comet Benchmark Docker Image
+This writes a `{name}-{benchmark}-metrics.csv` alongside the result JSON.
+
+For container-level memory profiling, use the constrained docker-compose
+overlay — see [Docker infrastructure](infra/docker/).
+
+## Generating Charts
-Build the benchmark Docker image and push to the Microk8s Docker registry.
+```bash
+# Compare two result JSON files
+python -m benchmarks.analysis.compare \
+ comet-tpch-*.json spark-tpch-*.json \
+ --labels Comet Spark --benchmark tpch \
+ --title "TPC-H SF100" --output-dir ./charts
-```shell
-docker build -t apache/datafusion-comet-tpcbench .
-docker tag apache/datafusion-comet-tpcbench localhost:32000/apache/datafusion-comet-tpcbench:latest
-docker push localhost:32000/apache/datafusion-comet-tpcbench:latest
+# Generate memory reports
+python -m benchmarks.analysis.memory_report \
+ --spark-csv comet-tpch-metrics.csv \
+ --container-csv container-metrics.csv \
+ --output-dir ./charts
```
-## Run benchmarks
-
-```shell
-export SPARK_MASTER=k8s://https://127.0.0.1:16443
-export COMET_DOCKER_IMAGE=localhost:32000/apache/datafusion-comet-tpcbench:latest
-# Location of Comet JAR within the Docker image
-export COMET_JAR=/opt/spark/jars/comet-spark-spark3.4_2.12-0.5.0-SNAPSHOT.jar
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --deploy-mode cluster \
- --name comet-tpcbench \
- --driver-memory 8G \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.memory=32G \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.task.cpus=1 \
- --conf spark.executor.memoryOverhead=3G \
- --jars local://$COMET_JAR \
- --conf spark.executor.extraClassPath=$COMET_JAR \
- --conf spark.driver.extraClassPath=$COMET_JAR \
- --conf spark.plugins=org.apache.spark.CometPlugin \
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
- --conf spark.comet.enabled=true \
- --conf spark.comet.exec.enabled=true \
- --conf spark.comet.exec.all.enabled=true \
- --conf spark.comet.cast.allowIncompatible=true \
- --conf spark.comet.exec.shuffle.enabled=true \
- --conf spark.comet.exec.shuffle.mode=auto \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.kubernetes.namespace=default \
- --conf spark.kubernetes.driver.pod.name=tpcbench \
- --conf spark.kubernetes.container.image=$COMET_DOCKER_IMAGE \
- --conf spark.kubernetes.driver.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \
- --conf spark.kubernetes.driver.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \
- --conf spark.kubernetes.executor.volumes.hostPath.tpcdata.mount.path=/mnt/bigdata/tpcds/sf100/ \
- --conf spark.kubernetes.executor.volumes.hostPath.tpcdata.options.path=/mnt/bigdata/tpcds/sf100/ \
- --conf spark.kubernetes.authenticate.caCertFile=/var/snap/microk8s/current/certs/ca.crt \
- local:///opt/datafusion-benchmarks/runners/datafusion-comet/tpcbench.py \
- --benchmark tpcds \
- --data /mnt/bigdata/tpcds/sf100/ \
- --queries /opt/datafusion-benchmarks/tpcds/queries-spark \
- --iterations 1
+## Running in Docker
+
+See [infra/docker/](infra/docker/) for docker-compose setup with optional
+memory-constrained overlays and cgroup metrics collection.
+
+The Docker image includes both Java 8 and Java 17 runtimes. Java 17 is the
+default (`JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64`), which is required
+by Comet. Gluten requires Java 8, so override `JAVA_HOME` for all containers
+when running Gluten benchmarks:
+
+```bash
+# Start the cluster with Java 8 for Gluten
+docker compose -f benchmarks/infra/docker/docker-compose.yml up -d
+
+# Run Gluten benchmark (override JAVA_HOME on all containers)
+docker compose -f benchmarks/infra/docker/docker-compose.yml run --rm \
+ -e JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
+ -e GLUTEN_JAR=/jars/gluten.jar \
+ bench bash -c 'python3 /opt/benchmarks/run.py \
+ --engine gluten --profile docker \
+ -- tpc --name gluten --benchmark tpch --data /data \
+ --queries /queries --output /results --iterations 1'
```
+
+> **Note:** The Spark worker must also run Java 8 for Gluten. Use a
+> docker-compose override file to set `JAVA_HOME` on `spark-master` and
+> `spark-worker` services before starting the cluster, or restart the
+> cluster between engine switches.
+
diff --git a/benchmarks/analysis/__init__.py b/benchmarks/analysis/__init__.py
new file mode 100644
index 0000000000..0ccbeeeafb
--- /dev/null
+++ b/benchmarks/analysis/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmarks/analysis/compare.py b/benchmarks/analysis/compare.py
new file mode 100644
index 0000000000..b9a24acc57
--- /dev/null
+++ b/benchmarks/analysis/compare.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Compare benchmark results and generate charts.
+
+Reads the JSON output produced by ``suites/tpc.py`` (integer query keys
+serialised as strings by ``json.dumps``).
+
+Usage::
+
+ python -m benchmarks.analysis.compare \\
+ comet-tpch-*.json spark-tpch-*.json \\
+ --labels comet spark --benchmark tpch --title "SF100" \\
+ --output-dir ./charts
+"""
+
+import argparse
+import json
+import os
+import sys
+from typing import Any, Dict, List, Sequence, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+QUERY_COUNTS = {"tpch": 22, "tpcds": 99}
+
+
+def _query_range(benchmark: str) -> range:
+ n = QUERY_COUNTS.get(benchmark)
+ if n is None:
+ raise ValueError(f"Unknown benchmark: {benchmark}")
+ return range(1, n + 1)
+
+
+def _median(timings: List[float]) -> float:
+ return float(np.median(np.array(timings)))
+
+
+# ---------------------------------------------------------------------------
+# Chart generators
+# ---------------------------------------------------------------------------
+
+def generate_summary_chart(
+ results: Sequence[Dict[str, Any]],
+ labels: Sequence[str],
+ benchmark: str,
+ title: str,
+ output_dir: str = ".",
+) -> str:
+ """Total wall-clock bar chart. Returns the output path."""
+ num_queries = QUERY_COUNTS[benchmark]
+ timings = [0.0] * len(results)
+ for query in _query_range(benchmark):
+ for i, r in enumerate(results):
+ timings[i] += _median(r[str(query)])
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+ ax.set_title(title)
+ ax.set_ylabel(
+ f"Time in seconds to run all {num_queries} {benchmark} queries "
+ f"(lower is better)"
+ )
+ times = [round(x, 0) for x in timings]
+ bars = ax.bar(labels, times, color="skyblue", width=0.8)
+ for bar in bars:
+ yval = bar.get_height()
+ ax.text(
+ bar.get_x() + bar.get_width() / 2.0, yval, f"{yval}",
+ va="bottom", ha="center",
+ )
+ path = os.path.join(output_dir, f"{benchmark}_allqueries.png")
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ return path
+
+
+def generate_comparison_chart(
+ results: Sequence[Dict[str, Any]],
+ labels: Sequence[str],
+ benchmark: str,
+ title: str,
+ output_dir: str = ".",
+) -> str:
+ """Per-query grouped bar chart. Returns the output path."""
+ queries: List[str] = []
+ benches: List[List[float]] = [[] for _ in results]
+ for query in _query_range(benchmark):
+ queries.append(f"q{query}")
+ for i, r in enumerate(results):
+ benches[i].append(_median(r[str(query)]))
+
+ bar_width = 0.3
+ index = np.arange(len(queries)) * 1.5
+ fig_w = 15 if benchmark == "tpch" else 35
+ fig, ax = plt.subplots(figsize=(fig_w, 6))
+
+ for i, label in enumerate(labels):
+ ax.bar(index + i * bar_width, benches[i], bar_width, label=label)
+
+ ax.set_title(title)
+ ax.set_xlabel("Queries")
+ ax.set_ylabel("Query Time (seconds)")
+ ax.set_xticks(index + bar_width / 2)
+ ax.set_xticklabels(queries)
+ ax.legend()
+
+ path = os.path.join(output_dir, f"{benchmark}_queries_compare.png")
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ return path
+
+
+def _speedup_data(
+ baseline: Dict, comparison: Dict, benchmark: str, absolute: bool,
+) -> Tuple[List[str], List[float]]:
+ """Compute per-query speedup (relative % or absolute seconds)."""
+ rows: List[Tuple[str, float]] = []
+ for query in _query_range(benchmark):
+ a = _median(baseline[str(query)])
+ b = _median(comparison[str(query)])
+ if absolute:
+ rows.append((f"q{query}", round(a - b, 1)))
+ else:
+ if a > b:
+ speedup = a / b - 1
+ else:
+ speedup = -(1 / (a / b) - 1)
+ rows.append((f"q{query}", round(speedup * 100, 0)))
+ rows.sort(key=lambda x: -x[1])
+ qs, vals = zip(*rows)
+ return list(qs), list(vals)
+
+
+def generate_speedup_chart(
+ baseline: Dict[str, Any],
+ comparison: Dict[str, Any],
+ label1: str,
+ label2: str,
+ benchmark: str,
+ title: str,
+ absolute: bool = False,
+ output_dir: str = ".",
+) -> str:
+ """Relative (%) or absolute (seconds) speedup chart. Returns path."""
+ queries, speedups = _speedup_data(baseline, comparison, benchmark, absolute)
+
+ fig_w = 10 if benchmark == "tpch" else 35
+ fig_h = 6 if benchmark == "tpch" else 10
+ fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+ bars = ax.bar(queries, speedups, color="skyblue")
+
+ for bar, val in zip(bars, speedups):
+ yval = bar.get_height()
+ fmt = f"{val:.1f}" if absolute else f"{val:.0f}%"
+ va = "bottom" if yval >= 0 else "top"
+ y = min(800, yval + 5) if yval >= 0 else yval
+ ax.text(
+ bar.get_x() + bar.get_width() / 2.0, y, fmt,
+ va=va, ha="center", fontsize=8, color="blue", rotation=90,
+ )
+
+ kind = "seconds" if absolute else "percentage"
+ suffix = "abs" if absolute else "rel"
+ ylabel = "Speedup (in seconds)" if absolute else "Speedup Percentage (100% speedup = 2x faster)"
+ ax.set_title(f"{label2} speedup over {label1} ({title})")
+ ax.set_ylabel(ylabel)
+ ax.set_xlabel("Query")
+ ax.axhline(0, color="black", linewidth=0.8)
+ ax.yaxis.grid(True)
+
+ if not absolute:
+ min_val = (min(speedups) // 100) * 100
+ max_val = ((max(speedups) // 100) + 1) * 100 + 50
+ if benchmark == "tpch":
+ ax.set_ylim(min_val, max_val)
+ else:
+ ax.set_ylim(-250, 300)
+ else:
+ ax.set_ylim(min(speedups) * 2 - 20, max(speedups) * 1.5)
+
+ path = os.path.join(output_dir, f"{benchmark}_queries_speedup_{suffix}.png")
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ return path
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def compare(
+ files: Sequence[str],
+ labels: Sequence[str],
+ benchmark: str,
+ title: str,
+ output_dir: str = ".",
+) -> List[str]:
+ """Run all applicable charts. Returns list of output file paths."""
+ os.makedirs(output_dir, exist_ok=True)
+ results = []
+ for filename in files:
+ with open(filename) as f:
+ results.append(json.load(f))
+
+ paths = [
+ generate_summary_chart(results, labels, benchmark, title, output_dir),
+ generate_comparison_chart(results, labels, benchmark, title, output_dir),
+ ]
+
+ if len(files) == 2:
+ paths.append(
+ generate_speedup_chart(
+ results[0], results[1], labels[0], labels[1],
+ benchmark, title, absolute=True, output_dir=output_dir,
+ )
+ )
+ paths.append(
+ generate_speedup_chart(
+ results[0], results[1], labels[0], labels[1],
+ benchmark, title, absolute=False, output_dir=output_dir,
+ )
+ )
+
+ for p in paths:
+ print(f"Wrote {p}")
+ return paths
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main(argv=None):
+ parser = argparse.ArgumentParser(
+ description="Compare benchmark results and generate charts",
+ )
+ parser.add_argument("filenames", nargs="+", help="JSON result files")
+ parser.add_argument("--labels", nargs="+", required=True, help="Labels for each file")
+ parser.add_argument("--benchmark", required=True, help="tpch or tpcds")
+ parser.add_argument("--title", required=True, help="Chart title")
+ parser.add_argument("--output-dir", default=".", help="Directory for chart PNGs")
+ args = parser.parse_args(argv)
+
+ if len(args.filenames) != len(args.labels):
+ parser.error("Number of filenames must match number of labels")
+
+ compare(args.filenames, args.labels, args.benchmark, args.title, args.output_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/analysis/memory_report.py b/benchmarks/analysis/memory_report.py
new file mode 100644
index 0000000000..e77ff10dc0
--- /dev/null
+++ b/benchmarks/analysis/memory_report.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Parse profiling output and generate memory utilisation reports.
+
+Supports two data sources:
+
+1. **Spark REST API metrics** — CSV written by ``runner/profiling.py``
+ (``SparkMetricsProfiler``). Columns include ``elapsed_secs``,
+ ``executor_id``, ``memoryUsed``, ``maxMemory``, and various peak metrics.
+
+2. **Container cgroup metrics** — CSV written by
+ ``infra/docker/collect-metrics.sh``. Columns:
+ ``timestamp_ms, memory_usage_bytes, memory_limit_bytes, rss_bytes,
+ cache_bytes, swap_bytes``.
+
+Usage::
+
+ python -m benchmarks.analysis.memory_report \\
+ --spark-csv results/comet-tpch-metrics.csv \\
+ --container-csv results/container-metrics.csv \\
+ --output-dir ./charts
+"""
+
+import argparse
+import csv
+import os
+import sys
+from typing import Dict, List, Optional
+
+import matplotlib.pyplot as plt
+
+
+# ---------------------------------------------------------------------------
+# Spark REST API metrics
+# ---------------------------------------------------------------------------
+
+def parse_spark_csv(path: str) -> Dict[str, List[Dict]]:
+ """Parse a SparkMetricsProfiler CSV into per-executor time series.
+
+ Returns ``{executor_id: [{elapsed_secs, memoryUsed, maxMemory, ...}]}``
+ """
+ executors: Dict[str, List[Dict]] = {}
+ with open(path, newline="") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ eid = row.get("executor_id", "unknown")
+ parsed = {}
+ for k, v in row.items():
+ try:
+ parsed[k] = float(v)
+ except (ValueError, TypeError):
+ parsed[k] = v
+ executors.setdefault(eid, []).append(parsed)
+ return executors
+
+
+def generate_spark_memory_chart(
+ spark_csv: str,
+ output_dir: str = ".",
+) -> List[str]:
+ """Generate per-executor memory usage over time. Returns output paths."""
+ executors = parse_spark_csv(spark_csv)
+ paths = []
+
+ for eid, samples in executors.items():
+ elapsed = [s.get("elapsed_secs", 0) for s in samples]
+ used = [s.get("memoryUsed", 0) / (1024 ** 2) for s in samples] # MB
+ max_mem = [s.get("maxMemory", 0) / (1024 ** 2) for s in samples]
+
+ fig, ax = plt.subplots(figsize=(12, 5))
+ ax.plot(elapsed, used, label="memoryUsed", linewidth=1.5)
+ if any(m > 0 for m in max_mem):
+ ax.plot(elapsed, max_mem, label="maxMemory", linestyle="--", alpha=0.6)
+ ax.set_xlabel("Elapsed (seconds)")
+ ax.set_ylabel("Memory (MB)")
+ ax.set_title(f"Executor {eid} — JVM Memory Usage")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ fname = f"spark_memory_executor_{eid}.png"
+ path = os.path.join(output_dir, fname)
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ paths.append(path)
+
+ # Peak memory bar chart across executors
+ if executors:
+ fig, ax = plt.subplots(figsize=(max(6, len(executors) * 1.5), 5))
+ eids = list(executors.keys())
+ peaks = []
+ for eid in eids:
+ peak = max(
+ (s.get("peak_JVMHeapMemory", 0) + s.get("peak_JVMOffHeapMemory", 0))
+ for s in executors[eid]
+ ) / (1024 ** 2)
+ peaks.append(peak)
+
+ bars = ax.bar(eids, peaks, color="coral")
+ for bar, val in zip(bars, peaks):
+ ax.text(
+ bar.get_x() + bar.get_width() / 2.0, val,
+ f"{val:.0f}", va="bottom", ha="center", fontsize=9,
+ )
+ ax.set_xlabel("Executor")
+ ax.set_ylabel("Peak JVM Memory (MB)")
+ ax.set_title("Peak JVM Memory by Executor")
+ ax.grid(True, axis="y", alpha=0.3)
+
+ path = os.path.join(output_dir, "spark_memory_peak.png")
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ paths.append(path)
+
+ for p in paths:
+ print(f"Wrote {p}")
+ return paths
+
+
+# ---------------------------------------------------------------------------
+# Container cgroup metrics
+# ---------------------------------------------------------------------------
+
+def parse_container_csv(path: str) -> List[Dict[str, float]]:
+ """Parse a collect-metrics.sh CSV into a list of samples."""
+ samples = []
+ with open(path, newline="") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ parsed = {}
+ for k, v in row.items():
+ try:
+ parsed[k] = float(v)
+ except (ValueError, TypeError):
+ parsed[k] = v
+ samples.append(parsed)
+ return samples
+
+
+def generate_container_memory_chart(
+ container_csv: str,
+ output_dir: str = ".",
+) -> List[str]:
+ """Generate container memory usage over time. Returns output paths."""
+ samples = parse_container_csv(container_csv)
+ if not samples:
+ print("No container metrics samples found")
+ return []
+
+ t0 = samples[0].get("timestamp_ms", 0)
+ elapsed = [(s.get("timestamp_ms", 0) - t0) / 1000.0 for s in samples]
+ usage_mb = [s.get("memory_usage_bytes", 0) / (1024 ** 2) for s in samples]
+ rss_mb = [s.get("rss_bytes", 0) / (1024 ** 2) for s in samples]
+ cache_mb = [s.get("cache_bytes", 0) / (1024 ** 2) for s in samples]
+ limit_mb = [s.get("memory_limit_bytes", 0) / (1024 ** 2) for s in samples]
+
+ fig, ax = plt.subplots(figsize=(12, 5))
+ ax.plot(elapsed, usage_mb, label="total usage", linewidth=1.5)
+ ax.plot(elapsed, rss_mb, label="RSS", linewidth=1.2)
+ ax.plot(elapsed, cache_mb, label="cache", linewidth=1.0, alpha=0.7)
+ if any(m > 0 for m in limit_mb):
+ ax.axhline(
+ limit_mb[0], color="red", linestyle="--", linewidth=1.0,
+ label=f"limit ({limit_mb[0]:.0f} MB)",
+ )
+ ax.set_xlabel("Elapsed (seconds)")
+ ax.set_ylabel("Memory (MB)")
+ ax.set_title("Container Memory Usage (cgroup)")
+ ax.legend()
+ ax.grid(True, alpha=0.3)
+
+ paths = []
+ path = os.path.join(output_dir, "container_memory.png")
+ plt.savefig(path, format="png")
+ plt.close(fig)
+ paths.append(path)
+
+ # Summary stats
+ peak_usage = max(usage_mb)
+ peak_rss = max(rss_mb)
+ limit = limit_mb[0] if limit_mb else 0
+ print(f"Container memory summary:")
+ print(f" Peak usage: {peak_usage:.0f} MB")
+ print(f" Peak RSS: {peak_rss:.0f} MB")
+ if limit > 0:
+ print(f" Limit: {limit:.0f} MB")
+ print(f" Peak % used: {peak_usage / limit * 100:.1f}%")
+
+ for p in paths:
+ print(f"Wrote {p}")
+ return paths
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main(argv=None):
+ parser = argparse.ArgumentParser(
+ description="Generate memory utilisation reports from profiling data",
+ )
+ parser.add_argument(
+ "--spark-csv", help="Path to SparkMetricsProfiler CSV",
+ )
+ parser.add_argument(
+ "--container-csv", help="Path to collect-metrics.sh CSV",
+ )
+ parser.add_argument(
+ "--output-dir", default=".", help="Directory for chart PNGs",
+ )
+ args = parser.parse_args(argv)
+
+ if not args.spark_csv and not args.container_csv:
+ parser.error("At least one of --spark-csv or --container-csv is required")
+
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ if args.spark_csv:
+ generate_spark_memory_chart(args.spark_csv, args.output_dir)
+ if args.container_csv:
+ generate_container_memory_chart(args.container_csv, args.output_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/conf/engines/comet-iceberg.conf b/benchmarks/conf/engines/comet-iceberg.conf
new file mode 100644
index 0000000000..bfdfd4ccf6
--- /dev/null
+++ b/benchmarks/conf/engines/comet-iceberg.conf
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# DataFusion Comet with native Iceberg scanning.
+# Catalog configs (spark.sql.catalog.*, spark.sql.defaultCatalog) should be
+# passed via --conf CLI overrides since the catalog name is user-specific.
+runner.name=comet-iceberg
+runner.jars=${COMET_JAR},${ICEBERG_JAR}
+
+spark.driver.extraClassPath=${COMET_JAR}:${ICEBERG_JAR}
+spark.executor.extraClassPath=${COMET_JAR}:${ICEBERG_JAR}
+spark.plugins=org.apache.spark.CometPlugin
+spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
+spark.comet.exec.replaceSortMergeJoin=true
+spark.comet.expression.Cast.allowIncompatible=true
+spark.comet.enabled=true
+spark.comet.exec.enabled=true
+spark.comet.scan.icebergNative.enabled=true
+spark.comet.explainFallback.enabled=true
diff --git a/benchmarks/conf/engines/comet-jvm-shuffle.conf b/benchmarks/conf/engines/comet-jvm-shuffle.conf
new file mode 100644
index 0000000000..12b3d23a18
--- /dev/null
+++ b/benchmarks/conf/engines/comet-jvm-shuffle.conf
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Comet with JVM shuffle mode — for shuffle benchmarks.
+runner.name=comet-jvm-shuffle
+runner.jars=${COMET_JAR}
+
+spark.driver.extraClassPath=${COMET_JAR}
+spark.executor.extraClassPath=${COMET_JAR}
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
+spark.comet.enabled=true
+spark.comet.operator.DataWritingCommandExec.allowIncompatible=true
+spark.comet.parquet.write.enabled=true
+spark.comet.logFallbackReasons.enabled=true
+spark.comet.explainFallback.enabled=true
+spark.comet.shuffle.mode=jvm
+spark.comet.exec.shuffle.mode=jvm
+spark.comet.exec.replaceSortMergeJoin=true
+spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
+spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions
+spark.comet.cast.allowIncompatible=true
diff --git a/benchmarks/conf/engines/comet-native-shuffle.conf b/benchmarks/conf/engines/comet-native-shuffle.conf
new file mode 100644
index 0000000000..0df2eac0c6
--- /dev/null
+++ b/benchmarks/conf/engines/comet-native-shuffle.conf
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Comet with native shuffle mode — for shuffle benchmarks.
+runner.name=comet-native-shuffle
+runner.jars=${COMET_JAR}
+
+spark.driver.extraClassPath=${COMET_JAR}
+spark.executor.extraClassPath=${COMET_JAR}
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
+spark.comet.enabled=true
+spark.comet.operator.DataWritingCommandExec.allowIncompatible=true
+spark.comet.parquet.write.enabled=true
+spark.comet.logFallbackReasons.enabled=true
+spark.comet.explainFallback.enabled=true
+spark.comet.exec.shuffle.mode=native
+spark.comet.exec.replaceSortMergeJoin=true
+spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
+spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions
+spark.comet.cast.allowIncompatible=true
diff --git a/benchmarks/conf/engines/comet.conf b/benchmarks/conf/engines/comet.conf
new file mode 100644
index 0000000000..257fd7dd56
--- /dev/null
+++ b/benchmarks/conf/engines/comet.conf
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# DataFusion Comet accelerator.
+runner.name=comet
+runner.jars=${COMET_JAR}
+
+spark.driver.extraClassPath=${COMET_JAR}
+spark.executor.extraClassPath=${COMET_JAR}
+spark.plugins=org.apache.spark.CometPlugin
+spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager
+spark.comet.scan.impl=native_datafusion
+spark.comet.exec.replaceSortMergeJoin=true
+spark.comet.expression.Cast.allowIncompatible=true
diff --git a/benchmarks/conf/engines/gluten.conf b/benchmarks/conf/engines/gluten.conf
new file mode 100644
index 0000000000..91599c5bde
--- /dev/null
+++ b/benchmarks/conf/engines/gluten.conf
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Gluten accelerator.
+runner.name=gluten
+runner.jars=${GLUTEN_JAR}
+runner.env.TZ=UTC
+
+spark.driver.extraClassPath=${GLUTEN_JAR}
+spark.executor.extraClassPath=${GLUTEN_JAR}
+spark.plugins=org.apache.gluten.GlutenPlugin
+spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager
+spark.gluten.sql.columnar.forceShuffledHashJoin=true
+spark.sql.session.timeZone=UTC
diff --git a/benchmarks/conf/engines/spark-shuffle.conf b/benchmarks/conf/engines/spark-shuffle.conf
new file mode 100644
index 0000000000..2b087a129b
--- /dev/null
+++ b/benchmarks/conf/engines/spark-shuffle.conf
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Vanilla Spark baseline for shuffle benchmarks — Comet explicitly disabled.
+runner.name=spark-shuffle
+
+spark.comet.enabled=false
+spark.comet.exec.shuffle.enabled=false
diff --git a/benchmarks/conf/engines/spark.conf b/benchmarks/conf/engines/spark.conf
new file mode 100644
index 0000000000..e1831c4ae5
--- /dev/null
+++ b/benchmarks/conf/engines/spark.conf
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Vanilla Spark — no accelerator plugin.
+runner.name=spark
diff --git a/benchmarks/conf/profiles/docker.conf b/benchmarks/conf/profiles/docker.conf
new file mode 100644
index 0000000000..9b2bec6841
--- /dev/null
+++ b/benchmarks/conf/profiles/docker.conf
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Profile for running inside docker-compose (see infra/docker/).
+# Data is mounted at /data, queries at /queries, results at /results.
+runner.master=${SPARK_MASTER}
+
+spark.driver.memory=8G
+spark.executor.instances=1
+spark.executor.cores=8
+spark.cores.max=8
+spark.executor.memory=16g
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
+spark.eventLog.enabled=true
diff --git a/benchmarks/conf/profiles/local.conf b/benchmarks/conf/profiles/local.conf
new file mode 100644
index 0000000000..75bb8454b3
--- /dev/null
+++ b/benchmarks/conf/profiles/local.conf
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Local mode — runs on local[*] with no master URL.
+runner.master=local[*]
+
+spark.driver.memory=8G
+spark.executor.memory=16g
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
diff --git a/benchmarks/conf/profiles/standalone-tpcds.conf b/benchmarks/conf/profiles/standalone-tpcds.conf
new file mode 100644
index 0000000000..c892a7e77f
--- /dev/null
+++ b/benchmarks/conf/profiles/standalone-tpcds.conf
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Standalone cluster profile for TPC-DS: 2 executors, 16 cores total.
+runner.master=${SPARK_MASTER}
+
+spark.driver.memory=8G
+spark.executor.instances=2
+spark.executor.cores=8
+spark.cores.max=16
+spark.executor.memory=16g
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
+spark.eventLog.enabled=true
+spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain
diff --git a/benchmarks/conf/profiles/standalone-tpch.conf b/benchmarks/conf/profiles/standalone-tpch.conf
new file mode 100644
index 0000000000..024a7364f3
--- /dev/null
+++ b/benchmarks/conf/profiles/standalone-tpch.conf
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Standalone cluster profile for TPC-H: 1 executor, 8 cores.
+runner.master=${SPARK_MASTER}
+
+spark.driver.memory=8G
+spark.executor.instances=1
+spark.executor.cores=8
+spark.cores.max=8
+spark.executor.memory=16g
+spark.memory.offHeap.enabled=true
+spark.memory.offHeap.size=16g
+spark.eventLog.enabled=true
+spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain
diff --git a/dev/benchmarks/create-iceberg-tpch.py b/benchmarks/create-iceberg-tpch.py
similarity index 100%
rename from dev/benchmarks/create-iceberg-tpch.py
rename to benchmarks/create-iceberg-tpch.py
diff --git a/dev/benchmarks/drop-caches.sh b/benchmarks/drop-caches.sh
similarity index 100%
rename from dev/benchmarks/drop-caches.sh
rename to benchmarks/drop-caches.sh
diff --git a/benchmarks/pyspark/generate_data.py b/benchmarks/generate_shuffle_data.py
old mode 100755
new mode 100644
similarity index 100%
rename from benchmarks/pyspark/generate_data.py
rename to benchmarks/generate_shuffle_data.py
diff --git a/benchmarks/infra/docker/Dockerfile b/benchmarks/infra/docker/Dockerfile
new file mode 100644
index 0000000000..861411819a
--- /dev/null
+++ b/benchmarks/infra/docker/Dockerfile
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Unified benchmark image for running TPC and shuffle benchmarks across
+# engines (Spark, Comet, Gluten).
+#
+# Build:
+# docker build -t comet-bench -f benchmarks/infra/docker/Dockerfile .
+#
+# The build context should be the repository root so that benchmarks/ is
+# available.
+
+ARG SPARK_IMAGE=apache/spark:3.5.2-python3
+FROM ${SPARK_IMAGE}
+
+USER root
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends python3 python3-pip procps \
+ openjdk-8-jre-headless openjdk-17-jre-headless \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Default to Java 17 (Comet). Override with JAVA_HOME for other engines.
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Copy the unified benchmark runner into the image.
+COPY benchmarks/conf /opt/benchmarks/conf
+COPY benchmarks/runner /opt/benchmarks/runner
+COPY benchmarks/suites /opt/benchmarks/suites
+COPY benchmarks/queries /opt/benchmarks/queries
+COPY benchmarks/run.py /opt/benchmarks/run.py
+
+# Copy the metrics collector script.
+COPY benchmarks/infra/docker/collect-metrics.sh /opt/benchmarks/collect-metrics.sh
+RUN chmod +x /opt/benchmarks/collect-metrics.sh
+
+# Engine JARs are bind-mounted or copied in at runtime via --jars.
+# Data and query paths are also bind-mounted.
+
+ENV PYTHONPATH="/opt:${PYTHONPATH}"
+
+WORKDIR /opt/benchmarks
+
+USER ${spark_uid}
diff --git a/benchmarks/infra/docker/collect-metrics.sh b/benchmarks/infra/docker/collect-metrics.sh
new file mode 100755
index 0000000000..fd9c1d848f
--- /dev/null
+++ b/benchmarks/infra/docker/collect-metrics.sh
@@ -0,0 +1,103 @@
+#!/bin/sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Container-level memory metrics collector.
+#
+# Polls cgroup memory stats at a fixed interval and writes a CSV with
+# columns: timestamp, memory_usage_bytes, memory_limit_bytes, rss_bytes,
+# cache_bytes, swap_bytes.
+#
+# Works with both cgroup v1 and v2.
+#
+# Usage:
+# collect-metrics.sh [INTERVAL_SECS] [OUTPUT_CSV]
+#
+# Defaults: interval=1, output=/results/container-metrics.csv
+
+set -e
+
+INTERVAL="${1:-1}"
+OUTPUT="${2:-/results/container-metrics.csv}"
+
+# Detect cgroup version
+if [ -f /sys/fs/cgroup/memory/memory.usage_in_bytes ]; then
+ CGROUP_VERSION=1
+elif [ -f /sys/fs/cgroup/memory.current ]; then
+ CGROUP_VERSION=2
+else
+ echo "Warning: cannot detect cgroup memory files; polling disabled" >&2
+ # Still write a header so downstream tools don't break on a missing file.
+ echo "timestamp_ms,memory_usage_bytes,memory_limit_bytes,rss_bytes,cache_bytes,swap_bytes" > "$OUTPUT"
+ # Sleep forever so the container stays up (compose expects it to keep running).
+ exec sleep infinity
+fi
+
+# ---- helpers ----
+
+read_file() {
+ # Return the contents of a file, or "0" if it doesn't exist.
+ if [ -f "$1" ]; then cat "$1"; else echo "0"; fi
+}
+
+read_stat() {
+ # Extract a named field from memory.stat (cgroup v1 format: "key value").
+ grep "^$1 " "$2" 2>/dev/null | awk '{print $2}' || echo "0"
+}
+
+poll_v1() {
+ local usage limit rss cache swap
+ usage=$(read_file /sys/fs/cgroup/memory/memory.usage_in_bytes)
+ limit=$(read_file /sys/fs/cgroup/memory/memory.limit_in_bytes)
+ local stat=/sys/fs/cgroup/memory/memory.stat
+ rss=$(read_stat total_rss "$stat")
+ cache=$(read_stat total_cache "$stat")
+ swap=$(read_file /sys/fs/cgroup/memory/memory.memsw.usage_in_bytes)
+ # swap file reports memory+swap; subtract memory to get swap only
+ if [ "$swap" != "0" ]; then
+ swap=$((swap - usage))
+ [ "$swap" -lt 0 ] && swap=0
+ fi
+ echo "$usage,$limit,$rss,$cache,$swap"
+}
+
+poll_v2() {
+ local usage limit rss cache swap
+ usage=$(read_file /sys/fs/cgroup/memory.current)
+ limit=$(read_file /sys/fs/cgroup/memory.max)
+ [ "$limit" = "max" ] && limit=0
+ local stat=/sys/fs/cgroup/memory.stat
+ rss=$(read_stat anon "$stat")
+ cache=$(read_stat file "$stat")
+ swap=$(read_file /sys/fs/cgroup/memory.swap.current)
+ echo "$usage,$limit,$rss,$cache,$swap"
+}
+
+# ---- main loop ----
+
+echo "timestamp_ms,memory_usage_bytes,memory_limit_bytes,rss_bytes,cache_bytes,swap_bytes" > "$OUTPUT"
+echo "Collecting container memory metrics every ${INTERVAL}s -> ${OUTPUT} (cgroup v${CGROUP_VERSION})" >&2
+
+while true; do
+ ts=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))')
+ if [ "$CGROUP_VERSION" = "1" ]; then
+ vals=$(poll_v1)
+ else
+ vals=$(poll_v2)
+ fi
+ echo "${ts},${vals}" >> "$OUTPUT"
+ sleep "$INTERVAL"
+done
diff --git a/benchmarks/infra/docker/docker-compose.constrained.yml b/benchmarks/infra/docker/docker-compose.constrained.yml
new file mode 100644
index 0000000000..eff730d0e3
--- /dev/null
+++ b/benchmarks/infra/docker/docker-compose.constrained.yml
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Constrained memory overlay.
+#
+# Apply on top of docker-compose.yml to enforce hard memory limits and
+# enable the metrics-collector sidecar:
+#
+# docker compose -f docker-compose.yml -f docker-compose.constrained.yml up -d
+#
+# Environment variables:
+# WORKER_MEM_LIMIT - Hard memory limit for the worker (default: 6g)
+# BENCH_MEM_LIMIT - Hard memory limit for the bench runner (default: 10g)
+# METRICS_INTERVAL - Collection interval in seconds (default: 1)
+
+services:
+ spark-worker:
+ mem_limit: ${WORKER_MEM_LIMIT:-6g}
+ memswap_limit: ${WORKER_MEM_LIMIT:-6g} # same as mem_limit → no swap
+
+ bench:
+ mem_limit: ${BENCH_MEM_LIMIT:-10g}
+ memswap_limit: ${BENCH_MEM_LIMIT:-10g}
+
+ metrics-collector:
+ image: ${BENCH_IMAGE:-comet-bench}
+ container_name: metrics-collector
+ pid: "service:spark-worker" # share PID namespace with worker
+ command:
+ - /opt/benchmarks/collect-metrics.sh
+ - "${METRICS_INTERVAL:-1}"
+ - /results/container-metrics.csv
+ volumes:
+ - ${RESULTS_DIR:-/tmp/bench-results}:/results
+ depends_on:
+ - spark-worker
diff --git a/benchmarks/infra/docker/docker-compose.yml b/benchmarks/infra/docker/docker-compose.yml
new file mode 100644
index 0000000000..36261e3ded
--- /dev/null
+++ b/benchmarks/infra/docker/docker-compose.yml
@@ -0,0 +1,88 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Spark standalone cluster for benchmarks.
+#
+# Usage:
+# docker compose -f benchmarks/infra/docker/docker-compose.yml up -d
+#
+# Override with constrained memory limits:
+# docker compose -f benchmarks/infra/docker/docker-compose.yml \
+# -f benchmarks/infra/docker/docker-compose.constrained.yml up -d
+#
+# Environment variables (set in .env or export before running):
+# BENCH_IMAGE - Docker image to use (default: comet-bench)
+# DATA_DIR - Host path to TPC data (default: /tmp/tpc-data)
+# QUERIES_DIR - Host path to query SQL files (default: /tmp/tpc-queries)
+# RESULTS_DIR - Host path for results output (default: /tmp/bench-results)
+# ENGINE_JARS_DIR - Host path containing engine JARs (default: /tmp/engine-jars)
+
+services:
+ spark-master:
+ image: ${BENCH_IMAGE:-comet-bench}
+ container_name: spark-master
+ hostname: spark-master
+ command: /opt/spark/sbin/start-master.sh --host spark-master
+ ports:
+ - "7077:7077"
+ - "8080:8080"
+ volumes:
+ - ${DATA_DIR:-/tmp/tpc-data}:/data:ro
+ - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro
+ - ${RESULTS_DIR:-/tmp/bench-results}:/results
+ - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro
+ environment:
+ - SPARK_MASTER_HOST=spark-master
+ - SPARK_NO_DAEMONIZE=true
+
+ spark-worker:
+ image: ${BENCH_IMAGE:-comet-bench}
+ container_name: spark-worker
+ hostname: spark-worker
+ depends_on:
+ - spark-master
+ command: /opt/spark/sbin/start-worker.sh spark://spark-master:7077
+ ports:
+ - "8081:8081"
+ volumes:
+ - ${DATA_DIR:-/tmp/tpc-data}:/data:ro
+ - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro
+ - ${RESULTS_DIR:-/tmp/bench-results}:/results
+ - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro
+ environment:
+ - SPARK_WORKER_CORES=${WORKER_CORES:-8}
+ - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-16g}
+ - SPARK_NO_DAEMONIZE=true
+
+ bench:
+ image: ${BENCH_IMAGE:-comet-bench}
+ container_name: bench-runner
+ depends_on:
+ - spark-master
+ - spark-worker
+ # Override 'command' to run a specific benchmark, e.g.:
+ # docker compose run bench python /opt/benchmarks/run.py \
+ # --engine comet --profile docker -- tpc ...
+ command: ["echo", "Use 'docker compose run bench python /opt/benchmarks/run.py ...' to run benchmarks"]
+ volumes:
+ - ${DATA_DIR:-/tmp/tpc-data}:/data:ro
+ - ${QUERIES_DIR:-/tmp/tpc-queries}:/queries:ro
+ - ${RESULTS_DIR:-/tmp/bench-results}:/results
+ - ${ENGINE_JARS_DIR:-/tmp/engine-jars}:/jars:ro
+ environment:
+ - SPARK_HOME=/opt/spark
+ - SPARK_MASTER=spark://spark-master:7077
+ - COMET_JAR=/jars/comet.jar
+ - PYTHONPATH=/opt
diff --git a/benchmarks/pyspark/README.md b/benchmarks/pyspark/README.md
deleted file mode 100644
index 3fc55123f0..0000000000
--- a/benchmarks/pyspark/README.md
+++ /dev/null
@@ -1,178 +0,0 @@
-
-
-# PySpark Benchmarks
-
-A suite of PySpark benchmarks for comparing performance between Spark, Comet JVM, and Comet Native implementations.
-
-## Available Benchmarks
-
-Run `python run_benchmark.py --list-benchmarks` to see all available benchmarks:
-
-- **shuffle-hash** - Shuffle all columns using hash partitioning on group_key
-- **shuffle-roundrobin** - Shuffle all columns using round-robin partitioning
-
-## Prerequisites
-
-- Apache Spark cluster (standalone, YARN, or Kubernetes)
-- PySpark installed
-- Comet JAR built
-
-## Build Comet JAR
-
-```bash
-cd /path/to/datafusion-comet
-make release
-```
-
-## Step 1: Generate Test Data
-
-Generate test data with realistic 50-column schema (nested structs, arrays, maps):
-
-```bash
-spark-submit \
- --master spark://master:7077 \
- --executor-memory 16g \
- generate_data.py \
- --output /tmp/shuffle-benchmark-data \
- --rows 10000000 \
- --partitions 200
-```
-
-### Data Generation Options
-
-| Option | Default | Description |
-| -------------------- | ---------- | ---------------------------- |
-| `--output`, `-o` | (required) | Output path for parquet data |
-| `--rows`, `-r` | 10000000 | Number of rows |
-| `--partitions`, `-p` | 200 | Number of output partitions |
-
-## Step 2: Run Benchmarks
-
-### List Available Benchmarks
-
-```bash
-python run_benchmark.py --list-benchmarks
-```
-
-### Run Individual Benchmarks
-
-You can run specific benchmarks by name:
-
-```bash
-# Hash partitioning shuffle - Spark baseline
-spark-submit --master spark://master:7077 \
- run_benchmark.py --data /tmp/shuffle-benchmark-data --mode spark --benchmark shuffle-hash
-
-# Round-robin shuffle - Spark baseline
-spark-submit --master spark://master:7077 \
- run_benchmark.py --data /tmp/shuffle-benchmark-data --mode spark --benchmark shuffle-roundrobin
-
-# Hash partitioning - Comet JVM shuffle
-spark-submit --master spark://master:7077 \
- --jars /path/to/comet.jar \
- --conf spark.comet.enabled=true \
- --conf spark.comet.exec.shuffle.enabled=true \
- --conf spark.comet.shuffle.mode=jvm \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- run_benchmark.py --data /tmp/shuffle-benchmark-data --mode jvm --benchmark shuffle-hash
-
-# Round-robin - Comet Native shuffle
-spark-submit --master spark://master:7077 \
- --jars /path/to/comet.jar \
- --conf spark.comet.enabled=true \
- --conf spark.comet.exec.shuffle.enabled=true \
- --conf spark.comet.exec.shuffle.mode=native \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- run_benchmark.py --data /tmp/shuffle-benchmark-data --mode native --benchmark shuffle-roundrobin
-```
-
-### Run All Benchmarks
-
-Use the provided script to run all benchmarks across all modes:
-
-```bash
-SPARK_MASTER=spark://master:7077 \
-EXECUTOR_MEMORY=16g \
-./run_all_benchmarks.sh /tmp/shuffle-benchmark-data
-```
-
-## Checking Results
-
-Open the Spark UI (default: http://localhost:4040) during each benchmark run to compare shuffle write sizes in the Stages tab.
-
-## Adding New Benchmarks
-
-The benchmark framework makes it easy to add new benchmarks:
-
-1. **Create a benchmark class** in `benchmarks/` directory (or add to existing file):
-
-```python
-from benchmarks.base import Benchmark
-
-class MyBenchmark(Benchmark):
- @classmethod
- def name(cls) -> str:
- return "my-benchmark"
-
- @classmethod
- def description(cls) -> str:
- return "Description of what this benchmark does"
-
- def run(self) -> Dict[str, Any]:
- # Read data
- df = self.spark.read.parquet(self.data_path)
-
- # Run your benchmark operation
- def benchmark_operation():
- result = df.filter(...).groupBy(...).agg(...)
- result.write.mode("overwrite").parquet("/tmp/output")
-
- # Time it
- duration_ms = self._time_operation(benchmark_operation)
-
- return {
- 'duration_ms': duration_ms,
- # Add any other metrics you want to track
- }
-```
-
-2. **Register the benchmark** in `benchmarks/__init__.py`:
-
-```python
-from .my_module import MyBenchmark
-
-_BENCHMARK_REGISTRY = {
- # ... existing benchmarks
- MyBenchmark.name(): MyBenchmark,
-}
-```
-
-3. **Run your new benchmark**:
-
-```bash
-python run_benchmark.py --data /path/to/data --mode spark --benchmark my-benchmark
-```
-
-The base `Benchmark` class provides:
-
-- Automatic timing via `_time_operation()`
-- Standard output formatting via `execute_timed()`
-- Access to SparkSession, data path, and mode
-- Spark configuration printing
diff --git a/benchmarks/pyspark/benchmarks/__init__.py b/benchmarks/pyspark/benchmarks/__init__.py
deleted file mode 100644
index 7d913a7d6d..0000000000
--- a/benchmarks/pyspark/benchmarks/__init__.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Benchmark registry for PySpark benchmarks.
-
-This module provides a central registry for discovering and running benchmarks.
-"""
-
-from typing import Dict, Type, List
-
-from .base import Benchmark
-from .shuffle import ShuffleHashBenchmark, ShuffleRoundRobinBenchmark
-
-
-# Registry of all available benchmarks
-_BENCHMARK_REGISTRY: Dict[str, Type[Benchmark]] = {
- ShuffleHashBenchmark.name(): ShuffleHashBenchmark,
- ShuffleRoundRobinBenchmark.name(): ShuffleRoundRobinBenchmark,
-}
-
-
-def get_benchmark(name: str) -> Type[Benchmark]:
- """
- Get a benchmark class by name.
-
- Args:
- name: Benchmark name
-
- Returns:
- Benchmark class
-
- Raises:
- KeyError: If benchmark name is not found
- """
- if name not in _BENCHMARK_REGISTRY:
- available = ", ".join(sorted(_BENCHMARK_REGISTRY.keys()))
- raise KeyError(
- f"Unknown benchmark: {name}. Available benchmarks: {available}"
- )
- return _BENCHMARK_REGISTRY[name]
-
-
-def list_benchmarks() -> List[tuple[str, str]]:
- """
- List all available benchmarks.
-
- Returns:
- List of (name, description) tuples
- """
- benchmarks = []
- for name in sorted(_BENCHMARK_REGISTRY.keys()):
- benchmark_cls = _BENCHMARK_REGISTRY[name]
- benchmarks.append((name, benchmark_cls.description()))
- return benchmarks
-
-
-__all__ = [
- 'Benchmark',
- 'get_benchmark',
- 'list_benchmarks',
- 'ShuffleHashBenchmark',
- 'ShuffleRoundRobinBenchmark',
-]
diff --git a/benchmarks/pyspark/benchmarks/base.py b/benchmarks/pyspark/benchmarks/base.py
deleted file mode 100644
index 7e8e8db5a9..0000000000
--- a/benchmarks/pyspark/benchmarks/base.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Base benchmark class providing common functionality for all benchmarks.
-"""
-
-import time
-from abc import ABC, abstractmethod
-from typing import Dict, Any
-
-from pyspark.sql import SparkSession
-
-
-class Benchmark(ABC):
- """Base class for all PySpark benchmarks."""
-
- def __init__(self, spark: SparkSession, data_path: str, mode: str):
- """
- Initialize benchmark.
-
- Args:
- spark: SparkSession instance
- data_path: Path to input data
- mode: Execution mode (spark, jvm, native)
- """
- self.spark = spark
- self.data_path = data_path
- self.mode = mode
-
- @classmethod
- @abstractmethod
- def name(cls) -> str:
- """Return the benchmark name (used for CLI)."""
- pass
-
- @classmethod
- @abstractmethod
- def description(cls) -> str:
- """Return a short description of the benchmark."""
- pass
-
- @abstractmethod
- def run(self) -> Dict[str, Any]:
- """
- Run the benchmark and return results.
-
- Returns:
- Dictionary containing benchmark results (must include 'duration_ms')
- """
- pass
-
- def execute_timed(self) -> Dict[str, Any]:
- """
- Execute the benchmark with timing and standard output.
-
- Returns:
- Dictionary containing benchmark results
- """
- print(f"\n{'=' * 80}")
- print(f"Benchmark: {self.name()}")
- print(f"Mode: {self.mode.upper()}")
- print(f"{'=' * 80}")
- print(f"Data path: {self.data_path}")
-
- # Print relevant Spark configuration
- self._print_spark_config()
-
- # Clear cache before running
- self.spark.catalog.clearCache()
-
- # Run the benchmark
- print(f"\nRunning benchmark...")
- results = self.run()
-
- # Print results
- print(f"\nDuration: {results['duration_ms']:,} ms")
- if 'row_count' in results:
- print(f"Rows processed: {results['row_count']:,}")
-
- # Print any additional metrics
- for key, value in results.items():
- if key not in ['duration_ms', 'row_count']:
- print(f"{key}: {value}")
-
- print(f"{'=' * 80}\n")
-
- return results
-
- def _print_spark_config(self):
- """Print relevant Spark configuration."""
- conf = self.spark.sparkContext.getConf()
- print(f"Shuffle manager: {conf.get('spark.shuffle.manager', 'default')}")
- print(f"Comet enabled: {conf.get('spark.comet.enabled', 'false')}")
- print(f"Comet shuffle enabled: {conf.get('spark.comet.exec.shuffle.enabled', 'false')}")
- print(f"Comet shuffle mode: {conf.get('spark.comet.shuffle.mode', 'not set')}")
- print(f"Spark UI: {self.spark.sparkContext.uiWebUrl}")
-
- def _time_operation(self, operation_fn):
- """
- Time an operation and return duration in milliseconds.
-
- Args:
- operation_fn: Function to time (takes no arguments)
-
- Returns:
- Duration in milliseconds
- """
- start_time = time.time()
- operation_fn()
- duration_ms = int((time.time() - start_time) * 1000)
- return duration_ms
diff --git a/benchmarks/pyspark/benchmarks/shuffle.py b/benchmarks/pyspark/benchmarks/shuffle.py
deleted file mode 100644
index 0facd2340d..0000000000
--- a/benchmarks/pyspark/benchmarks/shuffle.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Shuffle benchmarks for comparing shuffle file sizes and performance.
-
-These benchmarks test different partitioning strategies (hash, round-robin)
-across Spark, Comet JVM, and Comet Native shuffle implementations.
-"""
-
-from typing import Dict, Any
-from pyspark.sql import DataFrame
-
-from .base import Benchmark
-
-
-class ShuffleBenchmark(Benchmark):
- """Base class for shuffle benchmarks with common repartitioning logic."""
-
- def __init__(self, spark, data_path: str, mode: str, num_partitions: int = 200):
- """
- Initialize shuffle benchmark.
-
- Args:
- spark: SparkSession instance
- data_path: Path to input parquet data
- mode: Execution mode (spark, jvm, native)
- num_partitions: Number of partitions to shuffle to
- """
- super().__init__(spark, data_path, mode)
- self.num_partitions = num_partitions
-
- def _read_and_count(self) -> tuple[DataFrame, int]:
- """Read input data and count rows."""
- df = self.spark.read.parquet(self.data_path)
- row_count = df.count()
- return df, row_count
-
- def _repartition(self, df: DataFrame) -> DataFrame:
- """
- Repartition dataframe using the strategy defined by subclass.
-
- Args:
- df: Input dataframe
-
- Returns:
- Repartitioned dataframe
- """
- raise NotImplementedError("Subclasses must implement _repartition")
-
- def _write_output(self, df: DataFrame, output_path: str):
- """Write repartitioned data to parquet."""
- df.write.mode("overwrite").parquet(output_path)
-
- def run(self) -> Dict[str, Any]:
- """
- Run the shuffle benchmark.
-
- Returns:
- Dictionary with duration_ms and row_count
- """
- # Read input data
- df, row_count = self._read_and_count()
- print(f"Number of rows: {row_count:,}")
-
- # Define the benchmark operation
- def benchmark_operation():
- # Repartition using the specific strategy
- repartitioned = self._repartition(df)
-
- # Write to parquet to force materialization
- output_path = f"/tmp/shuffle-benchmark-output-{self.mode}-{self.name()}"
- self._write_output(repartitioned, output_path)
- print(f"Wrote repartitioned data to: {output_path}")
-
- # Time the operation
- duration_ms = self._time_operation(benchmark_operation)
-
- return {
- 'duration_ms': duration_ms,
- 'row_count': row_count,
- 'num_partitions': self.num_partitions,
- }
-
-
-class ShuffleHashBenchmark(ShuffleBenchmark):
- """Shuffle benchmark using hash partitioning on a key column."""
-
- @classmethod
- def name(cls) -> str:
- return "shuffle-hash"
-
- @classmethod
- def description(cls) -> str:
- return "Shuffle all columns using hash partitioning on group_key"
-
- def _repartition(self, df: DataFrame) -> DataFrame:
- """Repartition using hash partitioning on group_key."""
- return df.repartition(self.num_partitions, "group_key")
-
-
-class ShuffleRoundRobinBenchmark(ShuffleBenchmark):
- """Shuffle benchmark using round-robin partitioning."""
-
- @classmethod
- def name(cls) -> str:
- return "shuffle-roundrobin"
-
- @classmethod
- def description(cls) -> str:
- return "Shuffle all columns using round-robin partitioning"
-
- def _repartition(self, df: DataFrame) -> DataFrame:
- """Repartition using round-robin (no partition columns specified)."""
- return df.repartition(self.num_partitions)
diff --git a/benchmarks/pyspark/run_all_benchmarks.sh b/benchmarks/pyspark/run_all_benchmarks.sh
deleted file mode 100755
index 81eb044884..0000000000
--- a/benchmarks/pyspark/run_all_benchmarks.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-# Run all shuffle benchmarks (Spark, Comet JVM, Comet Native)
-# Check the Spark UI during each run to compare shuffle sizes
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-DATA_PATH="${1:-/tmp/shuffle-benchmark-data}"
-COMET_JAR="${COMET_JAR:-$SCRIPT_DIR/../../spark/target/comet-spark-spark3.5_2.12-0.14.0-SNAPSHOT.jar}"
-SPARK_MASTER="${SPARK_MASTER:-local[*]}"
-EXECUTOR_MEMORY="${EXECUTOR_MEMORY:-16g}"
-EVENT_LOG_DIR="${EVENT_LOG_DIR:-/tmp/spark-events}"
-
-# Create event log directory
-mkdir -p "$EVENT_LOG_DIR"
-
-echo "========================================"
-echo "Shuffle Size Comparison Benchmark"
-echo "========================================"
-echo "Data path: $DATA_PATH"
-echo "Comet JAR: $COMET_JAR"
-echo "Spark master: $SPARK_MASTER"
-echo "Executor memory: $EXECUTOR_MEMORY"
-echo "Event log dir: $EVENT_LOG_DIR"
-echo "========================================"
-
-# Run Spark baseline (no Comet)
-echo ""
-echo ">>> Running SPARK shuffle benchmark..."
-$SPARK_HOME/bin/spark-submit \
- --master "$SPARK_MASTER" \
- --executor-memory "$EXECUTOR_MEMORY" \
- --conf spark.eventLog.enabled=true \
- --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
- --conf spark.comet.enabled=false \
- --conf spark.comet.exec.shuffle.enabled=false \
- "$SCRIPT_DIR/run_benchmark.py" \
- --data "$DATA_PATH" \
- --mode spark
-
-# Run Comet JVM shuffle
-echo ""
-echo ">>> Running COMET JVM shuffle benchmark..."
-$SPARK_HOME/bin/spark-submit \
- --master "$SPARK_MASTER" \
- --executor-memory "$EXECUTOR_MEMORY" \
- --jars "$COMET_JAR" \
- --driver-class-path "$COMET_JAR" \
- --conf spark.executor.extraClassPath="$COMET_JAR" \
- --conf spark.eventLog.enabled=true \
- --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.comet.enabled=true \
- --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \
- --conf spark.comet.parquet.write.enabled=true \
- --conf spark.comet.logFallbackReasons.enabled=true \
- --conf spark.comet.explainFallback.enabled=true \
- --conf spark.comet.shuffle.mode=jvm \
- --conf spark.comet.exec.shuffle.mode=jvm \
- --conf spark.comet.exec.replaceSortMergeJoin=true \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
- --conf spark.comet.cast.allowIncompatible=true \
- "$SCRIPT_DIR/run_benchmark.py" \
- --data "$DATA_PATH" \
- --mode jvm
-
-# Run Comet Native shuffle
-echo ""
-echo ">>> Running COMET NATIVE shuffle benchmark..."
-$SPARK_HOME/bin/spark-submit \
- --master "$SPARK_MASTER" \
- --executor-memory "$EXECUTOR_MEMORY" \
- --jars "$COMET_JAR" \
- --driver-class-path "$COMET_JAR" \
- --conf spark.executor.extraClassPath="$COMET_JAR" \
- --conf spark.eventLog.enabled=true \
- --conf spark.eventLog.dir="$EVENT_LOG_DIR" \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.comet.enabled=true \
- --conf spark.comet.operator.DataWritingCommandExec.allowIncompatible=true \
- --conf spark.comet.parquet.write.enabled=true \
- --conf spark.comet.logFallbackReasons.enabled=true \
- --conf spark.comet.explainFallback.enabled=true \
- --conf spark.comet.exec.shuffle.mode=native \
- --conf spark.comet.exec.replaceSortMergeJoin=true \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
- --conf spark.comet.cast.allowIncompatible=true \
- "$SCRIPT_DIR/run_benchmark.py" \
- --data "$DATA_PATH" \
- --mode native
-
-echo ""
-echo "========================================"
-echo "BENCHMARK COMPLETE"
-echo "========================================"
-echo "Event logs written to: $EVENT_LOG_DIR"
-echo ""
diff --git a/benchmarks/pyspark/run_benchmark.py b/benchmarks/pyspark/run_benchmark.py
deleted file mode 100755
index 6713f0ff21..0000000000
--- a/benchmarks/pyspark/run_benchmark.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python3
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Run PySpark benchmarks.
-
-Run benchmarks by name with appropriate spark-submit configs for different modes
-(spark, jvm, native). Check the Spark UI to compare results between modes.
-"""
-
-import argparse
-import sys
-
-from pyspark.sql import SparkSession
-
-from benchmarks import get_benchmark, list_benchmarks
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Run PySpark benchmarks",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
-Examples:
- # Run hash partitioning shuffle benchmark in Spark mode
- python run_benchmark.py --data /path/to/data --mode spark --benchmark shuffle-hash
-
- # Run round-robin shuffle benchmark in Comet native mode
- python run_benchmark.py --data /path/to/data --mode native --benchmark shuffle-roundrobin
-
- # List all available benchmarks
- python run_benchmark.py --list-benchmarks
- """
- )
- parser.add_argument(
- "--data", "-d",
- help="Path to input parquet data"
- )
- parser.add_argument(
- "--mode", "-m",
- choices=["spark", "jvm", "native"],
- help="Shuffle mode being tested"
- )
- parser.add_argument(
- "--benchmark", "-b",
- default="shuffle-hash",
- help="Benchmark to run (default: shuffle-hash)"
- )
- parser.add_argument(
- "--list-benchmarks",
- action="store_true",
- help="List all available benchmarks and exit"
- )
-
- args = parser.parse_args()
-
- # Handle --list-benchmarks
- if args.list_benchmarks:
- print("Available benchmarks:\n")
- for name, description in list_benchmarks():
- print(f" {name:25s} - {description}")
- return 0
-
- # Validate required arguments
- if not args.data:
- parser.error("--data is required when running a benchmark")
- if not args.mode:
- parser.error("--mode is required when running a benchmark")
-
- # Get the benchmark class
- try:
- benchmark_cls = get_benchmark(args.benchmark)
- except KeyError as e:
- print(f"Error: {e}", file=sys.stderr)
- print("\nUse --list-benchmarks to see available benchmarks", file=sys.stderr)
- return 1
-
- # Create Spark session
- spark = SparkSession.builder \
- .appName(f"{benchmark_cls.name()}-{args.mode.upper()}") \
- .getOrCreate()
-
- try:
- # Create and run the benchmark
- benchmark = benchmark_cls(spark, args.data, args.mode)
- results = benchmark.execute_timed()
-
- print("\nCheck Spark UI for shuffle sizes and detailed metrics")
- return 0
-
- finally:
- spark.stop()
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/benchmarks/queries/tpcds/q1.sql b/benchmarks/queries/tpcds/q1.sql
new file mode 100644
index 0000000000..00328875ab
--- /dev/null
+++ b/benchmarks/queries/tpcds/q1.sql
@@ -0,0 +1,26 @@
+-- SQLBench-DS query 1 derived from TPC-DS query 1 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with customer_total_return as
+(select sr_customer_sk as ctr_customer_sk
+,sr_store_sk as ctr_store_sk
+,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return
+from store_returns
+,date_dim
+where sr_returned_date_sk = d_date_sk
+and d_year =1999
+group by sr_customer_sk
+,sr_store_sk)
+ select c_customer_id
+from customer_total_return ctr1
+,store
+,customer
+where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+from customer_total_return ctr2
+where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
+and s_store_sk = ctr1.ctr_store_sk
+and s_state = 'TN'
+and ctr1.ctr_customer_sk = c_customer_sk
+order by c_customer_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q10.sql b/benchmarks/queries/tpcds/q10.sql
new file mode 100644
index 0000000000..3a47920e04
--- /dev/null
+++ b/benchmarks/queries/tpcds/q10.sql
@@ -0,0 +1,60 @@
+-- SQLBench-DS query 10 derived from TPC-DS query 10 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ count(*) cnt1,
+ cd_purchase_estimate,
+ count(*) cnt2,
+ cd_credit_rating,
+ count(*) cnt3,
+ cd_dep_count,
+ count(*) cnt4,
+ cd_dep_employed_count,
+ count(*) cnt5,
+ cd_dep_college_count,
+ count(*) cnt6
+ from
+ customer c,customer_address ca,customer_demographics
+ where
+ c.c_current_addr_sk = ca.ca_address_sk and
+ ca_county in ('Clinton County','Platte County','Franklin County','Louisa County','Harmon County') and
+ cd_demo_sk = c.c_current_cdemo_sk and
+ exists (select *
+ from store_sales,date_dim
+ where c.c_customer_sk = ss_customer_sk and
+ ss_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 3 and 3+3) and
+ (exists (select *
+ from web_sales,date_dim
+ where c.c_customer_sk = ws_bill_customer_sk and
+ ws_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 3 ANd 3+3) or
+ exists (select *
+ from catalog_sales,date_dim
+ where c.c_customer_sk = cs_ship_customer_sk and
+ cs_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 3 and 3+3))
+ group by cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ cd_purchase_estimate,
+ cd_credit_rating,
+ cd_dep_count,
+ cd_dep_employed_count,
+ cd_dep_college_count
+ order by cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ cd_purchase_estimate,
+ cd_credit_rating,
+ cd_dep_count,
+ cd_dep_employed_count,
+ cd_dep_college_count
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q11.sql b/benchmarks/queries/tpcds/q11.sql
new file mode 100644
index 0000000000..7ffd3094f9
--- /dev/null
+++ b/benchmarks/queries/tpcds/q11.sql
@@ -0,0 +1,82 @@
+-- SQLBench-DS query 11 derived from TPC-DS query 11 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with year_total as (
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,c_preferred_cust_flag customer_preferred_cust_flag
+ ,c_birth_country customer_birth_country
+ ,c_login customer_login
+ ,c_email_address customer_email_address
+ ,d_year dyear
+ ,sum(ss_ext_list_price-ss_ext_discount_amt) year_total
+ ,'s' sale_type
+ from customer
+ ,store_sales
+ ,date_dim
+ where c_customer_sk = ss_customer_sk
+ and ss_sold_date_sk = d_date_sk
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,c_preferred_cust_flag
+ ,c_birth_country
+ ,c_login
+ ,c_email_address
+ ,d_year
+ union all
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,c_preferred_cust_flag customer_preferred_cust_flag
+ ,c_birth_country customer_birth_country
+ ,c_login customer_login
+ ,c_email_address customer_email_address
+ ,d_year dyear
+ ,sum(ws_ext_list_price-ws_ext_discount_amt) year_total
+ ,'w' sale_type
+ from customer
+ ,web_sales
+ ,date_dim
+ where c_customer_sk = ws_bill_customer_sk
+ and ws_sold_date_sk = d_date_sk
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,c_preferred_cust_flag
+ ,c_birth_country
+ ,c_login
+ ,c_email_address
+ ,d_year
+ )
+ select
+ t_s_secyear.customer_id
+ ,t_s_secyear.customer_first_name
+ ,t_s_secyear.customer_last_name
+ ,t_s_secyear.customer_email_address
+ from year_total t_s_firstyear
+ ,year_total t_s_secyear
+ ,year_total t_w_firstyear
+ ,year_total t_w_secyear
+ where t_s_secyear.customer_id = t_s_firstyear.customer_id
+ and t_s_firstyear.customer_id = t_w_secyear.customer_id
+ and t_s_firstyear.customer_id = t_w_firstyear.customer_id
+ and t_s_firstyear.sale_type = 's'
+ and t_w_firstyear.sale_type = 'w'
+ and t_s_secyear.sale_type = 's'
+ and t_w_secyear.sale_type = 'w'
+ and t_s_firstyear.dyear = 1999
+ and t_s_secyear.dyear = 1999+1
+ and t_w_firstyear.dyear = 1999
+ and t_w_secyear.dyear = 1999+1
+ and t_s_firstyear.year_total > 0
+ and t_w_firstyear.year_total > 0
+ and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else 0.0 end
+ > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else 0.0 end
+ order by t_s_secyear.customer_id
+ ,t_s_secyear.customer_first_name
+ ,t_s_secyear.customer_last_name
+ ,t_s_secyear.customer_email_address
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q12.sql b/benchmarks/queries/tpcds/q12.sql
new file mode 100644
index 0000000000..eb267ca64b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q12.sql
@@ -0,0 +1,35 @@
+-- SQLBench-DS query 12 derived from TPC-DS query 12 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+ ,sum(ws_ext_sales_price) as itemrevenue
+ ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
+ (partition by i_class) as revenueratio
+from
+ web_sales
+ ,item
+ ,date_dim
+where
+ ws_item_sk = i_item_sk
+ and i_category in ('Jewelry', 'Books', 'Women')
+ and ws_sold_date_sk = d_date_sk
+ and d_date between cast('2002-03-22' as date)
+ and (cast('2002-03-22' as date) + INTERVAL '30 DAYS')
+group by
+ i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+order by
+ i_category
+ ,i_class
+ ,i_item_id
+ ,i_item_desc
+ ,revenueratio
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q13.sql b/benchmarks/queries/tpcds/q13.sql
new file mode 100644
index 0000000000..31b1171b9e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q13.sql
@@ -0,0 +1,53 @@
+-- SQLBench-DS query 13 derived from TPC-DS query 13 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select avg(ss_quantity)
+ ,avg(ss_ext_sales_price)
+ ,avg(ss_ext_wholesale_cost)
+ ,sum(ss_ext_wholesale_cost)
+ from store_sales
+ ,store
+ ,customer_demographics
+ ,household_demographics
+ ,customer_address
+ ,date_dim
+ where s_store_sk = ss_store_sk
+ and ss_sold_date_sk = d_date_sk and d_year = 2001
+ and((ss_hdemo_sk=hd_demo_sk
+ and cd_demo_sk = ss_cdemo_sk
+ and cd_marital_status = 'U'
+ and cd_education_status = '4 yr Degree'
+ and ss_sales_price between 100.00 and 150.00
+ and hd_dep_count = 3
+ )or
+ (ss_hdemo_sk=hd_demo_sk
+ and cd_demo_sk = ss_cdemo_sk
+ and cd_marital_status = 'S'
+ and cd_education_status = 'Unknown'
+ and ss_sales_price between 50.00 and 100.00
+ and hd_dep_count = 1
+ ) or
+ (ss_hdemo_sk=hd_demo_sk
+ and cd_demo_sk = ss_cdemo_sk
+ and cd_marital_status = 'D'
+ and cd_education_status = '2 yr Degree'
+ and ss_sales_price between 150.00 and 200.00
+ and hd_dep_count = 1
+ ))
+ and((ss_addr_sk = ca_address_sk
+ and ca_country = 'United States'
+ and ca_state in ('CO', 'MI', 'MN')
+ and ss_net_profit between 100 and 200
+ ) or
+ (ss_addr_sk = ca_address_sk
+ and ca_country = 'United States'
+ and ca_state in ('NC', 'NY', 'TX')
+ and ss_net_profit between 150 and 300
+ ) or
+ (ss_addr_sk = ca_address_sk
+ and ca_country = 'United States'
+ and ca_state in ('CA', 'NE', 'TN')
+ and ss_net_profit between 50 and 250
+ ))
+;
+
diff --git a/benchmarks/queries/tpcds/q14.sql b/benchmarks/queries/tpcds/q14.sql
new file mode 100644
index 0000000000..119791f59d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q14.sql
@@ -0,0 +1,211 @@
+-- SQLBench-DS query 14 derived from TPC-DS query 14 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with cross_items as
+ (select i_item_sk ss_item_sk
+ from item,
+ (select iss.i_brand_id brand_id
+ ,iss.i_class_id class_id
+ ,iss.i_category_id category_id
+ from store_sales
+ ,item iss
+ ,date_dim d1
+ where ss_item_sk = iss.i_item_sk
+ and ss_sold_date_sk = d1.d_date_sk
+ and d1.d_year between 1999 AND 1999 + 2
+ intersect
+ select ics.i_brand_id
+ ,ics.i_class_id
+ ,ics.i_category_id
+ from catalog_sales
+ ,item ics
+ ,date_dim d2
+ where cs_item_sk = ics.i_item_sk
+ and cs_sold_date_sk = d2.d_date_sk
+ and d2.d_year between 1999 AND 1999 + 2
+ intersect
+ select iws.i_brand_id
+ ,iws.i_class_id
+ ,iws.i_category_id
+ from web_sales
+ ,item iws
+ ,date_dim d3
+ where ws_item_sk = iws.i_item_sk
+ and ws_sold_date_sk = d3.d_date_sk
+ and d3.d_year between 1999 AND 1999 + 2)
+ where i_brand_id = brand_id
+ and i_class_id = class_id
+ and i_category_id = category_id
+),
+ avg_sales as
+ (select avg(quantity*list_price) average_sales
+ from (select ss_quantity quantity
+ ,ss_list_price list_price
+ from store_sales
+ ,date_dim
+ where ss_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2
+ union all
+ select cs_quantity quantity
+ ,cs_list_price list_price
+ from catalog_sales
+ ,date_dim
+ where cs_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2
+ union all
+ select ws_quantity quantity
+ ,ws_list_price list_price
+ from web_sales
+ ,date_dim
+ where ws_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2) x)
+ select channel, i_brand_id,i_class_id,i_category_id,sum(sales), sum(number_sales)
+ from(
+ select 'store' channel, i_brand_id,i_class_id
+ ,i_category_id,sum(ss_quantity*ss_list_price) sales
+ , count(*) number_sales
+ from store_sales
+ ,item
+ ,date_dim
+ where ss_item_sk in (select ss_item_sk from cross_items)
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year = 1999+2
+ and d_moy = 11
+ group by i_brand_id,i_class_id,i_category_id
+ having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales)
+ union all
+ select 'catalog' channel, i_brand_id,i_class_id,i_category_id, sum(cs_quantity*cs_list_price) sales, count(*) number_sales
+ from catalog_sales
+ ,item
+ ,date_dim
+ where cs_item_sk in (select ss_item_sk from cross_items)
+ and cs_item_sk = i_item_sk
+ and cs_sold_date_sk = d_date_sk
+ and d_year = 1999+2
+ and d_moy = 11
+ group by i_brand_id,i_class_id,i_category_id
+ having sum(cs_quantity*cs_list_price) > (select average_sales from avg_sales)
+ union all
+ select 'web' channel, i_brand_id,i_class_id,i_category_id, sum(ws_quantity*ws_list_price) sales , count(*) number_sales
+ from web_sales
+ ,item
+ ,date_dim
+ where ws_item_sk in (select ss_item_sk from cross_items)
+ and ws_item_sk = i_item_sk
+ and ws_sold_date_sk = d_date_sk
+ and d_year = 1999+2
+ and d_moy = 11
+ group by i_brand_id,i_class_id,i_category_id
+ having sum(ws_quantity*ws_list_price) > (select average_sales from avg_sales)
+ ) y
+ group by rollup (channel, i_brand_id,i_class_id,i_category_id)
+ order by channel,i_brand_id,i_class_id,i_category_id
+ LIMIT 100;
+with cross_items as
+ (select i_item_sk ss_item_sk
+ from item,
+ (select iss.i_brand_id brand_id
+ ,iss.i_class_id class_id
+ ,iss.i_category_id category_id
+ from store_sales
+ ,item iss
+ ,date_dim d1
+ where ss_item_sk = iss.i_item_sk
+ and ss_sold_date_sk = d1.d_date_sk
+ and d1.d_year between 1999 AND 1999 + 2
+ intersect
+ select ics.i_brand_id
+ ,ics.i_class_id
+ ,ics.i_category_id
+ from catalog_sales
+ ,item ics
+ ,date_dim d2
+ where cs_item_sk = ics.i_item_sk
+ and cs_sold_date_sk = d2.d_date_sk
+ and d2.d_year between 1999 AND 1999 + 2
+ intersect
+ select iws.i_brand_id
+ ,iws.i_class_id
+ ,iws.i_category_id
+ from web_sales
+ ,item iws
+ ,date_dim d3
+ where ws_item_sk = iws.i_item_sk
+ and ws_sold_date_sk = d3.d_date_sk
+ and d3.d_year between 1999 AND 1999 + 2) x
+ where i_brand_id = brand_id
+ and i_class_id = class_id
+ and i_category_id = category_id
+),
+ avg_sales as
+(select avg(quantity*list_price) average_sales
+ from (select ss_quantity quantity
+ ,ss_list_price list_price
+ from store_sales
+ ,date_dim
+ where ss_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2
+ union all
+ select cs_quantity quantity
+ ,cs_list_price list_price
+ from catalog_sales
+ ,date_dim
+ where cs_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2
+ union all
+ select ws_quantity quantity
+ ,ws_list_price list_price
+ from web_sales
+ ,date_dim
+ where ws_sold_date_sk = d_date_sk
+ and d_year between 1999 and 1999 + 2) x)
+ select this_year.channel ty_channel
+ ,this_year.i_brand_id ty_brand
+ ,this_year.i_class_id ty_class
+ ,this_year.i_category_id ty_category
+ ,this_year.sales ty_sales
+ ,this_year.number_sales ty_number_sales
+ ,last_year.channel ly_channel
+ ,last_year.i_brand_id ly_brand
+ ,last_year.i_class_id ly_class
+ ,last_year.i_category_id ly_category
+ ,last_year.sales ly_sales
+ ,last_year.number_sales ly_number_sales
+ from
+ (select 'store' channel, i_brand_id,i_class_id,i_category_id
+ ,sum(ss_quantity*ss_list_price) sales, count(*) number_sales
+ from store_sales
+ ,item
+ ,date_dim
+ where ss_item_sk in (select ss_item_sk from cross_items)
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_week_seq = (select d_week_seq
+ from date_dim
+ where d_year = 1999 + 1
+ and d_moy = 12
+ and d_dom = 14)
+ group by i_brand_id,i_class_id,i_category_id
+ having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales)) this_year,
+ (select 'store' channel, i_brand_id,i_class_id
+ ,i_category_id, sum(ss_quantity*ss_list_price) sales, count(*) number_sales
+ from store_sales
+ ,item
+ ,date_dim
+ where ss_item_sk in (select ss_item_sk from cross_items)
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_week_seq = (select d_week_seq
+ from date_dim
+ where d_year = 1999
+ and d_moy = 12
+ and d_dom = 14)
+ group by i_brand_id,i_class_id,i_category_id
+ having sum(ss_quantity*ss_list_price) > (select average_sales from avg_sales)) last_year
+ where this_year.i_brand_id= last_year.i_brand_id
+ and this_year.i_class_id = last_year.i_class_id
+ and this_year.i_category_id = last_year.i_category_id
+ order by this_year.channel, this_year.i_brand_id, this_year.i_class_id, this_year.i_category_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q15.sql b/benchmarks/queries/tpcds/q15.sql
new file mode 100644
index 0000000000..bb1812a07c
--- /dev/null
+++ b/benchmarks/queries/tpcds/q15.sql
@@ -0,0 +1,21 @@
+-- SQLBench-DS query 15 derived from TPC-DS query 15 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select ca_zip
+ ,sum(cs_sales_price)
+ from catalog_sales
+ ,customer
+ ,customer_address
+ ,date_dim
+ where cs_bill_customer_sk = c_customer_sk
+ and c_current_addr_sk = ca_address_sk
+ and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475',
+ '85392', '85460', '80348', '81792')
+ or ca_state in ('CA','WA','GA')
+ or cs_sales_price > 500)
+ and cs_sold_date_sk = d_date_sk
+ and d_qoy = 2 and d_year = 2002
+ group by ca_zip
+ order by ca_zip
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q16.sql b/benchmarks/queries/tpcds/q16.sql
new file mode 100644
index 0000000000..2e0f9a9922
--- /dev/null
+++ b/benchmarks/queries/tpcds/q16.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 16 derived from TPC-DS query 16 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ count(distinct cs_order_number) as `order count`
+ ,sum(cs_ext_ship_cost) as `total shipping cost`
+ ,sum(cs_net_profit) as `total net profit`
+from
+ catalog_sales cs1
+ ,date_dim
+ ,customer_address
+ ,call_center
+where
+ d_date between '1999-5-01' and
+ (cast('1999-5-01' as date) + INTERVAL '60 DAYS')
+and cs1.cs_ship_date_sk = d_date_sk
+and cs1.cs_ship_addr_sk = ca_address_sk
+and ca_state = 'ID'
+and cs1.cs_call_center_sk = cc_call_center_sk
+and cc_county in ('Williamson County','Williamson County','Williamson County','Williamson County',
+ 'Williamson County'
+)
+and exists (select *
+ from catalog_sales cs2
+ where cs1.cs_order_number = cs2.cs_order_number
+ and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk)
+and not exists(select *
+ from catalog_returns cr1
+ where cs1.cs_order_number = cr1.cr_order_number)
+order by count(distinct cs_order_number)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q17.sql b/benchmarks/queries/tpcds/q17.sql
new file mode 100644
index 0000000000..9f9e97d76e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q17.sql
@@ -0,0 +1,46 @@
+-- SQLBench-DS query 17 derived from TPC-DS query 17 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,s_state
+ ,count(ss_quantity) as store_sales_quantitycount
+ ,avg(ss_quantity) as store_sales_quantityave
+ ,stddev_samp(ss_quantity) as store_sales_quantitystdev
+ ,stddev_samp(ss_quantity)/avg(ss_quantity) as store_sales_quantitycov
+ ,count(sr_return_quantity) as store_returns_quantitycount
+ ,avg(sr_return_quantity) as store_returns_quantityave
+ ,stddev_samp(sr_return_quantity) as store_returns_quantitystdev
+ ,stddev_samp(sr_return_quantity)/avg(sr_return_quantity) as store_returns_quantitycov
+ ,count(cs_quantity) as catalog_sales_quantitycount ,avg(cs_quantity) as catalog_sales_quantityave
+ ,stddev_samp(cs_quantity) as catalog_sales_quantitystdev
+ ,stddev_samp(cs_quantity)/avg(cs_quantity) as catalog_sales_quantitycov
+ from store_sales
+ ,store_returns
+ ,catalog_sales
+ ,date_dim d1
+ ,date_dim d2
+ ,date_dim d3
+ ,store
+ ,item
+ where d1.d_quarter_name = '1999Q1'
+ and d1.d_date_sk = ss_sold_date_sk
+ and i_item_sk = ss_item_sk
+ and s_store_sk = ss_store_sk
+ and ss_customer_sk = sr_customer_sk
+ and ss_item_sk = sr_item_sk
+ and ss_ticket_number = sr_ticket_number
+ and sr_returned_date_sk = d2.d_date_sk
+ and d2.d_quarter_name in ('1999Q1','1999Q2','1999Q3')
+ and sr_customer_sk = cs_bill_customer_sk
+ and sr_item_sk = cs_item_sk
+ and cs_sold_date_sk = d3.d_date_sk
+ and d3.d_quarter_name in ('1999Q1','1999Q2','1999Q3')
+ group by i_item_id
+ ,i_item_desc
+ ,s_state
+ order by i_item_id
+ ,i_item_desc
+ ,s_state
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q18.sql b/benchmarks/queries/tpcds/q18.sql
new file mode 100644
index 0000000000..50cc6c63f2
--- /dev/null
+++ b/benchmarks/queries/tpcds/q18.sql
@@ -0,0 +1,35 @@
+-- SQLBench-DS query 18 derived from TPC-DS query 18 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id,
+ ca_country,
+ ca_state,
+ ca_county,
+ avg( cast(cs_quantity as decimal(12,2))) agg1,
+ avg( cast(cs_list_price as decimal(12,2))) agg2,
+ avg( cast(cs_coupon_amt as decimal(12,2))) agg3,
+ avg( cast(cs_sales_price as decimal(12,2))) agg4,
+ avg( cast(cs_net_profit as decimal(12,2))) agg5,
+ avg( cast(c_birth_year as decimal(12,2))) agg6,
+ avg( cast(cd1.cd_dep_count as decimal(12,2))) agg7
+ from catalog_sales, customer_demographics cd1,
+ customer_demographics cd2, customer, customer_address, date_dim, item
+ where cs_sold_date_sk = d_date_sk and
+ cs_item_sk = i_item_sk and
+ cs_bill_cdemo_sk = cd1.cd_demo_sk and
+ cs_bill_customer_sk = c_customer_sk and
+ cd1.cd_gender = 'M' and
+ cd1.cd_education_status = 'Primary' and
+ c_current_cdemo_sk = cd2.cd_demo_sk and
+ c_current_addr_sk = ca_address_sk and
+ c_birth_month in (1,2,9,5,11,3) and
+ d_year = 1998 and
+ ca_state in ('MS','NE','IA'
+ ,'MI','GA','NY','CO')
+ group by rollup (i_item_id, ca_country, ca_state, ca_county)
+ order by ca_country,
+ ca_state,
+ ca_county,
+ i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q19.sql b/benchmarks/queries/tpcds/q19.sql
new file mode 100644
index 0000000000..bf54b3b802
--- /dev/null
+++ b/benchmarks/queries/tpcds/q19.sql
@@ -0,0 +1,26 @@
+-- SQLBench-DS query 19 derived from TPC-DS query 19 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_brand_id brand_id, i_brand brand, i_manufact_id, i_manufact,
+ sum(ss_ext_sales_price) ext_price
+ from date_dim, store_sales, item,customer,customer_address,store
+ where d_date_sk = ss_sold_date_sk
+ and ss_item_sk = i_item_sk
+ and i_manager_id=8
+ and d_moy=11
+ and d_year=1999
+ and ss_customer_sk = c_customer_sk
+ and c_current_addr_sk = ca_address_sk
+ and substr(ca_zip,1,5) <> substr(s_zip,1,5)
+ and ss_store_sk = s_store_sk
+ group by i_brand
+ ,i_brand_id
+ ,i_manufact_id
+ ,i_manufact
+ order by ext_price desc
+ ,i_brand
+ ,i_brand_id
+ ,i_manufact_id
+ ,i_manufact
+ LIMIT 100 ;
+
diff --git a/benchmarks/queries/tpcds/q2.sql b/benchmarks/queries/tpcds/q2.sql
new file mode 100644
index 0000000000..838717836b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q2.sql
@@ -0,0 +1,61 @@
+-- SQLBench-DS query 2 derived from TPC-DS query 2 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with wscs as
+ (select sold_date_sk
+ ,sales_price
+ from (select ws_sold_date_sk sold_date_sk
+ ,ws_ext_sales_price sales_price
+ from web_sales
+ union all
+ select cs_sold_date_sk sold_date_sk
+ ,cs_ext_sales_price sales_price
+ from catalog_sales)),
+ wswscs as
+ (select d_week_seq,
+ sum(case when (d_day_name='Sunday') then sales_price else null end) sun_sales,
+ sum(case when (d_day_name='Monday') then sales_price else null end) mon_sales,
+ sum(case when (d_day_name='Tuesday') then sales_price else null end) tue_sales,
+ sum(case when (d_day_name='Wednesday') then sales_price else null end) wed_sales,
+ sum(case when (d_day_name='Thursday') then sales_price else null end) thu_sales,
+ sum(case when (d_day_name='Friday') then sales_price else null end) fri_sales,
+ sum(case when (d_day_name='Saturday') then sales_price else null end) sat_sales
+ from wscs
+ ,date_dim
+ where d_date_sk = sold_date_sk
+ group by d_week_seq)
+ select d_week_seq1
+ ,round(sun_sales1/sun_sales2,2)
+ ,round(mon_sales1/mon_sales2,2)
+ ,round(tue_sales1/tue_sales2,2)
+ ,round(wed_sales1/wed_sales2,2)
+ ,round(thu_sales1/thu_sales2,2)
+ ,round(fri_sales1/fri_sales2,2)
+ ,round(sat_sales1/sat_sales2,2)
+ from
+ (select wswscs.d_week_seq d_week_seq1
+ ,sun_sales sun_sales1
+ ,mon_sales mon_sales1
+ ,tue_sales tue_sales1
+ ,wed_sales wed_sales1
+ ,thu_sales thu_sales1
+ ,fri_sales fri_sales1
+ ,sat_sales sat_sales1
+ from wswscs,date_dim
+ where date_dim.d_week_seq = wswscs.d_week_seq and
+ d_year = 2000) y,
+ (select wswscs.d_week_seq d_week_seq2
+ ,sun_sales sun_sales2
+ ,mon_sales mon_sales2
+ ,tue_sales tue_sales2
+ ,wed_sales wed_sales2
+ ,thu_sales thu_sales2
+ ,fri_sales fri_sales2
+ ,sat_sales sat_sales2
+ from wswscs
+ ,date_dim
+ where date_dim.d_week_seq = wswscs.d_week_seq and
+ d_year = 2000+1) z
+ where d_week_seq1=d_week_seq2-53
+ order by d_week_seq1;
+
diff --git a/benchmarks/queries/tpcds/q20.sql b/benchmarks/queries/tpcds/q20.sql
new file mode 100644
index 0000000000..ea4747317d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q20.sql
@@ -0,0 +1,31 @@
+-- SQLBench-DS query 20 derived from TPC-DS query 20 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+ ,sum(cs_ext_sales_price) as itemrevenue
+ ,sum(cs_ext_sales_price)*100/sum(sum(cs_ext_sales_price)) over
+ (partition by i_class) as revenueratio
+ from catalog_sales
+ ,item
+ ,date_dim
+ where cs_item_sk = i_item_sk
+ and i_category in ('Children', 'Sports', 'Music')
+ and cs_sold_date_sk = d_date_sk
+ and d_date between cast('2002-04-01' as date)
+ and (cast('2002-04-01' as date) + INTERVAL '30 DAYS')
+ group by i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+ order by i_category
+ ,i_class
+ ,i_item_id
+ ,i_item_desc
+ ,revenueratio
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q21.sql b/benchmarks/queries/tpcds/q21.sql
new file mode 100644
index 0000000000..d768fa1428
--- /dev/null
+++ b/benchmarks/queries/tpcds/q21.sql
@@ -0,0 +1,31 @@
+-- SQLBench-DS query 21 derived from TPC-DS query 21 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+ from(select w_warehouse_name
+ ,i_item_id
+ ,sum(case when (cast(d_date as date) < cast ('2000-05-19' as date))
+ then inv_quantity_on_hand
+ else 0 end) as inv_before
+ ,sum(case when (cast(d_date as date) >= cast ('2000-05-19' as date))
+ then inv_quantity_on_hand
+ else 0 end) as inv_after
+ from inventory
+ ,warehouse
+ ,item
+ ,date_dim
+ where i_current_price between 0.99 and 1.49
+ and i_item_sk = inv_item_sk
+ and inv_warehouse_sk = w_warehouse_sk
+ and inv_date_sk = d_date_sk
+ and d_date between (cast ('2000-05-19' as date) - INTERVAL '30 DAYS')
+ and (cast ('2000-05-19' as date) + INTERVAL '30 DAYS')
+ group by w_warehouse_name, i_item_id) x
+ where (case when inv_before > 0
+ then inv_after / inv_before
+ else null
+ end) between 2.0/3.0 and 3.0/2.0
+ order by w_warehouse_name
+ ,i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q22.sql b/benchmarks/queries/tpcds/q22.sql
new file mode 100644
index 0000000000..c7e1c78181
--- /dev/null
+++ b/benchmarks/queries/tpcds/q22.sql
@@ -0,0 +1,21 @@
+-- SQLBench-DS query 22 derived from TPC-DS query 22 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_product_name
+ ,i_brand
+ ,i_class
+ ,i_category
+ ,avg(inv_quantity_on_hand) qoh
+ from inventory
+ ,date_dim
+ ,item
+ where inv_date_sk=d_date_sk
+ and inv_item_sk=i_item_sk
+ and d_month_seq between 1201 and 1201 + 11
+ group by rollup(i_product_name
+ ,i_brand
+ ,i_class
+ ,i_category)
+order by qoh, i_product_name, i_brand, i_class, i_category
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q23.sql b/benchmarks/queries/tpcds/q23.sql
new file mode 100644
index 0000000000..0dc7f73859
--- /dev/null
+++ b/benchmarks/queries/tpcds/q23.sql
@@ -0,0 +1,108 @@
+-- SQLBench-DS query 23 derived from TPC-DS query 23 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with frequent_ss_items as
+ (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt
+ from store_sales
+ ,date_dim
+ ,item
+ where ss_sold_date_sk = d_date_sk
+ and ss_item_sk = i_item_sk
+ and d_year in (2000,2000+1,2000+2,2000+3)
+ group by substr(i_item_desc,1,30),i_item_sk,d_date
+ having count(*) >4),
+ max_store_sales as
+ (select max(csales) tpcds_cmax
+ from (select c_customer_sk,sum(ss_quantity*ss_sales_price) csales
+ from store_sales
+ ,customer
+ ,date_dim
+ where ss_customer_sk = c_customer_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year in (2000,2000+1,2000+2,2000+3)
+ group by c_customer_sk)),
+ best_ss_customer as
+ (select c_customer_sk,sum(ss_quantity*ss_sales_price) ssales
+ from store_sales
+ ,customer
+ where ss_customer_sk = c_customer_sk
+ group by c_customer_sk
+ having sum(ss_quantity*ss_sales_price) > (95/100.0) * (select
+ *
+from
+ max_store_sales))
+ select sum(sales)
+ from (select cs_quantity*cs_list_price sales
+ from catalog_sales
+ ,date_dim
+ where d_year = 2000
+ and d_moy = 3
+ and cs_sold_date_sk = d_date_sk
+ and cs_item_sk in (select item_sk from frequent_ss_items)
+ and cs_bill_customer_sk in (select c_customer_sk from best_ss_customer)
+ union all
+ select ws_quantity*ws_list_price sales
+ from web_sales
+ ,date_dim
+ where d_year = 2000
+ and d_moy = 3
+ and ws_sold_date_sk = d_date_sk
+ and ws_item_sk in (select item_sk from frequent_ss_items)
+ and ws_bill_customer_sk in (select c_customer_sk from best_ss_customer))
+ LIMIT 100;
+with frequent_ss_items as
+ (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt
+ from store_sales
+ ,date_dim
+ ,item
+ where ss_sold_date_sk = d_date_sk
+ and ss_item_sk = i_item_sk
+ and d_year in (2000,2000 + 1,2000 + 2,2000 + 3)
+ group by substr(i_item_desc,1,30),i_item_sk,d_date
+ having count(*) >4),
+ max_store_sales as
+ (select max(csales) tpcds_cmax
+ from (select c_customer_sk,sum(ss_quantity*ss_sales_price) csales
+ from store_sales
+ ,customer
+ ,date_dim
+ where ss_customer_sk = c_customer_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year in (2000,2000+1,2000+2,2000+3)
+ group by c_customer_sk)),
+ best_ss_customer as
+ (select c_customer_sk,sum(ss_quantity*ss_sales_price) ssales
+ from store_sales
+ ,customer
+ where ss_customer_sk = c_customer_sk
+ group by c_customer_sk
+ having sum(ss_quantity*ss_sales_price) > (95/100.0) * (select
+ *
+ from max_store_sales))
+ select c_last_name,c_first_name,sales
+ from (select c_last_name,c_first_name,sum(cs_quantity*cs_list_price) sales
+ from catalog_sales
+ ,customer
+ ,date_dim
+ where d_year = 2000
+ and d_moy = 3
+ and cs_sold_date_sk = d_date_sk
+ and cs_item_sk in (select item_sk from frequent_ss_items)
+ and cs_bill_customer_sk in (select c_customer_sk from best_ss_customer)
+ and cs_bill_customer_sk = c_customer_sk
+ group by c_last_name,c_first_name
+ union all
+ select c_last_name,c_first_name,sum(ws_quantity*ws_list_price) sales
+ from web_sales
+ ,customer
+ ,date_dim
+ where d_year = 2000
+ and d_moy = 3
+ and ws_sold_date_sk = d_date_sk
+ and ws_item_sk in (select item_sk from frequent_ss_items)
+ and ws_bill_customer_sk in (select c_customer_sk from best_ss_customer)
+ and ws_bill_customer_sk = c_customer_sk
+ group by c_last_name,c_first_name)
+ order by c_last_name,c_first_name,sales
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q24.sql b/benchmarks/queries/tpcds/q24.sql
new file mode 100644
index 0000000000..5d6d2f5053
--- /dev/null
+++ b/benchmarks/queries/tpcds/q24.sql
@@ -0,0 +1,108 @@
+-- SQLBench-DS query 24 derived from TPC-DS query 24 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ssales as
+(select c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,ca_state
+ ,s_state
+ ,i_color
+ ,i_current_price
+ ,i_manager_id
+ ,i_units
+ ,i_size
+ ,sum(ss_net_profit) netpaid
+from store_sales
+ ,store_returns
+ ,store
+ ,item
+ ,customer
+ ,customer_address
+where ss_ticket_number = sr_ticket_number
+ and ss_item_sk = sr_item_sk
+ and ss_customer_sk = c_customer_sk
+ and ss_item_sk = i_item_sk
+ and ss_store_sk = s_store_sk
+ and c_current_addr_sk = ca_address_sk
+ and c_birth_country <> upper(ca_country)
+ and s_zip = ca_zip
+and s_market_id=10
+group by c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,ca_state
+ ,s_state
+ ,i_color
+ ,i_current_price
+ ,i_manager_id
+ ,i_units
+ ,i_size)
+select c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,sum(netpaid) paid
+from ssales
+where i_color = 'orchid'
+group by c_last_name
+ ,c_first_name
+ ,s_store_name
+having sum(netpaid) > (select 0.05*avg(netpaid)
+ from ssales)
+order by c_last_name
+ ,c_first_name
+ ,s_store_name
+;
+with ssales as
+(select c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,ca_state
+ ,s_state
+ ,i_color
+ ,i_current_price
+ ,i_manager_id
+ ,i_units
+ ,i_size
+ ,sum(ss_net_profit) netpaid
+from store_sales
+ ,store_returns
+ ,store
+ ,item
+ ,customer
+ ,customer_address
+where ss_ticket_number = sr_ticket_number
+ and ss_item_sk = sr_item_sk
+ and ss_customer_sk = c_customer_sk
+ and ss_item_sk = i_item_sk
+ and ss_store_sk = s_store_sk
+ and c_current_addr_sk = ca_address_sk
+ and c_birth_country <> upper(ca_country)
+ and s_zip = ca_zip
+ and s_market_id = 10
+group by c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,ca_state
+ ,s_state
+ ,i_color
+ ,i_current_price
+ ,i_manager_id
+ ,i_units
+ ,i_size)
+select c_last_name
+ ,c_first_name
+ ,s_store_name
+ ,sum(netpaid) paid
+from ssales
+where i_color = 'green'
+group by c_last_name
+ ,c_first_name
+ ,s_store_name
+having sum(netpaid) > (select 0.05*avg(netpaid)
+ from ssales)
+order by c_last_name
+ ,c_first_name
+ ,s_store_name
+;
+
diff --git a/benchmarks/queries/tpcds/q25.sql b/benchmarks/queries/tpcds/q25.sql
new file mode 100644
index 0000000000..b0af0e61dd
--- /dev/null
+++ b/benchmarks/queries/tpcds/q25.sql
@@ -0,0 +1,49 @@
+-- SQLBench-DS query 25 derived from TPC-DS query 25 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ ,min(ss_net_profit) as store_sales_profit
+ ,min(sr_net_loss) as store_returns_loss
+ ,min(cs_net_profit) as catalog_sales_profit
+ from
+ store_sales
+ ,store_returns
+ ,catalog_sales
+ ,date_dim d1
+ ,date_dim d2
+ ,date_dim d3
+ ,store
+ ,item
+ where
+ d1.d_moy = 4
+ and d1.d_year = 2002
+ and d1.d_date_sk = ss_sold_date_sk
+ and i_item_sk = ss_item_sk
+ and s_store_sk = ss_store_sk
+ and ss_customer_sk = sr_customer_sk
+ and ss_item_sk = sr_item_sk
+ and ss_ticket_number = sr_ticket_number
+ and sr_returned_date_sk = d2.d_date_sk
+ and d2.d_moy between 4 and 10
+ and d2.d_year = 2002
+ and sr_customer_sk = cs_bill_customer_sk
+ and sr_item_sk = cs_item_sk
+ and cs_sold_date_sk = d3.d_date_sk
+ and d3.d_moy between 4 and 10
+ and d3.d_year = 2002
+ group by
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ order by
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q26.sql b/benchmarks/queries/tpcds/q26.sql
new file mode 100644
index 0000000000..55ccc8b511
--- /dev/null
+++ b/benchmarks/queries/tpcds/q26.sql
@@ -0,0 +1,22 @@
+-- SQLBench-DS query 26 derived from TPC-DS query 26 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id,
+ avg(cs_quantity) agg1,
+ avg(cs_list_price) agg2,
+ avg(cs_coupon_amt) agg3,
+ avg(cs_sales_price) agg4
+ from catalog_sales, customer_demographics, date_dim, item, promotion
+ where cs_sold_date_sk = d_date_sk and
+ cs_item_sk = i_item_sk and
+ cs_bill_cdemo_sk = cd_demo_sk and
+ cs_promo_sk = p_promo_sk and
+ cd_gender = 'F' and
+ cd_marital_status = 'M' and
+ cd_education_status = '4 yr Degree' and
+ (p_channel_email = 'N' or p_channel_event = 'N') and
+ d_year = 2000
+ group by i_item_id
+ order by i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q27.sql b/benchmarks/queries/tpcds/q27.sql
new file mode 100644
index 0000000000..6d28e4e663
--- /dev/null
+++ b/benchmarks/queries/tpcds/q27.sql
@@ -0,0 +1,24 @@
+-- SQLBench-DS query 27 derived from TPC-DS query 27 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id,
+ s_state, grouping(s_state) g_state,
+ avg(ss_quantity) agg1,
+ avg(ss_list_price) agg2,
+ avg(ss_coupon_amt) agg3,
+ avg(ss_sales_price) agg4
+ from store_sales, customer_demographics, date_dim, store, item
+ where ss_sold_date_sk = d_date_sk and
+ ss_item_sk = i_item_sk and
+ ss_store_sk = s_store_sk and
+ ss_cdemo_sk = cd_demo_sk and
+ cd_gender = 'M' and
+ cd_marital_status = 'U' and
+ cd_education_status = 'Secondary' and
+ d_year = 2000 and
+ s_state in ('TN','TN', 'TN', 'TN', 'TN', 'TN')
+ group by rollup (i_item_id, s_state)
+ order by i_item_id
+ ,s_state
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q28.sql b/benchmarks/queries/tpcds/q28.sql
new file mode 100644
index 0000000000..6efa7d7d77
--- /dev/null
+++ b/benchmarks/queries/tpcds/q28.sql
@@ -0,0 +1,54 @@
+-- SQLBench-DS query 28 derived from TPC-DS query 28 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+from (select avg(ss_list_price) B1_LP
+ ,count(ss_list_price) B1_CNT
+ ,count(distinct ss_list_price) B1_CNTD
+ from store_sales
+ where ss_quantity between 0 and 5
+ and (ss_list_price between 28 and 28+10
+ or ss_coupon_amt between 12573 and 12573+1000
+ or ss_wholesale_cost between 33 and 33+20)) B1,
+ (select avg(ss_list_price) B2_LP
+ ,count(ss_list_price) B2_CNT
+ ,count(distinct ss_list_price) B2_CNTD
+ from store_sales
+ where ss_quantity between 6 and 10
+ and (ss_list_price between 143 and 143+10
+ or ss_coupon_amt between 5562 and 5562+1000
+ or ss_wholesale_cost between 45 and 45+20)) B2,
+ (select avg(ss_list_price) B3_LP
+ ,count(ss_list_price) B3_CNT
+ ,count(distinct ss_list_price) B3_CNTD
+ from store_sales
+ where ss_quantity between 11 and 15
+ and (ss_list_price between 159 and 159+10
+ or ss_coupon_amt between 2807 and 2807+1000
+ or ss_wholesale_cost between 24 and 24+20)) B3,
+ (select avg(ss_list_price) B4_LP
+ ,count(ss_list_price) B4_CNT
+ ,count(distinct ss_list_price) B4_CNTD
+ from store_sales
+ where ss_quantity between 16 and 20
+ and (ss_list_price between 24 and 24+10
+ or ss_coupon_amt between 3706 and 3706+1000
+ or ss_wholesale_cost between 46 and 46+20)) B4,
+ (select avg(ss_list_price) B5_LP
+ ,count(ss_list_price) B5_CNT
+ ,count(distinct ss_list_price) B5_CNTD
+ from store_sales
+ where ss_quantity between 21 and 25
+ and (ss_list_price between 76 and 76+10
+ or ss_coupon_amt between 2096 and 2096+1000
+ or ss_wholesale_cost between 50 and 50+20)) B5,
+ (select avg(ss_list_price) B6_LP
+ ,count(ss_list_price) B6_CNT
+ ,count(distinct ss_list_price) B6_CNTD
+ from store_sales
+ where ss_quantity between 26 and 30
+ and (ss_list_price between 169 and 169+10
+ or ss_coupon_amt between 10672 and 10672+1000
+ or ss_wholesale_cost between 58 and 58+20)) B6
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q29.sql b/benchmarks/queries/tpcds/q29.sql
new file mode 100644
index 0000000000..8d463f3771
--- /dev/null
+++ b/benchmarks/queries/tpcds/q29.sql
@@ -0,0 +1,48 @@
+-- SQLBench-DS query 29 derived from TPC-DS query 29 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ ,stddev_samp(ss_quantity) as store_sales_quantity
+ ,stddev_samp(sr_return_quantity) as store_returns_quantity
+ ,stddev_samp(cs_quantity) as catalog_sales_quantity
+ from
+ store_sales
+ ,store_returns
+ ,catalog_sales
+ ,date_dim d1
+ ,date_dim d2
+ ,date_dim d3
+ ,store
+ ,item
+ where
+ d1.d_moy = 4
+ and d1.d_year = 1999
+ and d1.d_date_sk = ss_sold_date_sk
+ and i_item_sk = ss_item_sk
+ and s_store_sk = ss_store_sk
+ and ss_customer_sk = sr_customer_sk
+ and ss_item_sk = sr_item_sk
+ and ss_ticket_number = sr_ticket_number
+ and sr_returned_date_sk = d2.d_date_sk
+ and d2.d_moy between 4 and 4 + 3
+ and d2.d_year = 1999
+ and sr_customer_sk = cs_bill_customer_sk
+ and sr_item_sk = cs_item_sk
+ and cs_sold_date_sk = d3.d_date_sk
+ and d3.d_year in (1999,1999+1,1999+2)
+ group by
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ order by
+ i_item_id
+ ,i_item_desc
+ ,s_store_id
+ ,s_store_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q3.sql b/benchmarks/queries/tpcds/q3.sql
new file mode 100644
index 0000000000..d6a55cb8cf
--- /dev/null
+++ b/benchmarks/queries/tpcds/q3.sql
@@ -0,0 +1,22 @@
+-- SQLBench-DS query 3 derived from TPC-DS query 3 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select dt.d_year
+ ,item.i_brand_id brand_id
+ ,item.i_brand brand
+ ,sum(ss_net_profit) sum_agg
+ from date_dim dt
+ ,store_sales
+ ,item
+ where dt.d_date_sk = store_sales.ss_sold_date_sk
+ and store_sales.ss_item_sk = item.i_item_sk
+ and item.i_manufact_id = 445
+ and dt.d_moy=12
+ group by dt.d_year
+ ,item.i_brand
+ ,item.i_brand_id
+ order by dt.d_year
+ ,sum_agg desc
+ ,brand_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q30.sql b/benchmarks/queries/tpcds/q30.sql
new file mode 100644
index 0000000000..7004078a50
--- /dev/null
+++ b/benchmarks/queries/tpcds/q30.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 30 derived from TPC-DS query 30 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with customer_total_return as
+ (select wr_returning_customer_sk as ctr_customer_sk
+ ,ca_state as ctr_state,
+ sum(wr_return_amt) as ctr_total_return
+ from web_returns
+ ,date_dim
+ ,customer_address
+ where wr_returned_date_sk = d_date_sk
+ and d_year =2000
+ and wr_returning_addr_sk = ca_address_sk
+ group by wr_returning_customer_sk
+ ,ca_state)
+ select c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
+ ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
+ ,c_last_review_date_sk,ctr_total_return
+ from customer_total_return ctr1
+ ,customer_address
+ ,customer
+ where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+ from customer_total_return ctr2
+ where ctr1.ctr_state = ctr2.ctr_state)
+ and ca_address_sk = c_current_addr_sk
+ and ca_state = 'KS'
+ and ctr1.ctr_customer_sk = c_customer_sk
+ order by c_customer_id,c_salutation,c_first_name,c_last_name,c_preferred_cust_flag
+ ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address
+ ,c_last_review_date_sk,ctr_total_return
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q31.sql b/benchmarks/queries/tpcds/q31.sql
new file mode 100644
index 0000000000..89aba18998
--- /dev/null
+++ b/benchmarks/queries/tpcds/q31.sql
@@ -0,0 +1,53 @@
+-- SQLBench-DS query 31 derived from TPC-DS query 31 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss as
+ (select ca_county,d_qoy, d_year,sum(ss_ext_sales_price) as store_sales
+ from store_sales,date_dim,customer_address
+ where ss_sold_date_sk = d_date_sk
+ and ss_addr_sk=ca_address_sk
+ group by ca_county,d_qoy, d_year),
+ ws as
+ (select ca_county,d_qoy, d_year,sum(ws_ext_sales_price) as web_sales
+ from web_sales,date_dim,customer_address
+ where ws_sold_date_sk = d_date_sk
+ and ws_bill_addr_sk=ca_address_sk
+ group by ca_county,d_qoy, d_year)
+ select
+ ss1.ca_county
+ ,ss1.d_year
+ ,ws2.web_sales/ws1.web_sales web_q1_q2_increase
+ ,ss2.store_sales/ss1.store_sales store_q1_q2_increase
+ ,ws3.web_sales/ws2.web_sales web_q2_q3_increase
+ ,ss3.store_sales/ss2.store_sales store_q2_q3_increase
+ from
+ ss ss1
+ ,ss ss2
+ ,ss ss3
+ ,ws ws1
+ ,ws ws2
+ ,ws ws3
+ where
+ ss1.d_qoy = 1
+ and ss1.d_year = 1999
+ and ss1.ca_county = ss2.ca_county
+ and ss2.d_qoy = 2
+ and ss2.d_year = 1999
+ and ss2.ca_county = ss3.ca_county
+ and ss3.d_qoy = 3
+ and ss3.d_year = 1999
+ and ss1.ca_county = ws1.ca_county
+ and ws1.d_qoy = 1
+ and ws1.d_year = 1999
+ and ws1.ca_county = ws2.ca_county
+ and ws2.d_qoy = 2
+ and ws2.d_year = 1999
+ and ws1.ca_county = ws3.ca_county
+ and ws3.d_qoy = 3
+ and ws3.d_year =1999
+ and case when ws1.web_sales > 0 then ws2.web_sales/ws1.web_sales else null end
+ > case when ss1.store_sales > 0 then ss2.store_sales/ss1.store_sales else null end
+ and case when ws2.web_sales > 0 then ws3.web_sales/ws2.web_sales else null end
+ > case when ss2.store_sales > 0 then ss3.store_sales/ss2.store_sales else null end
+ order by ss1.ca_county;
+
diff --git a/benchmarks/queries/tpcds/q32.sql b/benchmarks/queries/tpcds/q32.sql
new file mode 100644
index 0000000000..419dcd0b05
--- /dev/null
+++ b/benchmarks/queries/tpcds/q32.sql
@@ -0,0 +1,29 @@
+-- SQLBench-DS query 32 derived from TPC-DS query 32 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select sum(cs_ext_discount_amt) as `excess discount amount`
+from
+ catalog_sales
+ ,item
+ ,date_dim
+where
+i_manufact_id = 283
+and i_item_sk = cs_item_sk
+and d_date between '1999-02-22' and
+ (cast('1999-02-22' as date) + INTERVAL '90 DAYS')
+and d_date_sk = cs_sold_date_sk
+and cs_ext_discount_amt
+ > (
+ select
+ 1.3 * avg(cs_ext_discount_amt)
+ from
+ catalog_sales
+ ,date_dim
+ where
+ cs_item_sk = i_item_sk
+ and d_date between '1999-02-22' and
+ (cast('1999-02-22' as date) + INTERVAL '90 DAYS')
+ and d_date_sk = cs_sold_date_sk
+ )
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q33.sql b/benchmarks/queries/tpcds/q33.sql
new file mode 100644
index 0000000000..1aabc472b7
--- /dev/null
+++ b/benchmarks/queries/tpcds/q33.sql
@@ -0,0 +1,76 @@
+-- SQLBench-DS query 33 derived from TPC-DS query 33 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss as (
+ select
+ i_manufact_id,sum(ss_ext_sales_price) total_sales
+ from
+ store_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_manufact_id in (select
+ i_manufact_id
+from
+ item
+where i_category in ('Books'))
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year = 1999
+ and d_moy = 4
+ and ss_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_manufact_id),
+ cs as (
+ select
+ i_manufact_id,sum(cs_ext_sales_price) total_sales
+ from
+ catalog_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_manufact_id in (select
+ i_manufact_id
+from
+ item
+where i_category in ('Books'))
+ and cs_item_sk = i_item_sk
+ and cs_sold_date_sk = d_date_sk
+ and d_year = 1999
+ and d_moy = 4
+ and cs_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_manufact_id),
+ ws as (
+ select
+ i_manufact_id,sum(ws_ext_sales_price) total_sales
+ from
+ web_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_manufact_id in (select
+ i_manufact_id
+from
+ item
+where i_category in ('Books'))
+ and ws_item_sk = i_item_sk
+ and ws_sold_date_sk = d_date_sk
+ and d_year = 1999
+ and d_moy = 4
+ and ws_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_manufact_id)
+ select i_manufact_id ,sum(total_sales) total_sales
+ from (select * from ss
+ union all
+ select * from cs
+ union all
+ select * from ws) tmp1
+ group by i_manufact_id
+ order by total_sales
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q34.sql b/benchmarks/queries/tpcds/q34.sql
new file mode 100644
index 0000000000..f61caa51a3
--- /dev/null
+++ b/benchmarks/queries/tpcds/q34.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 34 derived from TPC-DS query 34 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select c_last_name
+ ,c_first_name
+ ,c_salutation
+ ,c_preferred_cust_flag
+ ,ss_ticket_number
+ ,cnt from
+ (select ss_ticket_number
+ ,ss_customer_sk
+ ,count(*) cnt
+ from store_sales,date_dim,store,household_demographics
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_store_sk = store.s_store_sk
+ and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+ and (date_dim.d_dom between 1 and 3 or date_dim.d_dom between 25 and 28)
+ and (household_demographics.hd_buy_potential = '501-1000' or
+ household_demographics.hd_buy_potential = 'Unknown')
+ and household_demographics.hd_vehicle_count > 0
+ and (case when household_demographics.hd_vehicle_count > 0
+ then household_demographics.hd_dep_count/ household_demographics.hd_vehicle_count
+ else null
+ end) > 1.2
+ and date_dim.d_year in (2000,2000+1,2000+2)
+ and store.s_county in ('Williamson County','Williamson County','Williamson County','Williamson County',
+ 'Williamson County','Williamson County','Williamson County','Williamson County')
+ group by ss_ticket_number,ss_customer_sk) dn,customer
+ where ss_customer_sk = c_customer_sk
+ and cnt between 15 and 20
+ order by c_last_name,c_first_name,c_salutation,c_preferred_cust_flag desc, ss_ticket_number;
+
diff --git a/benchmarks/queries/tpcds/q35.sql b/benchmarks/queries/tpcds/q35.sql
new file mode 100644
index 0000000000..ba0ccf3667
--- /dev/null
+++ b/benchmarks/queries/tpcds/q35.sql
@@ -0,0 +1,59 @@
+-- SQLBench-DS query 35 derived from TPC-DS query 35 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ ca_state,
+ cd_gender,
+ cd_marital_status,
+ cd_dep_count,
+ count(*) cnt1,
+ max(cd_dep_count),
+ stddev_samp(cd_dep_count),
+ stddev_samp(cd_dep_count),
+ cd_dep_employed_count,
+ count(*) cnt2,
+ max(cd_dep_employed_count),
+ stddev_samp(cd_dep_employed_count),
+ stddev_samp(cd_dep_employed_count),
+ cd_dep_college_count,
+ count(*) cnt3,
+ max(cd_dep_college_count),
+ stddev_samp(cd_dep_college_count),
+ stddev_samp(cd_dep_college_count)
+ from
+ customer c,customer_address ca,customer_demographics
+ where
+ c.c_current_addr_sk = ca.ca_address_sk and
+ cd_demo_sk = c.c_current_cdemo_sk and
+ exists (select *
+ from store_sales,date_dim
+ where c.c_customer_sk = ss_customer_sk and
+ ss_sold_date_sk = d_date_sk and
+ d_year = 2000 and
+ d_qoy < 4) and
+ (exists (select *
+ from web_sales,date_dim
+ where c.c_customer_sk = ws_bill_customer_sk and
+ ws_sold_date_sk = d_date_sk and
+ d_year = 2000 and
+ d_qoy < 4) or
+ exists (select *
+ from catalog_sales,date_dim
+ where c.c_customer_sk = cs_ship_customer_sk and
+ cs_sold_date_sk = d_date_sk and
+ d_year = 2000 and
+ d_qoy < 4))
+ group by ca_state,
+ cd_gender,
+ cd_marital_status,
+ cd_dep_count,
+ cd_dep_employed_count,
+ cd_dep_college_count
+ order by ca_state,
+ cd_gender,
+ cd_marital_status,
+ cd_dep_count,
+ cd_dep_employed_count,
+ cd_dep_college_count
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q36.sql b/benchmarks/queries/tpcds/q36.sql
new file mode 100644
index 0000000000..889fff5d14
--- /dev/null
+++ b/benchmarks/queries/tpcds/q36.sql
@@ -0,0 +1,31 @@
+-- SQLBench-DS query 36 derived from TPC-DS query 36 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin
+ ,i_category
+ ,i_class
+ ,grouping(i_category)+grouping(i_class) as lochierarchy
+ ,rank() over (
+ partition by grouping(i_category)+grouping(i_class),
+ case when grouping(i_class) = 0 then i_category end
+ order by sum(ss_net_profit)/sum(ss_ext_sales_price) asc) as rank_within_parent
+ from
+ store_sales
+ ,date_dim d1
+ ,item
+ ,store
+ where
+ d1.d_year = 2001
+ and d1.d_date_sk = ss_sold_date_sk
+ and i_item_sk = ss_item_sk
+ and s_store_sk = ss_store_sk
+ and s_state in ('TN','TN','TN','TN',
+ 'TN','TN','TN','TN')
+ group by rollup(i_category,i_class)
+ order by
+ lochierarchy desc
+ ,case when lochierarchy = 0 then i_category end
+ ,rank_within_parent
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q37.sql b/benchmarks/queries/tpcds/q37.sql
new file mode 100644
index 0000000000..bdd12dc82e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q37.sql
@@ -0,0 +1,18 @@
+-- SQLBench-DS query 37 derived from TPC-DS query 37 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,i_current_price
+ from item, inventory, date_dim, catalog_sales
+ where i_current_price between 26 and 26 + 30
+ and inv_item_sk = i_item_sk
+ and d_date_sk=inv_date_sk
+ and d_date between cast('2001-06-09' as date) and (cast('2001-06-09' as date) + INTERVAL '60 DAYS')
+ and i_manufact_id in (744,884,722,693)
+ and inv_quantity_on_hand between 100 and 500
+ and cs_item_sk = i_item_sk
+ group by i_item_id,i_item_desc,i_current_price
+ order by i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q38.sql b/benchmarks/queries/tpcds/q38.sql
new file mode 100644
index 0000000000..03e4e07635
--- /dev/null
+++ b/benchmarks/queries/tpcds/q38.sql
@@ -0,0 +1,24 @@
+-- SQLBench-DS query 38 derived from TPC-DS query 38 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select count(*) from (
+ select distinct c_last_name, c_first_name, d_date
+ from store_sales, date_dim, customer
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1190 and 1190 + 11
+ intersect
+ select distinct c_last_name, c_first_name, d_date
+ from catalog_sales, date_dim, customer
+ where catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
+ and catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1190 and 1190 + 11
+ intersect
+ select distinct c_last_name, c_first_name, d_date
+ from web_sales, date_dim, customer
+ where web_sales.ws_sold_date_sk = date_dim.d_date_sk
+ and web_sales.ws_bill_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1190 and 1190 + 11
+) hot_cust
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q39.sql b/benchmarks/queries/tpcds/q39.sql
new file mode 100644
index 0000000000..f49c223eba
--- /dev/null
+++ b/benchmarks/queries/tpcds/q39.sql
@@ -0,0 +1,55 @@
+-- SQLBench-DS query 39 derived from TPC-DS query 39 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with inv as
+(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy
+ ,stdev,mean, case mean when 0 then null else stdev/mean end cov
+ from(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy
+ ,stddev_samp(inv_quantity_on_hand) stdev,avg(inv_quantity_on_hand) mean
+ from inventory
+ ,item
+ ,warehouse
+ ,date_dim
+ where inv_item_sk = i_item_sk
+ and inv_warehouse_sk = w_warehouse_sk
+ and inv_date_sk = d_date_sk
+ and d_year =2001
+ group by w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy) foo
+ where case mean when 0 then 0 else stdev/mean end > 1)
+select inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean, inv1.cov
+ ,inv2.w_warehouse_sk,inv2.i_item_sk,inv2.d_moy,inv2.mean, inv2.cov
+from inv inv1,inv inv2
+where inv1.i_item_sk = inv2.i_item_sk
+ and inv1.w_warehouse_sk = inv2.w_warehouse_sk
+ and inv1.d_moy=1
+ and inv2.d_moy=1+1
+order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov
+ ,inv2.d_moy,inv2.mean, inv2.cov
+;
+with inv as
+(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy
+ ,stdev,mean, case mean when 0 then null else stdev/mean end cov
+ from(select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy
+ ,stddev_samp(inv_quantity_on_hand) stdev,avg(inv_quantity_on_hand) mean
+ from inventory
+ ,item
+ ,warehouse
+ ,date_dim
+ where inv_item_sk = i_item_sk
+ and inv_warehouse_sk = w_warehouse_sk
+ and inv_date_sk = d_date_sk
+ and d_year =2001
+ group by w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy) foo
+ where case mean when 0 then 0 else stdev/mean end > 1)
+select inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean, inv1.cov
+ ,inv2.w_warehouse_sk,inv2.i_item_sk,inv2.d_moy,inv2.mean, inv2.cov
+from inv inv1,inv inv2
+where inv1.i_item_sk = inv2.i_item_sk
+ and inv1.w_warehouse_sk = inv2.w_warehouse_sk
+ and inv1.d_moy=1
+ and inv2.d_moy=1+1
+ and inv1.cov > 1.5
+order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov
+ ,inv2.d_moy,inv2.mean, inv2.cov
+;
+
diff --git a/benchmarks/queries/tpcds/q4.sql b/benchmarks/queries/tpcds/q4.sql
new file mode 100644
index 0000000000..08643201a5
--- /dev/null
+++ b/benchmarks/queries/tpcds/q4.sql
@@ -0,0 +1,117 @@
+-- SQLBench-DS query 4 derived from TPC-DS query 4 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with year_total as (
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,c_preferred_cust_flag customer_preferred_cust_flag
+ ,c_birth_country customer_birth_country
+ ,c_login customer_login
+ ,c_email_address customer_email_address
+ ,d_year dyear
+ ,sum(((ss_ext_list_price-ss_ext_wholesale_cost-ss_ext_discount_amt)+ss_ext_sales_price)/2) year_total
+ ,'s' sale_type
+ from customer
+ ,store_sales
+ ,date_dim
+ where c_customer_sk = ss_customer_sk
+ and ss_sold_date_sk = d_date_sk
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,c_preferred_cust_flag
+ ,c_birth_country
+ ,c_login
+ ,c_email_address
+ ,d_year
+ union all
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,c_preferred_cust_flag customer_preferred_cust_flag
+ ,c_birth_country customer_birth_country
+ ,c_login customer_login
+ ,c_email_address customer_email_address
+ ,d_year dyear
+ ,sum((((cs_ext_list_price-cs_ext_wholesale_cost-cs_ext_discount_amt)+cs_ext_sales_price)/2) ) year_total
+ ,'c' sale_type
+ from customer
+ ,catalog_sales
+ ,date_dim
+ where c_customer_sk = cs_bill_customer_sk
+ and cs_sold_date_sk = d_date_sk
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,c_preferred_cust_flag
+ ,c_birth_country
+ ,c_login
+ ,c_email_address
+ ,d_year
+union all
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,c_preferred_cust_flag customer_preferred_cust_flag
+ ,c_birth_country customer_birth_country
+ ,c_login customer_login
+ ,c_email_address customer_email_address
+ ,d_year dyear
+ ,sum((((ws_ext_list_price-ws_ext_wholesale_cost-ws_ext_discount_amt)+ws_ext_sales_price)/2) ) year_total
+ ,'w' sale_type
+ from customer
+ ,web_sales
+ ,date_dim
+ where c_customer_sk = ws_bill_customer_sk
+ and ws_sold_date_sk = d_date_sk
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,c_preferred_cust_flag
+ ,c_birth_country
+ ,c_login
+ ,c_email_address
+ ,d_year
+ )
+ select
+ t_s_secyear.customer_id
+ ,t_s_secyear.customer_first_name
+ ,t_s_secyear.customer_last_name
+ ,t_s_secyear.customer_email_address
+ from year_total t_s_firstyear
+ ,year_total t_s_secyear
+ ,year_total t_c_firstyear
+ ,year_total t_c_secyear
+ ,year_total t_w_firstyear
+ ,year_total t_w_secyear
+ where t_s_secyear.customer_id = t_s_firstyear.customer_id
+ and t_s_firstyear.customer_id = t_c_secyear.customer_id
+ and t_s_firstyear.customer_id = t_c_firstyear.customer_id
+ and t_s_firstyear.customer_id = t_w_firstyear.customer_id
+ and t_s_firstyear.customer_id = t_w_secyear.customer_id
+ and t_s_firstyear.sale_type = 's'
+ and t_c_firstyear.sale_type = 'c'
+ and t_w_firstyear.sale_type = 'w'
+ and t_s_secyear.sale_type = 's'
+ and t_c_secyear.sale_type = 'c'
+ and t_w_secyear.sale_type = 'w'
+ and t_s_firstyear.dyear = 2001
+ and t_s_secyear.dyear = 2001+1
+ and t_c_firstyear.dyear = 2001
+ and t_c_secyear.dyear = 2001+1
+ and t_w_firstyear.dyear = 2001
+ and t_w_secyear.dyear = 2001+1
+ and t_s_firstyear.year_total > 0
+ and t_c_firstyear.year_total > 0
+ and t_w_firstyear.year_total > 0
+ and case when t_c_firstyear.year_total > 0 then t_c_secyear.year_total / t_c_firstyear.year_total else null end
+ > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end
+ and case when t_c_firstyear.year_total > 0 then t_c_secyear.year_total / t_c_firstyear.year_total else null end
+ > case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else null end
+ order by t_s_secyear.customer_id
+ ,t_s_secyear.customer_first_name
+ ,t_s_secyear.customer_last_name
+ ,t_s_secyear.customer_email_address
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q40.sql b/benchmarks/queries/tpcds/q40.sql
new file mode 100644
index 0000000000..7f54a9bbdf
--- /dev/null
+++ b/benchmarks/queries/tpcds/q40.sql
@@ -0,0 +1,29 @@
+-- SQLBench-DS query 40 derived from TPC-DS query 40 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ w_state
+ ,i_item_id
+ ,sum(case when (cast(d_date as date) < cast ('2002-05-18' as date))
+ then cs_sales_price - coalesce(cr_refunded_cash,0) else 0 end) as sales_before
+ ,sum(case when (cast(d_date as date) >= cast ('2002-05-18' as date))
+ then cs_sales_price - coalesce(cr_refunded_cash,0) else 0 end) as sales_after
+ from
+ catalog_sales left outer join catalog_returns on
+ (cs_order_number = cr_order_number
+ and cs_item_sk = cr_item_sk)
+ ,warehouse
+ ,item
+ ,date_dim
+ where
+ i_current_price between 0.99 and 1.49
+ and i_item_sk = cs_item_sk
+ and cs_warehouse_sk = w_warehouse_sk
+ and cs_sold_date_sk = d_date_sk
+ and d_date between (cast ('2002-05-18' as date) - INTERVAL '30 DAYS')
+ and (cast ('2002-05-18' as date) + INTERVAL '30 DAYS')
+ group by
+ w_state,i_item_id
+ order by w_state,i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q41.sql b/benchmarks/queries/tpcds/q41.sql
new file mode 100644
index 0000000000..d561cdba50
--- /dev/null
+++ b/benchmarks/queries/tpcds/q41.sql
@@ -0,0 +1,53 @@
+-- SQLBench-DS query 41 derived from TPC-DS query 41 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select distinct(i_product_name)
+ from item i1
+ where i_manufact_id between 668 and 668+40
+ and (select count(*) as item_cnt
+ from item
+ where (i_manufact = i1.i_manufact and
+ ((i_category = 'Women' and
+ (i_color = 'cream' or i_color = 'ghost') and
+ (i_units = 'Ton' or i_units = 'Gross') and
+ (i_size = 'economy' or i_size = 'small')
+ ) or
+ (i_category = 'Women' and
+ (i_color = 'midnight' or i_color = 'burlywood') and
+ (i_units = 'Tsp' or i_units = 'Bundle') and
+ (i_size = 'medium' or i_size = 'extra large')
+ ) or
+ (i_category = 'Men' and
+ (i_color = 'lavender' or i_color = 'azure') and
+ (i_units = 'Each' or i_units = 'Lb') and
+ (i_size = 'large' or i_size = 'N/A')
+ ) or
+ (i_category = 'Men' and
+ (i_color = 'chocolate' or i_color = 'steel') and
+ (i_units = 'N/A' or i_units = 'Dozen') and
+ (i_size = 'economy' or i_size = 'small')
+ ))) or
+ (i_manufact = i1.i_manufact and
+ ((i_category = 'Women' and
+ (i_color = 'floral' or i_color = 'royal') and
+ (i_units = 'Unknown' or i_units = 'Tbl') and
+ (i_size = 'economy' or i_size = 'small')
+ ) or
+ (i_category = 'Women' and
+ (i_color = 'navy' or i_color = 'forest') and
+ (i_units = 'Bunch' or i_units = 'Dram') and
+ (i_size = 'medium' or i_size = 'extra large')
+ ) or
+ (i_category = 'Men' and
+ (i_color = 'cyan' or i_color = 'indian') and
+ (i_units = 'Carton' or i_units = 'Cup') and
+ (i_size = 'large' or i_size = 'N/A')
+ ) or
+ (i_category = 'Men' and
+ (i_color = 'coral' or i_color = 'pale') and
+ (i_units = 'Pallet' or i_units = 'Gram') and
+ (i_size = 'economy' or i_size = 'small')
+ )))) > 0
+ order by i_product_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q42.sql b/benchmarks/queries/tpcds/q42.sql
new file mode 100644
index 0000000000..ac91e7cc2b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q42.sql
@@ -0,0 +1,23 @@
+-- SQLBench-DS query 42 derived from TPC-DS query 42 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select dt.d_year
+ ,item.i_category_id
+ ,item.i_category
+ ,sum(ss_ext_sales_price)
+ from date_dim dt
+ ,store_sales
+ ,item
+ where dt.d_date_sk = store_sales.ss_sold_date_sk
+ and store_sales.ss_item_sk = item.i_item_sk
+ and item.i_manager_id = 1
+ and dt.d_moy=11
+ and dt.d_year=1998
+ group by dt.d_year
+ ,item.i_category_id
+ ,item.i_category
+ order by sum(ss_ext_sales_price) desc,dt.d_year
+ ,item.i_category_id
+ ,item.i_category
+ LIMIT 100 ;
+
diff --git a/benchmarks/queries/tpcds/q43.sql b/benchmarks/queries/tpcds/q43.sql
new file mode 100644
index 0000000000..ca09e8e77d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q43.sql
@@ -0,0 +1,20 @@
+-- SQLBench-DS query 43 derived from TPC-DS query 43 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select s_store_name, s_store_id,
+ sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales,
+ sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales,
+ sum(case when (d_day_name='Tuesday') then ss_sales_price else null end) tue_sales,
+ sum(case when (d_day_name='Wednesday') then ss_sales_price else null end) wed_sales,
+ sum(case when (d_day_name='Thursday') then ss_sales_price else null end) thu_sales,
+ sum(case when (d_day_name='Friday') then ss_sales_price else null end) fri_sales,
+ sum(case when (d_day_name='Saturday') then ss_sales_price else null end) sat_sales
+ from date_dim, store_sales, store
+ where d_date_sk = ss_sold_date_sk and
+ s_store_sk = ss_store_sk and
+ s_gmt_offset = -5 and
+ d_year = 2000
+ group by s_store_name, s_store_id
+ order by s_store_name, s_store_id,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q44.sql b/benchmarks/queries/tpcds/q44.sql
new file mode 100644
index 0000000000..8c635cef49
--- /dev/null
+++ b/benchmarks/queries/tpcds/q44.sql
@@ -0,0 +1,36 @@
+-- SQLBench-DS query 44 derived from TPC-DS query 44 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name worst_performing
+from(select *
+ from (select item_sk,rank() over (order by rank_col asc) rnk
+ from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
+ from store_sales ss1
+ where ss_store_sk = 6
+ group by ss_item_sk
+ having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col
+ from store_sales
+ where ss_store_sk = 6
+ and ss_hdemo_sk is null
+ group by ss_store_sk))V1)V11
+ where rnk < 11) asceding,
+ (select *
+ from (select item_sk,rank() over (order by rank_col desc) rnk
+ from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
+ from store_sales ss1
+ where ss_store_sk = 6
+ group by ss_item_sk
+ having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit) rank_col
+ from store_sales
+ where ss_store_sk = 6
+ and ss_hdemo_sk is null
+ group by ss_store_sk))V2)V21
+ where rnk < 11) descending,
+item i1,
+item i2
+where asceding.rnk = descending.rnk
+ and i1.i_item_sk=asceding.item_sk
+ and i2.i_item_sk=descending.item_sk
+order by asceding.rnk
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q45.sql b/benchmarks/queries/tpcds/q45.sql
new file mode 100644
index 0000000000..682cc9b54d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q45.sql
@@ -0,0 +1,21 @@
+-- SQLBench-DS query 45 derived from TPC-DS query 45 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select ca_zip, ca_city, sum(ws_sales_price)
+ from web_sales, customer, customer_address, date_dim, item
+ where ws_bill_customer_sk = c_customer_sk
+ and c_current_addr_sk = ca_address_sk
+ and ws_item_sk = i_item_sk
+ and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475', '85392', '85460', '80348', '81792')
+ or
+ i_item_id in (select i_item_id
+ from item
+ where i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 23, 29)
+ )
+ )
+ and ws_sold_date_sk = d_date_sk
+ and d_qoy = 2 and d_year = 2000
+ group by ca_zip, ca_city
+ order by ca_zip, ca_city
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q46.sql b/benchmarks/queries/tpcds/q46.sql
new file mode 100644
index 0000000000..81ae1d5815
--- /dev/null
+++ b/benchmarks/queries/tpcds/q46.sql
@@ -0,0 +1,36 @@
+-- SQLBench-DS query 46 derived from TPC-DS query 46 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select c_last_name
+ ,c_first_name
+ ,ca_city
+ ,bought_city
+ ,ss_ticket_number
+ ,amt,profit
+ from
+ (select ss_ticket_number
+ ,ss_customer_sk
+ ,ca_city bought_city
+ ,sum(ss_coupon_amt) amt
+ ,sum(ss_net_profit) profit
+ from store_sales,date_dim,store,household_demographics,customer_address
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_store_sk = store.s_store_sk
+ and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+ and store_sales.ss_addr_sk = customer_address.ca_address_sk
+ and (household_demographics.hd_dep_count = 3 or
+ household_demographics.hd_vehicle_count= 1)
+ and date_dim.d_dow in (6,0)
+ and date_dim.d_year in (1999,1999+1,1999+2)
+ and store.s_city in ('Midway','Fairview','Fairview','Midway','Fairview')
+ group by ss_ticket_number,ss_customer_sk,ss_addr_sk,ca_city) dn,customer,customer_address current_addr
+ where ss_customer_sk = c_customer_sk
+ and customer.c_current_addr_sk = current_addr.ca_address_sk
+ and current_addr.ca_city <> bought_city
+ order by c_last_name
+ ,c_first_name
+ ,ca_city
+ ,bought_city
+ ,ss_ticket_number
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q47.sql b/benchmarks/queries/tpcds/q47.sql
new file mode 100644
index 0000000000..f741fe44cd
--- /dev/null
+++ b/benchmarks/queries/tpcds/q47.sql
@@ -0,0 +1,52 @@
+-- SQLBench-DS query 47 derived from TPC-DS query 47 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with v1 as(
+ select i_category, i_brand,
+ s_store_name, s_company_name,
+ d_year, d_moy,
+ sum(ss_sales_price) sum_sales,
+ avg(sum(ss_sales_price)) over
+ (partition by i_category, i_brand,
+ s_store_name, s_company_name, d_year)
+ avg_monthly_sales,
+ rank() over
+ (partition by i_category, i_brand,
+ s_store_name, s_company_name
+ order by d_year, d_moy) rn
+ from item, store_sales, date_dim, store
+ where ss_item_sk = i_item_sk and
+ ss_sold_date_sk = d_date_sk and
+ ss_store_sk = s_store_sk and
+ (
+ d_year = 2001 or
+ ( d_year = 2001-1 and d_moy =12) or
+ ( d_year = 2001+1 and d_moy =1)
+ )
+ group by i_category, i_brand,
+ s_store_name, s_company_name,
+ d_year, d_moy),
+ v2 as(
+ select v1.i_category, v1.i_brand, v1.s_store_name, v1.s_company_name
+ ,v1.d_year
+ ,v1.avg_monthly_sales
+ ,v1.sum_sales, v1_lag.sum_sales psum, v1_lead.sum_sales nsum
+ from v1, v1 v1_lag, v1 v1_lead
+ where v1.i_category = v1_lag.i_category and
+ v1.i_category = v1_lead.i_category and
+ v1.i_brand = v1_lag.i_brand and
+ v1.i_brand = v1_lead.i_brand and
+ v1.s_store_name = v1_lag.s_store_name and
+ v1.s_store_name = v1_lead.s_store_name and
+ v1.s_company_name = v1_lag.s_company_name and
+ v1.s_company_name = v1_lead.s_company_name and
+ v1.rn = v1_lag.rn + 1 and
+ v1.rn = v1_lead.rn - 1)
+ select *
+ from v2
+ where d_year = 2001 and
+ avg_monthly_sales > 0 and
+ case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1
+ order by sum_sales - avg_monthly_sales, nsum
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q48.sql b/benchmarks/queries/tpcds/q48.sql
new file mode 100644
index 0000000000..fb83279b13
--- /dev/null
+++ b/benchmarks/queries/tpcds/q48.sql
@@ -0,0 +1,68 @@
+-- SQLBench-DS query 48 derived from TPC-DS query 48 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select sum (ss_quantity)
+ from store_sales, store, customer_demographics, customer_address, date_dim
+ where s_store_sk = ss_store_sk
+ and ss_sold_date_sk = d_date_sk and d_year = 2001
+ and
+ (
+ (
+ cd_demo_sk = ss_cdemo_sk
+ and
+ cd_marital_status = 'W'
+ and
+ cd_education_status = '2 yr Degree'
+ and
+ ss_sales_price between 100.00 and 150.00
+ )
+ or
+ (
+ cd_demo_sk = ss_cdemo_sk
+ and
+ cd_marital_status = 'S'
+ and
+ cd_education_status = 'Advanced Degree'
+ and
+ ss_sales_price between 50.00 and 100.00
+ )
+ or
+ (
+ cd_demo_sk = ss_cdemo_sk
+ and
+ cd_marital_status = 'D'
+ and
+ cd_education_status = 'Primary'
+ and
+ ss_sales_price between 150.00 and 200.00
+ )
+ )
+ and
+ (
+ (
+ ss_addr_sk = ca_address_sk
+ and
+ ca_country = 'United States'
+ and
+ ca_state in ('IL', 'KY', 'OR')
+ and ss_net_profit between 0 and 2000
+ )
+ or
+ (ss_addr_sk = ca_address_sk
+ and
+ ca_country = 'United States'
+ and
+ ca_state in ('VA', 'FL', 'AL')
+ and ss_net_profit between 150 and 3000
+ )
+ or
+ (ss_addr_sk = ca_address_sk
+ and
+ ca_country = 'United States'
+ and
+ ca_state in ('OK', 'IA', 'TX')
+ and ss_net_profit between 50 and 25000
+ )
+ )
+;
+
diff --git a/benchmarks/queries/tpcds/q49.sql b/benchmarks/queries/tpcds/q49.sql
new file mode 100644
index 0000000000..c97286528b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q49.sql
@@ -0,0 +1,130 @@
+-- SQLBench-DS query 49 derived from TPC-DS query 49 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select channel, item, return_ratio, return_rank, currency_rank from
+ (select
+ 'web' as channel
+ ,web.item
+ ,web.return_ratio
+ ,web.return_rank
+ ,web.currency_rank
+ from (
+ select
+ item
+ ,return_ratio
+ ,currency_ratio
+ ,rank() over (order by return_ratio) as return_rank
+ ,rank() over (order by currency_ratio) as currency_rank
+ from
+ ( select ws.ws_item_sk as item
+ ,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/
+ cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as return_ratio
+ ,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/
+ cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as currency_ratio
+ from
+ web_sales ws left outer join web_returns wr
+ on (ws.ws_order_number = wr.wr_order_number and
+ ws.ws_item_sk = wr.wr_item_sk)
+ ,date_dim
+ where
+ wr.wr_return_amt > 10000
+ and ws.ws_net_profit > 1
+ and ws.ws_net_paid > 0
+ and ws.ws_quantity > 0
+ and ws_sold_date_sk = d_date_sk
+ and d_year = 2000
+ and d_moy = 12
+ group by ws.ws_item_sk
+ ) in_web
+ ) web
+ where
+ (
+ web.return_rank <= 10
+ or
+ web.currency_rank <= 10
+ )
+ union
+ select
+ 'catalog' as channel
+ ,catalog.item
+ ,catalog.return_ratio
+ ,catalog.return_rank
+ ,catalog.currency_rank
+ from (
+ select
+ item
+ ,return_ratio
+ ,currency_ratio
+ ,rank() over (order by return_ratio) as return_rank
+ ,rank() over (order by currency_ratio) as currency_rank
+ from
+ ( select
+ cs.cs_item_sk as item
+ ,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/
+ cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as return_ratio
+ ,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/
+ cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as currency_ratio
+ from
+ catalog_sales cs left outer join catalog_returns cr
+ on (cs.cs_order_number = cr.cr_order_number and
+ cs.cs_item_sk = cr.cr_item_sk)
+ ,date_dim
+ where
+ cr.cr_return_amount > 10000
+ and cs.cs_net_profit > 1
+ and cs.cs_net_paid > 0
+ and cs.cs_quantity > 0
+ and cs_sold_date_sk = d_date_sk
+ and d_year = 2000
+ and d_moy = 12
+ group by cs.cs_item_sk
+ ) in_cat
+ ) catalog
+ where
+ (
+ catalog.return_rank <= 10
+ or
+ catalog.currency_rank <=10
+ )
+ union
+ select
+ 'store' as channel
+ ,store.item
+ ,store.return_ratio
+ ,store.return_rank
+ ,store.currency_rank
+ from (
+ select
+ item
+ ,return_ratio
+ ,currency_ratio
+ ,rank() over (order by return_ratio) as return_rank
+ ,rank() over (order by currency_ratio) as currency_rank
+ from
+ ( select sts.ss_item_sk as item
+ ,(cast(sum(coalesce(sr.sr_return_quantity,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as return_ratio
+ ,(cast(sum(coalesce(sr.sr_return_amt,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as decimal(15,4) )) as currency_ratio
+ from
+ store_sales sts left outer join store_returns sr
+ on (sts.ss_ticket_number = sr.sr_ticket_number and sts.ss_item_sk = sr.sr_item_sk)
+ ,date_dim
+ where
+ sr.sr_return_amt > 10000
+ and sts.ss_net_profit > 1
+ and sts.ss_net_paid > 0
+ and sts.ss_quantity > 0
+ and ss_sold_date_sk = d_date_sk
+ and d_year = 2000
+ and d_moy = 12
+ group by sts.ss_item_sk
+ ) in_store
+ ) store
+ where (
+ store.return_rank <= 10
+ or
+ store.currency_rank <= 10
+ )
+ )
+ order by 1,4,5,2
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q5.sql b/benchmarks/queries/tpcds/q5.sql
new file mode 100644
index 0000000000..4f2721634c
--- /dev/null
+++ b/benchmarks/queries/tpcds/q5.sql
@@ -0,0 +1,129 @@
+-- SQLBench-DS query 5 derived from TPC-DS query 5 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ssr as
+ (select s_store_id,
+ sum(sales_price) as sales,
+ sum(profit) as profit,
+ sum(return_amt) as returns,
+ sum(net_loss) as profit_loss
+ from
+ ( select ss_store_sk as store_sk,
+ ss_sold_date_sk as date_sk,
+ ss_ext_sales_price as sales_price,
+ ss_net_profit as profit,
+ cast(0 as decimal(7,2)) as return_amt,
+ cast(0 as decimal(7,2)) as net_loss
+ from store_sales
+ union all
+ select sr_store_sk as store_sk,
+ sr_returned_date_sk as date_sk,
+ cast(0 as decimal(7,2)) as sales_price,
+ cast(0 as decimal(7,2)) as profit,
+ sr_return_amt as return_amt,
+ sr_net_loss as net_loss
+ from store_returns
+ ) salesreturns,
+ date_dim,
+ store
+ where date_sk = d_date_sk
+ and d_date between cast('2001-08-04' as date)
+ and (cast('2001-08-04' as date) + INTERVAL '14 DAYS')
+ and store_sk = s_store_sk
+ group by s_store_id)
+ ,
+ csr as
+ (select cp_catalog_page_id,
+ sum(sales_price) as sales,
+ sum(profit) as profit,
+ sum(return_amt) as returns,
+ sum(net_loss) as profit_loss
+ from
+ ( select cs_catalog_page_sk as page_sk,
+ cs_sold_date_sk as date_sk,
+ cs_ext_sales_price as sales_price,
+ cs_net_profit as profit,
+ cast(0 as decimal(7,2)) as return_amt,
+ cast(0 as decimal(7,2)) as net_loss
+ from catalog_sales
+ union all
+ select cr_catalog_page_sk as page_sk,
+ cr_returned_date_sk as date_sk,
+ cast(0 as decimal(7,2)) as sales_price,
+ cast(0 as decimal(7,2)) as profit,
+ cr_return_amount as return_amt,
+ cr_net_loss as net_loss
+ from catalog_returns
+ ) salesreturns,
+ date_dim,
+ catalog_page
+ where date_sk = d_date_sk
+ and d_date between cast('2001-08-04' as date)
+ and (cast('2001-08-04' as date) + INTERVAL '14 DAYS')
+ and page_sk = cp_catalog_page_sk
+ group by cp_catalog_page_id)
+ ,
+ wsr as
+ (select web_site_id,
+ sum(sales_price) as sales,
+ sum(profit) as profit,
+ sum(return_amt) as returns,
+ sum(net_loss) as profit_loss
+ from
+ ( select ws_web_site_sk as wsr_web_site_sk,
+ ws_sold_date_sk as date_sk,
+ ws_ext_sales_price as sales_price,
+ ws_net_profit as profit,
+ cast(0 as decimal(7,2)) as return_amt,
+ cast(0 as decimal(7,2)) as net_loss
+ from web_sales
+ union all
+ select ws_web_site_sk as wsr_web_site_sk,
+ wr_returned_date_sk as date_sk,
+ cast(0 as decimal(7,2)) as sales_price,
+ cast(0 as decimal(7,2)) as profit,
+ wr_return_amt as return_amt,
+ wr_net_loss as net_loss
+ from web_returns left outer join web_sales on
+ ( wr_item_sk = ws_item_sk
+ and wr_order_number = ws_order_number)
+ ) salesreturns,
+ date_dim,
+ web_site
+ where date_sk = d_date_sk
+ and d_date between cast('2001-08-04' as date)
+ and (cast('2001-08-04' as date) + INTERVAL '14 DAYS')
+ and wsr_web_site_sk = web_site_sk
+ group by web_site_id)
+ select channel
+ , id
+ , sum(sales) as sales
+ , sum(returns) as returns
+ , sum(profit) as profit
+ from
+ (select 'store channel' as channel
+ , 'store' || s_store_id as id
+ , sales
+ , returns
+ , (profit - profit_loss) as profit
+ from ssr
+ union all
+ select 'catalog channel' as channel
+ , 'catalog_page' || cp_catalog_page_id as id
+ , sales
+ , returns
+ , (profit - profit_loss) as profit
+ from csr
+ union all
+ select 'web channel' as channel
+ , 'web_site' || web_site_id as id
+ , sales
+ , returns
+ , (profit - profit_loss) as profit
+ from wsr
+ ) x
+ group by rollup (channel, id)
+ order by channel
+ ,id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q50.sql b/benchmarks/queries/tpcds/q50.sql
new file mode 100644
index 0000000000..d3dd26a156
--- /dev/null
+++ b/benchmarks/queries/tpcds/q50.sql
@@ -0,0 +1,60 @@
+-- SQLBench-DS query 50 derived from TPC-DS query 50 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ s_store_name
+ ,s_company_id
+ ,s_street_number
+ ,s_street_name
+ ,s_street_type
+ ,s_suite_number
+ ,s_city
+ ,s_county
+ ,s_state
+ ,s_zip
+ ,sum(case when (sr_returned_date_sk - ss_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days`
+ ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 30) and
+ (sr_returned_date_sk - ss_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days`
+ ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 60) and
+ (sr_returned_date_sk - ss_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days`
+ ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 90) and
+ (sr_returned_date_sk - ss_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days`
+ ,sum(case when (sr_returned_date_sk - ss_sold_date_sk > 120) then 1 else 0 end) as `>120 days`
+from
+ store_sales
+ ,store_returns
+ ,store
+ ,date_dim d1
+ ,date_dim d2
+where
+ d2.d_year = 2002
+and d2.d_moy = 8
+and ss_ticket_number = sr_ticket_number
+and ss_item_sk = sr_item_sk
+and ss_sold_date_sk = d1.d_date_sk
+and sr_returned_date_sk = d2.d_date_sk
+and ss_customer_sk = sr_customer_sk
+and ss_store_sk = s_store_sk
+group by
+ s_store_name
+ ,s_company_id
+ ,s_street_number
+ ,s_street_name
+ ,s_street_type
+ ,s_suite_number
+ ,s_city
+ ,s_county
+ ,s_state
+ ,s_zip
+order by s_store_name
+ ,s_company_id
+ ,s_street_number
+ ,s_street_name
+ ,s_street_type
+ ,s_suite_number
+ ,s_city
+ ,s_county
+ ,s_state
+ ,s_zip
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q51.sql b/benchmarks/queries/tpcds/q51.sql
new file mode 100644
index 0000000000..5aeb3087b4
--- /dev/null
+++ b/benchmarks/queries/tpcds/q51.sql
@@ -0,0 +1,46 @@
+-- SQLBench-DS query 51 derived from TPC-DS query 51 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+WITH web_v1 as (
+select
+ ws_item_sk item_sk, d_date,
+ sum(sum(ws_sales_price))
+ over (partition by ws_item_sk order by d_date rows between unbounded preceding and current row) cume_sales
+from web_sales
+ ,date_dim
+where ws_sold_date_sk=d_date_sk
+ and d_month_seq between 1215 and 1215+11
+ and ws_item_sk is not NULL
+group by ws_item_sk, d_date),
+store_v1 as (
+select
+ ss_item_sk item_sk, d_date,
+ sum(sum(ss_sales_price))
+ over (partition by ss_item_sk order by d_date rows between unbounded preceding and current row) cume_sales
+from store_sales
+ ,date_dim
+where ss_sold_date_sk=d_date_sk
+ and d_month_seq between 1215 and 1215+11
+ and ss_item_sk is not NULL
+group by ss_item_sk, d_date)
+ select *
+from (select item_sk
+ ,d_date
+ ,web_sales
+ ,store_sales
+ ,max(web_sales)
+ over (partition by item_sk order by d_date rows between unbounded preceding and current row) web_cumulative
+ ,max(store_sales)
+ over (partition by item_sk order by d_date rows between unbounded preceding and current row) store_cumulative
+ from (select case when web.item_sk is not null then web.item_sk else store.item_sk end item_sk
+ ,case when web.d_date is not null then web.d_date else store.d_date end d_date
+ ,web.cume_sales web_sales
+ ,store.cume_sales store_sales
+ from web_v1 web full outer join store_v1 store on (web.item_sk = store.item_sk
+ and web.d_date = store.d_date)
+ )x )y
+where web_cumulative > store_cumulative
+order by item_sk
+ ,d_date
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q52.sql b/benchmarks/queries/tpcds/q52.sql
new file mode 100644
index 0000000000..b4d032baec
--- /dev/null
+++ b/benchmarks/queries/tpcds/q52.sql
@@ -0,0 +1,23 @@
+-- SQLBench-DS query 52 derived from TPC-DS query 52 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select dt.d_year
+ ,item.i_brand_id brand_id
+ ,item.i_brand brand
+ ,sum(ss_ext_sales_price) ext_price
+ from date_dim dt
+ ,store_sales
+ ,item
+ where dt.d_date_sk = store_sales.ss_sold_date_sk
+ and store_sales.ss_item_sk = item.i_item_sk
+ and item.i_manager_id = 1
+ and dt.d_moy=11
+ and dt.d_year=2000
+ group by dt.d_year
+ ,item.i_brand
+ ,item.i_brand_id
+ order by dt.d_year
+ ,ext_price desc
+ ,brand_id
+ LIMIT 100 ;
+
diff --git a/benchmarks/queries/tpcds/q53.sql b/benchmarks/queries/tpcds/q53.sql
new file mode 100644
index 0000000000..4c87797741
--- /dev/null
+++ b/benchmarks/queries/tpcds/q53.sql
@@ -0,0 +1,29 @@
+-- SQLBench-DS query 53 derived from TPC-DS query 53 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select * from
+(select i_manufact_id,
+sum(ss_sales_price) sum_sales,
+avg(sum(ss_sales_price)) over (partition by i_manufact_id) avg_quarterly_sales
+from item, store_sales, date_dim, store
+where ss_item_sk = i_item_sk and
+ss_sold_date_sk = d_date_sk and
+ss_store_sk = s_store_sk and
+d_month_seq in (1197,1197+1,1197+2,1197+3,1197+4,1197+5,1197+6,1197+7,1197+8,1197+9,1197+10,1197+11) and
+((i_category in ('Books','Children','Electronics') and
+i_class in ('personal','portable','reference','self-help') and
+i_brand in ('scholaramalgamalg #14','scholaramalgamalg #7',
+ 'exportiunivamalg #9','scholaramalgamalg #9'))
+or(i_category in ('Women','Music','Men') and
+i_class in ('accessories','classical','fragrances','pants') and
+i_brand in ('amalgimporto #1','edu packscholar #1','exportiimporto #1',
+ 'importoamalg #1')))
+group by i_manufact_id, d_qoy ) tmp1
+where case when avg_quarterly_sales > 0
+ then abs (sum_sales - avg_quarterly_sales)/ avg_quarterly_sales
+ else null end > 0.1
+order by avg_quarterly_sales,
+ sum_sales,
+ i_manufact_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q54.sql b/benchmarks/queries/tpcds/q54.sql
new file mode 100644
index 0000000000..4b382e1abe
--- /dev/null
+++ b/benchmarks/queries/tpcds/q54.sql
@@ -0,0 +1,57 @@
+-- SQLBench-DS query 54 derived from TPC-DS query 54 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with my_customers as (
+ select distinct c_customer_sk
+ , c_current_addr_sk
+ from
+ ( select cs_sold_date_sk sold_date_sk,
+ cs_bill_customer_sk customer_sk,
+ cs_item_sk item_sk
+ from catalog_sales
+ union all
+ select ws_sold_date_sk sold_date_sk,
+ ws_bill_customer_sk customer_sk,
+ ws_item_sk item_sk
+ from web_sales
+ ) cs_or_ws_sales,
+ item,
+ date_dim,
+ customer
+ where sold_date_sk = d_date_sk
+ and item_sk = i_item_sk
+ and i_category = 'Men'
+ and i_class = 'shirts'
+ and c_customer_sk = cs_or_ws_sales.customer_sk
+ and d_moy = 4
+ and d_year = 1998
+ )
+ , my_revenue as (
+ select c_customer_sk,
+ sum(ss_ext_sales_price) as revenue
+ from my_customers,
+ store_sales,
+ customer_address,
+ store,
+ date_dim
+ where c_current_addr_sk = ca_address_sk
+ and ca_county = s_county
+ and ca_state = s_state
+ and ss_sold_date_sk = d_date_sk
+ and c_customer_sk = ss_customer_sk
+ and d_month_seq between (select distinct d_month_seq+1
+ from date_dim where d_year = 1998 and d_moy = 4)
+ and (select distinct d_month_seq+3
+ from date_dim where d_year = 1998 and d_moy = 4)
+ group by c_customer_sk
+ )
+ , segments as
+ (select cast((revenue/50) as int) as segment
+ from my_revenue
+ )
+ select segment, count(*) as num_customers, segment*50 as segment_base
+ from segments
+ group by segment
+ order by segment, num_customers
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q55.sql b/benchmarks/queries/tpcds/q55.sql
new file mode 100644
index 0000000000..5dabcab05f
--- /dev/null
+++ b/benchmarks/queries/tpcds/q55.sql
@@ -0,0 +1,15 @@
+-- SQLBench-DS query 55 derived from TPC-DS query 55 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_brand_id brand_id, i_brand brand,
+ sum(ss_ext_sales_price) ext_price
+ from date_dim, store_sales, item
+ where d_date_sk = ss_sold_date_sk
+ and ss_item_sk = i_item_sk
+ and i_manager_id=20
+ and d_moy=12
+ and d_year=1998
+ group by i_brand, i_brand_id
+ order by ext_price desc, i_brand_id
+ LIMIT 100 ;
+
diff --git a/benchmarks/queries/tpcds/q56.sql b/benchmarks/queries/tpcds/q56.sql
new file mode 100644
index 0000000000..d877d0b8b9
--- /dev/null
+++ b/benchmarks/queries/tpcds/q56.sql
@@ -0,0 +1,70 @@
+-- SQLBench-DS query 56 derived from TPC-DS query 56 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss as (
+ select i_item_id,sum(ss_ext_sales_price) total_sales
+ from
+ store_sales,
+ date_dim,
+ customer_address,
+ item
+ where i_item_id in (select
+ i_item_id
+from item
+where i_color in ('powder','goldenrod','bisque'))
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year = 1998
+ and d_moy = 5
+ and ss_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_item_id),
+ cs as (
+ select i_item_id,sum(cs_ext_sales_price) total_sales
+ from
+ catalog_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_item_id in (select
+ i_item_id
+from item
+where i_color in ('powder','goldenrod','bisque'))
+ and cs_item_sk = i_item_sk
+ and cs_sold_date_sk = d_date_sk
+ and d_year = 1998
+ and d_moy = 5
+ and cs_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_item_id),
+ ws as (
+ select i_item_id,sum(ws_ext_sales_price) total_sales
+ from
+ web_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_item_id in (select
+ i_item_id
+from item
+where i_color in ('powder','goldenrod','bisque'))
+ and ws_item_sk = i_item_sk
+ and ws_sold_date_sk = d_date_sk
+ and d_year = 1998
+ and d_moy = 5
+ and ws_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -5
+ group by i_item_id)
+ select i_item_id ,sum(total_sales) total_sales
+ from (select * from ss
+ union all
+ select * from cs
+ union all
+ select * from ws) tmp1
+ group by i_item_id
+ order by total_sales,
+ i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q57.sql b/benchmarks/queries/tpcds/q57.sql
new file mode 100644
index 0000000000..088ddc9eeb
--- /dev/null
+++ b/benchmarks/queries/tpcds/q57.sql
@@ -0,0 +1,49 @@
+-- SQLBench-DS query 57 derived from TPC-DS query 57 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with v1 as(
+ select i_category, i_brand,
+ cc_name,
+ d_year, d_moy,
+ sum(cs_sales_price) sum_sales,
+ avg(sum(cs_sales_price)) over
+ (partition by i_category, i_brand,
+ cc_name, d_year)
+ avg_monthly_sales,
+ rank() over
+ (partition by i_category, i_brand,
+ cc_name
+ order by d_year, d_moy) rn
+ from item, catalog_sales, date_dim, call_center
+ where cs_item_sk = i_item_sk and
+ cs_sold_date_sk = d_date_sk and
+ cc_call_center_sk= cs_call_center_sk and
+ (
+ d_year = 2000 or
+ ( d_year = 2000-1 and d_moy =12) or
+ ( d_year = 2000+1 and d_moy =1)
+ )
+ group by i_category, i_brand,
+ cc_name , d_year, d_moy),
+ v2 as(
+ select v1.cc_name
+ ,v1.d_year, v1.d_moy
+ ,v1.avg_monthly_sales
+ ,v1.sum_sales, v1_lag.sum_sales psum, v1_lead.sum_sales nsum
+ from v1, v1 v1_lag, v1 v1_lead
+ where v1.i_category = v1_lag.i_category and
+ v1.i_category = v1_lead.i_category and
+ v1.i_brand = v1_lag.i_brand and
+ v1.i_brand = v1_lead.i_brand and
+ v1. cc_name = v1_lag. cc_name and
+ v1. cc_name = v1_lead. cc_name and
+ v1.rn = v1_lag.rn + 1 and
+ v1.rn = v1_lead.rn - 1)
+ select *
+ from v2
+ where d_year = 2000 and
+ avg_monthly_sales > 0 and
+ case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1
+ order by sum_sales - avg_monthly_sales, psum
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q58.sql b/benchmarks/queries/tpcds/q58.sql
new file mode 100644
index 0000000000..05801ea4b3
--- /dev/null
+++ b/benchmarks/queries/tpcds/q58.sql
@@ -0,0 +1,66 @@
+-- SQLBench-DS query 58 derived from TPC-DS query 58 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss_items as
+ (select i_item_id item_id
+ ,sum(ss_ext_sales_price) ss_item_rev
+ from store_sales
+ ,item
+ ,date_dim
+ where ss_item_sk = i_item_sk
+ and d_date in (select d_date
+ from date_dim
+ where d_week_seq = (select d_week_seq
+ from date_dim
+ where d_date = '2000-02-12'))
+ and ss_sold_date_sk = d_date_sk
+ group by i_item_id),
+ cs_items as
+ (select i_item_id item_id
+ ,sum(cs_ext_sales_price) cs_item_rev
+ from catalog_sales
+ ,item
+ ,date_dim
+ where cs_item_sk = i_item_sk
+ and d_date in (select d_date
+ from date_dim
+ where d_week_seq = (select d_week_seq
+ from date_dim
+ where d_date = '2000-02-12'))
+ and cs_sold_date_sk = d_date_sk
+ group by i_item_id),
+ ws_items as
+ (select i_item_id item_id
+ ,sum(ws_ext_sales_price) ws_item_rev
+ from web_sales
+ ,item
+ ,date_dim
+ where ws_item_sk = i_item_sk
+ and d_date in (select d_date
+ from date_dim
+ where d_week_seq =(select d_week_seq
+ from date_dim
+ where d_date = '2000-02-12'))
+ and ws_sold_date_sk = d_date_sk
+ group by i_item_id)
+ select ss_items.item_id
+ ,ss_item_rev
+ ,ss_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ss_dev
+ ,cs_item_rev
+ ,cs_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 cs_dev
+ ,ws_item_rev
+ ,ws_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ws_dev
+ ,(ss_item_rev+cs_item_rev+ws_item_rev)/3 average
+ from ss_items,cs_items,ws_items
+ where ss_items.item_id=cs_items.item_id
+ and ss_items.item_id=ws_items.item_id
+ and ss_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
+ and ss_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
+ and cs_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
+ and cs_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
+ and ws_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
+ and ws_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
+ order by item_id
+ ,ss_item_rev
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q59.sql b/benchmarks/queries/tpcds/q59.sql
new file mode 100644
index 0000000000..e10c0dbf61
--- /dev/null
+++ b/benchmarks/queries/tpcds/q59.sql
@@ -0,0 +1,45 @@
+-- SQLBench-DS query 59 derived from TPC-DS query 59 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with wss as
+ (select d_week_seq,
+ ss_store_sk,
+ sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales,
+ sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales,
+ sum(case when (d_day_name='Tuesday') then ss_sales_price else null end) tue_sales,
+ sum(case when (d_day_name='Wednesday') then ss_sales_price else null end) wed_sales,
+ sum(case when (d_day_name='Thursday') then ss_sales_price else null end) thu_sales,
+ sum(case when (d_day_name='Friday') then ss_sales_price else null end) fri_sales,
+ sum(case when (d_day_name='Saturday') then ss_sales_price else null end) sat_sales
+ from store_sales,date_dim
+ where d_date_sk = ss_sold_date_sk
+ group by d_week_seq,ss_store_sk
+ )
+ select s_store_name1,s_store_id1,d_week_seq1
+ ,sun_sales1/sun_sales2,mon_sales1/mon_sales2
+ ,tue_sales1/tue_sales2,wed_sales1/wed_sales2,thu_sales1/thu_sales2
+ ,fri_sales1/fri_sales2,sat_sales1/sat_sales2
+ from
+ (select s_store_name s_store_name1,wss.d_week_seq d_week_seq1
+ ,s_store_id s_store_id1,sun_sales sun_sales1
+ ,mon_sales mon_sales1,tue_sales tue_sales1
+ ,wed_sales wed_sales1,thu_sales thu_sales1
+ ,fri_sales fri_sales1,sat_sales sat_sales1
+ from wss,store,date_dim d
+ where d.d_week_seq = wss.d_week_seq and
+ ss_store_sk = s_store_sk and
+ d_month_seq between 1206 and 1206 + 11) y,
+ (select s_store_name s_store_name2,wss.d_week_seq d_week_seq2
+ ,s_store_id s_store_id2,sun_sales sun_sales2
+ ,mon_sales mon_sales2,tue_sales tue_sales2
+ ,wed_sales wed_sales2,thu_sales thu_sales2
+ ,fri_sales fri_sales2,sat_sales sat_sales2
+ from wss,store,date_dim d
+ where d.d_week_seq = wss.d_week_seq and
+ ss_store_sk = s_store_sk and
+ d_month_seq between 1206+ 12 and 1206 + 23) x
+ where s_store_id1=s_store_id2
+ and d_week_seq1=d_week_seq2-52
+ order by s_store_name1,s_store_id1,d_week_seq1
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q6.sql b/benchmarks/queries/tpcds/q6.sql
new file mode 100644
index 0000000000..098db850c8
--- /dev/null
+++ b/benchmarks/queries/tpcds/q6.sql
@@ -0,0 +1,27 @@
+-- SQLBench-DS query 6 derived from TPC-DS query 6 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select a.ca_state state, count(*) cnt
+ from customer_address a
+ ,customer c
+ ,store_sales s
+ ,date_dim d
+ ,item i
+ where a.ca_address_sk = c.c_current_addr_sk
+ and c.c_customer_sk = s.ss_customer_sk
+ and s.ss_sold_date_sk = d.d_date_sk
+ and s.ss_item_sk = i.i_item_sk
+ and d.d_month_seq =
+ (select distinct (d_month_seq)
+ from date_dim
+ where d_year = 1998
+ and d_moy = 3 )
+ and i.i_current_price > 1.2 *
+ (select avg(j.i_current_price)
+ from item j
+ where j.i_category = i.i_category)
+ group by a.ca_state
+ having count(*) >= 10
+ order by cnt, a.ca_state
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q60.sql b/benchmarks/queries/tpcds/q60.sql
new file mode 100644
index 0000000000..1e088c1605
--- /dev/null
+++ b/benchmarks/queries/tpcds/q60.sql
@@ -0,0 +1,79 @@
+-- SQLBench-DS query 60 derived from TPC-DS query 60 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss as (
+ select
+ i_item_id,sum(ss_ext_sales_price) total_sales
+ from
+ store_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_item_id in (select
+ i_item_id
+from
+ item
+where i_category in ('Shoes'))
+ and ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year = 2001
+ and d_moy = 10
+ and ss_addr_sk = ca_address_sk
+ and ca_gmt_offset = -6
+ group by i_item_id),
+ cs as (
+ select
+ i_item_id,sum(cs_ext_sales_price) total_sales
+ from
+ catalog_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_item_id in (select
+ i_item_id
+from
+ item
+where i_category in ('Shoes'))
+ and cs_item_sk = i_item_sk
+ and cs_sold_date_sk = d_date_sk
+ and d_year = 2001
+ and d_moy = 10
+ and cs_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -6
+ group by i_item_id),
+ ws as (
+ select
+ i_item_id,sum(ws_ext_sales_price) total_sales
+ from
+ web_sales,
+ date_dim,
+ customer_address,
+ item
+ where
+ i_item_id in (select
+ i_item_id
+from
+ item
+where i_category in ('Shoes'))
+ and ws_item_sk = i_item_sk
+ and ws_sold_date_sk = d_date_sk
+ and d_year = 2001
+ and d_moy = 10
+ and ws_bill_addr_sk = ca_address_sk
+ and ca_gmt_offset = -6
+ group by i_item_id)
+ select
+ i_item_id
+,sum(total_sales) total_sales
+ from (select * from ss
+ union all
+ select * from cs
+ union all
+ select * from ws) tmp1
+ group by i_item_id
+ order by i_item_id
+ ,total_sales
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q61.sql b/benchmarks/queries/tpcds/q61.sql
new file mode 100644
index 0000000000..6d6c2a5fcb
--- /dev/null
+++ b/benchmarks/queries/tpcds/q61.sql
@@ -0,0 +1,45 @@
+-- SQLBench-DS query 61 derived from TPC-DS query 61 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select promotions,total,cast(promotions as decimal(15,4))/cast(total as decimal(15,4))*100
+from
+ (select sum(ss_ext_sales_price) promotions
+ from store_sales
+ ,store
+ ,promotion
+ ,date_dim
+ ,customer
+ ,customer_address
+ ,item
+ where ss_sold_date_sk = d_date_sk
+ and ss_store_sk = s_store_sk
+ and ss_promo_sk = p_promo_sk
+ and ss_customer_sk= c_customer_sk
+ and ca_address_sk = c_current_addr_sk
+ and ss_item_sk = i_item_sk
+ and ca_gmt_offset = -6
+ and i_category = 'Sports'
+ and (p_channel_dmail = 'Y' or p_channel_email = 'Y' or p_channel_tv = 'Y')
+ and s_gmt_offset = -6
+ and d_year = 2002
+ and d_moy = 11) promotional_sales,
+ (select sum(ss_ext_sales_price) total
+ from store_sales
+ ,store
+ ,date_dim
+ ,customer
+ ,customer_address
+ ,item
+ where ss_sold_date_sk = d_date_sk
+ and ss_store_sk = s_store_sk
+ and ss_customer_sk= c_customer_sk
+ and ca_address_sk = c_current_addr_sk
+ and ss_item_sk = i_item_sk
+ and ca_gmt_offset = -6
+ and i_category = 'Sports'
+ and s_gmt_offset = -6
+ and d_year = 2002
+ and d_moy = 11) all_sales
+order by promotions, total
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q62.sql b/benchmarks/queries/tpcds/q62.sql
new file mode 100644
index 0000000000..d0138e057b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q62.sql
@@ -0,0 +1,36 @@
+-- SQLBench-DS query 62 derived from TPC-DS query 62 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,web_name
+ ,sum(case when (ws_ship_date_sk - ws_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days`
+ ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 30) and
+ (ws_ship_date_sk - ws_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days`
+ ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 60) and
+ (ws_ship_date_sk - ws_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days`
+ ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 90) and
+ (ws_ship_date_sk - ws_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days`
+ ,sum(case when (ws_ship_date_sk - ws_sold_date_sk > 120) then 1 else 0 end) as `>120 days`
+from
+ web_sales
+ ,warehouse
+ ,ship_mode
+ ,web_site
+ ,date_dim
+where
+ d_month_seq between 1217 and 1217 + 11
+and ws_ship_date_sk = d_date_sk
+and ws_warehouse_sk = w_warehouse_sk
+and ws_ship_mode_sk = sm_ship_mode_sk
+and ws_web_site_sk = web_site_sk
+group by
+ substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,web_name
+order by substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,web_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q63.sql b/benchmarks/queries/tpcds/q63.sql
new file mode 100644
index 0000000000..3d85a2e38b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q63.sql
@@ -0,0 +1,30 @@
+-- SQLBench-DS query 63 derived from TPC-DS query 63 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+from (select i_manager_id
+ ,sum(ss_sales_price) sum_sales
+ ,avg(sum(ss_sales_price)) over (partition by i_manager_id) avg_monthly_sales
+ from item
+ ,store_sales
+ ,date_dim
+ ,store
+ where ss_item_sk = i_item_sk
+ and ss_sold_date_sk = d_date_sk
+ and ss_store_sk = s_store_sk
+ and d_month_seq in (1181,1181+1,1181+2,1181+3,1181+4,1181+5,1181+6,1181+7,1181+8,1181+9,1181+10,1181+11)
+ and (( i_category in ('Books','Children','Electronics')
+ and i_class in ('personal','portable','reference','self-help')
+ and i_brand in ('scholaramalgamalg #14','scholaramalgamalg #7',
+ 'exportiunivamalg #9','scholaramalgamalg #9'))
+ or( i_category in ('Women','Music','Men')
+ and i_class in ('accessories','classical','fragrances','pants')
+ and i_brand in ('amalgimporto #1','edu packscholar #1','exportiimporto #1',
+ 'importoamalg #1')))
+group by i_manager_id, d_moy) tmp1
+where case when avg_monthly_sales > 0 then abs (sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1
+order by i_manager_id
+ ,avg_monthly_sales
+ ,sum_sales
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q64.sql b/benchmarks/queries/tpcds/q64.sql
new file mode 100644
index 0000000000..0350cdc7d0
--- /dev/null
+++ b/benchmarks/queries/tpcds/q64.sql
@@ -0,0 +1,122 @@
+-- SQLBench-DS query 64 derived from TPC-DS query 64 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with cs_ui as
+ (select cs_item_sk
+ ,sum(cs_ext_list_price) as sale,sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit) as refund
+ from catalog_sales
+ ,catalog_returns
+ where cs_item_sk = cr_item_sk
+ and cs_order_number = cr_order_number
+ group by cs_item_sk
+ having sum(cs_ext_list_price)>2*sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit)),
+cross_sales as
+ (select i_product_name product_name
+ ,i_item_sk item_sk
+ ,s_store_name store_name
+ ,s_zip store_zip
+ ,ad1.ca_street_number b_street_number
+ ,ad1.ca_street_name b_street_name
+ ,ad1.ca_city b_city
+ ,ad1.ca_zip b_zip
+ ,ad2.ca_street_number c_street_number
+ ,ad2.ca_street_name c_street_name
+ ,ad2.ca_city c_city
+ ,ad2.ca_zip c_zip
+ ,d1.d_year as syear
+ ,d2.d_year as fsyear
+ ,d3.d_year s2year
+ ,count(*) cnt
+ ,sum(ss_wholesale_cost) s1
+ ,sum(ss_list_price) s2
+ ,sum(ss_coupon_amt) s3
+ FROM store_sales
+ ,store_returns
+ ,cs_ui
+ ,date_dim d1
+ ,date_dim d2
+ ,date_dim d3
+ ,store
+ ,customer
+ ,customer_demographics cd1
+ ,customer_demographics cd2
+ ,promotion
+ ,household_demographics hd1
+ ,household_demographics hd2
+ ,customer_address ad1
+ ,customer_address ad2
+ ,income_band ib1
+ ,income_band ib2
+ ,item
+ WHERE ss_store_sk = s_store_sk AND
+ ss_sold_date_sk = d1.d_date_sk AND
+ ss_customer_sk = c_customer_sk AND
+ ss_cdemo_sk= cd1.cd_demo_sk AND
+ ss_hdemo_sk = hd1.hd_demo_sk AND
+ ss_addr_sk = ad1.ca_address_sk and
+ ss_item_sk = i_item_sk and
+ ss_item_sk = sr_item_sk and
+ ss_ticket_number = sr_ticket_number and
+ ss_item_sk = cs_ui.cs_item_sk and
+ c_current_cdemo_sk = cd2.cd_demo_sk AND
+ c_current_hdemo_sk = hd2.hd_demo_sk AND
+ c_current_addr_sk = ad2.ca_address_sk and
+ c_first_sales_date_sk = d2.d_date_sk and
+ c_first_shipto_date_sk = d3.d_date_sk and
+ ss_promo_sk = p_promo_sk and
+ hd1.hd_income_band_sk = ib1.ib_income_band_sk and
+ hd2.hd_income_band_sk = ib2.ib_income_band_sk and
+ cd1.cd_marital_status <> cd2.cd_marital_status and
+ i_color in ('light','cyan','burnished','green','almond','smoke') and
+ i_current_price between 22 and 22 + 10 and
+ i_current_price between 22 + 1 and 22 + 15
+group by i_product_name
+ ,i_item_sk
+ ,s_store_name
+ ,s_zip
+ ,ad1.ca_street_number
+ ,ad1.ca_street_name
+ ,ad1.ca_city
+ ,ad1.ca_zip
+ ,ad2.ca_street_number
+ ,ad2.ca_street_name
+ ,ad2.ca_city
+ ,ad2.ca_zip
+ ,d1.d_year
+ ,d2.d_year
+ ,d3.d_year
+)
+select cs1.product_name
+ ,cs1.store_name
+ ,cs1.store_zip
+ ,cs1.b_street_number
+ ,cs1.b_street_name
+ ,cs1.b_city
+ ,cs1.b_zip
+ ,cs1.c_street_number
+ ,cs1.c_street_name
+ ,cs1.c_city
+ ,cs1.c_zip
+ ,cs1.syear
+ ,cs1.cnt
+ ,cs1.s1 as s11
+ ,cs1.s2 as s21
+ ,cs1.s3 as s31
+ ,cs2.s1 as s12
+ ,cs2.s2 as s22
+ ,cs2.s3 as s32
+ ,cs2.syear
+ ,cs2.cnt
+from cross_sales cs1,cross_sales cs2
+where cs1.item_sk=cs2.item_sk and
+ cs1.syear = 2001 and
+ cs2.syear = 2001 + 1 and
+ cs2.cnt <= cs1.cnt and
+ cs1.store_name = cs2.store_name and
+ cs1.store_zip = cs2.store_zip
+order by cs1.product_name
+ ,cs1.store_name
+ ,cs2.cnt
+ ,cs1.s1
+ ,cs2.s1;
+
diff --git a/benchmarks/queries/tpcds/q65.sql b/benchmarks/queries/tpcds/q65.sql
new file mode 100644
index 0000000000..0c13a0debf
--- /dev/null
+++ b/benchmarks/queries/tpcds/q65.sql
@@ -0,0 +1,30 @@
+-- SQLBench-DS query 65 derived from TPC-DS query 65 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ s_store_name,
+ i_item_desc,
+ sc.revenue,
+ i_current_price,
+ i_wholesale_cost,
+ i_brand
+ from store, item,
+ (select ss_store_sk, avg(revenue) as ave
+ from
+ (select ss_store_sk, ss_item_sk,
+ sum(ss_sales_price) as revenue
+ from store_sales, date_dim
+ where ss_sold_date_sk = d_date_sk and d_month_seq between 1186 and 1186+11
+ group by ss_store_sk, ss_item_sk) sa
+ group by ss_store_sk) sb,
+ (select ss_store_sk, ss_item_sk, sum(ss_sales_price) as revenue
+ from store_sales, date_dim
+ where ss_sold_date_sk = d_date_sk and d_month_seq between 1186 and 1186+11
+ group by ss_store_sk, ss_item_sk) sc
+ where sb.ss_store_sk = sc.ss_store_sk and
+ sc.revenue <= 0.1 * sb.ave and
+ s_store_sk = sc.ss_store_sk and
+ i_item_sk = sc.ss_item_sk
+ order by s_store_name, i_item_desc
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q66.sql b/benchmarks/queries/tpcds/q66.sql
new file mode 100644
index 0000000000..ba066a561d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q66.sql
@@ -0,0 +1,221 @@
+-- SQLBench-DS query 66 derived from TPC-DS query 66 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,ship_carriers
+ ,year
+ ,sum(jan_sales) as jan_sales
+ ,sum(feb_sales) as feb_sales
+ ,sum(mar_sales) as mar_sales
+ ,sum(apr_sales) as apr_sales
+ ,sum(may_sales) as may_sales
+ ,sum(jun_sales) as jun_sales
+ ,sum(jul_sales) as jul_sales
+ ,sum(aug_sales) as aug_sales
+ ,sum(sep_sales) as sep_sales
+ ,sum(oct_sales) as oct_sales
+ ,sum(nov_sales) as nov_sales
+ ,sum(dec_sales) as dec_sales
+ ,sum(jan_sales/w_warehouse_sq_ft) as jan_sales_per_sq_foot
+ ,sum(feb_sales/w_warehouse_sq_ft) as feb_sales_per_sq_foot
+ ,sum(mar_sales/w_warehouse_sq_ft) as mar_sales_per_sq_foot
+ ,sum(apr_sales/w_warehouse_sq_ft) as apr_sales_per_sq_foot
+ ,sum(may_sales/w_warehouse_sq_ft) as may_sales_per_sq_foot
+ ,sum(jun_sales/w_warehouse_sq_ft) as jun_sales_per_sq_foot
+ ,sum(jul_sales/w_warehouse_sq_ft) as jul_sales_per_sq_foot
+ ,sum(aug_sales/w_warehouse_sq_ft) as aug_sales_per_sq_foot
+ ,sum(sep_sales/w_warehouse_sq_ft) as sep_sales_per_sq_foot
+ ,sum(oct_sales/w_warehouse_sq_ft) as oct_sales_per_sq_foot
+ ,sum(nov_sales/w_warehouse_sq_ft) as nov_sales_per_sq_foot
+ ,sum(dec_sales/w_warehouse_sq_ft) as dec_sales_per_sq_foot
+ ,sum(jan_net) as jan_net
+ ,sum(feb_net) as feb_net
+ ,sum(mar_net) as mar_net
+ ,sum(apr_net) as apr_net
+ ,sum(may_net) as may_net
+ ,sum(jun_net) as jun_net
+ ,sum(jul_net) as jul_net
+ ,sum(aug_net) as aug_net
+ ,sum(sep_net) as sep_net
+ ,sum(oct_net) as oct_net
+ ,sum(nov_net) as nov_net
+ ,sum(dec_net) as dec_net
+ from (
+ select
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,'FEDEX' || ',' || 'GERMA' as ship_carriers
+ ,d_year as year
+ ,sum(case when d_moy = 1
+ then ws_ext_list_price* ws_quantity else 0 end) as jan_sales
+ ,sum(case when d_moy = 2
+ then ws_ext_list_price* ws_quantity else 0 end) as feb_sales
+ ,sum(case when d_moy = 3
+ then ws_ext_list_price* ws_quantity else 0 end) as mar_sales
+ ,sum(case when d_moy = 4
+ then ws_ext_list_price* ws_quantity else 0 end) as apr_sales
+ ,sum(case when d_moy = 5
+ then ws_ext_list_price* ws_quantity else 0 end) as may_sales
+ ,sum(case when d_moy = 6
+ then ws_ext_list_price* ws_quantity else 0 end) as jun_sales
+ ,sum(case when d_moy = 7
+ then ws_ext_list_price* ws_quantity else 0 end) as jul_sales
+ ,sum(case when d_moy = 8
+ then ws_ext_list_price* ws_quantity else 0 end) as aug_sales
+ ,sum(case when d_moy = 9
+ then ws_ext_list_price* ws_quantity else 0 end) as sep_sales
+ ,sum(case when d_moy = 10
+ then ws_ext_list_price* ws_quantity else 0 end) as oct_sales
+ ,sum(case when d_moy = 11
+ then ws_ext_list_price* ws_quantity else 0 end) as nov_sales
+ ,sum(case when d_moy = 12
+ then ws_ext_list_price* ws_quantity else 0 end) as dec_sales
+ ,sum(case when d_moy = 1
+ then ws_net_profit * ws_quantity else 0 end) as jan_net
+ ,sum(case when d_moy = 2
+ then ws_net_profit * ws_quantity else 0 end) as feb_net
+ ,sum(case when d_moy = 3
+ then ws_net_profit * ws_quantity else 0 end) as mar_net
+ ,sum(case when d_moy = 4
+ then ws_net_profit * ws_quantity else 0 end) as apr_net
+ ,sum(case when d_moy = 5
+ then ws_net_profit * ws_quantity else 0 end) as may_net
+ ,sum(case when d_moy = 6
+ then ws_net_profit * ws_quantity else 0 end) as jun_net
+ ,sum(case when d_moy = 7
+ then ws_net_profit * ws_quantity else 0 end) as jul_net
+ ,sum(case when d_moy = 8
+ then ws_net_profit * ws_quantity else 0 end) as aug_net
+ ,sum(case when d_moy = 9
+ then ws_net_profit * ws_quantity else 0 end) as sep_net
+ ,sum(case when d_moy = 10
+ then ws_net_profit * ws_quantity else 0 end) as oct_net
+ ,sum(case when d_moy = 11
+ then ws_net_profit * ws_quantity else 0 end) as nov_net
+ ,sum(case when d_moy = 12
+ then ws_net_profit * ws_quantity else 0 end) as dec_net
+ from
+ web_sales
+ ,warehouse
+ ,date_dim
+ ,time_dim
+ ,ship_mode
+ where
+ ws_warehouse_sk = w_warehouse_sk
+ and ws_sold_date_sk = d_date_sk
+ and ws_sold_time_sk = t_time_sk
+ and ws_ship_mode_sk = sm_ship_mode_sk
+ and d_year = 2001
+ and t_time between 19072 and 19072+28800
+ and sm_carrier in ('FEDEX','GERMA')
+ group by
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,d_year
+ union all
+ select
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,'FEDEX' || ',' || 'GERMA' as ship_carriers
+ ,d_year as year
+ ,sum(case when d_moy = 1
+ then cs_sales_price* cs_quantity else 0 end) as jan_sales
+ ,sum(case when d_moy = 2
+ then cs_sales_price* cs_quantity else 0 end) as feb_sales
+ ,sum(case when d_moy = 3
+ then cs_sales_price* cs_quantity else 0 end) as mar_sales
+ ,sum(case when d_moy = 4
+ then cs_sales_price* cs_quantity else 0 end) as apr_sales
+ ,sum(case when d_moy = 5
+ then cs_sales_price* cs_quantity else 0 end) as may_sales
+ ,sum(case when d_moy = 6
+ then cs_sales_price* cs_quantity else 0 end) as jun_sales
+ ,sum(case when d_moy = 7
+ then cs_sales_price* cs_quantity else 0 end) as jul_sales
+ ,sum(case when d_moy = 8
+ then cs_sales_price* cs_quantity else 0 end) as aug_sales
+ ,sum(case when d_moy = 9
+ then cs_sales_price* cs_quantity else 0 end) as sep_sales
+ ,sum(case when d_moy = 10
+ then cs_sales_price* cs_quantity else 0 end) as oct_sales
+ ,sum(case when d_moy = 11
+ then cs_sales_price* cs_quantity else 0 end) as nov_sales
+ ,sum(case when d_moy = 12
+ then cs_sales_price* cs_quantity else 0 end) as dec_sales
+ ,sum(case when d_moy = 1
+ then cs_net_paid * cs_quantity else 0 end) as jan_net
+ ,sum(case when d_moy = 2
+ then cs_net_paid * cs_quantity else 0 end) as feb_net
+ ,sum(case when d_moy = 3
+ then cs_net_paid * cs_quantity else 0 end) as mar_net
+ ,sum(case when d_moy = 4
+ then cs_net_paid * cs_quantity else 0 end) as apr_net
+ ,sum(case when d_moy = 5
+ then cs_net_paid * cs_quantity else 0 end) as may_net
+ ,sum(case when d_moy = 6
+ then cs_net_paid * cs_quantity else 0 end) as jun_net
+ ,sum(case when d_moy = 7
+ then cs_net_paid * cs_quantity else 0 end) as jul_net
+ ,sum(case when d_moy = 8
+ then cs_net_paid * cs_quantity else 0 end) as aug_net
+ ,sum(case when d_moy = 9
+ then cs_net_paid * cs_quantity else 0 end) as sep_net
+ ,sum(case when d_moy = 10
+ then cs_net_paid * cs_quantity else 0 end) as oct_net
+ ,sum(case when d_moy = 11
+ then cs_net_paid * cs_quantity else 0 end) as nov_net
+ ,sum(case when d_moy = 12
+ then cs_net_paid * cs_quantity else 0 end) as dec_net
+ from
+ catalog_sales
+ ,warehouse
+ ,date_dim
+ ,time_dim
+ ,ship_mode
+ where
+ cs_warehouse_sk = w_warehouse_sk
+ and cs_sold_date_sk = d_date_sk
+ and cs_sold_time_sk = t_time_sk
+ and cs_ship_mode_sk = sm_ship_mode_sk
+ and d_year = 2001
+ and t_time between 19072 AND 19072+28800
+ and sm_carrier in ('FEDEX','GERMA')
+ group by
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,d_year
+ ) x
+ group by
+ w_warehouse_name
+ ,w_warehouse_sq_ft
+ ,w_city
+ ,w_county
+ ,w_state
+ ,w_country
+ ,ship_carriers
+ ,year
+ order by w_warehouse_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q67.sql b/benchmarks/queries/tpcds/q67.sql
new file mode 100644
index 0000000000..7d684e6745
--- /dev/null
+++ b/benchmarks/queries/tpcds/q67.sql
@@ -0,0 +1,45 @@
+-- SQLBench-DS query 67 derived from TPC-DS query 67 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+from (select i_category
+ ,i_class
+ ,i_brand
+ ,i_product_name
+ ,d_year
+ ,d_qoy
+ ,d_moy
+ ,s_store_id
+ ,sumsales
+ ,rank() over (partition by i_category order by sumsales desc) rk
+ from (select i_category
+ ,i_class
+ ,i_brand
+ ,i_product_name
+ ,d_year
+ ,d_qoy
+ ,d_moy
+ ,s_store_id
+ ,sum(coalesce(ss_sales_price*ss_quantity,0)) sumsales
+ from store_sales
+ ,date_dim
+ ,store
+ ,item
+ where ss_sold_date_sk=d_date_sk
+ and ss_item_sk=i_item_sk
+ and ss_store_sk = s_store_sk
+ and d_month_seq between 1194 and 1194+11
+ group by rollup(i_category, i_class, i_brand, i_product_name, d_year, d_qoy, d_moy,s_store_id))dw1) dw2
+where rk <= 100
+order by i_category
+ ,i_class
+ ,i_brand
+ ,i_product_name
+ ,d_year
+ ,d_qoy
+ ,d_moy
+ ,s_store_id
+ ,sumsales
+ ,rk
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q68.sql b/benchmarks/queries/tpcds/q68.sql
new file mode 100644
index 0000000000..242e0dbd93
--- /dev/null
+++ b/benchmarks/queries/tpcds/q68.sql
@@ -0,0 +1,43 @@
+-- SQLBench-DS query 68 derived from TPC-DS query 68 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select c_last_name
+ ,c_first_name
+ ,ca_city
+ ,bought_city
+ ,ss_ticket_number
+ ,extended_price
+ ,extended_tax
+ ,list_price
+ from (select ss_ticket_number
+ ,ss_customer_sk
+ ,ca_city bought_city
+ ,sum(ss_ext_sales_price) extended_price
+ ,sum(ss_ext_list_price) list_price
+ ,sum(ss_ext_tax) extended_tax
+ from store_sales
+ ,date_dim
+ ,store
+ ,household_demographics
+ ,customer_address
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_store_sk = store.s_store_sk
+ and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+ and store_sales.ss_addr_sk = customer_address.ca_address_sk
+ and date_dim.d_dom between 1 and 2
+ and (household_demographics.hd_dep_count = 8 or
+ household_demographics.hd_vehicle_count= 3)
+ and date_dim.d_year in (2000,2000+1,2000+2)
+ and store.s_city in ('Midway','Fairview')
+ group by ss_ticket_number
+ ,ss_customer_sk
+ ,ss_addr_sk,ca_city) dn
+ ,customer
+ ,customer_address current_addr
+ where ss_customer_sk = c_customer_sk
+ and customer.c_current_addr_sk = current_addr.ca_address_sk
+ and current_addr.ca_city <> bought_city
+ order by c_last_name
+ ,ss_ticket_number
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q69.sql b/benchmarks/queries/tpcds/q69.sql
new file mode 100644
index 0000000000..4d4030cf59
--- /dev/null
+++ b/benchmarks/queries/tpcds/q69.sql
@@ -0,0 +1,48 @@
+-- SQLBench-DS query 69 derived from TPC-DS query 69 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ count(*) cnt1,
+ cd_purchase_estimate,
+ count(*) cnt2,
+ cd_credit_rating,
+ count(*) cnt3
+ from
+ customer c,customer_address ca,customer_demographics
+ where
+ c.c_current_addr_sk = ca.ca_address_sk and
+ ca_state in ('IN','VA','MS') and
+ cd_demo_sk = c.c_current_cdemo_sk and
+ exists (select *
+ from store_sales,date_dim
+ where c.c_customer_sk = ss_customer_sk and
+ ss_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 2 and 2+2) and
+ (not exists (select *
+ from web_sales,date_dim
+ where c.c_customer_sk = ws_bill_customer_sk and
+ ws_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 2 and 2+2) and
+ not exists (select *
+ from catalog_sales,date_dim
+ where c.c_customer_sk = cs_ship_customer_sk and
+ cs_sold_date_sk = d_date_sk and
+ d_year = 2002 and
+ d_moy between 2 and 2+2))
+ group by cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ cd_purchase_estimate,
+ cd_credit_rating
+ order by cd_gender,
+ cd_marital_status,
+ cd_education_status,
+ cd_purchase_estimate,
+ cd_credit_rating
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q7.sql b/benchmarks/queries/tpcds/q7.sql
new file mode 100644
index 0000000000..bb58851616
--- /dev/null
+++ b/benchmarks/queries/tpcds/q7.sql
@@ -0,0 +1,22 @@
+-- SQLBench-DS query 7 derived from TPC-DS query 7 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id,
+ avg(ss_quantity) agg1,
+ avg(ss_list_price) agg2,
+ avg(ss_coupon_amt) agg3,
+ avg(ss_sales_price) agg4
+ from store_sales, customer_demographics, date_dim, item, promotion
+ where ss_sold_date_sk = d_date_sk and
+ ss_item_sk = i_item_sk and
+ ss_cdemo_sk = cd_demo_sk and
+ ss_promo_sk = p_promo_sk and
+ cd_gender = 'M' and
+ cd_marital_status = 'M' and
+ cd_education_status = '4 yr Degree' and
+ (p_channel_email = 'N' or p_channel_event = 'N') and
+ d_year = 2001
+ group by i_item_id
+ order by i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q70.sql b/benchmarks/queries/tpcds/q70.sql
new file mode 100644
index 0000000000..a8b5f1c99f
--- /dev/null
+++ b/benchmarks/queries/tpcds/q70.sql
@@ -0,0 +1,39 @@
+-- SQLBench-DS query 70 derived from TPC-DS query 70 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ sum(ss_net_profit) as total_sum
+ ,s_state
+ ,s_county
+ ,grouping(s_state)+grouping(s_county) as lochierarchy
+ ,rank() over (
+ partition by grouping(s_state)+grouping(s_county),
+ case when grouping(s_county) = 0 then s_state end
+ order by sum(ss_net_profit) desc) as rank_within_parent
+ from
+ store_sales
+ ,date_dim d1
+ ,store
+ where
+ d1.d_month_seq between 1180 and 1180+11
+ and d1.d_date_sk = ss_sold_date_sk
+ and s_store_sk = ss_store_sk
+ and s_state in
+ ( select s_state
+ from (select s_state as s_state,
+ rank() over ( partition by s_state order by sum(ss_net_profit) desc) as ranking
+ from store_sales, store, date_dim
+ where d_month_seq between 1180 and 1180+11
+ and d_date_sk = ss_sold_date_sk
+ and s_store_sk = ss_store_sk
+ group by s_state
+ ) tmp1
+ where ranking <= 5
+ )
+ group by rollup(s_state,s_county)
+ order by
+ lochierarchy desc
+ ,case when lochierarchy = 0 then s_state end
+ ,rank_within_parent
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q71.sql b/benchmarks/queries/tpcds/q71.sql
new file mode 100644
index 0000000000..90d00806b0
--- /dev/null
+++ b/benchmarks/queries/tpcds/q71.sql
@@ -0,0 +1,41 @@
+-- SQLBench-DS query 71 derived from TPC-DS query 71 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_brand_id brand_id, i_brand brand,t_hour,t_minute,
+ sum(ext_price) ext_price
+ from item, (select ws_ext_sales_price as ext_price,
+ ws_sold_date_sk as sold_date_sk,
+ ws_item_sk as sold_item_sk,
+ ws_sold_time_sk as time_sk
+ from web_sales,date_dim
+ where d_date_sk = ws_sold_date_sk
+ and d_moy=11
+ and d_year=2001
+ union all
+ select cs_ext_sales_price as ext_price,
+ cs_sold_date_sk as sold_date_sk,
+ cs_item_sk as sold_item_sk,
+ cs_sold_time_sk as time_sk
+ from catalog_sales,date_dim
+ where d_date_sk = cs_sold_date_sk
+ and d_moy=11
+ and d_year=2001
+ union all
+ select ss_ext_sales_price as ext_price,
+ ss_sold_date_sk as sold_date_sk,
+ ss_item_sk as sold_item_sk,
+ ss_sold_time_sk as time_sk
+ from store_sales,date_dim
+ where d_date_sk = ss_sold_date_sk
+ and d_moy=11
+ and d_year=2001
+ ) tmp,time_dim
+ where
+ sold_item_sk = i_item_sk
+ and i_manager_id=1
+ and time_sk = t_time_sk
+ and (t_meal_time = 'breakfast' or t_meal_time = 'dinner')
+ group by i_brand, i_brand_id,t_hour,t_minute
+ order by ext_price desc, i_brand_id
+ ;
+
diff --git a/benchmarks/queries/tpcds/q72.sql b/benchmarks/queries/tpcds/q72.sql
new file mode 100644
index 0000000000..0e31057a03
--- /dev/null
+++ b/benchmarks/queries/tpcds/q72.sql
@@ -0,0 +1,30 @@
+-- SQLBench-DS query 72 derived from TPC-DS query 72 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_desc
+ ,w_warehouse_name
+ ,d1.d_week_seq
+ ,sum(case when p_promo_sk is null then 1 else 0 end) no_promo
+ ,sum(case when p_promo_sk is not null then 1 else 0 end) promo
+ ,count(*) total_cnt
+from catalog_sales
+join inventory on (cs_item_sk = inv_item_sk)
+join warehouse on (w_warehouse_sk=inv_warehouse_sk)
+join item on (i_item_sk = cs_item_sk)
+join customer_demographics on (cs_bill_cdemo_sk = cd_demo_sk)
+join household_demographics on (cs_bill_hdemo_sk = hd_demo_sk)
+join date_dim d1 on (cs_sold_date_sk = d1.d_date_sk)
+join date_dim d2 on (inv_date_sk = d2.d_date_sk)
+join date_dim d3 on (cs_ship_date_sk = d3.d_date_sk)
+left outer join promotion on (cs_promo_sk=p_promo_sk)
+left outer join catalog_returns on (cr_item_sk = cs_item_sk and cr_order_number = cs_order_number)
+where d1.d_week_seq = d2.d_week_seq
+ and inv_quantity_on_hand < cs_quantity
+ and d3.d_date > d1.d_date + 5
+ and hd_buy_potential = '501-1000'
+ and d1.d_year = 1999
+ and cd_marital_status = 'S'
+group by i_item_desc,w_warehouse_name,d1.d_week_seq
+order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q72_optimized.sql b/benchmarks/queries/tpcds/q72_optimized.sql
new file mode 100644
index 0000000000..a98a70e8f0
--- /dev/null
+++ b/benchmarks/queries/tpcds/q72_optimized.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 72 derived from TPC-DS query 72 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+
+-- This is a modified version of q72 that changes the join order to be sensible (the original q72
+-- intentionally has a terrible join order for testing database vendors join reordering rules)
+
+select i_item_desc
+ ,w_warehouse_name
+ ,d1.d_week_seq
+ ,sum(case when p_promo_sk is null then 1 else 0 end) no_promo
+ ,sum(case when p_promo_sk is not null then 1 else 0 end) promo
+ ,count(*) total_cnt
+from catalog_sales
+ join date_dim d1 on (cs_sold_date_sk = d1.d_date_sk)
+ join customer_demographics on (cs_bill_cdemo_sk = cd_demo_sk)
+ join household_demographics on (cs_bill_hdemo_sk = hd_demo_sk)
+ join item on (i_item_sk = cs_item_sk)
+ join inventory on (cs_item_sk = inv_item_sk)
+ join warehouse on (w_warehouse_sk=inv_warehouse_sk)
+ join date_dim d2 on (inv_date_sk = d2.d_date_sk)
+ join date_dim d3 on (cs_ship_date_sk = d3.d_date_sk)
+ left outer join promotion on (cs_promo_sk=p_promo_sk)
+ left outer join catalog_returns on (cr_item_sk = cs_item_sk and cr_order_number = cs_order_number)
+where d1.d_week_seq = d2.d_week_seq
+ and inv_quantity_on_hand < cs_quantity
+ and d3.d_date > d1.d_date + 5
+ and hd_buy_potential = '501-1000'
+ and d1.d_year = 1999
+ and cd_marital_status = 'S'
+group by i_item_desc,w_warehouse_name,d1.d_week_seq
+order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq
+LIMIT 100;
\ No newline at end of file
diff --git a/benchmarks/queries/tpcds/q73.sql b/benchmarks/queries/tpcds/q73.sql
new file mode 100644
index 0000000000..e7879d09ff
--- /dev/null
+++ b/benchmarks/queries/tpcds/q73.sql
@@ -0,0 +1,29 @@
+-- SQLBench-DS query 73 derived from TPC-DS query 73 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select c_last_name
+ ,c_first_name
+ ,c_salutation
+ ,c_preferred_cust_flag
+ ,ss_ticket_number
+ ,cnt from
+ (select ss_ticket_number
+ ,ss_customer_sk
+ ,count(*) cnt
+ from store_sales,date_dim,store,household_demographics
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_store_sk = store.s_store_sk
+ and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+ and date_dim.d_dom between 1 and 2
+ and (household_demographics.hd_buy_potential = '1001-5000' or
+ household_demographics.hd_buy_potential = '5001-10000')
+ and household_demographics.hd_vehicle_count > 0
+ and case when household_demographics.hd_vehicle_count > 0 then
+ household_demographics.hd_dep_count/ household_demographics.hd_vehicle_count else null end > 1
+ and date_dim.d_year in (1999,1999+1,1999+2)
+ and store.s_county in ('Williamson County','Williamson County','Williamson County','Williamson County')
+ group by ss_ticket_number,ss_customer_sk) dj,customer
+ where ss_customer_sk = c_customer_sk
+ and cnt between 1 and 5
+ order by cnt desc, c_last_name asc;
+
diff --git a/benchmarks/queries/tpcds/q74.sql b/benchmarks/queries/tpcds/q74.sql
new file mode 100644
index 0000000000..b9829d9d5e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q74.sql
@@ -0,0 +1,62 @@
+-- SQLBench-DS query 74 derived from TPC-DS query 74 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with year_total as (
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,d_year as year
+ ,stddev_samp(ss_net_paid) year_total
+ ,'s' sale_type
+ from customer
+ ,store_sales
+ ,date_dim
+ where c_customer_sk = ss_customer_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_year in (2001,2001+1)
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,d_year
+ union all
+ select c_customer_id customer_id
+ ,c_first_name customer_first_name
+ ,c_last_name customer_last_name
+ ,d_year as year
+ ,stddev_samp(ws_net_paid) year_total
+ ,'w' sale_type
+ from customer
+ ,web_sales
+ ,date_dim
+ where c_customer_sk = ws_bill_customer_sk
+ and ws_sold_date_sk = d_date_sk
+ and d_year in (2001,2001+1)
+ group by c_customer_id
+ ,c_first_name
+ ,c_last_name
+ ,d_year
+ )
+ select
+ t_s_secyear.customer_id, t_s_secyear.customer_first_name, t_s_secyear.customer_last_name
+ from year_total t_s_firstyear
+ ,year_total t_s_secyear
+ ,year_total t_w_firstyear
+ ,year_total t_w_secyear
+ where t_s_secyear.customer_id = t_s_firstyear.customer_id
+ and t_s_firstyear.customer_id = t_w_secyear.customer_id
+ and t_s_firstyear.customer_id = t_w_firstyear.customer_id
+ and t_s_firstyear.sale_type = 's'
+ and t_w_firstyear.sale_type = 'w'
+ and t_s_secyear.sale_type = 's'
+ and t_w_secyear.sale_type = 'w'
+ and t_s_firstyear.year = 2001
+ and t_s_secyear.year = 2001+1
+ and t_w_firstyear.year = 2001
+ and t_w_secyear.year = 2001+1
+ and t_s_firstyear.year_total > 0
+ and t_w_firstyear.year_total > 0
+ and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else null end
+ > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end
+ order by 3,2,1
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q75.sql b/benchmarks/queries/tpcds/q75.sql
new file mode 100644
index 0000000000..cec9da56a5
--- /dev/null
+++ b/benchmarks/queries/tpcds/q75.sql
@@ -0,0 +1,71 @@
+-- SQLBench-DS query 75 derived from TPC-DS query 75 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+WITH all_sales AS (
+ SELECT d_year
+ ,i_brand_id
+ ,i_class_id
+ ,i_category_id
+ ,i_manufact_id
+ ,SUM(sales_cnt) AS sales_cnt
+ ,SUM(sales_amt) AS sales_amt
+ FROM (SELECT d_year
+ ,i_brand_id
+ ,i_class_id
+ ,i_category_id
+ ,i_manufact_id
+ ,cs_quantity - COALESCE(cr_return_quantity,0) AS sales_cnt
+ ,cs_ext_sales_price - COALESCE(cr_return_amount,0.0) AS sales_amt
+ FROM catalog_sales JOIN item ON i_item_sk=cs_item_sk
+ JOIN date_dim ON d_date_sk=cs_sold_date_sk
+ LEFT JOIN catalog_returns ON (cs_order_number=cr_order_number
+ AND cs_item_sk=cr_item_sk)
+ WHERE i_category='Shoes'
+ UNION
+ SELECT d_year
+ ,i_brand_id
+ ,i_class_id
+ ,i_category_id
+ ,i_manufact_id
+ ,ss_quantity - COALESCE(sr_return_quantity,0) AS sales_cnt
+ ,ss_ext_sales_price - COALESCE(sr_return_amt,0.0) AS sales_amt
+ FROM store_sales JOIN item ON i_item_sk=ss_item_sk
+ JOIN date_dim ON d_date_sk=ss_sold_date_sk
+ LEFT JOIN store_returns ON (ss_ticket_number=sr_ticket_number
+ AND ss_item_sk=sr_item_sk)
+ WHERE i_category='Shoes'
+ UNION
+ SELECT d_year
+ ,i_brand_id
+ ,i_class_id
+ ,i_category_id
+ ,i_manufact_id
+ ,ws_quantity - COALESCE(wr_return_quantity,0) AS sales_cnt
+ ,ws_ext_sales_price - COALESCE(wr_return_amt,0.0) AS sales_amt
+ FROM web_sales JOIN item ON i_item_sk=ws_item_sk
+ JOIN date_dim ON d_date_sk=ws_sold_date_sk
+ LEFT JOIN web_returns ON (ws_order_number=wr_order_number
+ AND ws_item_sk=wr_item_sk)
+ WHERE i_category='Shoes') sales_detail
+ GROUP BY d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id)
+ SELECT prev_yr.d_year AS prev_year
+ ,curr_yr.d_year AS year
+ ,curr_yr.i_brand_id
+ ,curr_yr.i_class_id
+ ,curr_yr.i_category_id
+ ,curr_yr.i_manufact_id
+ ,prev_yr.sales_cnt AS prev_yr_cnt
+ ,curr_yr.sales_cnt AS curr_yr_cnt
+ ,curr_yr.sales_cnt-prev_yr.sales_cnt AS sales_cnt_diff
+ ,curr_yr.sales_amt-prev_yr.sales_amt AS sales_amt_diff
+ FROM all_sales curr_yr, all_sales prev_yr
+ WHERE curr_yr.i_brand_id=prev_yr.i_brand_id
+ AND curr_yr.i_class_id=prev_yr.i_class_id
+ AND curr_yr.i_category_id=prev_yr.i_category_id
+ AND curr_yr.i_manufact_id=prev_yr.i_manufact_id
+ AND curr_yr.d_year=2000
+ AND prev_yr.d_year=2000-1
+ AND CAST(curr_yr.sales_cnt AS DECIMAL(17,2))/CAST(prev_yr.sales_cnt AS DECIMAL(17,2))<0.9
+ ORDER BY sales_cnt_diff,sales_amt_diff
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q76.sql b/benchmarks/queries/tpcds/q76.sql
new file mode 100644
index 0000000000..931a1334f6
--- /dev/null
+++ b/benchmarks/queries/tpcds/q76.sql
@@ -0,0 +1,25 @@
+-- SQLBench-DS query 76 derived from TPC-DS query 76 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select channel, col_name, d_year, d_qoy, i_category, COUNT(*) sales_cnt, SUM(ext_sales_price) sales_amt FROM (
+ SELECT 'store' as channel, 'ss_customer_sk' col_name, d_year, d_qoy, i_category, ss_ext_sales_price ext_sales_price
+ FROM store_sales, item, date_dim
+ WHERE ss_customer_sk IS NULL
+ AND ss_sold_date_sk=d_date_sk
+ AND ss_item_sk=i_item_sk
+ UNION ALL
+ SELECT 'web' as channel, 'ws_ship_hdemo_sk' col_name, d_year, d_qoy, i_category, ws_ext_sales_price ext_sales_price
+ FROM web_sales, item, date_dim
+ WHERE ws_ship_hdemo_sk IS NULL
+ AND ws_sold_date_sk=d_date_sk
+ AND ws_item_sk=i_item_sk
+ UNION ALL
+ SELECT 'catalog' as channel, 'cs_bill_customer_sk' col_name, d_year, d_qoy, i_category, cs_ext_sales_price ext_sales_price
+ FROM catalog_sales, item, date_dim
+ WHERE cs_bill_customer_sk IS NULL
+ AND cs_sold_date_sk=d_date_sk
+ AND cs_item_sk=i_item_sk) foo
+GROUP BY channel, col_name, d_year, d_qoy, i_category
+ORDER BY channel, col_name, d_year, d_qoy, i_category
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q77.sql b/benchmarks/queries/tpcds/q77.sql
new file mode 100644
index 0000000000..d04bc14bc9
--- /dev/null
+++ b/benchmarks/queries/tpcds/q77.sql
@@ -0,0 +1,109 @@
+-- SQLBench-DS query 77 derived from TPC-DS query 77 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ss as
+ (select s_store_sk,
+ sum(ss_ext_sales_price) as sales,
+ sum(ss_net_profit) as profit
+ from store_sales,
+ date_dim,
+ store
+ where ss_sold_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ and ss_store_sk = s_store_sk
+ group by s_store_sk)
+ ,
+ sr as
+ (select s_store_sk,
+ sum(sr_return_amt) as returns,
+ sum(sr_net_loss) as profit_loss
+ from store_returns,
+ date_dim,
+ store
+ where sr_returned_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ and sr_store_sk = s_store_sk
+ group by s_store_sk),
+ cs as
+ (select cs_call_center_sk,
+ sum(cs_ext_sales_price) as sales,
+ sum(cs_net_profit) as profit
+ from catalog_sales,
+ date_dim
+ where cs_sold_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ group by cs_call_center_sk
+ ),
+ cr as
+ (select cr_call_center_sk,
+ sum(cr_return_amount) as returns,
+ sum(cr_net_loss) as profit_loss
+ from catalog_returns,
+ date_dim
+ where cr_returned_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ group by cr_call_center_sk
+ ),
+ ws as
+ ( select wp_web_page_sk,
+ sum(ws_ext_sales_price) as sales,
+ sum(ws_net_profit) as profit
+ from web_sales,
+ date_dim,
+ web_page
+ where ws_sold_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ and ws_web_page_sk = wp_web_page_sk
+ group by wp_web_page_sk),
+ wr as
+ (select wp_web_page_sk,
+ sum(wr_return_amt) as returns,
+ sum(wr_net_loss) as profit_loss
+ from web_returns,
+ date_dim,
+ web_page
+ where wr_returned_date_sk = d_date_sk
+ and d_date between cast('2001-08-11' as date)
+ and (cast('2001-08-11' as date) + INTERVAL '30 DAYS')
+ and wr_web_page_sk = wp_web_page_sk
+ group by wp_web_page_sk)
+ select channel
+ , id
+ , sum(sales) as sales
+ , sum(returns) as returns
+ , sum(profit) as profit
+ from
+ (select 'store channel' as channel
+ , ss.s_store_sk as id
+ , sales
+ , coalesce(returns, 0) as returns
+ , (profit - coalesce(profit_loss,0)) as profit
+ from ss left join sr
+ on ss.s_store_sk = sr.s_store_sk
+ union all
+ select 'catalog channel' as channel
+ , cs_call_center_sk as id
+ , sales
+ , returns
+ , (profit - profit_loss) as profit
+ from cs
+ , cr
+ union all
+ select 'web channel' as channel
+ , ws.wp_web_page_sk as id
+ , sales
+ , coalesce(returns, 0) returns
+ , (profit - coalesce(profit_loss,0)) as profit
+ from ws left join wr
+ on ws.wp_web_page_sk = wr.wp_web_page_sk
+ ) x
+ group by rollup (channel, id)
+ order by channel
+ ,id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q78.sql b/benchmarks/queries/tpcds/q78.sql
new file mode 100644
index 0000000000..927ef63561
--- /dev/null
+++ b/benchmarks/queries/tpcds/q78.sql
@@ -0,0 +1,59 @@
+-- SQLBench-DS query 78 derived from TPC-DS query 78 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ws as
+ (select d_year AS ws_sold_year, ws_item_sk,
+ ws_bill_customer_sk ws_customer_sk,
+ sum(ws_quantity) ws_qty,
+ sum(ws_wholesale_cost) ws_wc,
+ sum(ws_sales_price) ws_sp
+ from web_sales
+ left join web_returns on wr_order_number=ws_order_number and ws_item_sk=wr_item_sk
+ join date_dim on ws_sold_date_sk = d_date_sk
+ where wr_order_number is null
+ group by d_year, ws_item_sk, ws_bill_customer_sk
+ ),
+cs as
+ (select d_year AS cs_sold_year, cs_item_sk,
+ cs_bill_customer_sk cs_customer_sk,
+ sum(cs_quantity) cs_qty,
+ sum(cs_wholesale_cost) cs_wc,
+ sum(cs_sales_price) cs_sp
+ from catalog_sales
+ left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk
+ join date_dim on cs_sold_date_sk = d_date_sk
+ where cr_order_number is null
+ group by d_year, cs_item_sk, cs_bill_customer_sk
+ ),
+ss as
+ (select d_year AS ss_sold_year, ss_item_sk,
+ ss_customer_sk,
+ sum(ss_quantity) ss_qty,
+ sum(ss_wholesale_cost) ss_wc,
+ sum(ss_sales_price) ss_sp
+ from store_sales
+ left join store_returns on sr_ticket_number=ss_ticket_number and ss_item_sk=sr_item_sk
+ join date_dim on ss_sold_date_sk = d_date_sk
+ where sr_ticket_number is null
+ group by d_year, ss_item_sk, ss_customer_sk
+ )
+ select
+ss_customer_sk,
+round(ss_qty/(coalesce(ws_qty,0)+coalesce(cs_qty,0)),2) ratio,
+ss_qty store_qty, ss_wc store_wholesale_cost, ss_sp store_sales_price,
+coalesce(ws_qty,0)+coalesce(cs_qty,0) other_chan_qty,
+coalesce(ws_wc,0)+coalesce(cs_wc,0) other_chan_wholesale_cost,
+coalesce(ws_sp,0)+coalesce(cs_sp,0) other_chan_sales_price
+from ss
+left join ws on (ws_sold_year=ss_sold_year and ws_item_sk=ss_item_sk and ws_customer_sk=ss_customer_sk)
+left join cs on (cs_sold_year=ss_sold_year and cs_item_sk=ss_item_sk and cs_customer_sk=ss_customer_sk)
+where (coalesce(ws_qty,0)>0 or coalesce(cs_qty, 0)>0) and ss_sold_year=2001
+order by
+ ss_customer_sk,
+ ss_qty desc, ss_wc desc, ss_sp desc,
+ other_chan_qty,
+ other_chan_wholesale_cost,
+ other_chan_sales_price,
+ ratio
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q79.sql b/benchmarks/queries/tpcds/q79.sql
new file mode 100644
index 0000000000..568444b152
--- /dev/null
+++ b/benchmarks/queries/tpcds/q79.sql
@@ -0,0 +1,24 @@
+-- SQLBench-DS query 79 derived from TPC-DS query 79 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ c_last_name,c_first_name,substr(s_city,1,30),ss_ticket_number,amt,profit
+ from
+ (select ss_ticket_number
+ ,ss_customer_sk
+ ,store.s_city
+ ,sum(ss_coupon_amt) amt
+ ,sum(ss_net_profit) profit
+ from store_sales,date_dim,store,household_demographics
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_store_sk = store.s_store_sk
+ and store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
+ and (household_demographics.hd_dep_count = 0 or household_demographics.hd_vehicle_count > 4)
+ and date_dim.d_dow = 1
+ and date_dim.d_year in (1999,1999+1,1999+2)
+ and store.s_number_employees between 200 and 295
+ group by ss_ticket_number,ss_customer_sk,ss_addr_sk,store.s_city) ms,customer
+ where ss_customer_sk = c_customer_sk
+ order by c_last_name,c_first_name,substr(s_city,1,30), profit
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q8.sql b/benchmarks/queries/tpcds/q8.sql
new file mode 100644
index 0000000000..0a994b4d21
--- /dev/null
+++ b/benchmarks/queries/tpcds/q8.sql
@@ -0,0 +1,109 @@
+-- SQLBench-DS query 8 derived from TPC-DS query 8 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select s_store_name
+ ,sum(ss_net_profit)
+ from store_sales
+ ,date_dim
+ ,store,
+ (select ca_zip
+ from (
+ SELECT substr(ca_zip,1,5) ca_zip
+ FROM customer_address
+ WHERE substr(ca_zip,1,5) IN (
+ '19100','41548','51640','49699','88329','55986',
+ '85119','19510','61020','95452','26235',
+ '51102','16733','42819','27823','90192',
+ '31905','28865','62197','23750','81398',
+ '95288','45114','82060','12313','25218',
+ '64386','46400','77230','69271','43672',
+ '36521','34217','13017','27936','42766',
+ '59233','26060','27477','39981','93402',
+ '74270','13932','51731','71642','17710',
+ '85156','21679','70840','67191','39214',
+ '35273','27293','17128','15458','31615',
+ '60706','67657','54092','32775','14683',
+ '32206','62543','43053','11297','58216',
+ '49410','14710','24501','79057','77038',
+ '91286','32334','46298','18326','67213',
+ '65382','40315','56115','80162','55956',
+ '81583','73588','32513','62880','12201',
+ '11592','17014','83832','61796','57872',
+ '78829','69912','48524','22016','26905',
+ '48511','92168','63051','25748','89786',
+ '98827','86404','53029','37524','14039',
+ '50078','34487','70142','18697','40129',
+ '60642','42810','62667','57183','46414',
+ '58463','71211','46364','34851','54884',
+ '25382','25239','74126','21568','84204',
+ '13607','82518','32982','36953','86001',
+ '79278','21745','64444','35199','83181',
+ '73255','86177','98043','90392','13882',
+ '47084','17859','89526','42072','20233',
+ '52745','75000','22044','77013','24182',
+ '52554','56138','43440','86100','48791',
+ '21883','17096','15965','31196','74903',
+ '19810','35763','92020','55176','54433',
+ '68063','71919','44384','16612','32109',
+ '28207','14762','89933','10930','27616',
+ '56809','14244','22733','33177','29784',
+ '74968','37887','11299','34692','85843',
+ '83663','95421','19323','17406','69264',
+ '28341','50150','79121','73974','92917',
+ '21229','32254','97408','46011','37169',
+ '18146','27296','62927','68812','47734',
+ '86572','12620','80252','50173','27261',
+ '29534','23488','42184','23695','45868',
+ '12910','23429','29052','63228','30731',
+ '15747','25827','22332','62349','56661',
+ '44652','51862','57007','22773','40361',
+ '65238','19327','17282','44708','35484',
+ '34064','11148','92729','22995','18833',
+ '77528','48917','17256','93166','68576',
+ '71096','56499','35096','80551','82424',
+ '17700','32748','78969','46820','57725',
+ '46179','54677','98097','62869','83959',
+ '66728','19716','48326','27420','53458',
+ '69056','84216','36688','63957','41469',
+ '66843','18024','81950','21911','58387',
+ '58103','19813','34581','55347','17171',
+ '35914','75043','75088','80541','26802',
+ '28849','22356','57721','77084','46385',
+ '59255','29308','65885','70673','13306',
+ '68788','87335','40987','31654','67560',
+ '92309','78116','65961','45018','16548',
+ '67092','21818','33716','49449','86150',
+ '12156','27574','43201','50977','52839',
+ '33234','86611','71494','17823','57172',
+ '59869','34086','51052','11320','39717',
+ '79604','24672','70555','38378','91135',
+ '15567','21606','74994','77168','38607',
+ '27384','68328','88944','40203','37893',
+ '42726','83549','48739','55652','27543',
+ '23109','98908','28831','45011','47525',
+ '43870','79404','35780','42136','49317',
+ '14574','99586','21107','14302','83882',
+ '81272','92552','14916','87533','86518',
+ '17862','30741','96288','57886','30304',
+ '24201','79457','36728','49833','35182',
+ '20108','39858','10804','47042','20439',
+ '54708','59027','82499','75311','26548',
+ '53406','92060','41152','60446','33129',
+ '43979','16903','60319','35550','33887',
+ '25463','40343','20726','44429')
+ intersect
+ select ca_zip
+ from (SELECT substr(ca_zip,1,5) ca_zip,count(*) cnt
+ FROM customer_address, customer
+ WHERE ca_address_sk = c_current_addr_sk and
+ c_preferred_cust_flag='Y'
+ group by ca_zip
+ having count(*) > 10)A1)A2) V1
+ where ss_store_sk = s_store_sk
+ and ss_sold_date_sk = d_date_sk
+ and d_qoy = 1 and d_year = 2000
+ and (substr(s_zip,1,2) = substr(V1.ca_zip,1,2))
+ group by s_store_name
+ order by s_store_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q80.sql b/benchmarks/queries/tpcds/q80.sql
new file mode 100644
index 0000000000..29b2f87464
--- /dev/null
+++ b/benchmarks/queries/tpcds/q80.sql
@@ -0,0 +1,97 @@
+-- SQLBench-DS query 80 derived from TPC-DS query 80 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ssr as
+ (select s_store_id as store_id,
+ sum(ss_ext_sales_price) as sales,
+ sum(coalesce(sr_return_amt, 0)) as returns,
+ sum(ss_net_profit - coalesce(sr_net_loss, 0)) as profit
+ from store_sales left outer join store_returns on
+ (ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number),
+ date_dim,
+ store,
+ item,
+ promotion
+ where ss_sold_date_sk = d_date_sk
+ and d_date between cast('2002-08-04' as date)
+ and (cast('2002-08-04' as date) + INTERVAL '30 DAYS')
+ and ss_store_sk = s_store_sk
+ and ss_item_sk = i_item_sk
+ and i_current_price > 50
+ and ss_promo_sk = p_promo_sk
+ and p_channel_tv = 'N'
+ group by s_store_id)
+ ,
+ csr as
+ (select cp_catalog_page_id as catalog_page_id,
+ sum(cs_ext_sales_price) as sales,
+ sum(coalesce(cr_return_amount, 0)) as returns,
+ sum(cs_net_profit - coalesce(cr_net_loss, 0)) as profit
+ from catalog_sales left outer join catalog_returns on
+ (cs_item_sk = cr_item_sk and cs_order_number = cr_order_number),
+ date_dim,
+ catalog_page,
+ item,
+ promotion
+ where cs_sold_date_sk = d_date_sk
+ and d_date between cast('2002-08-04' as date)
+ and (cast('2002-08-04' as date) + INTERVAL '30 DAYS')
+ and cs_catalog_page_sk = cp_catalog_page_sk
+ and cs_item_sk = i_item_sk
+ and i_current_price > 50
+ and cs_promo_sk = p_promo_sk
+ and p_channel_tv = 'N'
+group by cp_catalog_page_id)
+ ,
+ wsr as
+ (select web_site_id,
+ sum(ws_ext_sales_price) as sales,
+ sum(coalesce(wr_return_amt, 0)) as returns,
+ sum(ws_net_profit - coalesce(wr_net_loss, 0)) as profit
+ from web_sales left outer join web_returns on
+ (ws_item_sk = wr_item_sk and ws_order_number = wr_order_number),
+ date_dim,
+ web_site,
+ item,
+ promotion
+ where ws_sold_date_sk = d_date_sk
+ and d_date between cast('2002-08-04' as date)
+ and (cast('2002-08-04' as date) + INTERVAL '30 DAYS')
+ and ws_web_site_sk = web_site_sk
+ and ws_item_sk = i_item_sk
+ and i_current_price > 50
+ and ws_promo_sk = p_promo_sk
+ and p_channel_tv = 'N'
+group by web_site_id)
+ select channel
+ , id
+ , sum(sales) as sales
+ , sum(returns) as returns
+ , sum(profit) as profit
+ from
+ (select 'store channel' as channel
+ , 'store' || store_id as id
+ , sales
+ , returns
+ , profit
+ from ssr
+ union all
+ select 'catalog channel' as channel
+ , 'catalog_page' || catalog_page_id as id
+ , sales
+ , returns
+ , profit
+ from csr
+ union all
+ select 'web channel' as channel
+ , 'web_site' || web_site_id as id
+ , sales
+ , returns
+ , profit
+ from wsr
+ ) x
+ group by rollup (channel, id)
+ order by channel
+ ,id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q81.sql b/benchmarks/queries/tpcds/q81.sql
new file mode 100644
index 0000000000..8dd4c43067
--- /dev/null
+++ b/benchmarks/queries/tpcds/q81.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 81 derived from TPC-DS query 81 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with customer_total_return as
+ (select cr_returning_customer_sk as ctr_customer_sk
+ ,ca_state as ctr_state,
+ sum(cr_return_amt_inc_tax) as ctr_total_return
+ from catalog_returns
+ ,date_dim
+ ,customer_address
+ where cr_returned_date_sk = d_date_sk
+ and d_year =1998
+ and cr_returning_addr_sk = ca_address_sk
+ group by cr_returning_customer_sk
+ ,ca_state )
+ select c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name
+ ,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset
+ ,ca_location_type,ctr_total_return
+ from customer_total_return ctr1
+ ,customer_address
+ ,customer
+ where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
+ from customer_total_return ctr2
+ where ctr1.ctr_state = ctr2.ctr_state)
+ and ca_address_sk = c_current_addr_sk
+ and ca_state = 'TX'
+ and ctr1.ctr_customer_sk = c_customer_sk
+ order by c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name
+ ,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset
+ ,ca_location_type,ctr_total_return
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q82.sql b/benchmarks/queries/tpcds/q82.sql
new file mode 100644
index 0000000000..faea7a2f67
--- /dev/null
+++ b/benchmarks/queries/tpcds/q82.sql
@@ -0,0 +1,18 @@
+-- SQLBench-DS query 82 derived from TPC-DS query 82 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,i_current_price
+ from item, inventory, date_dim, store_sales
+ where i_current_price between 69 and 69+30
+ and inv_item_sk = i_item_sk
+ and d_date_sk=inv_date_sk
+ and d_date between cast('1998-06-06' as date) and (cast('1998-06-06' as date) + INTERVAL '60 DAYS')
+ and i_manufact_id in (105,513,180,137)
+ and inv_quantity_on_hand between 100 and 500
+ and ss_item_sk = i_item_sk
+ group by i_item_id,i_item_desc,i_current_price
+ order by i_item_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q83.sql b/benchmarks/queries/tpcds/q83.sql
new file mode 100644
index 0000000000..b2512ed83e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q83.sql
@@ -0,0 +1,68 @@
+-- SQLBench-DS query 83 derived from TPC-DS query 83 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with sr_items as
+ (select i_item_id item_id,
+ sum(sr_return_quantity) sr_item_qty
+ from store_returns,
+ item,
+ date_dim
+ where sr_item_sk = i_item_sk
+ and d_date in
+ (select d_date
+ from date_dim
+ where d_week_seq in
+ (select d_week_seq
+ from date_dim
+ where d_date in ('2000-04-29','2000-09-09','2000-11-02')))
+ and sr_returned_date_sk = d_date_sk
+ group by i_item_id),
+ cr_items as
+ (select i_item_id item_id,
+ sum(cr_return_quantity) cr_item_qty
+ from catalog_returns,
+ item,
+ date_dim
+ where cr_item_sk = i_item_sk
+ and d_date in
+ (select d_date
+ from date_dim
+ where d_week_seq in
+ (select d_week_seq
+ from date_dim
+ where d_date in ('2000-04-29','2000-09-09','2000-11-02')))
+ and cr_returned_date_sk = d_date_sk
+ group by i_item_id),
+ wr_items as
+ (select i_item_id item_id,
+ sum(wr_return_quantity) wr_item_qty
+ from web_returns,
+ item,
+ date_dim
+ where wr_item_sk = i_item_sk
+ and d_date in
+ (select d_date
+ from date_dim
+ where d_week_seq in
+ (select d_week_seq
+ from date_dim
+ where d_date in ('2000-04-29','2000-09-09','2000-11-02')))
+ and wr_returned_date_sk = d_date_sk
+ group by i_item_id)
+ select sr_items.item_id
+ ,sr_item_qty
+ ,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev
+ ,cr_item_qty
+ ,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev
+ ,wr_item_qty
+ ,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev
+ ,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average
+ from sr_items
+ ,cr_items
+ ,wr_items
+ where sr_items.item_id=cr_items.item_id
+ and sr_items.item_id=wr_items.item_id
+ order by sr_items.item_id
+ ,sr_item_qty
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q84.sql b/benchmarks/queries/tpcds/q84.sql
new file mode 100644
index 0000000000..a07249b463
--- /dev/null
+++ b/benchmarks/queries/tpcds/q84.sql
@@ -0,0 +1,22 @@
+-- SQLBench-DS query 84 derived from TPC-DS query 84 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select c_customer_id as customer_id
+ , coalesce(c_last_name,'') || ', ' || coalesce(c_first_name,'') as customername
+ from customer
+ ,customer_address
+ ,customer_demographics
+ ,household_demographics
+ ,income_band
+ ,store_returns
+ where ca_city = 'White Oak'
+ and c_current_addr_sk = ca_address_sk
+ and ib_lower_bound >= 45626
+ and ib_upper_bound <= 45626 + 50000
+ and ib_income_band_sk = hd_income_band_sk
+ and cd_demo_sk = c_current_cdemo_sk
+ and hd_demo_sk = c_current_hdemo_sk
+ and sr_cdemo_sk = cd_demo_sk
+ order by c_customer_id
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q85.sql b/benchmarks/queries/tpcds/q85.sql
new file mode 100644
index 0000000000..c529acfe9e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q85.sql
@@ -0,0 +1,85 @@
+-- SQLBench-DS query 85 derived from TPC-DS query 85 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select substr(r_reason_desc,1,20)
+ ,avg(ws_quantity)
+ ,avg(wr_refunded_cash)
+ ,avg(wr_fee)
+ from web_sales, web_returns, web_page, customer_demographics cd1,
+ customer_demographics cd2, customer_address, date_dim, reason
+ where ws_web_page_sk = wp_web_page_sk
+ and ws_item_sk = wr_item_sk
+ and ws_order_number = wr_order_number
+ and ws_sold_date_sk = d_date_sk and d_year = 2001
+ and cd1.cd_demo_sk = wr_refunded_cdemo_sk
+ and cd2.cd_demo_sk = wr_returning_cdemo_sk
+ and ca_address_sk = wr_refunded_addr_sk
+ and r_reason_sk = wr_reason_sk
+ and
+ (
+ (
+ cd1.cd_marital_status = 'D'
+ and
+ cd1.cd_marital_status = cd2.cd_marital_status
+ and
+ cd1.cd_education_status = 'Primary'
+ and
+ cd1.cd_education_status = cd2.cd_education_status
+ and
+ ws_sales_price between 100.00 and 150.00
+ )
+ or
+ (
+ cd1.cd_marital_status = 'U'
+ and
+ cd1.cd_marital_status = cd2.cd_marital_status
+ and
+ cd1.cd_education_status = 'Unknown'
+ and
+ cd1.cd_education_status = cd2.cd_education_status
+ and
+ ws_sales_price between 50.00 and 100.00
+ )
+ or
+ (
+ cd1.cd_marital_status = 'M'
+ and
+ cd1.cd_marital_status = cd2.cd_marital_status
+ and
+ cd1.cd_education_status = 'Advanced Degree'
+ and
+ cd1.cd_education_status = cd2.cd_education_status
+ and
+ ws_sales_price between 150.00 and 200.00
+ )
+ )
+ and
+ (
+ (
+ ca_country = 'United States'
+ and
+ ca_state in ('SC', 'IN', 'VA')
+ and ws_net_profit between 100 and 200
+ )
+ or
+ (
+ ca_country = 'United States'
+ and
+ ca_state in ('WA', 'KS', 'KY')
+ and ws_net_profit between 150 and 300
+ )
+ or
+ (
+ ca_country = 'United States'
+ and
+ ca_state in ('SD', 'WI', 'NE')
+ and ws_net_profit between 50 and 250
+ )
+ )
+group by r_reason_desc
+order by substr(r_reason_desc,1,20)
+ ,avg(ws_quantity)
+ ,avg(wr_refunded_cash)
+ ,avg(wr_fee)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q86.sql b/benchmarks/queries/tpcds/q86.sql
new file mode 100644
index 0000000000..ed7f4f85d0
--- /dev/null
+++ b/benchmarks/queries/tpcds/q86.sql
@@ -0,0 +1,27 @@
+-- SQLBench-DS query 86 derived from TPC-DS query 86 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ sum(ws_net_paid) as total_sum
+ ,i_category
+ ,i_class
+ ,grouping(i_category)+grouping(i_class) as lochierarchy
+ ,rank() over (
+ partition by grouping(i_category)+grouping(i_class),
+ case when grouping(i_class) = 0 then i_category end
+ order by sum(ws_net_paid) desc) as rank_within_parent
+ from
+ web_sales
+ ,date_dim d1
+ ,item
+ where
+ d1.d_month_seq between 1205 and 1205+11
+ and d1.d_date_sk = ws_sold_date_sk
+ and i_item_sk = ws_item_sk
+ group by rollup(i_category,i_class)
+ order by
+ lochierarchy desc,
+ case when lochierarchy = 0 then i_category end,
+ rank_within_parent
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q87.sql b/benchmarks/queries/tpcds/q87.sql
new file mode 100644
index 0000000000..13e2d8e2e7
--- /dev/null
+++ b/benchmarks/queries/tpcds/q87.sql
@@ -0,0 +1,24 @@
+-- SQLBench-DS query 87 derived from TPC-DS query 87 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select count(*)
+from ((select distinct c_last_name, c_first_name, d_date
+ from store_sales, date_dim, customer
+ where store_sales.ss_sold_date_sk = date_dim.d_date_sk
+ and store_sales.ss_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1189 and 1189+11)
+ except
+ (select distinct c_last_name, c_first_name, d_date
+ from catalog_sales, date_dim, customer
+ where catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
+ and catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1189 and 1189+11)
+ except
+ (select distinct c_last_name, c_first_name, d_date
+ from web_sales, date_dim, customer
+ where web_sales.ws_sold_date_sk = date_dim.d_date_sk
+ and web_sales.ws_bill_customer_sk = customer.c_customer_sk
+ and d_month_seq between 1189 and 1189+11)
+) cool_cust
+;
+
diff --git a/benchmarks/queries/tpcds/q88.sql b/benchmarks/queries/tpcds/q88.sql
new file mode 100644
index 0000000000..8d47334a4e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q88.sql
@@ -0,0 +1,95 @@
+-- SQLBench-DS query 88 derived from TPC-DS query 88 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+from
+ (select count(*) h8_30_to_9
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 8
+ and time_dim.t_minute >= 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s1,
+ (select count(*) h9_to_9_30
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 9
+ and time_dim.t_minute < 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s2,
+ (select count(*) h9_30_to_10
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 9
+ and time_dim.t_minute >= 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s3,
+ (select count(*) h10_to_10_30
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 10
+ and time_dim.t_minute < 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s4,
+ (select count(*) h10_30_to_11
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 10
+ and time_dim.t_minute >= 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s5,
+ (select count(*) h11_to_11_30
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 11
+ and time_dim.t_minute < 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s6,
+ (select count(*) h11_30_to_12
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 11
+ and time_dim.t_minute >= 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s7,
+ (select count(*) h12_to_12_30
+ from store_sales, household_demographics , time_dim, store
+ where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 12
+ and time_dim.t_minute < 30
+ and ((household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or
+ (household_demographics.hd_dep_count = 1 and household_demographics.hd_vehicle_count<=1+2) or
+ (household_demographics.hd_dep_count = 4 and household_demographics.hd_vehicle_count<=4+2))
+ and store.s_store_name = 'ese') s8
+;
+
diff --git a/benchmarks/queries/tpcds/q89.sql b/benchmarks/queries/tpcds/q89.sql
new file mode 100644
index 0000000000..ac02b6fe33
--- /dev/null
+++ b/benchmarks/queries/tpcds/q89.sql
@@ -0,0 +1,29 @@
+-- SQLBench-DS query 89 derived from TPC-DS query 89 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select *
+from(
+select i_category, i_class, i_brand,
+ s_store_name, s_company_name,
+ d_moy,
+ sum(ss_sales_price) sum_sales,
+ avg(sum(ss_sales_price)) over
+ (partition by i_category, i_brand, s_store_name, s_company_name)
+ avg_monthly_sales
+from item, store_sales, date_dim, store
+where ss_item_sk = i_item_sk and
+ ss_sold_date_sk = d_date_sk and
+ ss_store_sk = s_store_sk and
+ d_year in (2001) and
+ ((i_category in ('Children','Jewelry','Home') and
+ i_class in ('infants','birdal','flatware')
+ )
+ or (i_category in ('Electronics','Music','Books') and
+ i_class in ('audio','classical','science')
+ ))
+group by i_category, i_class, i_brand,
+ s_store_name, s_company_name, d_moy) tmp1
+where case when (avg_monthly_sales <> 0) then (abs(sum_sales - avg_monthly_sales) / avg_monthly_sales) else null end > 0.1
+order by sum_sales - avg_monthly_sales, s_store_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q9.sql b/benchmarks/queries/tpcds/q9.sql
new file mode 100644
index 0000000000..cf723ccf29
--- /dev/null
+++ b/benchmarks/queries/tpcds/q9.sql
@@ -0,0 +1,52 @@
+-- SQLBench-DS query 9 derived from TPC-DS query 9 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select case when (select count(*)
+ from store_sales
+ where ss_quantity between 1 and 20) > 31002
+ then (select avg(ss_ext_discount_amt)
+ from store_sales
+ where ss_quantity between 1 and 20)
+ else (select avg(ss_net_profit)
+ from store_sales
+ where ss_quantity between 1 and 20) end bucket1 ,
+ case when (select count(*)
+ from store_sales
+ where ss_quantity between 21 and 40) > 588
+ then (select avg(ss_ext_discount_amt)
+ from store_sales
+ where ss_quantity between 21 and 40)
+ else (select avg(ss_net_profit)
+ from store_sales
+ where ss_quantity between 21 and 40) end bucket2,
+ case when (select count(*)
+ from store_sales
+ where ss_quantity between 41 and 60) > 2456
+ then (select avg(ss_ext_discount_amt)
+ from store_sales
+ where ss_quantity between 41 and 60)
+ else (select avg(ss_net_profit)
+ from store_sales
+ where ss_quantity between 41 and 60) end bucket3,
+ case when (select count(*)
+ from store_sales
+ where ss_quantity between 61 and 80) > 21645
+ then (select avg(ss_ext_discount_amt)
+ from store_sales
+ where ss_quantity between 61 and 80)
+ else (select avg(ss_net_profit)
+ from store_sales
+ where ss_quantity between 61 and 80) end bucket4,
+ case when (select count(*)
+ from store_sales
+ where ss_quantity between 81 and 100) > 20553
+ then (select avg(ss_ext_discount_amt)
+ from store_sales
+ where ss_quantity between 81 and 100)
+ else (select avg(ss_net_profit)
+ from store_sales
+ where ss_quantity between 81 and 100) end bucket5
+from reason
+where r_reason_sk = 1
+;
+
diff --git a/benchmarks/queries/tpcds/q90.sql b/benchmarks/queries/tpcds/q90.sql
new file mode 100644
index 0000000000..dedf5fd066
--- /dev/null
+++ b/benchmarks/queries/tpcds/q90.sql
@@ -0,0 +1,23 @@
+-- SQLBench-DS query 90 derived from TPC-DS query 90 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select cast(amc as decimal(15,4))/cast(pmc as decimal(15,4)) am_pm_ratio
+ from ( select count(*) amc
+ from web_sales, household_demographics , time_dim, web_page
+ where ws_sold_time_sk = time_dim.t_time_sk
+ and ws_ship_hdemo_sk = household_demographics.hd_demo_sk
+ and ws_web_page_sk = web_page.wp_web_page_sk
+ and time_dim.t_hour between 9 and 9+1
+ and household_demographics.hd_dep_count = 2
+ and web_page.wp_char_count between 5000 and 5200) at,
+ ( select count(*) pmc
+ from web_sales, household_demographics , time_dim, web_page
+ where ws_sold_time_sk = time_dim.t_time_sk
+ and ws_ship_hdemo_sk = household_demographics.hd_demo_sk
+ and ws_web_page_sk = web_page.wp_web_page_sk
+ and time_dim.t_hour between 15 and 15+1
+ and household_demographics.hd_dep_count = 2
+ and web_page.wp_char_count between 5000 and 5200) pt
+ order by am_pm_ratio
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q91.sql b/benchmarks/queries/tpcds/q91.sql
new file mode 100644
index 0000000000..894d41bb2b
--- /dev/null
+++ b/benchmarks/queries/tpcds/q91.sql
@@ -0,0 +1,32 @@
+-- SQLBench-DS query 91 derived from TPC-DS query 91 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ cc_call_center_id Call_Center,
+ cc_name Call_Center_Name,
+ cc_manager Manager,
+ sum(cr_net_loss) Returns_Loss
+from
+ call_center,
+ catalog_returns,
+ date_dim,
+ customer,
+ customer_address,
+ customer_demographics,
+ household_demographics
+where
+ cr_call_center_sk = cc_call_center_sk
+and cr_returned_date_sk = d_date_sk
+and cr_returning_customer_sk= c_customer_sk
+and cd_demo_sk = c_current_cdemo_sk
+and hd_demo_sk = c_current_hdemo_sk
+and ca_address_sk = c_current_addr_sk
+and d_year = 2002
+and d_moy = 11
+and ( (cd_marital_status = 'M' and cd_education_status = 'Unknown')
+ or(cd_marital_status = 'W' and cd_education_status = 'Advanced Degree'))
+and hd_buy_potential like 'Unknown%'
+and ca_gmt_offset = -6
+group by cc_call_center_id,cc_name,cc_manager,cd_marital_status,cd_education_status
+order by sum(cr_net_loss) desc;
+
diff --git a/benchmarks/queries/tpcds/q92.sql b/benchmarks/queries/tpcds/q92.sql
new file mode 100644
index 0000000000..171a968627
--- /dev/null
+++ b/benchmarks/queries/tpcds/q92.sql
@@ -0,0 +1,31 @@
+-- SQLBench-DS query 92 derived from TPC-DS query 92 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ sum(ws_ext_discount_amt) as `Excess Discount Amount`
+from
+ web_sales
+ ,item
+ ,date_dim
+where
+i_manufact_id = 914
+and i_item_sk = ws_item_sk
+and d_date between '2001-01-25' and
+ (cast('2001-01-25' as date) + INTERVAL '90 DAYS')
+and d_date_sk = ws_sold_date_sk
+and ws_ext_discount_amt
+ > (
+ SELECT
+ 1.3 * avg(ws_ext_discount_amt)
+ FROM
+ web_sales
+ ,date_dim
+ WHERE
+ ws_item_sk = i_item_sk
+ and d_date between '2001-01-25' and
+ (cast('2001-01-25' as date) + INTERVAL '90 DAYS')
+ and d_date_sk = ws_sold_date_sk
+ )
+order by sum(ws_ext_discount_amt)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q93.sql b/benchmarks/queries/tpcds/q93.sql
new file mode 100644
index 0000000000..31ec9e7d4e
--- /dev/null
+++ b/benchmarks/queries/tpcds/q93.sql
@@ -0,0 +1,19 @@
+-- SQLBench-DS query 93 derived from TPC-DS query 93 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select ss_customer_sk
+ ,sum(act_sales) sumsales
+ from (select ss_item_sk
+ ,ss_ticket_number
+ ,ss_customer_sk
+ ,case when sr_return_quantity is not null then (ss_quantity-sr_return_quantity)*ss_sales_price
+ else (ss_quantity*ss_sales_price) end act_sales
+ from store_sales left outer join store_returns on (sr_item_sk = ss_item_sk
+ and sr_ticket_number = ss_ticket_number)
+ ,reason
+ where sr_reason_sk = r_reason_sk
+ and r_reason_desc = 'Did not get it on time') t
+ group by ss_customer_sk
+ order by sumsales, ss_customer_sk
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q94.sql b/benchmarks/queries/tpcds/q94.sql
new file mode 100644
index 0000000000..cf04e14e0d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q94.sql
@@ -0,0 +1,30 @@
+-- SQLBench-DS query 94 derived from TPC-DS query 94 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ count(distinct ws_order_number) as `order count`
+ ,sum(ws_ext_ship_cost) as `total shipping cost`
+ ,sum(ws_net_profit) as `total net profit`
+from
+ web_sales ws1
+ ,date_dim
+ ,customer_address
+ ,web_site
+where
+ d_date between '1999-4-01' and
+ (cast('1999-4-01' as date) + INTERVAL '60 DAYS')
+and ws1.ws_ship_date_sk = d_date_sk
+and ws1.ws_ship_addr_sk = ca_address_sk
+and ca_state = 'WI'
+and ws1.ws_web_site_sk = web_site_sk
+and web_company_name = 'pri'
+and exists (select *
+ from web_sales ws2
+ where ws1.ws_order_number = ws2.ws_order_number
+ and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk)
+and not exists(select *
+ from web_returns wr1
+ where ws1.ws_order_number = wr1.wr_order_number)
+order by count(distinct ws_order_number)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q95.sql b/benchmarks/queries/tpcds/q95.sql
new file mode 100644
index 0000000000..2db3e50327
--- /dev/null
+++ b/benchmarks/queries/tpcds/q95.sql
@@ -0,0 +1,33 @@
+-- SQLBench-DS query 95 derived from TPC-DS query 95 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ws_wh as
+(select ws1.ws_order_number,ws1.ws_warehouse_sk wh1,ws2.ws_warehouse_sk wh2
+ from web_sales ws1,web_sales ws2
+ where ws1.ws_order_number = ws2.ws_order_number
+ and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk)
+ select
+ count(distinct ws_order_number) as `order count`
+ ,sum(ws_ext_ship_cost) as `total shipping cost`
+ ,sum(ws_net_profit) as `total net profit`
+from
+ web_sales ws1
+ ,date_dim
+ ,customer_address
+ ,web_site
+where
+ d_date between '2002-5-01' and
+ (cast('2002-5-01' as date) + INTERVAL '60 DAYS')
+and ws1.ws_ship_date_sk = d_date_sk
+and ws1.ws_ship_addr_sk = ca_address_sk
+and ca_state = 'MA'
+and ws1.ws_web_site_sk = web_site_sk
+and web_company_name = 'pri'
+and ws1.ws_order_number in (select ws_order_number
+ from ws_wh)
+and ws1.ws_order_number in (select wr_order_number
+ from web_returns,ws_wh
+ where wr_order_number = ws_wh.ws_order_number)
+order by count(distinct ws_order_number)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q96.sql b/benchmarks/queries/tpcds/q96.sql
new file mode 100644
index 0000000000..63c6fdbf97
--- /dev/null
+++ b/benchmarks/queries/tpcds/q96.sql
@@ -0,0 +1,17 @@
+-- SQLBench-DS query 96 derived from TPC-DS query 96 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select count(*)
+from store_sales
+ ,household_demographics
+ ,time_dim, store
+where ss_sold_time_sk = time_dim.t_time_sk
+ and ss_hdemo_sk = household_demographics.hd_demo_sk
+ and ss_store_sk = s_store_sk
+ and time_dim.t_hour = 8
+ and time_dim.t_minute >= 30
+ and household_demographics.hd_dep_count = 5
+ and store.s_store_name = 'ese'
+order by count(*)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q97.sql b/benchmarks/queries/tpcds/q97.sql
new file mode 100644
index 0000000000..5741cc9c56
--- /dev/null
+++ b/benchmarks/queries/tpcds/q97.sql
@@ -0,0 +1,26 @@
+-- SQLBench-DS query 97 derived from TPC-DS query 97 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+with ssci as (
+select ss_customer_sk customer_sk
+ ,ss_item_sk item_sk
+from store_sales,date_dim
+where ss_sold_date_sk = d_date_sk
+ and d_month_seq between 1211 and 1211 + 11
+group by ss_customer_sk
+ ,ss_item_sk),
+csci as(
+ select cs_bill_customer_sk customer_sk
+ ,cs_item_sk item_sk
+from catalog_sales,date_dim
+where cs_sold_date_sk = d_date_sk
+ and d_month_seq between 1211 and 1211 + 11
+group by cs_bill_customer_sk
+ ,cs_item_sk)
+ select sum(case when ssci.customer_sk is not null and csci.customer_sk is null then 1 else 0 end) store_only
+ ,sum(case when ssci.customer_sk is null and csci.customer_sk is not null then 1 else 0 end) catalog_only
+ ,sum(case when ssci.customer_sk is not null and csci.customer_sk is not null then 1 else 0 end) store_and_catalog
+from ssci full outer join csci on (ssci.customer_sk=csci.customer_sk
+ and ssci.item_sk = csci.item_sk)
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpcds/q98.sql b/benchmarks/queries/tpcds/q98.sql
new file mode 100644
index 0000000000..86bf08b2a1
--- /dev/null
+++ b/benchmarks/queries/tpcds/q98.sql
@@ -0,0 +1,34 @@
+-- SQLBench-DS query 98 derived from TPC-DS query 98 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+ ,sum(ss_ext_sales_price) as itemrevenue
+ ,sum(ss_ext_sales_price)*100/sum(sum(ss_ext_sales_price)) over
+ (partition by i_class) as revenueratio
+from
+ store_sales
+ ,item
+ ,date_dim
+where
+ ss_item_sk = i_item_sk
+ and i_category in ('Shoes', 'Music', 'Men')
+ and ss_sold_date_sk = d_date_sk
+ and d_date between cast('2000-01-05' as date)
+ and (cast('2000-01-05' as date) + INTERVAL '30 DAYS')
+group by
+ i_item_id
+ ,i_item_desc
+ ,i_category
+ ,i_class
+ ,i_current_price
+order by
+ i_category
+ ,i_class
+ ,i_item_id
+ ,i_item_desc
+ ,revenueratio;
+
diff --git a/benchmarks/queries/tpcds/q99.sql b/benchmarks/queries/tpcds/q99.sql
new file mode 100644
index 0000000000..8bd1f6406d
--- /dev/null
+++ b/benchmarks/queries/tpcds/q99.sql
@@ -0,0 +1,36 @@
+-- SQLBench-DS query 99 derived from TPC-DS query 99 under the terms of the TPC Fair Use Policy.
+-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
+-- This query was generated at scale factor 1.
+select
+ substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,cc_name
+ ,sum(case when (cs_ship_date_sk - cs_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days`
+ ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 30) and
+ (cs_ship_date_sk - cs_sold_date_sk <= 60) then 1 else 0 end ) as `31-60 days`
+ ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 60) and
+ (cs_ship_date_sk - cs_sold_date_sk <= 90) then 1 else 0 end) as `61-90 days`
+ ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 90) and
+ (cs_ship_date_sk - cs_sold_date_sk <= 120) then 1 else 0 end) as `91-120 days`
+ ,sum(case when (cs_ship_date_sk - cs_sold_date_sk > 120) then 1 else 0 end) as `>120 days`
+from
+ catalog_sales
+ ,warehouse
+ ,ship_mode
+ ,call_center
+ ,date_dim
+where
+ d_month_seq between 1188 and 1188 + 11
+and cs_ship_date_sk = d_date_sk
+and cs_warehouse_sk = w_warehouse_sk
+and cs_ship_mode_sk = sm_ship_mode_sk
+and cs_call_center_sk = cc_call_center_sk
+group by
+ substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,cc_name
+order by substr(w_warehouse_name,1,20)
+ ,sm_type
+ ,cc_name
+ LIMIT 100;
+
diff --git a/benchmarks/queries/tpch/q1.sql b/benchmarks/queries/tpch/q1.sql
new file mode 100644
index 0000000000..0dc4c3e598
--- /dev/null
+++ b/benchmarks/queries/tpch/q1.sql
@@ -0,0 +1,23 @@
+-- SQLBench-H query 1 derived from TPC-H query 1 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ avg(l_quantity) as avg_qty,
+ avg(l_extendedprice) as avg_price,
+ avg(l_discount) as avg_disc,
+ count(*) as count_order
+from
+ lineitem
+where
+ l_shipdate <= date '1998-12-01' - interval '68 days'
+group by
+ l_returnflag,
+ l_linestatus
+order by
+ l_returnflag,
+ l_linestatus;
diff --git a/benchmarks/queries/tpch/q10.sql b/benchmarks/queries/tpch/q10.sql
new file mode 100644
index 0000000000..576338f044
--- /dev/null
+++ b/benchmarks/queries/tpch/q10.sql
@@ -0,0 +1,33 @@
+-- SQLBench-H query 10 derived from TPC-H query 10 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ c_custkey,
+ c_name,
+ sum(l_extendedprice * (1 - l_discount)) as revenue,
+ c_acctbal,
+ n_name,
+ c_address,
+ c_phone,
+ c_comment
+from
+ customer,
+ orders,
+ lineitem,
+ nation
+where
+ c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and o_orderdate >= date '1993-07-01'
+ and o_orderdate < date '1993-07-01' + interval '3' month
+ and l_returnflag = 'R'
+ and c_nationkey = n_nationkey
+group by
+ c_custkey,
+ c_name,
+ c_acctbal,
+ c_phone,
+ n_name,
+ c_address,
+ c_comment
+order by
+ revenue desc limit 20;
diff --git a/benchmarks/queries/tpch/q11.sql b/benchmarks/queries/tpch/q11.sql
new file mode 100644
index 0000000000..f4ead457b7
--- /dev/null
+++ b/benchmarks/queries/tpch/q11.sql
@@ -0,0 +1,29 @@
+-- SQLBench-H query 11 derived from TPC-H query 11 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ ps_partkey,
+ sum(ps_supplycost * ps_availqty) as value
+from
+ partsupp,
+ supplier,
+ nation
+where
+ ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'ALGERIA'
+group by
+ ps_partkey having
+ sum(ps_supplycost * ps_availqty) > (
+ select
+ sum(ps_supplycost * ps_availqty) * 0.0001000000
+ from
+ partsupp,
+ supplier,
+ nation
+ where
+ ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'ALGERIA'
+ )
+order by
+ value desc;
diff --git a/benchmarks/queries/tpch/q12.sql b/benchmarks/queries/tpch/q12.sql
new file mode 100644
index 0000000000..4ab4ea6e3b
--- /dev/null
+++ b/benchmarks/queries/tpch/q12.sql
@@ -0,0 +1,30 @@
+-- SQLBench-H query 12 derived from TPC-H query 12 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ l_shipmode,
+ sum(case
+ when o_orderpriority = '1-URGENT'
+ or o_orderpriority = '2-HIGH'
+ then 1
+ else 0
+ end) as high_line_count,
+ sum(case
+ when o_orderpriority <> '1-URGENT'
+ and o_orderpriority <> '2-HIGH'
+ then 1
+ else 0
+ end) as low_line_count
+from
+ orders,
+ lineitem
+where
+ o_orderkey = l_orderkey
+ and l_shipmode in ('FOB', 'SHIP')
+ and l_commitdate < l_receiptdate
+ and l_shipdate < l_commitdate
+ and l_receiptdate >= date '1995-01-01'
+ and l_receiptdate < date '1995-01-01' + interval '1' year
+group by
+ l_shipmode
+order by
+ l_shipmode;
diff --git a/benchmarks/queries/tpch/q13.sql b/benchmarks/queries/tpch/q13.sql
new file mode 100644
index 0000000000..301e35d193
--- /dev/null
+++ b/benchmarks/queries/tpch/q13.sql
@@ -0,0 +1,22 @@
+-- SQLBench-H query 13 derived from TPC-H query 13 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ c_count,
+ count(*) as custdist
+from
+ (
+ select
+ c_custkey,
+ count(o_orderkey)
+ from
+ customer left outer join orders on
+ c_custkey = o_custkey
+ and o_comment not like '%express%requests%'
+ group by
+ c_custkey
+ ) as c_orders (c_custkey, c_count)
+group by
+ c_count
+order by
+ custdist desc,
+ c_count desc;
diff --git a/benchmarks/queries/tpch/q14.sql b/benchmarks/queries/tpch/q14.sql
new file mode 100644
index 0000000000..6040ac734c
--- /dev/null
+++ b/benchmarks/queries/tpch/q14.sql
@@ -0,0 +1,15 @@
+-- SQLBench-H query 14 derived from TPC-H query 14 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ 100.00 * sum(case
+ when p_type like 'PROMO%'
+ then l_extendedprice * (1 - l_discount)
+ else 0
+ end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from
+ lineitem,
+ part
+where
+ l_partkey = p_partkey
+ and l_shipdate >= date '1995-02-01'
+ and l_shipdate < date '1995-02-01' + interval '1' month;
diff --git a/benchmarks/queries/tpch/q15.sql b/benchmarks/queries/tpch/q15.sql
new file mode 100644
index 0000000000..0fe03a79c0
--- /dev/null
+++ b/benchmarks/queries/tpch/q15.sql
@@ -0,0 +1,33 @@
+-- SQLBench-H query 15 derived from TPC-H query 15 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+create view revenue0 (supplier_no, total_revenue) as
+ select
+ l_suppkey,
+ sum(l_extendedprice * (1 - l_discount))
+ from
+ lineitem
+ where
+ l_shipdate >= date '1996-08-01'
+ and l_shipdate < date '1996-08-01' + interval '3' month
+ group by
+ l_suppkey;
+select
+ s_suppkey,
+ s_name,
+ s_address,
+ s_phone,
+ total_revenue
+from
+ supplier,
+ revenue0
+where
+ s_suppkey = supplier_no
+ and total_revenue = (
+ select
+ max(total_revenue)
+ from
+ revenue0
+ )
+order by
+ s_suppkey;
+drop view revenue0;
diff --git a/benchmarks/queries/tpch/q16.sql b/benchmarks/queries/tpch/q16.sql
new file mode 100644
index 0000000000..7fdf36522a
--- /dev/null
+++ b/benchmarks/queries/tpch/q16.sql
@@ -0,0 +1,32 @@
+-- SQLBench-H query 16 derived from TPC-H query 16 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ p_brand,
+ p_type,
+ p_size,
+ count(distinct ps_suppkey) as supplier_cnt
+from
+ partsupp,
+ part
+where
+ p_partkey = ps_partkey
+ and p_brand <> 'Brand#14'
+ and p_type not like 'SMALL PLATED%'
+ and p_size in (14, 6, 5, 31, 49, 15, 41, 47)
+ and ps_suppkey not in (
+ select
+ s_suppkey
+ from
+ supplier
+ where
+ s_comment like '%Customer%Complaints%'
+ )
+group by
+ p_brand,
+ p_type,
+ p_size
+order by
+ supplier_cnt desc,
+ p_brand,
+ p_type,
+ p_size;
diff --git a/benchmarks/queries/tpch/q17.sql b/benchmarks/queries/tpch/q17.sql
new file mode 100644
index 0000000000..ffa0f15c8a
--- /dev/null
+++ b/benchmarks/queries/tpch/q17.sql
@@ -0,0 +1,19 @@
+-- SQLBench-H query 17 derived from TPC-H query 17 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ sum(l_extendedprice) / 7.0 as avg_yearly
+from
+ lineitem,
+ part
+where
+ p_partkey = l_partkey
+ and p_brand = 'Brand#42'
+ and p_container = 'LG BAG'
+ and l_quantity < (
+ select
+ 0.2 * avg(l_quantity)
+ from
+ lineitem
+ where
+ l_partkey = p_partkey
+ );
diff --git a/benchmarks/queries/tpch/q18.sql b/benchmarks/queries/tpch/q18.sql
new file mode 100644
index 0000000000..f4ab1945e7
--- /dev/null
+++ b/benchmarks/queries/tpch/q18.sql
@@ -0,0 +1,34 @@
+-- SQLBench-H query 18 derived from TPC-H query 18 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate,
+ o_totalprice,
+ sum(l_quantity)
+from
+ customer,
+ orders,
+ lineitem
+where
+ o_orderkey in (
+ select
+ l_orderkey
+ from
+ lineitem
+ group by
+ l_orderkey having
+ sum(l_quantity) > 313
+ )
+ and c_custkey = o_custkey
+ and o_orderkey = l_orderkey
+group by
+ c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate,
+ o_totalprice
+order by
+ o_totalprice desc,
+ o_orderdate limit 100;
diff --git a/benchmarks/queries/tpch/q19.sql b/benchmarks/queries/tpch/q19.sql
new file mode 100644
index 0000000000..ad5fb7d929
--- /dev/null
+++ b/benchmarks/queries/tpch/q19.sql
@@ -0,0 +1,37 @@
+-- SQLBench-H query 19 derived from TPC-H query 19 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+ lineitem,
+ part
+where
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#21'
+ and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+ and l_quantity >= 8 and l_quantity <= 8 + 10
+ and p_size between 1 and 5
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ or
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#13'
+ and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+ and l_quantity >= 20 and l_quantity <= 20 + 10
+ and p_size between 1 and 10
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ or
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#52'
+ and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+ and l_quantity >= 30 and l_quantity <= 30 + 10
+ and p_size between 1 and 15
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ );
diff --git a/benchmarks/queries/tpch/q2.sql b/benchmarks/queries/tpch/q2.sql
new file mode 100644
index 0000000000..2936532889
--- /dev/null
+++ b/benchmarks/queries/tpch/q2.sql
@@ -0,0 +1,45 @@
+-- SQLBench-H query 2 derived from TPC-H query 2 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ s_acctbal,
+ s_name,
+ n_name,
+ p_partkey,
+ p_mfgr,
+ s_address,
+ s_phone,
+ s_comment
+from
+ part,
+ supplier,
+ partsupp,
+ nation,
+ region
+where
+ p_partkey = ps_partkey
+ and s_suppkey = ps_suppkey
+ and p_size = 48
+ and p_type like '%TIN'
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'ASIA'
+ and ps_supplycost = (
+ select
+ min(ps_supplycost)
+ from
+ partsupp,
+ supplier,
+ nation,
+ region
+ where
+ p_partkey = ps_partkey
+ and s_suppkey = ps_suppkey
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'ASIA'
+ )
+order by
+ s_acctbal desc,
+ n_name,
+ s_name,
+ p_partkey limit 100;
diff --git a/benchmarks/queries/tpch/q20.sql b/benchmarks/queries/tpch/q20.sql
new file mode 100644
index 0000000000..3136ca302c
--- /dev/null
+++ b/benchmarks/queries/tpch/q20.sql
@@ -0,0 +1,39 @@
+-- SQLBench-H query 20 derived from TPC-H query 20 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ s_name,
+ s_address
+from
+ supplier,
+ nation
+where
+ s_suppkey in (
+ select
+ ps_suppkey
+ from
+ partsupp
+ where
+ ps_partkey in (
+ select
+ p_partkey
+ from
+ part
+ where
+ p_name like 'blanched%'
+ )
+ and ps_availqty > (
+ select
+ 0.5 * sum(l_quantity)
+ from
+ lineitem
+ where
+ l_partkey = ps_partkey
+ and l_suppkey = ps_suppkey
+ and l_shipdate >= date '1993-01-01'
+ and l_shipdate < date '1993-01-01' + interval '1' year
+ )
+ )
+ and s_nationkey = n_nationkey
+ and n_name = 'KENYA'
+order by
+ s_name;
diff --git a/benchmarks/queries/tpch/q21.sql b/benchmarks/queries/tpch/q21.sql
new file mode 100644
index 0000000000..01704697c4
--- /dev/null
+++ b/benchmarks/queries/tpch/q21.sql
@@ -0,0 +1,41 @@
+-- SQLBench-H query 21 derived from TPC-H query 21 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ s_name,
+ count(*) as numwait
+from
+ supplier,
+ lineitem l1,
+ orders,
+ nation
+where
+ s_suppkey = l1.l_suppkey
+ and o_orderkey = l1.l_orderkey
+ and o_orderstatus = 'F'
+ and l1.l_receiptdate > l1.l_commitdate
+ and exists (
+ select
+ *
+ from
+ lineitem l2
+ where
+ l2.l_orderkey = l1.l_orderkey
+ and l2.l_suppkey <> l1.l_suppkey
+ )
+ and not exists (
+ select
+ *
+ from
+ lineitem l3
+ where
+ l3.l_orderkey = l1.l_orderkey
+ and l3.l_suppkey <> l1.l_suppkey
+ and l3.l_receiptdate > l3.l_commitdate
+ )
+ and s_nationkey = n_nationkey
+ and n_name = 'ARGENTINA'
+group by
+ s_name
+order by
+ numwait desc,
+ s_name limit 100;
diff --git a/benchmarks/queries/tpch/q22.sql b/benchmarks/queries/tpch/q22.sql
new file mode 100644
index 0000000000..8d528ef6da
--- /dev/null
+++ b/benchmarks/queries/tpch/q22.sql
@@ -0,0 +1,39 @@
+-- SQLBench-H query 22 derived from TPC-H query 22 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ cntrycode,
+ count(*) as numcust,
+ sum(c_acctbal) as totacctbal
+from
+ (
+ select
+ substring(c_phone from 1 for 2) as cntrycode,
+ c_acctbal
+ from
+ customer
+ where
+ substring(c_phone from 1 for 2) in
+ ('24', '34', '16', '30', '33', '14', '13')
+ and c_acctbal > (
+ select
+ avg(c_acctbal)
+ from
+ customer
+ where
+ c_acctbal > 0.00
+ and substring(c_phone from 1 for 2) in
+ ('24', '34', '16', '30', '33', '14', '13')
+ )
+ and not exists (
+ select
+ *
+ from
+ orders
+ where
+ o_custkey = c_custkey
+ )
+ ) as custsale
+group by
+ cntrycode
+order by
+ cntrycode;
diff --git a/benchmarks/queries/tpch/q3.sql b/benchmarks/queries/tpch/q3.sql
new file mode 100644
index 0000000000..b60be7ff69
--- /dev/null
+++ b/benchmarks/queries/tpch/q3.sql
@@ -0,0 +1,24 @@
+-- SQLBench-H query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ l_orderkey,
+ sum(l_extendedprice * (1 - l_discount)) as revenue,
+ o_orderdate,
+ o_shippriority
+from
+ customer,
+ orders,
+ lineitem
+where
+ c_mktsegment = 'BUILDING'
+ and c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and o_orderdate < date '1995-03-15'
+ and l_shipdate > date '1995-03-15'
+group by
+ l_orderkey,
+ o_orderdate,
+ o_shippriority
+order by
+ revenue desc,
+ o_orderdate limit 10;
diff --git a/benchmarks/queries/tpch/q4.sql b/benchmarks/queries/tpch/q4.sql
new file mode 100644
index 0000000000..05fae97af9
--- /dev/null
+++ b/benchmarks/queries/tpch/q4.sql
@@ -0,0 +1,23 @@
+-- SQLBench-H query 4 derived from TPC-H query 4 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ o_orderpriority,
+ count(*) as order_count
+from
+ orders
+where
+ o_orderdate >= date '1995-04-01'
+ and o_orderdate < date '1995-04-01' + interval '3' month
+ and exists (
+ select
+ *
+ from
+ lineitem
+ where
+ l_orderkey = o_orderkey
+ and l_commitdate < l_receiptdate
+ )
+group by
+ o_orderpriority
+order by
+ o_orderpriority;
diff --git a/benchmarks/queries/tpch/q5.sql b/benchmarks/queries/tpch/q5.sql
new file mode 100644
index 0000000000..4b97ef0e48
--- /dev/null
+++ b/benchmarks/queries/tpch/q5.sql
@@ -0,0 +1,26 @@
+-- SQLBench-H query 5 derived from TPC-H query 5 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ n_name,
+ sum(l_extendedprice * (1 - l_discount)) as revenue
+from
+ customer,
+ orders,
+ lineitem,
+ supplier,
+ nation,
+ region
+where
+ c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and l_suppkey = s_suppkey
+ and c_nationkey = s_nationkey
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'AFRICA'
+ and o_orderdate >= date '1994-01-01'
+ and o_orderdate < date '1994-01-01' + interval '1' year
+group by
+ n_name
+order by
+ revenue desc;
diff --git a/benchmarks/queries/tpch/q6.sql b/benchmarks/queries/tpch/q6.sql
new file mode 100644
index 0000000000..f5b4bae70e
--- /dev/null
+++ b/benchmarks/queries/tpch/q6.sql
@@ -0,0 +1,11 @@
+-- SQLBench-H query 6 derived from TPC-H query 6 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ sum(l_extendedprice * l_discount) as revenue
+from
+ lineitem
+where
+ l_shipdate >= date '1994-01-01'
+ and l_shipdate < date '1994-01-01' + interval '1' year
+ and l_discount between 0.04 - 0.01 and 0.04 + 0.01
+ and l_quantity < 24;
diff --git a/benchmarks/queries/tpch/q7.sql b/benchmarks/queries/tpch/q7.sql
new file mode 100644
index 0000000000..f3919be2db
--- /dev/null
+++ b/benchmarks/queries/tpch/q7.sql
@@ -0,0 +1,41 @@
+-- SQLBench-H query 7 derived from TPC-H query 7 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ supp_nation,
+ cust_nation,
+ l_year,
+ sum(volume) as revenue
+from
+ (
+ select
+ n1.n_name as supp_nation,
+ n2.n_name as cust_nation,
+ extract(year from l_shipdate) as l_year,
+ l_extendedprice * (1 - l_discount) as volume
+ from
+ supplier,
+ lineitem,
+ orders,
+ customer,
+ nation n1,
+ nation n2
+ where
+ s_suppkey = l_suppkey
+ and o_orderkey = l_orderkey
+ and c_custkey = o_custkey
+ and s_nationkey = n1.n_nationkey
+ and c_nationkey = n2.n_nationkey
+ and (
+ (n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ')
+ or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY')
+ )
+ and l_shipdate between date '1995-01-01' and date '1996-12-31'
+ ) as shipping
+group by
+ supp_nation,
+ cust_nation,
+ l_year
+order by
+ supp_nation,
+ cust_nation,
+ l_year;
diff --git a/benchmarks/queries/tpch/q8.sql b/benchmarks/queries/tpch/q8.sql
new file mode 100644
index 0000000000..7c53e145e4
--- /dev/null
+++ b/benchmarks/queries/tpch/q8.sql
@@ -0,0 +1,39 @@
+-- SQLBench-H query 8 derived from TPC-H query 8 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ o_year,
+ sum(case
+ when nation = 'IRAQ' then volume
+ else 0
+ end) / sum(volume) as mkt_share
+from
+ (
+ select
+ extract(year from o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) as volume,
+ n2.n_name as nation
+ from
+ part,
+ supplier,
+ lineitem,
+ orders,
+ customer,
+ nation n1,
+ nation n2,
+ region
+ where
+ p_partkey = l_partkey
+ and s_suppkey = l_suppkey
+ and l_orderkey = o_orderkey
+ and o_custkey = c_custkey
+ and c_nationkey = n1.n_nationkey
+ and n1.n_regionkey = r_regionkey
+ and r_name = 'MIDDLE EAST'
+ and s_nationkey = n2.n_nationkey
+ and o_orderdate between date '1995-01-01' and date '1996-12-31'
+ and p_type = 'LARGE PLATED STEEL'
+ ) as all_nations
+group by
+ o_year
+order by
+ o_year;
diff --git a/benchmarks/queries/tpch/q9.sql b/benchmarks/queries/tpch/q9.sql
new file mode 100644
index 0000000000..2455695618
--- /dev/null
+++ b/benchmarks/queries/tpch/q9.sql
@@ -0,0 +1,34 @@
+-- SQLBench-H query 9 derived from TPC-H query 9 under the terms of the TPC Fair Use Policy.
+-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council.
+select
+ nation,
+ o_year,
+ sum(amount) as sum_profit
+from
+ (
+ select
+ n_name as nation,
+ extract(year from o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ from
+ part,
+ supplier,
+ lineitem,
+ partsupp,
+ orders,
+ nation
+ where
+ s_suppkey = l_suppkey
+ and ps_suppkey = l_suppkey
+ and ps_partkey = l_partkey
+ and p_partkey = l_partkey
+ and o_orderkey = l_orderkey
+ and s_nationkey = n_nationkey
+ and p_name like '%moccasin%'
+ ) as profit
+group by
+ nation,
+ o_year
+order by
+ nation,
+ o_year desc;
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100755
index 0000000000..0632764edc
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Unified benchmark runner wrapper.
+
+Reads .conf files, merges them with precedence (profile < engine < CLI),
+then builds and executes the spark-submit command.
+
+Usage::
+
+ # = comet-tpch.sh
+ python benchmarks/run.py --engine comet --profile standalone-tpch \\
+ --restart-cluster \\
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \\
+ --output . --iterations 1
+
+
+
+ # = comet-tpch-iceberg.sh (dynamic catalog via --conf)
+ python benchmarks/run.py --engine comet-iceberg --profile standalone-tpch \\
+ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \\
+ --conf spark.sql.catalog.local.type=hadoop \\
+ --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \\
+ --conf spark.sql.defaultCatalog=local \\
+ --restart-cluster \\
+ -- tpc --benchmark tpch --catalog local --database tpch \\
+ --queries $TPCH_QUERIES --output . --iterations 1
+
+ # shuffle benchmark
+ python benchmarks/run.py --engine comet-jvm-shuffle --profile local \\
+ -- shuffle --benchmark shuffle-hash --data /tmp/data --mode jvm \\
+ --output . --iterations 3
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+# Allow importing from the repo root so ``from benchmarks.runner.config ...``
+# works when this script is run directly.
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_REPO_ROOT = os.path.dirname(_SCRIPT_DIR)
+if _REPO_ROOT not in sys.path:
+ sys.path.insert(0, _REPO_ROOT)
+
+from benchmarks.runner.config import merge_configs, split_config
+
+
+def _parse_args():
+ """Parse wrapper-level arguments, splitting on ``--``."""
+ parser = argparse.ArgumentParser(
+ description="Unified benchmark runner — builds and executes spark-submit",
+ usage=(
+ "%(prog)s --engine NAME [--profile NAME] "
+ "[--conf key=value ...] [--restart-cluster] "
+ "[--dry-run] -- SUITE_ARGS..."
+ ),
+ )
+ parser.add_argument("--engine", required=True, help="Engine config name")
+ parser.add_argument("--profile", default=None, help="Profile config name")
+ parser.add_argument(
+ "--conf", action="append", default=[],
+ help="Extra key=value config override (repeatable)",
+ )
+ parser.add_argument(
+ "--restart-cluster", action="store_true",
+ help="Stop and restart Spark standalone master + worker",
+ )
+ parser.add_argument(
+ "--dry-run", action="store_true",
+ help="Print the spark-submit command without executing it",
+ )
+
+ # Split on "--": everything before goes to this parser, everything after
+ # is passed through to the benchmark suite CLI.
+ argv = sys.argv[1:]
+ if "--" in argv:
+ sep = argv.index("--")
+ wrapper_args = argv[:sep]
+ suite_args = argv[sep + 1:]
+ else:
+ wrapper_args = argv
+ suite_args = []
+
+ args = parser.parse_args(wrapper_args)
+ args.suite_args = suite_args
+ return args
+
+
+def _resolve_conf_path(conf_dir, kind, name):
+ """Return the path to a .conf file, or exit with an error."""
+ path = os.path.join(conf_dir, kind, f"{name}.conf")
+ if not os.path.isfile(path):
+ available = sorted(
+ f.removesuffix(".conf")
+ for f in os.listdir(os.path.join(conf_dir, kind))
+ if f.endswith(".conf")
+ )
+ print(
+ f"Error: {kind} config '{name}' not found at {path}\n"
+ f"Available: {', '.join(available)}",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+ return path
+
+
+def _restart_cluster():
+ """Stop and start Spark standalone master + worker."""
+ spark_home = os.environ.get("SPARK_HOME")
+ if not spark_home:
+ print("Error: SPARK_HOME must be set for --restart-cluster", file=sys.stderr)
+ sys.exit(1)
+ spark_master = os.environ.get("SPARK_MASTER")
+ if not spark_master:
+ print("Error: SPARK_MASTER must be set for --restart-cluster", file=sys.stderr)
+ sys.exit(1)
+
+ sbin = os.path.join(spark_home, "sbin")
+ print("Restarting Spark standalone cluster...")
+ subprocess.run([os.path.join(sbin, "stop-master.sh")], stderr=subprocess.DEVNULL, check=False)
+ subprocess.run([os.path.join(sbin, "stop-worker.sh")], stderr=subprocess.DEVNULL, check=False)
+ subprocess.check_call([os.path.join(sbin, "start-master.sh")])
+ subprocess.check_call([os.path.join(sbin, "start-worker.sh"), spark_master])
+
+
+def main():
+ args = _parse_args()
+ conf_dir = os.path.join(_SCRIPT_DIR, "conf")
+
+ # Resolve config file paths
+ engine_path = _resolve_conf_path(conf_dir, "engines", args.engine)
+ profile_path = (
+ _resolve_conf_path(conf_dir, "profiles", args.profile)
+ if args.profile else None
+ )
+
+ # Merge configs: profile < engine < CLI overrides
+ merged = merge_configs(
+ profile_path=profile_path,
+ engine_path=engine_path,
+ cli_overrides=args.conf,
+ )
+ spark_conf, runner_conf = split_config(merged)
+
+ # Export runner.env.* as environment variables
+ for key, value in runner_conf.items():
+ if key.startswith("env."):
+ env_var = key[len("env."):]
+ os.environ[env_var] = value
+ print(f"Exported {env_var}={value}")
+
+ # Restart cluster if requested
+ if args.restart_cluster:
+ _restart_cluster()
+
+ # Build spark-submit command
+ spark_home = os.environ.get("SPARK_HOME", "")
+ if not spark_home:
+ print("Error: SPARK_HOME must be set", file=sys.stderr)
+ sys.exit(1)
+
+ cmd = [os.path.join(spark_home, "bin", "spark-submit")]
+
+ # Master
+ master = runner_conf.get("master")
+ if master:
+ cmd += ["--master", master]
+
+ # JARs
+ jars = runner_conf.get("jars")
+ if jars:
+ cmd += ["--jars", jars]
+ cmd += ["--driver-class-path", jars.replace(",", ":")]
+
+ # Spark configs
+ for key, value in spark_conf.items():
+ cmd += ["--conf", f"{key}={value}"]
+
+ # Python script (the CLI entry point)
+ cmd.append(os.path.join(_SCRIPT_DIR, "runner", "cli.py"))
+
+ # Inject --name from runner.name if not already in suite args.
+ # Insert after the first positional arg (the subcommand) so that
+ # argparse routes it to the correct subparser.
+ runner_name = runner_conf.get("name", args.engine)
+ suite_args = list(args.suite_args)
+ if "--name" not in suite_args and suite_args:
+ suite_args.insert(1, "--name")
+ suite_args.insert(2, runner_name)
+
+ cmd += suite_args
+
+ # Print and execute
+ print()
+ print("spark-submit command:")
+ print(f" {' '.join(cmd)}")
+ print()
+
+ if args.dry_run:
+ return
+
+ os.execvp(cmd[0], cmd)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
new file mode 100644
index 0000000000..0ccbeeeafb
--- /dev/null
+++ b/benchmarks/runner/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmarks/runner/cli.py b/benchmarks/runner/cli.py
new file mode 100644
index 0000000000..5394cf9599
--- /dev/null
+++ b/benchmarks/runner/cli.py
@@ -0,0 +1,315 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Single CLI entry point for the unified benchmark runner.
+
+Designed to be the Python script passed to ``spark-submit``. Subcommands
+correspond to benchmark suites (currently: ``tpc``, ``shuffle``, ``micro``).
+
+Usage (via spark-submit)::
+
+ spark-submit ... benchmarks/runner/cli.py tpc --benchmark tpch --data /path ...
+ spark-submit ... benchmarks/runner/cli.py shuffle --benchmark shuffle-hash --data /path ...
+ spark-submit ... benchmarks/runner/cli.py micro --benchmark string-expressions --output ...
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from benchmarks.runner.spark_session import create_session
+from benchmarks.suites import tpc
+from benchmarks.suites import shuffle
+from benchmarks.suites import micro
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+def _maybe_start_profiler(spark, args):
+ """Start profiler if ``--profile`` was passed. Returns profiler or None."""
+ if not getattr(args, "profile", False):
+ return None
+ from benchmarks.runner.profiling import SparkMetricsProfiler
+
+ interval = getattr(args, "profile_interval", 2.0)
+ profiler = SparkMetricsProfiler(spark, interval_secs=interval)
+ profiler.start()
+ return profiler
+
+
+def _maybe_stop_profiler(profiler, output_dir, name, benchmark):
+ """Stop profiler and write CSV if active."""
+ if profiler is None:
+ return
+ profiler.stop()
+ csv_path = os.path.join(output_dir, f"{name}-{benchmark}-metrics.csv")
+ profiler.write_csv(csv_path)
+
+
+def _add_profiling_args(parser):
+ """Add common profiling flags to a subparser."""
+ parser.add_argument(
+ "--profile", action="store_true",
+ help="Enable Level 1 JVM metrics profiling via Spark REST API",
+ )
+ parser.add_argument(
+ "--profile-interval", type=float, default=2.0,
+ help="Profiling poll interval in seconds (default: 2.0)",
+ )
+
+
+# ---------------------------------------------------------------------------
+# TPC subcommand
+# ---------------------------------------------------------------------------
+
+def _add_tpc_subparser(subparsers):
+ """Register the ``tpc`` subcommand with the same args as tpcbench.py."""
+ p = subparsers.add_parser(
+ "tpc",
+ help="Run TPC-H or TPC-DS benchmarks",
+ description="TPC-H/TPC-DS benchmark runner for files or Iceberg tables",
+ )
+ p.add_argument("--benchmark", required=True, help="tpch or tpcds")
+
+ source = p.add_mutually_exclusive_group(required=True)
+ source.add_argument("--data", help="Path to data files")
+ source.add_argument("--catalog", help="Iceberg catalog name")
+
+ p.add_argument(
+ "--format", default="parquet",
+ help="Input file format: parquet, csv, json (only with --data)",
+ )
+ p.add_argument(
+ "--options", type=json.loads, default={},
+ help='Spark reader options as JSON, e.g. \'{"header": "true"}\'',
+ )
+ p.add_argument(
+ "--database", default="tpch",
+ help="Database containing TPC tables (only with --catalog)",
+ )
+ p.add_argument("--queries", required=True, help="Path to query SQL files")
+ p.add_argument("--iterations", type=int, default=1, help="Number of iterations")
+ p.add_argument("--output", required=True, help="Directory for results JSON")
+ p.add_argument("--name", required=True, help="Prefix for result file")
+ p.add_argument("--query", type=int, help="Run a single query (1-based)")
+ p.add_argument("--write", help="Path to save query results as Parquet")
+ _add_profiling_args(p)
+
+
+def _run_tpc(args):
+ """Execute the TPC suite."""
+ spark = create_session(
+ app_name=f"{args.name} benchmark derived from {args.benchmark}",
+ spark_conf={}, # configs already set by spark-submit
+ )
+
+ profiler = _maybe_start_profiler(spark, args)
+
+ using_iceberg = tpc.register_tables(
+ spark,
+ benchmark=args.benchmark,
+ data_path=args.data,
+ catalog=args.catalog,
+ database=args.database,
+ file_format=args.format,
+ reader_options=args.options,
+ )
+
+ timings = tpc.run_queries(
+ spark,
+ benchmark=args.benchmark,
+ query_path=args.queries,
+ iterations=args.iterations,
+ query_num=args.query,
+ write_path=args.write,
+ )
+
+ results = tpc.build_results(
+ spark,
+ benchmark=args.benchmark,
+ query_path=args.queries,
+ data_path=args.data,
+ catalog=args.catalog,
+ database=args.database,
+ using_iceberg=using_iceberg,
+ name=args.name,
+ timings=timings,
+ )
+
+ tpc.write_results(results, args.output, args.name, args.benchmark)
+ _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark)
+ spark.stop()
+
+
+# ---------------------------------------------------------------------------
+# Shuffle subcommand
+# ---------------------------------------------------------------------------
+
+def _add_shuffle_subparser(subparsers):
+ """Register the ``shuffle`` subcommand."""
+ p = subparsers.add_parser(
+ "shuffle",
+ help="Run shuffle benchmarks (hash, round-robin)",
+ description=(
+ "Shuffle benchmark runner. Tests different partitioning strategies "
+ "across Spark, Comet JVM, and Comet Native shuffle implementations."
+ ),
+ )
+ p.add_argument(
+ "--benchmark", required=True,
+ choices=list(shuffle.BENCHMARKS),
+ help="Shuffle benchmark to run",
+ )
+ p.add_argument("--data", required=True, help="Path to input parquet data")
+ p.add_argument(
+ "--mode", required=True,
+ choices=["spark", "jvm", "native"],
+ help="Shuffle mode being tested",
+ )
+ p.add_argument(
+ "--partitions", type=int, default=200,
+ help="Number of shuffle partitions (default: 200)",
+ )
+ p.add_argument("--iterations", type=int, default=1, help="Number of iterations")
+ p.add_argument("--output", required=True, help="Directory for results JSON")
+ p.add_argument("--name", required=True, help="Prefix for result file")
+ _add_profiling_args(p)
+
+
+def _run_shuffle(args):
+ """Execute the shuffle suite."""
+ spark = create_session(
+ app_name=f"{args.name}-{args.benchmark}-{args.mode.upper()}",
+ spark_conf={}, # configs already set by spark-submit
+ )
+
+ profiler = _maybe_start_profiler(spark, args)
+
+ timings = shuffle.run_shuffle(
+ spark,
+ benchmark=args.benchmark,
+ data_path=args.data,
+ mode=args.mode,
+ num_partitions=args.partitions,
+ iterations=args.iterations,
+ )
+
+ results = shuffle.build_results(
+ spark,
+ benchmark=args.benchmark,
+ data_path=args.data,
+ mode=args.mode,
+ name=args.name,
+ timings=timings,
+ )
+
+ shuffle.write_results(results, args.output, args.name, args.benchmark)
+ _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark)
+ spark.stop()
+
+
+# ---------------------------------------------------------------------------
+# Micro subcommand
+# ---------------------------------------------------------------------------
+
+def _add_micro_subparser(subparsers):
+ """Register the ``micro`` subcommand."""
+ p = subparsers.add_parser(
+ "micro",
+ help="Run expression-level microbenchmarks",
+ description=(
+ "Microbenchmark runner. Generates a small dataset and times "
+ "individual SQL expressions."
+ ),
+ )
+ p.add_argument(
+ "--benchmark", required=True,
+ choices=list(micro.BENCHMARKS),
+ help="Microbenchmark to run",
+ )
+ p.add_argument(
+ "--rows", type=int, default=1024,
+ help="Number of rows for data generation (default: 1024)",
+ )
+ p.add_argument("--iterations", type=int, default=3, help="Number of iterations")
+ p.add_argument("--expression", help="Run a single expression by name")
+ p.add_argument("--output", required=True, help="Directory for results JSON")
+ p.add_argument("--name", required=True, help="Prefix for result file")
+ _add_profiling_args(p)
+
+
+def _run_micro(args):
+ """Execute the micro suite."""
+ spark = create_session(
+ app_name=f"{args.name}-{args.benchmark}",
+ spark_conf={}, # configs already set by spark-submit
+ )
+
+ profiler = _maybe_start_profiler(spark, args)
+
+ timings = micro.run_micro(
+ spark,
+ benchmark=args.benchmark,
+ num_rows=args.rows,
+ iterations=args.iterations,
+ expression=args.expression,
+ )
+
+ results = micro.build_results(
+ spark,
+ benchmark=args.benchmark,
+ name=args.name,
+ timings=timings,
+ )
+
+ micro.write_results(results, args.output, args.name, args.benchmark)
+ _maybe_stop_profiler(profiler, args.output, args.name, args.benchmark)
+ spark.stop()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main(argv=None):
+ parser = argparse.ArgumentParser(
+ prog="benchmark-runner",
+ description="Unified benchmark runner for Apache DataFusion Comet",
+ )
+ subparsers = parser.add_subparsers(dest="suite", required=True)
+ _add_tpc_subparser(subparsers)
+ _add_shuffle_subparser(subparsers)
+ _add_micro_subparser(subparsers)
+
+ args = parser.parse_args(argv)
+
+ if args.suite == "tpc":
+ _run_tpc(args)
+ elif args.suite == "shuffle":
+ _run_shuffle(args)
+ elif args.suite == "micro":
+ _run_micro(args)
+ else:
+ parser.error(f"Unknown suite: {args.suite}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmarks/runner/config.py b/benchmarks/runner/config.py
new file mode 100644
index 0000000000..ff03d622d3
--- /dev/null
+++ b/benchmarks/runner/config.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Config loader for the unified benchmark runner.
+
+Reads key=value .conf files, merges them with precedence
+(profile < engine < CLI overrides), and splits into spark vs runner configs.
+
+The ``runner.*`` namespace controls the shell wrapper (JAR paths, env vars,
+result name) without colliding with Spark config keys. Examples:
+ runner.jars=${COMET_JAR}
+ runner.env.TZ=UTC
+ runner.name=comet
+"""
+
+import os
+import re
+from typing import Dict, List, Tuple
+
+
+def load_conf_file(path: str) -> Dict[str, str]:
+ """Read a key=value .conf file.
+
+ - Blank lines and lines starting with ``#`` are skipped.
+ - ``${VAR}`` references are expanded from the environment.
+ - Values may optionally be quoted with single or double quotes.
+ """
+ conf: Dict[str, str] = {}
+ with open(path) as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ key, _, value = line.partition("=")
+ key = key.strip()
+ value = value.strip()
+ if not key or not _:
+ continue
+ # Strip optional quotes
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
+ value = value[1:-1]
+ # Expand ${VAR} references from environment
+ value = re.sub(
+ r"\$\{(\w+)\}",
+ lambda m: os.environ.get(m.group(1), m.group(0)),
+ value,
+ )
+ conf[key] = value
+ return conf
+
+
+def merge_configs(
+ profile_path: str = None,
+ engine_path: str = None,
+ cli_overrides: List[str] = None,
+) -> Dict[str, str]:
+ """Merge configs with precedence: profile < engine < CLI overrides."""
+ merged: Dict[str, str] = {}
+ if profile_path:
+ merged.update(load_conf_file(profile_path))
+ if engine_path:
+ merged.update(load_conf_file(engine_path))
+ for override in cli_overrides or []:
+ key, _, value = override.partition("=")
+ key = key.strip()
+ value = value.strip()
+ if key and _:
+ # Expand ${VAR} in CLI overrides too
+ value = re.sub(
+ r"\$\{(\w+)\}",
+ lambda m: os.environ.get(m.group(1), m.group(0)),
+ value,
+ )
+ merged[key] = value
+ return merged
+
+
+def split_config(merged: Dict[str, str]) -> Tuple[Dict[str, str], Dict[str, str]]:
+ """Separate ``runner.*`` keys from ``spark.*`` (and other) keys.
+
+ Returns (spark_conf, runner_conf) where runner_conf has the
+ ``runner.`` prefix stripped.
+ """
+ spark_conf: Dict[str, str] = {}
+ runner_conf: Dict[str, str] = {}
+ for key, value in merged.items():
+ if key.startswith("runner."):
+ runner_conf[key[len("runner."):]] = value
+ else:
+ spark_conf[key] = value
+ return spark_conf, runner_conf
diff --git a/benchmarks/runner/profiling.py b/benchmarks/runner/profiling.py
new file mode 100644
index 0000000000..98aa56d493
--- /dev/null
+++ b/benchmarks/runner/profiling.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Level 1 profiling hooks: JVM metrics via the Spark REST API.
+
+Polls ``/api/v1/applications/{appId}/executors`` at a configurable interval
+and records executor memory metrics as a time-series CSV alongside the
+benchmark results.
+
+Usage::
+
+ profiler = SparkMetricsProfiler(spark, interval_secs=2)
+ profiler.start()
+ # ... run benchmark ...
+ profiler.stop()
+ profiler.write_csv("/path/to/output/metrics.csv")
+"""
+
+import csv
+import threading
+import time
+from typing import Any, Dict, List, Optional
+
+from pyspark.sql import SparkSession
+
+try:
+ from urllib.request import urlopen
+ import json as _json
+
+ def _fetch_json(url: str) -> Any:
+ with urlopen(url, timeout=5) as resp:
+ return _json.loads(resp.read().decode())
+except ImportError:
+ _fetch_json = None # type: ignore[assignment]
+
+
+# Metrics we extract per executor from the REST API response
+_EXECUTOR_METRICS = [
+ "memoryUsed",
+ "maxMemory",
+ "totalOnHeapStorageMemory",
+ "usedOnHeapStorageMemory",
+ "totalOffHeapStorageMemory",
+ "usedOffHeapStorageMemory",
+]
+
+# Metrics nested under peakMemoryMetrics (if available)
+_PEAK_MEMORY_METRICS = [
+ "JVMHeapMemory",
+ "JVMOffHeapMemory",
+ "OnHeapExecutionMemory",
+ "OffHeapExecutionMemory",
+ "OnHeapStorageMemory",
+ "OffHeapStorageMemory",
+ "OnHeapUnifiedMemory",
+ "OffHeapUnifiedMemory",
+ "ProcessTreeJVMRSSMemory",
+]
+
+
+class SparkMetricsProfiler:
+ """Periodically polls executor metrics from the Spark REST API."""
+
+ def __init__(
+ self,
+ spark: SparkSession,
+ interval_secs: float = 2.0,
+ ):
+ self._spark = spark
+ self._interval = interval_secs
+ self._samples: List[Dict[str, Any]] = []
+ self._stop_event = threading.Event()
+ self._thread: Optional[threading.Thread] = None
+ self._start_time: float = 0.0
+
+ @property
+ def samples(self) -> List[Dict[str, Any]]:
+ """Return collected samples (each is a flat dict)."""
+ return list(self._samples)
+
+ def _ui_url(self) -> Optional[str]:
+ """Return the Spark UI base URL, or None if unavailable."""
+ url = self._spark.sparkContext.uiWebUrl
+ if url:
+ return url.rstrip("/")
+ return None
+
+ def _app_id(self) -> str:
+ return self._spark.sparkContext.applicationId
+
+ def _poll_once(self) -> None:
+ """Fetch executor metrics and append a timestamped sample."""
+ base = self._ui_url()
+ if base is None or _fetch_json is None:
+ return
+
+ url = f"{base}/api/v1/applications/{self._app_id()}/executors"
+ try:
+ executors = _fetch_json(url)
+ except Exception:
+ return
+
+ elapsed = time.time() - self._start_time
+ for exc in executors:
+ row: Dict[str, Any] = {
+ "elapsed_secs": round(elapsed, 2),
+ "executor_id": exc.get("id", ""),
+ "is_active": exc.get("isActive", True),
+ }
+ for key in _EXECUTOR_METRICS:
+ row[key] = exc.get(key, 0)
+
+ peak = exc.get("peakMemoryMetrics", {})
+ for key in _PEAK_MEMORY_METRICS:
+ row[f"peak_{key}"] = peak.get(key, 0)
+
+ self._samples.append(row)
+
+ def _run(self) -> None:
+ """Background polling loop."""
+ while not self._stop_event.is_set():
+ self._poll_once()
+ self._stop_event.wait(self._interval)
+
+ def start(self) -> None:
+ """Start background polling thread."""
+ if self._thread is not None:
+ return
+ self._start_time = time.time()
+ self._stop_event.clear()
+ self._thread = threading.Thread(
+ target=self._run, name="spark-metrics-profiler", daemon=True
+ )
+ self._thread.start()
+ print(
+ f"Profiler started (interval={self._interval}s, "
+ f"ui={self._ui_url()})"
+ )
+
+ def stop(self) -> None:
+ """Stop the polling thread and collect a final sample."""
+ if self._thread is None:
+ return
+ self._stop_event.set()
+ self._thread.join(timeout=self._interval + 2)
+ self._thread = None
+ # One last poll to capture final state
+ self._poll_once()
+ print(f"Profiler stopped ({len(self._samples)} samples collected)")
+
+ def write_csv(self, path: str) -> str:
+ """Write collected samples to a CSV file. Returns the path."""
+ if not self._samples:
+ print("Profiler: no samples to write")
+ return path
+
+ fieldnames = list(self._samples[0].keys())
+ with open(path, "w", newline="") as f:
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
+ writer.writeheader()
+ for row in self._samples:
+ writer.writerow(row)
+ print(f"Profiler: wrote {len(self._samples)} samples to {path}")
+ return path
diff --git a/benchmarks/runner/spark_session.py b/benchmarks/runner/spark_session.py
new file mode 100644
index 0000000000..f2e21a464a
--- /dev/null
+++ b/benchmarks/runner/spark_session.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Thin wrapper around SparkSession.builder."""
+
+from typing import Dict
+
+from pyspark.sql import SparkSession
+
+
+def create_session(app_name: str, spark_conf: Dict[str, str]) -> SparkSession:
+ """Create (or retrieve) a SparkSession with the given config.
+
+ When launched via spark-submit the configs are already set; this just
+ picks up the existing session.
+ """
+ builder = SparkSession.builder.appName(app_name)
+ for key, value in spark_conf.items():
+ builder = builder.config(key, value)
+ return builder.getOrCreate()
diff --git a/benchmarks/suites/MICRO.md b/benchmarks/suites/MICRO.md
new file mode 100644
index 0000000000..41c5fa2bc0
--- /dev/null
+++ b/benchmarks/suites/MICRO.md
@@ -0,0 +1,108 @@
+
+
+# Microbenchmark Suite
+
+Runs expression-level microbenchmarks that generate a small in-memory dataset
+and time individual SQL expressions. Currently supports the string expression
+benchmark (ported from `CometStringExpressionBenchmark.scala`).
+
+## Arguments
+
+| Argument | Required | Default | Description |
+| -------------------- | -------- | ------- | ---------------------------------------------- |
+| `--benchmark` | yes | | `string-expressions` |
+| `--rows` | no | `1024` | Number of rows for data generation |
+| `--iterations` | no | `3` | Number of timed iterations per expression |
+| `--expression` | no | | Run a single expression by name |
+| `--output` | yes | | Directory for results JSON |
+| `--name` | auto | | Result file prefix (auto-injected by `run.py`) |
+| `--profile` | no | | Enable JVM metrics profiling |
+| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds |
+
+## Examples
+
+### String expressions with Comet
+
+```bash
+python benchmarks/run.py --engine comet --profile local \
+ -- micro --benchmark string-expressions --output . --iterations 3
+```
+
+### String expressions with vanilla Spark (baseline)
+
+```bash
+python benchmarks/run.py --engine spark --profile local \
+ -- micro --benchmark string-expressions --output . --iterations 3
+```
+
+### String expressions with Gluten
+
+```bash
+python benchmarks/run.py --engine gluten --profile local \
+ -- micro --benchmark string-expressions --output . --iterations 3
+```
+
+### Run a single expression
+
+```bash
+python benchmarks/run.py --engine comet --profile local \
+ -- micro --benchmark string-expressions --output . --expression ascii
+```
+
+### Compare results across engines
+
+```bash
+# Run each engine
+for engine in comet spark gluten; do
+ python benchmarks/run.py --engine $engine --profile local \
+ -- micro --benchmark string-expressions --output . --iterations 3
+done
+
+# Generate comparison chart
+python -m benchmarks.analysis.compare \
+ comet-string-expressions-*.json spark-string-expressions-*.json \
+ --labels Comet Spark --benchmark string-expressions
+```
+
+## Output Format
+
+Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`:
+
+```json
+{
+ "engine": "datafusion-comet",
+ "benchmark": "string-expressions",
+ "spark_conf": { ... },
+ "ascii": [0.12, 0.10, 0.08],
+ "bit_length": [0.05, 0.04, 0.04],
+ "lower": [0.15, 0.11, 0.07],
+ ...
+}
+```
+
+Expression names are top-level keys, each mapping to a list of elapsed seconds
+per iteration. This format is directly compatible with `analysis/compare.py`.
+
+## Available Expressions (string-expressions)
+
+ascii, bit_length, btrim, chr, concat, concat_ws, contains, endswith, initcap,
+instr, length, like, lower, lpad, ltrim, octet_length, regexp_replace, repeat,
+replace, reverse, rlike, rpad, rtrim, space, startswith, substring, translate,
+trim, upper.
diff --git a/benchmarks/suites/SHUFFLE.md b/benchmarks/suites/SHUFFLE.md
new file mode 100644
index 0000000000..222fa4cc82
--- /dev/null
+++ b/benchmarks/suites/SHUFFLE.md
@@ -0,0 +1,132 @@
+
+
+# Shuffle Benchmark Suite
+
+Compares shuffle file sizes and performance across Spark, Comet JVM, and
+Comet Native shuffle implementations using hash or round-robin partitioning.
+
+## Arguments
+
+| Argument | Required | Default | Description |
+| -------------------- | -------- | ------- | -------------------------------------- |
+| `--benchmark` | yes | | `shuffle-hash` or `shuffle-roundrobin` |
+| `--data` | yes | | Path to input Parquet data |
+| `--mode` | yes | | `spark`, `jvm`, or `native` |
+| `--partitions` | no | `200` | Number of shuffle partitions |
+| `--iterations` | no | `1` | Number of iterations |
+| `--output` | yes | | Directory for results JSON |
+| `--name` | auto | | Result file prefix (auto-injected) |
+| `--profile` | no | | Enable JVM metrics profiling |
+| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds |
+
+## Generating Test Data
+
+Generate a Parquet dataset with a wide schema (100 columns including deeply
+nested structs, arrays, and maps):
+
+```bash
+$SPARK_HOME/bin/spark-submit \
+ --master $SPARK_MASTER \
+ --executor-memory 16g \
+ benchmarks/generate_shuffle_data.py \
+ --output /tmp/shuffle-benchmark-data \
+ --rows 10000000 \
+ --partitions 200
+```
+
+> **Note**: The data generation script is a standalone PySpark job. It can be
+> run with any Spark installation — no engine JARs required.
+
+## Examples
+
+### Hash shuffle — Spark baseline
+
+```bash
+python benchmarks/run.py --engine spark-shuffle --profile local \
+ -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \
+ --mode spark --output . --iterations 3
+```
+
+### Hash shuffle — Comet JVM
+
+```bash
+python benchmarks/run.py --engine comet-jvm-shuffle --profile local \
+ -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \
+ --mode jvm --output . --iterations 3
+```
+
+### Hash shuffle — Comet Native
+
+```bash
+python benchmarks/run.py --engine comet-native-shuffle --profile local \
+ -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \
+ --mode native --output . --iterations 3
+```
+
+### Round-robin shuffle
+
+```bash
+python benchmarks/run.py --engine comet-native-shuffle --profile local \
+ -- shuffle --benchmark shuffle-roundrobin --data /tmp/shuffle-data \
+ --mode native --output . --iterations 3
+```
+
+### Run all three modes back-to-back
+
+```bash
+for engine_mode in "spark-shuffle spark" "comet-jvm-shuffle jvm" "comet-native-shuffle native"; do
+ set -- $engine_mode
+ python benchmarks/run.py --engine "$1" --profile local \
+ -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \
+ --mode "$2" --output . --iterations 3
+done
+```
+
+### With profiling
+
+```bash
+python benchmarks/run.py --engine comet-native-shuffle --profile local \
+ -- shuffle --benchmark shuffle-hash --data /tmp/shuffle-data \
+ --mode native --output . --iterations 3 --profile --profile-interval 1.0
+```
+
+## Output Format
+
+Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`:
+
+```json
+{
+ "engine": "datafusion-comet",
+ "benchmark": "shuffle-hash",
+ "mode": "native",
+ "data_path": "/tmp/shuffle-data",
+ "spark_conf": { ... },
+ "shuffle-hash": [
+ {"duration_ms": 12345, "row_count": 10000000, "num_partitions": 200},
+ {"duration_ms": 11234, "row_count": 10000000, "num_partitions": 200},
+ {"duration_ms": 10987, "row_count": 10000000, "num_partitions": 200}
+ ]
+}
+```
+
+## Checking Results
+
+Open the Spark UI (default: http://localhost:4040) during each benchmark run
+to compare shuffle write sizes in the Stages tab.
diff --git a/benchmarks/suites/TPC.md b/benchmarks/suites/TPC.md
new file mode 100644
index 0000000000..7e7ff299b7
--- /dev/null
+++ b/benchmarks/suites/TPC.md
@@ -0,0 +1,139 @@
+
+
+# TPC-H / TPC-DS Benchmark Suite
+
+Runs TPC-H (22 queries) or TPC-DS (99 queries) benchmarks against Parquet
+files or Iceberg tables.
+
+## Arguments
+
+| Argument | Required | Default | Description |
+| -------------------- | -------- | --------- | ------------------------------------------------------- |
+| `--benchmark` | yes | | `tpch` or `tpcds` |
+| `--data` | \* | | Path to Parquet data files |
+| `--catalog` | \* | | Iceberg catalog name (mutually exclusive with `--data`) |
+| `--database` | no | `tpch` | Database name (only with `--catalog`) |
+| `--format` | no | `parquet` | File format: parquet, csv, json (only with `--data`) |
+| `--options` | no | `{}` | Spark reader options as JSON string |
+| `--queries` | yes | | Path to directory containing `q1.sql` ... `qN.sql` |
+| `--iterations` | no | `1` | Number of times to run all queries |
+| `--output` | yes | | Directory for results JSON |
+| `--name` | auto | | Result file prefix (auto-injected from engine config) |
+| `--query` | no | | Run a single query number (1-based) |
+| `--write` | no | | Write query results as Parquet to this path |
+| `--profile` | no | | Enable JVM metrics profiling |
+| `--profile-interval` | no | `2.0` | Profiling poll interval in seconds |
+
+`*` Either `--data` or `--catalog` is required, but not both.
+
+## Examples
+
+### TPC-H with Comet (standalone cluster)
+
+```bash
+export SPARK_HOME=/opt/spark
+export SPARK_MASTER=spark://hostname:7077
+export COMET_JAR=/path/to/comet.jar
+export TPCH_DATA=/mnt/bigdata/tpch/sf100
+export TPCH_QUERIES=/mnt/bigdata/tpch/queries
+
+python benchmarks/run.py \
+ --engine comet --profile standalone-tpch --restart-cluster \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --iterations 1
+```
+
+### TPC-H with vanilla Spark (baseline)
+
+```bash
+python benchmarks/run.py \
+ --engine spark --profile standalone-tpch --restart-cluster \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --iterations 1
+```
+
+### TPC-H with Iceberg tables
+
+First, create Iceberg tables from Parquet data:
+
+```bash
+export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
+export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
+
+$SPARK_HOME/bin/spark-submit \
+ --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
+ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
+ --conf spark.sql.catalog.local.type=hadoop \
+ --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \
+ benchmarks/create-iceberg-tpch.py \
+ --parquet-path $TPCH_DATA --catalog local --database tpch
+```
+
+Then run the benchmark with Comet's native Iceberg scanning:
+
+```bash
+python benchmarks/run.py \
+ --engine comet-iceberg --profile standalone-tpch \
+ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
+ --conf spark.sql.catalog.local.type=hadoop \
+ --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \
+ --conf spark.sql.defaultCatalog=local \
+ --restart-cluster \
+ -- tpc --benchmark tpch --catalog local --database tpch \
+ --queries $TPCH_QUERIES --output . --iterations 1
+```
+
+### Run a single query
+
+```bash
+python benchmarks/run.py --engine comet --profile local \
+ -- tpc --benchmark tpch --data $TPCH_DATA --queries $TPCH_QUERIES \
+ --output . --query 1
+```
+
+## Output Format
+
+Results are written as JSON with the filename `{name}-{benchmark}-{timestamp_millis}.json`:
+
+```json
+{
+ "engine": "datafusion-comet",
+ "benchmark": "tpch",
+ "query_path": "/path/to/queries",
+ "spark_conf": { ... },
+ "data_path": "/path/to/data",
+ "1": [12.34],
+ "2": [5.67],
+ ...
+}
+```
+
+Query keys are integers (serialised as strings by `json.dumps`). Each value
+is a list of elapsed seconds per iteration. This format is compatible with
+`analysis/compare.py` for chart generation.
+
+## Comparing Results
+
+```bash
+python -m benchmarks.analysis.compare \
+ comet-tpch-*.json spark-tpch-*.json \
+ --labels Comet Spark --benchmark tpch \
+ --title "TPC-H SF100" --output-dir ./charts
+```
diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py
new file mode 100644
index 0000000000..0ccbeeeafb
--- /dev/null
+++ b/benchmarks/suites/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/benchmarks/suites/micro.py b/benchmarks/suites/micro.py
new file mode 100644
index 0000000000..58c7a22283
--- /dev/null
+++ b/benchmarks/suites/micro.py
@@ -0,0 +1,172 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Microbenchmark suite.
+
+Ports expression-level benchmarks (e.g. CometStringExpressionBenchmark) to the
+unified runner. Each benchmark generates a small dataset, runs SQL expressions
+in a tight loop, and records per-iteration wall-clock times.
+"""
+
+import json
+import os
+import shutil
+import tempfile
+import time
+from datetime import datetime
+from typing import Dict, List, Optional
+
+from pyspark.sql import SparkSession
+
+
+# ---------------------------------------------------------------------------
+# String expression benchmark
+# ---------------------------------------------------------------------------
+
+STRING_EXPRESSIONS: List[tuple] = [
+ ("ascii", "select ascii(c1) from parquetV1Table"),
+ ("bit_length", "select bit_length(c1) from parquetV1Table"),
+ ("btrim", "select btrim(c1) from parquetV1Table"),
+ ("chr", "select chr(c1) from parquetV1Table"),
+ ("concat", "select concat(c1, c1) from parquetV1Table"),
+ ("concat_ws", "select concat_ws(' ', c1, c1) from parquetV1Table"),
+ ("contains", "select contains(c1, '123') from parquetV1Table"),
+ ("endswith", "select endswith(c1, '9') from parquetV1Table"),
+ ("initcap", "select initCap(c1) from parquetV1Table"),
+ ("instr", "select instr(c1, '123') from parquetV1Table"),
+ ("length", "select length(c1) from parquetV1Table"),
+ ("like", "select c1 like '%123%' from parquetV1Table"),
+ ("lower", "select lower(c1) from parquetV1Table"),
+ ("lpad", "select lpad(c1, 150, 'x') from parquetV1Table"),
+ ("ltrim", "select ltrim(c1) from parquetV1Table"),
+ ("octet_length", "select octet_length(c1) from parquetV1Table"),
+ ("regexp_replace", "select regexp_replace(c1, '[0-9]', 'X') from parquetV1Table"),
+ ("repeat", "select repeat(c1, 3) from parquetV1Table"),
+ ("replace", "select replace(c1, '123', 'ab') from parquetV1Table"),
+ ("reverse", "select reverse(c1) from parquetV1Table"),
+ ("rlike", "select c1 rlike '[0-9]+' from parquetV1Table"),
+ ("rpad", "select rpad(c1, 150, 'x') from parquetV1Table"),
+ ("rtrim", "select rtrim(c1) from parquetV1Table"),
+ ("space", "select space(2) from parquetV1Table"),
+ ("startswith", "select startswith(c1, '1') from parquetV1Table"),
+ ("substring", "select substring(c1, 1, 100) from parquetV1Table"),
+ ("translate", "select translate(c1, '123456', 'aBcDeF') from parquetV1Table"),
+ ("trim", "select trim(c1) from parquetV1Table"),
+ ("upper", "select upper(c1) from parquetV1Table"),
+]
+
+BENCHMARKS = {
+ "string-expressions": "String expression microbenchmarks (29 expressions)",
+}
+
+
+def prepare_string_table(
+ spark: SparkSession, num_rows: int, temp_dir: str
+) -> None:
+ """Generate a string column table and register it as ``parquetV1Table``."""
+ path = os.path.join(temp_dir, "string_data")
+ spark.range(num_rows).selectExpr(
+ "REPEAT(CAST(id AS STRING), 10) AS c1"
+ ).write.mode("overwrite").option("compression", "snappy").parquet(path)
+ spark.read.parquet(path).createOrReplaceTempView("parquetV1Table")
+ print(f"Generated {num_rows} rows in {path}")
+
+
+def run_micro(
+ spark: SparkSession,
+ benchmark: str,
+ num_rows: int = 1024,
+ iterations: int = 3,
+ expression: Optional[str] = None,
+) -> Dict[str, List[float]]:
+ """Run a microbenchmark and return ``{expr_name: [elapsed_secs, ...]}``."""
+ if benchmark != "string-expressions":
+ raise ValueError(
+ f"Unknown micro benchmark: {benchmark}. "
+ f"Available: {', '.join(BENCHMARKS)}"
+ )
+
+ temp_dir = tempfile.mkdtemp(prefix="comet-micro-")
+ try:
+ prepare_string_table(spark, num_rows, temp_dir)
+
+ expressions = STRING_EXPRESSIONS
+ if expression is not None:
+ expressions = [(n, sql) for n, sql in expressions if n == expression]
+ if not expressions:
+ valid = [n for n, _ in STRING_EXPRESSIONS]
+ raise ValueError(
+ f"Unknown expression: {expression}. Valid: {', '.join(valid)}"
+ )
+
+ timings: Dict[str, List[float]] = {}
+
+ for expr_name, sql in expressions:
+ print(f"\n{'=' * 60}")
+ print(f"Expression: {expr_name}")
+ print(f"{'=' * 60}")
+
+ for iteration in range(iterations):
+ spark.sparkContext.setJobDescription(
+ f"{benchmark} {expr_name} iter{iteration + 1}"
+ )
+ start = time.time()
+ spark.sql(sql).foreach(lambda _: None)
+ elapsed = time.time() - start
+ print(f" Iteration {iteration + 1}: {elapsed:.4f}s")
+ timings.setdefault(expr_name, []).append(elapsed)
+
+ return timings
+ finally:
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+
+def build_results(
+ spark: SparkSession,
+ benchmark: str,
+ name: str,
+ timings: Dict[str, List[float]],
+) -> Dict:
+ """Assemble the result dict for micro benchmarks."""
+ conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()}
+
+ results: Dict = {
+ "engine": "datafusion-comet",
+ "benchmark": benchmark,
+ "spark_conf": conf_dict,
+ }
+ for expr_name, elapsed_list in timings.items():
+ results[expr_name] = elapsed_list
+
+ return results
+
+
+def write_results(
+ results: Dict,
+ output_dir: str,
+ name: str,
+ benchmark: str,
+) -> str:
+ """Write JSON results file. Returns the path written."""
+ result_str = json.dumps(results, indent=4)
+ current_time_millis = int(datetime.now().timestamp() * 1000)
+ results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json"
+ print(f"\nWriting results to {results_path}")
+ with open(results_path, "w") as f:
+ f.write(result_str)
+ return results_path
diff --git a/benchmarks/suites/shuffle.py b/benchmarks/suites/shuffle.py
new file mode 100644
index 0000000000..adabecf2be
--- /dev/null
+++ b/benchmarks/suites/shuffle.py
@@ -0,0 +1,156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Shuffle benchmark suite.
+
+Tests different partitioning strategies (hash, round-robin) across Spark,
+Comet JVM, and Comet Native shuffle implementations.
+"""
+
+import json
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pyspark.sql import DataFrame, SparkSession
+
+
+BENCHMARKS = {
+ "shuffle-hash": "Shuffle all columns using hash partitioning on group_key",
+ "shuffle-roundrobin": "Shuffle all columns using round-robin partitioning",
+}
+
+
+def _repartition(
+ df: DataFrame, benchmark: str, num_partitions: int
+) -> DataFrame:
+ """Apply the partitioning strategy for the given benchmark."""
+ if benchmark == "shuffle-hash":
+ return df.repartition(num_partitions, "group_key")
+ elif benchmark == "shuffle-roundrobin":
+ return df.repartition(num_partitions)
+ else:
+ raise ValueError(
+ f"Unknown shuffle benchmark: {benchmark}. "
+ f"Available: {', '.join(BENCHMARKS)}"
+ )
+
+
+def run_shuffle(
+ spark: SparkSession,
+ benchmark: str,
+ data_path: str,
+ mode: str,
+ num_partitions: int = 200,
+ iterations: int = 1,
+) -> Dict[str, List[Dict[str, Any]]]:
+ """Run a shuffle benchmark and return per-iteration results.
+
+ Returns ``{benchmark_name: [{duration_ms, row_count, num_partitions}, ...]}``
+ so the structure parallels TPC output (query -> list of timings).
+ """
+ if benchmark not in BENCHMARKS:
+ raise ValueError(
+ f"Unknown shuffle benchmark: {benchmark}. "
+ f"Available: {', '.join(BENCHMARKS)}"
+ )
+
+ results: List[Dict[str, Any]] = []
+
+ # Read input data once
+ df = spark.read.parquet(data_path)
+ row_count = df.count()
+
+ for iteration in range(iterations):
+ print(f"\n{'=' * 60}")
+ print(f"Shuffle benchmark: {benchmark} | Mode: {mode.upper()}")
+ print(f"Iteration {iteration + 1} of {iterations}")
+ print(f"{'=' * 60}")
+ print(f"Data path: {data_path}")
+ print(f"Rows: {row_count:,} | Partitions: {num_partitions}")
+
+ # Print relevant Spark configuration
+ conf = spark.sparkContext.getConf()
+ print(f"Shuffle manager: {conf.get('spark.shuffle.manager', 'default')}")
+ print(f"Comet enabled: {conf.get('spark.comet.enabled', 'false')}")
+ print(
+ f"Comet shuffle enabled: "
+ f"{conf.get('spark.comet.exec.shuffle.enabled', 'false')}"
+ )
+ print(
+ f"Comet shuffle mode: "
+ f"{conf.get('spark.comet.exec.shuffle.mode', 'not set')}"
+ )
+
+ spark.catalog.clearCache()
+ spark.sparkContext.setJobDescription(f"{benchmark} iter{iteration + 1}")
+
+ start_time = time.time()
+
+ repartitioned = _repartition(df, benchmark, num_partitions)
+ output_path = f"/tmp/shuffle-benchmark-output-{mode}-{benchmark}"
+ repartitioned.write.mode("overwrite").parquet(output_path)
+ print(f"Wrote repartitioned data to: {output_path}")
+
+ duration_ms = int((time.time() - start_time) * 1000)
+ print(f"Duration: {duration_ms:,} ms")
+
+ results.append({
+ "duration_ms": duration_ms,
+ "row_count": row_count,
+ "num_partitions": num_partitions,
+ })
+
+ return {benchmark: results}
+
+
+def build_results(
+ spark: SparkSession,
+ benchmark: str,
+ data_path: str,
+ mode: str,
+ name: str,
+ timings: Dict[str, List[Dict[str, Any]]],
+) -> Dict:
+ """Assemble the result dict for shuffle benchmarks."""
+ conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()}
+
+ return {
+ "engine": "datafusion-comet",
+ "benchmark": benchmark,
+ "mode": mode,
+ "data_path": data_path,
+ "spark_conf": conf_dict,
+ **timings,
+ }
+
+
+def write_results(
+ results: Dict,
+ output_dir: str,
+ name: str,
+ benchmark: str,
+) -> str:
+ """Write JSON results file. Returns the path written."""
+ result_str = json.dumps(results, indent=4)
+ current_time_millis = int(datetime.now().timestamp() * 1000)
+ results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json"
+ print(f"\nWriting results to {results_path}")
+ with open(results_path, "w") as f:
+ f.write(result_str)
+ return results_path
diff --git a/benchmarks/suites/tpc.py b/benchmarks/suites/tpc.py
new file mode 100644
index 0000000000..19c020a135
--- /dev/null
+++ b/benchmarks/suites/tpc.py
@@ -0,0 +1,215 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""TPC-H / TPC-DS benchmark suite."""
+
+import json
+import time
+from datetime import datetime
+from typing import Dict, List, Optional
+
+from pyspark.sql import SparkSession
+
+
+# Table definitions per benchmark
+TPCH_TABLES = [
+ "customer", "lineitem", "nation", "orders",
+ "part", "partsupp", "region", "supplier",
+]
+
+TPCDS_TABLES = [
+ "call_center", "catalog_page", "catalog_returns", "catalog_sales",
+ "customer", "customer_address", "customer_demographics", "date_dim",
+ "time_dim", "household_demographics", "income_band", "inventory",
+ "item", "promotion", "reason", "ship_mode", "store", "store_returns",
+ "store_sales", "warehouse", "web_page", "web_returns", "web_sales",
+ "web_site",
+]
+
+BENCHMARK_META = {
+ "tpch": {"num_queries": 22, "tables": TPCH_TABLES},
+ "tpcds": {"num_queries": 99, "tables": TPCDS_TABLES},
+}
+
+
+def dedup_columns(df):
+ """Rename duplicate column aliases: a, a, b, b -> a, a_1, b, b_1."""
+ counts: Dict[str, int] = {}
+ new_cols: List[str] = []
+ for c in df.columns:
+ if c not in counts:
+ counts[c] = 0
+ new_cols.append(c)
+ else:
+ counts[c] += 1
+ new_cols.append(f"{c}_{counts[c]}")
+ return df.toDF(*new_cols)
+
+
+def register_tables(
+ spark: SparkSession,
+ benchmark: str,
+ data_path: Optional[str],
+ catalog: Optional[str],
+ database: str,
+ file_format: str,
+ reader_options: Optional[Dict[str, str]],
+) -> bool:
+ """Register TPC tables as temp views.
+
+ Returns True when using Iceberg catalog, False for file-based tables.
+ """
+ if benchmark not in BENCHMARK_META:
+ raise ValueError(f"Invalid benchmark: {benchmark}")
+ tables = BENCHMARK_META[benchmark]["tables"]
+ using_iceberg = catalog is not None
+
+ for table in tables:
+ if using_iceberg:
+ source = f"{catalog}.{database}.{table}"
+ print(f"Registering table {table} from {source}")
+ df = spark.table(source)
+ else:
+ source = f"{data_path}/{table}.{file_format}"
+ print(f"Registering table {table} from {source}")
+ df = spark.read.format(file_format).options(**(reader_options or {})).load(source)
+ df.createOrReplaceTempView(table)
+
+ return using_iceberg
+
+
+def run_queries(
+ spark: SparkSession,
+ benchmark: str,
+ query_path: str,
+ iterations: int,
+ query_num: Optional[int] = None,
+ write_path: Optional[str] = None,
+) -> Dict[int, List[float]]:
+ """Execute TPC queries and return {query_num: [elapsed_secs_per_iter]}."""
+ meta = BENCHMARK_META[benchmark]
+ num_queries = meta["num_queries"]
+ results: Dict[int, List[float]] = {}
+
+ for iteration in range(iterations):
+ print(f"\n{'=' * 60}")
+ print(f"Starting iteration {iteration + 1} of {iterations}")
+ print(f"{'=' * 60}")
+ iter_start_time = time.time()
+
+ if query_num is not None:
+ if query_num < 1 or query_num > num_queries:
+ raise ValueError(
+ f"Query number {query_num} out of range. "
+ f"Valid: 1-{num_queries} for {benchmark}"
+ )
+ queries_to_run = [query_num]
+ else:
+ queries_to_run = range(1, num_queries + 1)
+
+ for query in queries_to_run:
+ spark.sparkContext.setJobDescription(f"{benchmark} q{query}")
+ path = f"{query_path}/q{query}.sql"
+ print(f"\nRunning query {query} from {path}")
+
+ with open(path, "r") as f:
+ text = f.read()
+ queries_sql = text.split(";")
+
+ start_time = time.time()
+ for sql in queries_sql:
+ sql = sql.strip().replace("create view", "create temp view")
+ if len(sql) > 0:
+ print(f"Executing: {sql[:100]}...")
+ df = spark.sql(sql)
+ df.explain("formatted")
+
+ if write_path is not None:
+ if len(df.columns) > 0:
+ output_path = f"{write_path}/q{query}"
+ deduped = dedup_columns(df)
+ deduped.orderBy(*deduped.columns).coalesce(1).write.mode(
+ "overwrite"
+ ).parquet(output_path)
+ print(f"Results written to {output_path}")
+ else:
+ rows = df.collect()
+ print(f"Query {query} returned {len(rows)} rows")
+
+ end_time = time.time()
+ elapsed = end_time - start_time
+ print(f"Query {query} took {elapsed:.2f} seconds")
+
+ results.setdefault(query, []).append(elapsed)
+
+ iter_end_time = time.time()
+ print(
+ f"\nIteration {iteration + 1} took "
+ f"{iter_end_time - iter_start_time:.2f} seconds"
+ )
+
+ return results
+
+
+def build_results(
+ spark: SparkSession,
+ benchmark: str,
+ query_path: str,
+ data_path: Optional[str],
+ catalog: Optional[str],
+ database: str,
+ using_iceberg: bool,
+ name: str,
+ timings: Dict[int, List[float]],
+) -> Dict:
+ """Assemble the result dict with the same schema as tpcbench.py."""
+ conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()}
+
+ results: Dict = {
+ "engine": "datafusion-comet",
+ "benchmark": benchmark,
+ "query_path": query_path,
+ "spark_conf": conf_dict,
+ }
+ if using_iceberg:
+ results["catalog"] = catalog
+ results["database"] = database
+ else:
+ results["data_path"] = data_path
+
+ # Integer query keys — json.dumps serialises them as strings, matching
+ # the format that generate-comparison.py expects (str(query)).
+ for query, elapsed_list in timings.items():
+ results[query] = elapsed_list
+
+ return results
+
+
+def write_results(
+ results: Dict,
+ output_dir: str,
+ name: str,
+ benchmark: str,
+) -> str:
+ """Write JSON results file. Returns the path written."""
+ result_str = json.dumps(results, indent=4)
+ current_time_millis = int(datetime.now().timestamp() * 1000)
+ results_path = f"{output_dir}/{name}-{benchmark}-{current_time_millis}.json"
+ print(f"\nWriting results to {results_path}")
+ with open(results_path, "w") as f:
+ f.write(result_str)
+ return results_path
diff --git a/dev/benchmarks/.gitignore b/dev/benchmarks/.gitignore
deleted file mode 100644
index 477aaef0c3..0000000000
--- a/dev/benchmarks/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.json
-*.png
\ No newline at end of file
diff --git a/dev/benchmarks/README.md b/dev/benchmarks/README.md
deleted file mode 100644
index b3ea674199..0000000000
--- a/dev/benchmarks/README.md
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-# Comet Benchmarking Scripts
-
-This directory contains scripts used for generating benchmark results that are published in this repository and in
-the Comet documentation.
-
-For full instructions on running these benchmarks on an EC2 instance, see the [Comet Benchmarking on EC2 Guide].
-
-[Comet Benchmarking on EC2 Guide]: https://datafusion.apache.org/comet/contributor-guide/benchmarking_aws_ec2.html
-
-## Example usage
-
-Set Spark environment variables:
-
-```shell
-export SPARK_HOME=/opt/spark-3.5.3-bin-hadoop3/
-export SPARK_MASTER=spark://yourhostname:7077
-```
-
-Set path to queries and data:
-
-```shell
-export TPCH_QUERIES=/mnt/bigdata/tpch/queries/
-export TPCH_DATA=/mnt/bigdata/tpch/sf100/
-```
-
-Run Spark benchmark:
-
-```shell
-export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
-sudo ./drop-caches.sh
-./spark-tpch.sh
-```
-
-Run Comet benchmark:
-
-```shell
-export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
-export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar
-sudo ./drop-caches.sh
-./comet-tpch.sh
-```
-
-Run Gluten benchmark:
-
-```shell
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
-export GLUTEN_JAR=/opt/gluten/gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.4.0.jar
-sudo ./drop-caches.sh
-./gluten-tpch.sh
-```
-
-Generating charts:
-
-```shell
-python3 generate-comparison.py --benchmark tpch --labels "Spark 3.5.3" "Comet 0.9.0" "Gluten 1.4.0" --title "TPC-H @ 100 GB (single executor, 8 cores, local Parquet files)" spark-tpch-1752338506381.json comet-tpch-1752337818039.json gluten-tpch-1752337474344.json
-```
-
-## Iceberg Benchmarking
-
-Comet includes native Iceberg support via iceberg-rust integration. This enables benchmarking TPC-H queries
-against Iceberg tables with native scan acceleration.
-
-### Prerequisites
-
-Download the Iceberg Spark runtime JAR (required for running the benchmark):
-
-```shell
-wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
-export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
-```
-
-Note: Table creation uses `--packages` which auto-downloads the dependency.
-
-### Create Iceberg TPC-H tables
-
-Convert existing Parquet TPC-H data to Iceberg format:
-
-```shell
-export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
-export ICEBERG_CATALOG=${ICEBERG_CATALOG:-local}
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.executor.memory=16g \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \
- create-iceberg-tpch.py \
- --parquet-path $TPCH_DATA \
- --catalog $ICEBERG_CATALOG \
- --database tpch
-```
-
-### Run Iceberg benchmark
-
-```shell
-export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
-export COMET_JAR=/opt/comet/comet-spark-spark3.5_2.12-0.10.0.jar
-export ICEBERG_JAR=/path/to/iceberg-spark-runtime-3.5_2.12-1.8.1.jar
-export ICEBERG_WAREHOUSE=/mnt/bigdata/iceberg-warehouse
-export TPCH_QUERIES=/mnt/bigdata/tpch/queries/
-sudo ./drop-caches.sh
-./comet-tpch-iceberg.sh
-```
-
-The benchmark uses `spark.comet.scan.icebergNative.enabled=true` to enable Comet's native iceberg-rust
-integration. Verify native scanning is active by checking for `CometIcebergNativeScanExec` in the
-physical plan output.
-
-### Iceberg-specific options
-
-| Environment Variable | Default | Description |
-| -------------------- | ---------- | ----------------------------------- |
-| `ICEBERG_CATALOG` | `local` | Iceberg catalog name |
-| `ICEBERG_DATABASE` | `tpch` | Database containing TPC-H tables |
-| `ICEBERG_WAREHOUSE` | (required) | Path to Iceberg warehouse directory |
-
-### Comparing Parquet vs Iceberg performance
-
-Run both benchmarks and compare:
-
-```shell
-python3 generate-comparison.py --benchmark tpch \
- --labels "Comet (Parquet)" "Comet (Iceberg)" \
- --title "TPC-H @ 100 GB: Parquet vs Iceberg" \
- comet-tpch-*.json comet-iceberg-tpch-*.json
-```
diff --git a/dev/benchmarks/blaze-tpcds.sh b/dev/benchmarks/blaze-tpcds.sh
deleted file mode 100755
index 90a4a48468..0000000000
--- a/dev/benchmarks/blaze-tpcds.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --jars $BLAZE_JAR \
- --driver-class-path $BLAZE_JAR \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=2 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=16 \
- --conf spark.executor.memory=16g \
- --conf spark.executor.memoryOverhead=16g \
- --conf spark.memory.offHeap.enabled=false \
- --conf spark.eventLog.enabled=true \
- --conf spark.driver.extraClassPath=$BLAZE_JAR \
- --conf spark.executor.extraClassPath=$BLAZE_JAR \
- --conf spark.sql.extensions=org.apache.spark.sql.blaze.BlazeSparkSessionExtension \
- --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
- --conf spark.blaze.enable=true \
- --conf spark.blaze.forceShuffledHashJoin=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name blaze \
- --benchmark tpcds \
- --data $TPCDS_DATA \
- --queries $TPCDS_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/blaze-tpch.sh b/dev/benchmarks/blaze-tpch.sh
deleted file mode 100755
index 2c6878737d..0000000000
--- a/dev/benchmarks/blaze-tpch.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --jars $BLAZE_JAR \
- --driver-class-path $BLAZE_JAR \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.executor.memory=16g \
- --conf spark.executor.memoryOverhead=16g \
- --conf spark.memory.offHeap.enabled=false \
- --conf spark.eventLog.enabled=true \
- --conf spark.driver.extraClassPath=$BLAZE_JAR \
- --conf spark.executor.extraClassPath=$BLAZE_JAR \
- --conf spark.sql.extensions=org.apache.spark.sql.blaze.BlazeSparkSessionExtension \
- --conf spark.shuffle.manager=org.apache.spark.sql.execution.blaze.shuffle.BlazeShuffleManager \
- --conf spark.blaze.enable=true \
- --conf spark.blaze.forceShuffledHashJoin=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name blaze \
- --benchmark tpch \
- --data $TPCH_DATA \
- --queries $TPCH_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/comet-tpcds.sh b/dev/benchmarks/comet-tpcds.sh
deleted file mode 100755
index b55b27188c..0000000000
--- a/dev/benchmarks/comet-tpcds.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --jars $COMET_JAR \
- --driver-class-path $COMET_JAR \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=2 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=16 \
- --conf spark.executor.memory=16g \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.eventLog.enabled=true \
- --conf spark.driver.extraClassPath=$COMET_JAR \
- --conf spark.executor.extraClassPath=$COMET_JAR \
- --conf spark.plugins=org.apache.spark.CometPlugin \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.comet.scan.impl=native_datafusion \
- --conf spark.comet.expression.Cast.allowIncompatible=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name comet \
- --benchmark tpcds \
- --data $TPCDS_DATA \
- --queries $TPCDS_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/comet-tpch-iceberg.sh b/dev/benchmarks/comet-tpch-iceberg.sh
deleted file mode 100755
index 7907125c82..0000000000
--- a/dev/benchmarks/comet-tpch-iceberg.sh
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-# TPC-H benchmark using Iceberg tables with Comet's native iceberg-rust integration.
-#
-# Required environment variables:
-# SPARK_HOME - Path to Spark installation
-# SPARK_MASTER - Spark master URL (e.g., spark://localhost:7077)
-# COMET_JAR - Path to Comet JAR
-# ICEBERG_JAR - Path to Iceberg Spark runtime JAR
-# ICEBERG_WAREHOUSE - Path to Iceberg warehouse directory
-# TPCH_QUERIES - Path to TPC-H query files
-#
-# Optional:
-# ICEBERG_CATALOG - Catalog name (default: local)
-# ICEBERG_DATABASE - Database name (default: tpch)
-#
-# Setup (run once to create Iceberg tables from Parquet):
-# $SPARK_HOME/bin/spark-submit \
-# --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 \
-# --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
-# --conf spark.sql.catalog.local.type=hadoop \
-# --conf spark.sql.catalog.local.warehouse=$ICEBERG_WAREHOUSE \
-# create-iceberg-tpch.py \
-# --parquet-path $TPCH_DATA \
-# --catalog local \
-# --database tpch
-
-set -e
-
-# Defaults
-ICEBERG_CATALOG=${ICEBERG_CATALOG:-local}
-ICEBERG_DATABASE=${ICEBERG_DATABASE:-tpch}
-
-# Validate required variables
-if [ -z "$SPARK_HOME" ]; then
- echo "Error: SPARK_HOME is not set"
- exit 1
-fi
-if [ -z "$COMET_JAR" ]; then
- echo "Error: COMET_JAR is not set"
- exit 1
-fi
-if [ -z "$ICEBERG_JAR" ]; then
- echo "Error: ICEBERG_JAR is not set"
- echo "Download from: https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.8.1/"
- exit 1
-fi
-if [ -z "$ICEBERG_WAREHOUSE" ]; then
- echo "Error: ICEBERG_WAREHOUSE is not set"
- exit 1
-fi
-if [ -z "$TPCH_QUERIES" ]; then
- echo "Error: TPCH_QUERIES is not set"
- exit 1
-fi
-
-$SPARK_HOME/sbin/stop-master.sh 2>/dev/null || true
-$SPARK_HOME/sbin/stop-worker.sh 2>/dev/null || true
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --jars $COMET_JAR,$ICEBERG_JAR \
- --driver-class-path $COMET_JAR:$ICEBERG_JAR \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.executor.memory=16g \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.eventLog.enabled=true \
- --conf spark.driver.extraClassPath=$COMET_JAR:$ICEBERG_JAR \
- --conf spark.executor.extraClassPath=$COMET_JAR:$ICEBERG_JAR \
- --conf spark.plugins=org.apache.spark.CometPlugin \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.comet.exec.replaceSortMergeJoin=true \
- --conf spark.comet.expression.Cast.allowIncompatible=true \
- --conf spark.comet.enabled=true \
- --conf spark.comet.exec.enabled=true \
- --conf spark.comet.scan.icebergNative.enabled=true \
- --conf spark.comet.explainFallback.enabled=true \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}=org.apache.iceberg.spark.SparkCatalog \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}.type=hadoop \
- --conf spark.sql.catalog.${ICEBERG_CATALOG}.warehouse=$ICEBERG_WAREHOUSE \
- --conf spark.sql.defaultCatalog=${ICEBERG_CATALOG} \
- tpcbench.py \
- --name comet-iceberg \
- --benchmark tpch \
- --catalog $ICEBERG_CATALOG \
- --database $ICEBERG_DATABASE \
- --queries $TPCH_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/comet-tpch.sh b/dev/benchmarks/comet-tpch.sh
deleted file mode 100755
index a748a02319..0000000000
--- a/dev/benchmarks/comet-tpch.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --jars $COMET_JAR \
- --driver-class-path $COMET_JAR \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.executor.memory=16g \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.eventLog.enabled=true \
- --conf spark.driver.extraClassPath=$COMET_JAR \
- --conf spark.executor.extraClassPath=$COMET_JAR \
- --conf spark.plugins=org.apache.spark.CometPlugin \
- --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
- --conf spark.comet.scan.impl=native_datafusion \
- --conf spark.comet.exec.replaceSortMergeJoin=true \
- --conf spark.comet.expression.Cast.allowIncompatible=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name comet \
- --benchmark tpch \
- --data $TPCH_DATA \
- --queries $TPCH_QUERIES \
- --output . \
- --iterations 1 \
- --format parquet
diff --git a/dev/benchmarks/generate-comparison.py b/dev/benchmarks/generate-comparison.py
deleted file mode 100644
index eb57cc1e45..0000000000
--- a/dev/benchmarks/generate-comparison.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import json
-import matplotlib.pyplot as plt
-import numpy as np
-
-def geomean(data):
- return np.prod(data) ** (1 / len(data))
-
-def generate_query_rel_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
- results = []
- for query in range(1, query_count(benchmark)+1):
- if query == 999:
- continue
- a = np.median(np.array(baseline[str(query)]))
- b = np.median(np.array(comparison[str(query)]))
- if a > b:
- speedup = a/b-1
- else:
- speedup = -(1/(a/b)-1)
- results.append(("q" + str(query), round(speedup*100, 0)))
-
- results = sorted(results, key=lambda x: -x[1])
-
- queries, speedups = zip(*results)
-
- # Create figure and axis
- if benchmark == "tpch":
- fig, ax = plt.subplots(figsize=(10, 6))
- else:
- fig, ax = plt.subplots(figsize=(35, 10))
-
- # Create bar chart
- bars = ax.bar(queries, speedups, color='skyblue')
-
- # Add text annotations
- for bar, speedup in zip(bars, speedups):
- yval = bar.get_height()
- if yval >= 0:
- ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
- color='blue', rotation=90)
- else:
- ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
- color='blue', rotation=90)
-
- # Add title and labels
- ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
- ax.set_ylabel('Speedup Percentage (100% speedup = 2x faster)')
- ax.set_xlabel('Query')
-
- # Customize the y-axis to handle both positive and negative values better
- ax.axhline(0, color='black', linewidth=0.8)
- min_value = (min(speedups) // 100) * 100
- max_value = ((max(speedups) // 100) + 1) * 100 + 50
- if benchmark == "tpch":
- ax.set_ylim(min_value, max_value)
- else:
- # TODO improve this
- ax.set_ylim(-250, 300)
-
- # Show grid for better readability
- ax.yaxis.grid(True)
-
- # Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_speedup_rel.png', format='png')
-
-def generate_query_abs_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
- results = []
- for query in range(1, query_count(benchmark)+1):
- if query == 999:
- continue
- a = np.median(np.array(baseline[str(query)]))
- b = np.median(np.array(comparison[str(query)]))
- speedup = a-b
- results.append(("q" + str(query), round(speedup, 1)))
-
- results = sorted(results, key=lambda x: -x[1])
-
- queries, speedups = zip(*results)
-
- # Create figure and axis
- if benchmark == "tpch":
- fig, ax = plt.subplots(figsize=(10, 6))
- else:
- fig, ax = plt.subplots(figsize=(35, 10))
-
- # Create bar chart
- bars = ax.bar(queries, speedups, color='skyblue')
-
- # Add text annotations
- for bar, speedup in zip(bars, speedups):
- yval = bar.get_height()
- if yval >= 0:
- ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.1f}', va='bottom', ha='center', fontsize=8,
- color='blue', rotation=90)
- else:
- ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.1f}', va='top', ha='center', fontsize=8,
- color='blue', rotation=90)
-
- # Add title and labels
- ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
- ax.set_ylabel('Speedup (in seconds)')
- ax.set_xlabel('Query')
-
- # Customize the y-axis to handle both positive and negative values better
- ax.axhline(0, color='black', linewidth=0.8)
- min_value = min(speedups) * 2 - 20
- max_value = max(speedups) * 1.5
- ax.set_ylim(min_value, max_value)
-
- # Show grid for better readability
- ax.yaxis.grid(True)
-
- # Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_speedup_abs.png', format='png')
-
-def generate_query_comparison_chart(results, labels, benchmark: str, title: str):
- queries = []
- benches = []
- for _ in results:
- benches.append([])
- for query in range(1, query_count(benchmark)+1):
- if query == 999:
- continue
- queries.append("q" + str(query))
- for i in range(0, len(results)):
- benches[i].append(np.median(np.array(results[i][str(query)])))
-
- # Define the width of the bars
- bar_width = 0.3
-
- # Define the positions of the bars on the x-axis
- index = np.arange(len(queries)) * 1.5
-
- # Create a bar chart
- if benchmark == "tpch":
- fig, ax = plt.subplots(figsize=(15, 6))
- else:
- fig, ax = plt.subplots(figsize=(35, 6))
-
- for i in range(0, len(results)):
- bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])
-
- # Add labels, title, and legend
- ax.set_title(title)
- ax.set_xlabel('Queries')
- ax.set_ylabel('Query Time (seconds)')
- ax.set_xticks(index + bar_width / 2)
- ax.set_xticklabels(queries)
- ax.legend()
-
- # Save the plot as an image file
- plt.savefig(f'{benchmark}_queries_compare.png', format='png')
-
-def generate_summary(results, labels, benchmark: str, title: str):
- timings = []
- for _ in results:
- timings.append(0)
-
- num_queries = query_count(benchmark)
- for query in range(1, num_queries + 1):
- if query == 999:
- continue
- for i in range(0, len(results)):
- timings[i] += np.median(np.array(results[i][str(query)]))
-
- # Create figure and axis
- fig, ax = plt.subplots()
- fig.set_size_inches(10, 6)
-
- # Add title and labels
- ax.set_title(title)
- ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)')
-
- times = [round(x,0) for x in timings]
-
- # Create bar chart
- bars = ax.bar(labels, times, color='skyblue', width=0.8)
-
- # Add text annotations
- for bar in bars:
- yval = bar.get_height()
- ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment
-
- plt.savefig(f'{benchmark}_allqueries.png', format='png')
-
-def query_count(benchmark: str):
- if benchmark == "tpch":
- return 22
- elif benchmark == "tpcds":
- return 99
- else:
- raise "invalid benchmark name"
-
-def main(files, labels, benchmark: str, title: str):
- results = []
- for filename in files:
- with open(filename) as f:
- results.append(json.load(f))
- generate_summary(results, labels, benchmark, title)
- generate_query_comparison_chart(results, labels, benchmark, title)
- if len(files) == 2:
- generate_query_abs_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)
- generate_query_rel_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)
-
-if __name__ == '__main__':
- argparse = argparse.ArgumentParser(description='Generate comparison')
- argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
- argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
- argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)')
- argparse.add_argument('--title', type=str, help='Chart title')
- args = argparse.parse_args()
- main(args.filenames, args.labels, args.benchmark, args.title)
diff --git a/dev/benchmarks/gluten-tpcds.sh b/dev/benchmarks/gluten-tpcds.sh
deleted file mode 100755
index 7c475c79c0..0000000000
--- a/dev/benchmarks/gluten-tpcds.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-export TZ=UTC
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=2 \
- --conf spark.executor.memory=16G \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=16 \
- --conf spark.eventLog.enabled=true \
- --jars $GLUTEN_JAR \
- --conf spark.plugins=org.apache.gluten.GlutenPlugin \
- --conf spark.driver.extraClassPath=${GLUTEN_JAR} \
- --conf spark.executor.extraClassPath=${GLUTEN_JAR} \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
- --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
- --conf spark.sql.session.timeZone=UTC \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name gluten \
- --benchmark tpcds \
- --data $TPCDS_DATA \
- --queries $TPCDS_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/gluten-tpch.sh b/dev/benchmarks/gluten-tpch.sh
deleted file mode 100755
index 46c3ed7527..0000000000
--- a/dev/benchmarks/gluten-tpch.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-export TZ=UTC
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.memory=16G \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.eventLog.enabled=true \
- --jars $GLUTEN_JAR \
- --conf spark.plugins=org.apache.gluten.GlutenPlugin \
- --conf spark.driver.extraClassPath=${GLUTEN_JAR} \
- --conf spark.executor.extraClassPath=${GLUTEN_JAR} \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.gluten.sql.columnar.forceShuffledHashJoin=true \
- --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
- --conf spark.sql.session.timeZone=UTC \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name gluten \
- --benchmark tpch \
- --data $TPCH_DATA \
- --queries $TPCH_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/spark-tpcds.sh b/dev/benchmarks/spark-tpcds.sh
deleted file mode 100755
index dad079ba23..0000000000
--- a/dev/benchmarks/spark-tpcds.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=2 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=16 \
- --conf spark.executor.memory=16g \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.eventLog.enabled=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name spark \
- --benchmark tpcds \
- --data $TPCDS_DATA \
- --queries $TPCDS_QUERIES \
- --output . \
- --iterations 1
diff --git a/dev/benchmarks/spark-tpch.sh b/dev/benchmarks/spark-tpch.sh
deleted file mode 100755
index ae359f049f..0000000000
--- a/dev/benchmarks/spark-tpch.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-
-$SPARK_HOME/sbin/stop-master.sh
-$SPARK_HOME/sbin/stop-worker.sh
-
-$SPARK_HOME/sbin/start-master.sh
-$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
-
-$SPARK_HOME/bin/spark-submit \
- --master $SPARK_MASTER \
- --conf spark.driver.memory=8G \
- --conf spark.executor.instances=1 \
- --conf spark.executor.cores=8 \
- --conf spark.cores.max=8 \
- --conf spark.executor.memory=16g \
- --conf spark.memory.offHeap.enabled=true \
- --conf spark.memory.offHeap.size=16g \
- --conf spark.eventLog.enabled=true \
- --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
- --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.DefaultAWSCredentialsProviderChain \
- tpcbench.py \
- --name spark \
- --benchmark tpch \
- --data $TPCH_DATA \
- --queries $TPCH_QUERIES \
- --output . \
- --iterations 1 \
- --format parquet
diff --git a/dev/benchmarks/tpcbench.py b/dev/benchmarks/tpcbench.py
deleted file mode 100644
index 400ccd175a..0000000000
--- a/dev/benchmarks/tpcbench.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-TPC-H / TPC-DS benchmark runner.
-
-Supports two data sources:
- - Files: use --data with --format (parquet, csv, json) and optional --options
- - Iceberg tables: use --catalog and --database to specify the catalog location
-"""
-
-import argparse
-from datetime import datetime
-import json
-from pyspark.sql import SparkSession
-import time
-from typing import Dict
-
-
-def dedup_columns(df):
- """Rename duplicate column aliases: a, a, b, b -> a, a_1, b, b_1"""
- counts = {}
- new_cols = []
- for c in df.columns:
- if c not in counts:
- counts[c] = 0
- new_cols.append(c)
- else:
- counts[c] += 1
- new_cols.append(f"{c}_{counts[c]}")
- return df.toDF(*new_cols)
-
-
-def main(
- benchmark: str,
- data_path: str,
- catalog: str,
- database: str,
- query_path: str,
- iterations: int,
- output: str,
- name: str,
- format: str,
- query_num: int = None,
- write_path: str = None,
- options: Dict[str, str] = None
-):
- if options is None:
- options = {}
-
- spark = SparkSession.builder \
- .appName(f"{name} benchmark derived from {benchmark}") \
- .getOrCreate()
-
- # Define tables for each benchmark
- if benchmark == "tpch":
- num_queries = 22
- table_names = [
- "customer", "lineitem", "nation", "orders",
- "part", "partsupp", "region", "supplier"
- ]
- elif benchmark == "tpcds":
- num_queries = 99
- table_names = [
- "call_center", "catalog_page", "catalog_returns", "catalog_sales",
- "customer", "customer_address", "customer_demographics", "date_dim",
- "time_dim", "household_demographics", "income_band", "inventory",
- "item", "promotion", "reason", "ship_mode", "store", "store_returns",
- "store_sales", "warehouse", "web_page", "web_returns", "web_sales",
- "web_site"
- ]
- else:
- raise ValueError(f"Invalid benchmark: {benchmark}")
-
- # Register tables from either files or Iceberg catalog
- using_iceberg = catalog is not None
- for table in table_names:
- if using_iceberg:
- source = f"{catalog}.{database}.{table}"
- print(f"Registering table {table} from {source}")
- df = spark.table(source)
- else:
- source = f"{data_path}/{table}.{format}"
- print(f"Registering table {table} from {source}")
- df = spark.read.format(format).options(**options).load(source)
- df.createOrReplaceTempView(table)
-
- conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()}
-
- results = {
- 'engine': 'datafusion-comet',
- 'benchmark': benchmark,
- 'query_path': query_path,
- 'spark_conf': conf_dict,
- }
- if using_iceberg:
- results['catalog'] = catalog
- results['database'] = database
- else:
- results['data_path'] = data_path
-
- for iteration in range(iterations):
- print(f"\n{'='*60}")
- print(f"Starting iteration {iteration + 1} of {iterations}")
- print(f"{'='*60}")
- iter_start_time = time.time()
-
- # Determine which queries to run
- if query_num is not None:
- if query_num < 1 or query_num > num_queries:
- raise ValueError(
- f"Query number {query_num} out of range. "
- f"Valid: 1-{num_queries} for {benchmark}"
- )
- queries_to_run = [query_num]
- else:
- queries_to_run = range(1, num_queries + 1)
-
- for query in queries_to_run:
- spark.sparkContext.setJobDescription(f"{benchmark} q{query}")
-
- path = f"{query_path}/q{query}.sql"
- print(f"\nRunning query {query} from {path}")
-
- with open(path, "r") as f:
- text = f.read()
- queries = text.split(";")
-
- start_time = time.time()
- for sql in queries:
- sql = sql.strip().replace("create view", "create temp view")
- if len(sql) > 0:
- print(f"Executing: {sql[:100]}...")
- df = spark.sql(sql)
- df.explain("formatted")
-
- if write_path is not None:
- if len(df.columns) > 0:
- output_path = f"{write_path}/q{query}"
- deduped = dedup_columns(df)
- deduped.orderBy(*deduped.columns).coalesce(1).write.mode("overwrite").parquet(output_path)
- print(f"Results written to {output_path}")
- else:
- rows = df.collect()
- print(f"Query {query} returned {len(rows)} rows")
-
- end_time = time.time()
- elapsed = end_time - start_time
- print(f"Query {query} took {elapsed:.2f} seconds")
-
- query_timings = results.setdefault(query, [])
- query_timings.append(elapsed)
-
- iter_end_time = time.time()
- print(f"\nIteration {iteration + 1} took {iter_end_time - iter_start_time:.2f} seconds")
-
- # Write results
- result_str = json.dumps(results, indent=4)
- current_time_millis = int(datetime.now().timestamp() * 1000)
- results_path = f"{output}/{name}-{benchmark}-{current_time_millis}.json"
- print(f"\nWriting results to {results_path}")
- with open(results_path, "w") as f:
- f.write(result_str)
-
- spark.stop()
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description="TPC-H/TPC-DS benchmark runner for files or Iceberg tables"
- )
- parser.add_argument(
- "--benchmark", required=True,
- help="Benchmark to run (tpch or tpcds)"
- )
-
- # Data source - mutually exclusive: either file path or Iceberg catalog
- source_group = parser.add_mutually_exclusive_group(required=True)
- source_group.add_argument(
- "--data",
- help="Path to data files"
- )
- source_group.add_argument(
- "--catalog",
- help="Iceberg catalog name"
- )
-
- # Options for file-based reading
- parser.add_argument(
- "--format", default="parquet",
- help="Input file format: parquet, csv, json (only used with --data)"
- )
- parser.add_argument(
- "--options", type=json.loads, default={},
- help='Spark reader options as JSON string, e.g., \'{"header": "true"}\' (only used with --data)'
- )
-
- # Options for Iceberg
- parser.add_argument(
- "--database", default="tpch",
- help="Database containing TPC tables (only used with --catalog)"
- )
-
- parser.add_argument(
- "--queries", required=True,
- help="Path to query SQL files"
- )
- parser.add_argument(
- "--iterations", type=int, default=1,
- help="Number of iterations"
- )
- parser.add_argument(
- "--output", required=True,
- help="Path to write results JSON"
- )
- parser.add_argument(
- "--name", required=True,
- help="Prefix for result file"
- )
- parser.add_argument(
- "--query", type=int,
- help="Specific query number (1-based). If omitted, run all."
- )
- parser.add_argument(
- "--write",
- help="Path to save query results as Parquet"
- )
- args = parser.parse_args()
-
- main(
- args.benchmark,
- args.data,
- args.catalog,
- args.database,
- args.queries,
- args.iterations,
- args.output,
- args.name,
- args.format,
- args.query,
- args.write,
- args.options
- )
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 4ea10c1dff..ee80a51eeb 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -28,3 +28,5 @@ spark/src/test/resources/tpch-extended/q*.sql
spark/src/test/resources/test-data/*.csv
spark/src/test/resources/test-data/*.ndjson
spark/inspections/CometTPC*results.txt
+benchmarks/queries/tpch/q*.sql
+benchmarks/queries/tpcds/q*.sql
diff --git a/pom.xml b/pom.xml
index 1b33fc4757..6ece20552d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1058,6 +1058,7 @@ under the License.
dev/deploy-file
**/test/resources/**
**/benchmarks/*.txt
+ benchmarks/queries/**/*.sql
**/inspections/*.txt
tpcds-kit/**
tpcds-sf-1/**