apache · Shekharrajak · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/.github/actions/setup-builder/action.yaml b/.github/actions/setup-builder/action.yaml
@@ -32,9 +32,8 @@ runs:
     - name: Install Build Dependencies
       shell: bash
       run: |
-        apt-get update
-        apt-get install -y protobuf-compiler
-        apt-get install -y clang
+        sudo apt-get update
+        sudo apt-get install -y protobuf-compiler clang
 
     - name: Install JDK ${{inputs.jdk-version}}
       uses: actions/setup-java@v4

diff --git a/.github/workflows/k8s_benchmark.yml b/.github/workflows/k8s_benchmark.yml
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: K8s Benchmark CI
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    # paths:
+    #   - "native/**/*.rs"
+    #   - "spark/**/*.scala"
+    #   - "spark/**/*.java"
+  workflow_dispatch:
+    inputs:
+      scale_factor:
+        description: 'TPC-H scale factor'
+        default: '1'
+      query:
+        description: 'TPC-H query'
+        default: 'q1'
+        type: choice
+        options: [q1, q6]
+      min_speedup:
+        description: 'Minimum speedup'
+        default: '1.1'
+
+env:
+  RUST_VERSION: stable
+  K8S_VERSION: "1.32.0"
+  SPARK_VERSION: "3.5"
+  SCALA_VERSION: "2.12"
+  TPCH_SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }}
+  TPCH_QUERY: ${{ github.event.inputs.query || 'q1' }}
+  MIN_SPEEDUP: ${{ github.event.inputs.min_speedup || '1.1' }}
+
+jobs:
+  k8s-benchmark:
+    name: K8s Benchmark
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/share/boost /opt/ghc
+          docker system prune -af || true
+
+      - name: Install K8s tools
+        run: |
+          curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.26.0/kind-linux-amd64"
+          chmod +x ./kind && sudo mv ./kind /usr/local/bin/kind
+          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          chmod +x ./kubectl && sudo mv ./kubectl /usr/local/bin/kubectl
+          curl -sSfL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+
+      - name: Setup Rust
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{ env.RUST_VERSION }}
+          jdk-version: 17
+
+      - name: Cache Cargo
+        uses: actions/cache@v5
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            native/target
+          key: ${{ runner.os }}-cargo-bench-${{ hashFiles('native/**/Cargo.lock') }}
+
+      - name: Cache Maven
+        uses: actions/cache@v5
+        with:
+          path: ~/.m2/repository
+          key: ${{ runner.os }}-maven-bench-${{ hashFiles('**/pom.xml') }}
+
+      - name: Build Comet
+        run: make release PROFILES="-Pspark-${{ env.SPARK_VERSION }} -Pscala-${{ env.SCALA_VERSION }}"
+        timeout-minutes: 20
+
+      - name: Setup Kind cluster
+        run: ./hack/k8s-benchmark-setup.sh
+        timeout-minutes: 10
+
+      - name: Build benchmark image
+        run: |
+          COMET_JAR=$(find spark/target -name "comet-spark-spark${{ env.SPARK_VERSION }}_${{ env.SCALA_VERSION }}-*.jar" -not -name "*sources*" -not -name "*javadoc*" | head -1)
+          echo "Using Comet JAR: $COMET_JAR"
+          docker build -t comet-bench:local \
+            --build-arg COMET_JAR=$COMET_JAR \
+            -f benchmarks/Dockerfile.k8s .
+          kind load docker-image comet-bench:local --name comet-bench
+        timeout-minutes: 10
+
+      - name: Generate TPC-H data
+        run: ./benchmarks/scripts/generate-tpch-data.sh ${{ env.TPCH_SCALE_FACTOR }}
+        timeout-minutes: 10
+
+      - name: Run Spark baseline
+        id: spark_bench
+        run: ./benchmarks/scripts/run-k8s-benchmark.sh spark ${{ env.TPCH_QUERY }}
+        timeout-minutes: 15
+
+      - name: Run Comet benchmark
+        id: comet_bench
+        run: ./benchmarks/scripts/run-k8s-benchmark.sh comet ${{ env.TPCH_QUERY }}
+        timeout-minutes: 15
+
+      - name: Validate performance
+        run: |
+          python3 benchmarks/scripts/compare-results.py \
+            --spark /tmp/comet-bench-results/spark_${{ env.TPCH_QUERY }}_result.json \
+            --comet /tmp/comet-bench-results/comet_${{ env.TPCH_QUERY }}_result.json \
+            --min-speedup ${{ env.MIN_SPEEDUP }} \
+            --output /tmp/comet-bench-results/comparison.json \
+            --strict
+
+      - name: Upload results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: benchmark-results
+          path: /tmp/comet-bench-results/
+
+      - name: Collect logs on failure
+        if: failure()
+        run: |
+          kubectl get pods -A || true
+          kubectl logs -n comet-bench comet-bench-driver --tail=100 2>/dev/null || true
+
+      - name: Cleanup
+        if: always()
+        run: kind delete cluster --name comet-bench || true
diff --git a/benchmarks/Dockerfile.k8s b/benchmarks/Dockerfile.k8s
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM apache/spark:3.5.8
+
+USER root
+
+RUN apt-get update \
+    && apt-get install -y python3 python3-pip git curl make gcc \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG COMET_JAR
+COPY ${COMET_JAR} $SPARK_HOME/jars/
+
+RUN cd /opt \
+    && git clone --depth 1 https://github.com/databricks/tpch-dbgen.git \
+    && cd tpch-dbgen \
+    && make
+
+COPY benchmarks/scripts /opt/comet-bench/scripts
+COPY benchmarks/conf /opt/comet-bench/conf
+
+WORKDIR /opt/comet-bench
+
+ENV COMET_JAR_PATH="$SPARK_HOME/jars"
+ENV PYTHONUNBUFFERED=1
+
+CMD ["python3", "--version"]
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -17,10 +17,42 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-# Running Comet Benchmarks in Microk8s
+# Comet Benchmarks
 
-This guide explains how to run benchmarks derived from TPC-H and TPC-DS in Apache DataFusion Comet deployed in a
-local Microk8s cluster.
+## GitHub CI (Kind)
+
+The CI runs TPC-H benchmarks on Kind cluster for PRs modifying `native/**/*.rs` or `spark/**/*.scala|java`.
+Target: Comet >= 1.1x speedup over Spark.
+
+## Local Development (Kind)
+
+```bash
+# Setup
+./hack/k8s-benchmark-setup.sh
+
+# Build
+make release PROFILES="-Pspark-3.5 -Pscala-2.12"
+docker build -t comet-bench:local -f benchmarks/Dockerfile.k8s .
+kind load docker-image comet-bench:local --name comet-bench
+
+# Generate data
+./benchmarks/scripts/generate-tpch-data.sh 1
+
+# Run benchmarks
+./benchmarks/scripts/run-k8s-benchmark.sh spark q1
+./benchmarks/scripts/run-k8s-benchmark.sh comet q1
+
+# Compare
+python3 benchmarks/scripts/compare-results.py \
+    --spark /tmp/comet-bench-results/spark_q1_result.json \
+    --comet /tmp/comet-bench-results/comet_q1_result.json \
+    --min-speedup 1.1
+
+# Cleanup
+./hack/k8s-benchmark-setup.sh --delete
+```
+
+## Microk8s Deployment
 
 ## Use Microk8s locally
 

diff --git a/benchmarks/conf/k8s.conf b/benchmarks/conf/k8s.conf
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+spark.kubernetes.namespace=comet-bench
+spark.kubernetes.container.image=comet-bench:local
+spark.kubernetes.authenticate.driver.serviceAccountName=spark
+spark.driver.memory=2g
+spark.kubernetes.driver.pod.name=comet-bench-driver
+spark.executor.instances=2
+spark.executor.memory=2g
+spark.executor.cores=2
+spark.kubernetes.driver.volumes.hostPath.data.mount.path=/data
+spark.kubernetes.driver.volumes.hostPath.data.options.path=/tmp/comet-bench-data
+spark.kubernetes.executor.volumes.hostPath.data.mount.path=/data
+spark.kubernetes.executor.volumes.hostPath.data.options.path=/tmp/comet-bench-data
+spark.sql.adaptive.enabled=true
+spark.dynamicAllocation.enabled=false
diff --git a/benchmarks/scripts/compare-results.py b/benchmarks/scripts/compare-results.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+def load_result(filepath: str) -> dict:
+    with open(filepath, 'r') as f:
+        return json.load(f)
+
+
+def calculate_speedup(spark_duration: float, comet_duration: float) -> float:
+    if comet_duration <= 0:
+        return float('inf')
+    return spark_duration / comet_duration
+
+
+def print_comparison(spark_result: dict, comet_result: dict, speedup: float, min_speedup: float):
+    spark_duration = spark_result['duration_seconds']
+    comet_duration = comet_result['duration_seconds']
+
+    print("\n" + "=" * 50)
+    print("BENCHMARK COMPARISON")
+    print("=" * 50)
+    print(f"Query: {spark_result.get('query', 'unknown')}")
+    print(f"Spark: {spark_duration:.2f}s")
+    print(f"Comet: {comet_duration:.2f}s")
+    print(f"Speedup: {speedup:.2f}x")
+    print(f"Required: {min_speedup:.2f}x")
+    print("-" * 50)
+
+    if speedup >= min_speedup:
+        print(f"PASS: {speedup:.2f}x >= {min_speedup:.2f}x")
+    else:
+        print(f"FAIL: {speedup:.2f}x < {min_speedup:.2f}x")
+    print("=" * 50 + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare benchmark results")
+    parser.add_argument("--spark", "-s", required=True, help="Spark result JSON")
+    parser.add_argument("--comet", "-c", required=True, help="Comet result JSON")
+    parser.add_argument("--min-speedup", type=float, default=1.1, help="Min speedup (default: 1.1)")
+    parser.add_argument("--output", "-o", help="Output summary JSON")
+    parser.add_argument("--strict", action="store_true", help="Exit with error if below threshold")
+
+    args = parser.parse_args()
+
+    if not Path(args.spark).exists():
+        print(f"Error: {args.spark} not found", file=sys.stderr)
+        return 1
+
+    if not Path(args.comet).exists():
+        print(f"Error: {args.comet} not found", file=sys.stderr)
+        return 1
+
+    spark_result = load_result(args.spark)
+    comet_result = load_result(args.comet)
+
+    speedup = calculate_speedup(
+        spark_result['duration_seconds'],
+        comet_result['duration_seconds']
+    )
+
+    passed = speedup >= args.min_speedup
+    print_comparison(spark_result, comet_result, speedup, args.min_speedup)
+
+    if args.output:
+        summary = {
+            "spark": spark_result,
+            "comet": comet_result,
+            "speedup": speedup,
+            "min_speedup": args.min_speedup,
+            "passed": passed
+        }
+        with open(args.output, 'w') as f:
+            json.dump(summary, f, indent=2)
+
+    if args.strict and not passed:
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())