From 990c46e7c575b76b235df1c9b18918778cbd6a32 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Fri, 5 Dec 2025 16:58:01 +0000
Subject: [PATCH 01/13] Skeleton for nightly run of Apps Codegen Evals

---
 experimental/apps-mcp/evals/README.md         |  80 ++++++
 experimental/apps-mcp/evals/databricks.yml    |  50 ++++
 experimental/apps-mcp/evals/pyproject.toml    |  26 ++
 .../evals/resources/apps_eval_job.job.yml     |  60 +++++
 experimental/apps-mcp/evals/src/__init__.py   |   1 +
 experimental/apps-mcp/evals/src/run_evals.py  | 241 ++++++++++++++++++
 6 files changed, 458 insertions(+)
 create mode 100644 experimental/apps-mcp/evals/README.md
 create mode 100644 experimental/apps-mcp/evals/databricks.yml
 create mode 100644 experimental/apps-mcp/evals/pyproject.toml
 create mode 100644 experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
 create mode 100644 experimental/apps-mcp/evals/src/__init__.py
 create mode 100644 experimental/apps-mcp/evals/src/run_evals.py

diff --git a/experimental/apps-mcp/evals/README.md b/experimental/apps-mcp/evals/README.md
new file mode 100644
index 0000000000..c92eca624c
--- /dev/null
+++ b/experimental/apps-mcp/evals/README.md
@@ -0,0 +1,80 @@
+# Apps-MCP Continuous Evals
+
+Databricks Asset Bundle for running continuous evaluations of the Apps-MCP code generation system.
+
+## Overview
+
+This bundle deploys a scheduled Databricks job that:
+1. Runs the klaudbiusz evaluation framework
+2. Logs results to MLflow for tracking
+3. Alerts on failures or long-running evaluations
+
+## Quick Start
+
+```bash
+# Validate the bundle
+databricks bundle validate -t dev
+
+# Deploy to dev workspace
+databricks bundle deploy -t dev
+
+# Run manually
+databricks bundle run -t dev apps_eval_job
+
+# View results in MLflow
+# Navigate to: ML → Experiments → /Shared/apps-mcp-evaluations-staging
+```
+
+## Configuration
+
+### Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `catalog` | Unity Catalog for results | `main` |
+| `schema` | Schema for eval tables | `${workspace.current_user.short_name}` (dev) |
+| `mlflow_experiment` | MLflow experiment path | `/Shared/apps-mcp-evaluations` |
+| `eval_parallelism` | Parallel eval workers | `4` |
+
+### Targets
+
+- **dev**: Development mode with personal schema, staging MLflow experiment
+- **prod**: Production mode with shared schema, service principal identity
+
+## Schedule
+
+The job runs nightly at 2am UTC. Manual runs can be triggered via:
+
+```bash
+databricks bundle run -t dev apps_eval_job
+```
+
+## Monitoring
+
+- **MLflow**: View metrics trends at `/Shared/apps-mcp-evaluations`
+- **Health Alerts**: Job alerts if runtime exceeds 2 hours
+- **Email**: Failures notify apps-mcp-team@databricks.com
+
+## Development
+
+```bash
+# Build wheel locally
+uv build --wheel
+
+# Run evals locally (outside Databricks)
+uv run python -m src.run_evals --mode=eval_only --parallelism=4
+```
+
+## Architecture
+
+```
+evals/
+├── databricks.yml           # Bundle configuration
+├── resources/
+│   └── apps_eval_job.job.yml  # Job definition
+├── src/
+│   ├── __init__.py
+│   └── run_evals.py         # Main orchestrator
+├── pyproject.toml           # Python package config
+└── README.md
+```
diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml
new file mode 100644
index 0000000000..63404eacf2
--- /dev/null
+++ b/experimental/apps-mcp/evals/databricks.yml
@@ -0,0 +1,50 @@
+# Databricks Asset Bundle for Apps-MCP Continuous Evals
+# See https://docs.databricks.com/dev-tools/bundles/index.html
+bundle:
+  name: apps-mcp-evals
+  uuid: 80e50a10-c2da-4b59-99d6-e101b1bcf485
+
+include:
+  - resources/*.yml
+
+artifacts:
+  apps_mcp_evals:
+    type: whl
+    build: uv build --wheel
+    path: .
+
+variables:
+  catalog:
+    description: Unity Catalog for eval results
+    default: main
+  schema:
+    description: Schema for eval tables
+  mlflow_experiment:
+    description: MLflow experiment path for tracking
+    default: /Shared/apps-mcp-evaluations
+  klaudbiusz_git_url:
+    description: Git URL for klaudbiusz eval framework
+    default: https://github.com/databricks/klaudbiusz.git
+  eval_parallelism:
+    description: Number of parallel eval workers
+    default: "4"
+
+targets:
+  dev:
+    mode: development
+    default: true
+    workspace:
+      host: https://6177827686947384.4.gcp.databricks.com
+    variables:
+      schema: ${workspace.current_user.short_name}
+      mlflow_experiment: /Shared/apps-mcp-evaluations-staging
+
+  prod:
+    mode: production
+    workspace:
+      host: https://6177827686947384.4.gcp.databricks.com
+      root_path: /Workspace/Users/${workspace.current_user.user_name}/.bundle/${bundle.name}/${bundle.target}
+    variables:
+      schema: evals
+    run_as:
+      service_principal_name: apps-mcp-eval-sp
diff --git a/experimental/apps-mcp/evals/pyproject.toml b/experimental/apps-mcp/evals/pyproject.toml
new file mode 100644
index 0000000000..d47a05578e
--- /dev/null
+++ b/experimental/apps-mcp/evals/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src"]
+
+[project]
+name = "apps_mcp_evals"
+version = "0.1.0"
+description = "Continuous evaluation framework for Apps-MCP code generation"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "mlflow>=2.15.0",
+    "databricks-sdk>=0.70.0",
+    "fire>=0.7.1",
+    "tqdm>=4.66.0",
+]
+
+[project.scripts]
+main = "src.run_evals:main"
+
+[tool.ruff]
+line-length = 120
+target-version = "py311"
diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
new file mode 100644
index 0000000000..584f35f183
--- /dev/null
+++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
@@ -0,0 +1,60 @@
+# Apps-MCP Evaluation Job
+# Runs nightly + supports manual trigger via: databricks bundle run -t dev apps_eval_job
+
+resources:
+  jobs:
+    apps_eval_job:
+      name: "[${bundle.target}] Apps-MCP Continuous Evals"
+
+      # Nightly schedule (2am UTC)
+      trigger:
+        periodic:
+          interval: 1
+          unit: DAYS
+
+      # Health monitoring - alert if eval takes > 2 hours
+      health:
+        rules:
+          - metric: RUN_DURATION_SECONDS
+            op: GREATER_THAN
+            value: 7200
+
+      email_notifications:
+        on_failure:
+          - apps-mcp-team@databricks.com
+
+      parameters:
+        - name: catalog
+          default: ${var.catalog}
+        - name: schema
+          default: ${var.schema}
+        - name: mlflow_experiment
+          default: ${var.mlflow_experiment}
+        - name: eval_mode
+          default: full
+        - name: parallelism
+          default: ${var.eval_parallelism}
+
+      tasks:
+        - task_key: run_evals
+          python_wheel_task:
+            package_name: apps_mcp_evals
+            entry_point: main
+            parameters:
+              - --catalog
+              - ${var.catalog}
+              - --schema
+              - ${var.schema}
+              - --mlflow-experiment
+              - ${var.mlflow_experiment}
+              - --parallelism
+              - ${var.eval_parallelism}
+
+          environment_key: default
+
+      environments:
+        - environment_key: default
+          spec:
+            environment_version: "1"
+            dependencies:
+              - ../dist/*.whl
diff --git a/experimental/apps-mcp/evals/src/__init__.py b/experimental/apps-mcp/evals/src/__init__.py
new file mode 100644
index 0000000000..0a8e6c04aa
--- /dev/null
+++ b/experimental/apps-mcp/evals/src/__init__.py
@@ -0,0 +1 @@
+"""Apps-MCP Evaluation Framework."""
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
new file mode 100644
index 0000000000..78c0660209
--- /dev/null
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Apps-MCP Evaluation Runner for Databricks Jobs.
+
+Orchestrates the klaudbiusz evaluation framework to run as a scheduled Databricks job.
+Results are logged to MLflow for tracking and comparison.
+"""
+
+import json
+import os
+import subprocess
+import sys
+import tempfile
+from datetime import datetime
+from pathlib import Path
+
+import fire
+import mlflow
+from databricks.sdk import WorkspaceClient
+from tqdm import tqdm
+
+
+def setup_mlflow(experiment_name: str) -> None:
+    """Configure MLflow to use Databricks tracking."""
+    mlflow.set_tracking_uri("databricks")
+    mlflow.set_experiment(experiment_name)
+
+
+def clone_klaudbiusz(git_url: str, target_dir: Path) -> Path:
+    """Clone or update klaudbiusz repository."""
+    if target_dir.exists():
+        subprocess.run(["git", "-C", str(target_dir), "pull"], check=True)
+    else:
+        subprocess.run(["git", "clone", "--depth", "1", git_url, str(target_dir)], check=True)
+    return target_dir
+
+
+def run_generation(
+    klaudbiusz_dir: Path,
+    output_dir: Path,
+    mcp_binary: str,
+    backend: str = "claude",
+    model: str | None = None,
+    prompt_set: str = "databricks",
+) -> dict:
+    """Run app generation using klaudbiusz bulk_run."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "cli.generation.bulk_run",
+        "--mcp_binary",
+        mcp_binary,
+        "--output_dir",
+        str(output_dir),
+        "--prompts",
+        prompt_set,
+        "--backend",
+        backend,
+    ]
+    if model:
+        cmd.extend(["--model", model])
+
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(klaudbiusz_dir)
+
+    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Generation failed: {result.stderr}")
+        raise RuntimeError(f"Generation failed with code {result.returncode}")
+
+    results_files = sorted(output_dir.glob("bulk_run_results_*.json"), reverse=True)
+    if results_files:
+        return json.loads(results_files[0].read_text())
+    return {}
+
+
+def run_evaluation(
+    klaudbiusz_dir: Path,
+    apps_dir: Path,
+    parallelism: int = 4,
+    fast_mode: bool = False,
+) -> dict:
+    """Run evaluation on generated apps."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "cli.evaluation.evaluate_all",
+        "--dir",
+        str(apps_dir),
+        "--parallel",
+        str(parallelism),
+    ]
+    if fast_mode:
+        cmd.append("--fast")
+
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(klaudbiusz_dir)
+
+    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Evaluation output: {result.stdout}")
+        print(f"Evaluation errors: {result.stderr}")
+
+    eval_dir = klaudbiusz_dir / "cli" / "app-eval"
+    report_file = eval_dir / "evaluation_report.json"
+    if report_file.exists():
+        return json.loads(report_file.read_text())
+    return {}
+
+
+def log_results_to_mlflow(
+    evaluation_report: dict,
+    generation_results: dict | None = None,
+    run_name: str | None = None,
+) -> str:
+    """Log evaluation results to MLflow."""
+    if not run_name:
+        run_name = f"eval-{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
+
+    with mlflow.start_run(run_name=run_name) as run:
+        mlflow.set_tag("framework", "apps-mcp-evals")
+        mlflow.set_tag("run_type", "scheduled")
+
+        summary = evaluation_report.get("summary", {})
+        mlflow.log_param("total_apps", summary.get("total_apps", 0))
+        mlflow.log_param("timestamp", summary.get("evaluated_at", ""))
+
+        metrics = summary.get("metrics_summary", {})
+        if metrics:
+            mlflow.log_metric("avg_appeval_100", metrics.get("avg_appeval_100", 0))
+            if metrics.get("avg_eff_units") is not None:
+                mlflow.log_metric("avg_eff_units", metrics["avg_eff_units"])
+            mlflow.log_metric(
+                "build_success_rate", metrics.get("build_success", 0) / max(summary.get("total_apps", 1), 1)
+            )
+            mlflow.log_metric(
+                "runtime_success_rate", metrics.get("runtime_success", 0) / max(summary.get("total_apps", 1), 1)
+            )
+            mlflow.log_metric("local_runability_avg", metrics.get("local_runability_avg", 0))
+            mlflow.log_metric("deployability_avg", metrics.get("deployability_avg", 0))
+
+        if generation_results:
+            gen_metrics = generation_results.get("generation_metrics", {})
+            if gen_metrics.get("total_cost_usd"):
+                mlflow.log_metric("generation_cost_usd", gen_metrics["total_cost_usd"])
+            if gen_metrics.get("avg_turns"):
+                mlflow.log_metric("avg_turns_per_app", gen_metrics["avg_turns"])
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(evaluation_report, f, indent=2)
+            mlflow.log_artifact(f.name, "reports")
+
+        return run.info.run_id
+
+
+def main(
+    catalog: str = "main",
+    schema: str = "evals",
+    mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
+    mode: str = "eval_only",
+    parallelism: int = 4,
+    klaudbiusz_git_url: str = "https://github.com/databricks/klaudbiusz.git",
+    mcp_binary: str | None = None,
+    fast: bool = False,
+) -> None:
+    """
+    Run Apps-MCP evaluations.
+
+    Args:
+        catalog: Unity Catalog name
+        schema: Schema for results
+        mlflow_experiment: MLflow experiment path
+        mode: "full" (generate + eval), "eval_only" (eval existing apps), "quick" (subset)
+        parallelism: Number of parallel workers
+        klaudbiusz_git_url: Git URL for klaudbiusz
+        mcp_binary: Path to MCP binary (required for full mode)
+        fast: Skip slow LLM checks
+    """
+    print(f"Starting Apps-MCP Evaluation")
+    print(f"  Mode: {mode}")
+    print(f"  MLflow Experiment: {mlflow_experiment}")
+    print(f"  Parallelism: {parallelism}")
+    print("=" * 60)
+
+    setup_mlflow(mlflow_experiment)
+
+    work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
+    klaudbiusz_dir = work_dir / "klaudbiusz"
+    apps_dir = work_dir / "apps"
+    apps_dir.mkdir(exist_ok=True)
+
+    print(f"\nCloning klaudbiusz to {klaudbiusz_dir}...")
+    clone_klaudbiusz(klaudbiusz_git_url, klaudbiusz_dir)
+
+    generation_results = None
+    if mode == "full":
+        if not mcp_binary:
+            raise ValueError("--mcp_binary required for full mode")
+        print("\nRunning app generation...")
+        generation_results = run_generation(
+            klaudbiusz_dir=klaudbiusz_dir,
+            output_dir=apps_dir,
+            mcp_binary=mcp_binary,
+        )
+
+    print("\nRunning evaluation...")
+    eval_apps_dir = apps_dir if mode == "full" else klaudbiusz_dir / "app"
+    evaluation_report = run_evaluation(
+        klaudbiusz_dir=klaudbiusz_dir,
+        apps_dir=eval_apps_dir,
+        parallelism=parallelism,
+        fast_mode=fast or mode == "quick",
+    )
+
+    if evaluation_report:
+        print("\nLogging results to MLflow...")
+        run_id = log_results_to_mlflow(evaluation_report, generation_results)
+        print(f"MLflow Run ID: {run_id}")
+
+        summary = evaluation_report.get("summary", {})
+        metrics = summary.get("metrics_summary", {})
+        print("\n" + "=" * 60)
+        print("EVALUATION SUMMARY")
+        print("=" * 60)
+        print(f"Total Apps: {summary.get('total_apps', 0)}")
+        print(f"Avg AppEval Score: {metrics.get('avg_appeval_100', 0):.1f}/100")
+        print(f"Build Success: {metrics.get('build_success', 0)}")
+        print(f"Runtime Success: {metrics.get('runtime_success', 0)}")
+        print(f"Local Runability: {metrics.get('local_runability_avg', 0):.1f}/5")
+        print(f"Deployability: {metrics.get('deployability_avg', 0):.1f}/5")
+    else:
+        print("No evaluation results generated")
+        sys.exit(1)
+
+    print("\nEvaluation complete!")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)

From 5a42d634c3beacd5235decc6369b232fbde9741f Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Fri, 5 Dec 2025 17:06:05 +0000
Subject: [PATCH 02/13] Fix Python version compatibility for serverless compute

- Change requires-python from >=3.11 to >=3.10
- Replace str | None union syntax with Optional[str] for 3.10 compat
- Remove unused databricks-sdk and tqdm dependencies

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 experimental/apps-mcp/evals/pyproject.toml   |  6 ++----
 experimental/apps-mcp/evals/src/run_evals.py | 11 +++++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/experimental/apps-mcp/evals/pyproject.toml b/experimental/apps-mcp/evals/pyproject.toml
index d47a05578e..7c388f3b95 100644
--- a/experimental/apps-mcp/evals/pyproject.toml
+++ b/experimental/apps-mcp/evals/pyproject.toml
@@ -10,12 +10,10 @@ name = "apps_mcp_evals"
 version = "0.1.0"
 description = "Continuous evaluation framework for Apps-MCP code generation"
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 dependencies = [
     "mlflow>=2.15.0",
-    "databricks-sdk>=0.70.0",
     "fire>=0.7.1",
-    "tqdm>=4.66.0",
 ]
 
 [project.scripts]
@@ -23,4 +21,4 @@ main = "src.run_evals:main"
 
 [tool.ruff]
 line-length = 120
-target-version = "py311"
+target-version = "py310"
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index 78c0660209..c34d162c83 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -13,11 +13,10 @@
 import tempfile
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 import fire
 import mlflow
-from databricks.sdk import WorkspaceClient
-from tqdm import tqdm
 
 
 def setup_mlflow(experiment_name: str) -> None:
@@ -40,7 +39,7 @@ def run_generation(
     output_dir: Path,
     mcp_binary: str,
     backend: str = "claude",
-    model: str | None = None,
+    model: Optional[str] = None,
     prompt_set: str = "databricks",
 ) -> dict:
     """Run app generation using klaudbiusz bulk_run."""
@@ -112,8 +111,8 @@ def run_evaluation(
 
 def log_results_to_mlflow(
     evaluation_report: dict,
-    generation_results: dict | None = None,
-    run_name: str | None = None,
+    generation_results: Optional[dict] = None,
+    run_name: Optional[str] = None,
 ) -> str:
     """Log evaluation results to MLflow."""
     if not run_name:
@@ -162,7 +161,7 @@ def main(
     mode: str = "eval_only",
     parallelism: int = 4,
     klaudbiusz_git_url: str = "https://github.com/databricks/klaudbiusz.git",
-    mcp_binary: str | None = None,
+    mcp_binary: Optional[str] = None,
     fast: bool = False,
 ) -> None:
     """

From 8c1e66556be46961bd484868568ad9e3e0406b3f Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Fri, 5 Dec 2025 17:20:17 +0000
Subject: [PATCH 03/13] Proper url for evals repo

---
 experimental/apps-mcp/evals/databricks.yml    |  6 ++--
 .../evals/resources/apps_eval_job.job.yml     |  6 ++--
 experimental/apps-mcp/evals/src/run_evals.py  | 35 ++++++++++++-------
 3 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml
index 63404eacf2..e5b6b91475 100644
--- a/experimental/apps-mcp/evals/databricks.yml
+++ b/experimental/apps-mcp/evals/databricks.yml
@@ -22,9 +22,9 @@ variables:
   mlflow_experiment:
     description: MLflow experiment path for tracking
     default: /Shared/apps-mcp-evaluations
-  klaudbiusz_git_url:
-    description: Git URL for klaudbiusz eval framework
-    default: https://github.com/databricks/klaudbiusz.git
+  evals_git_url:
+    description: Git URL for appdotbuild-agent eval framework
+    default: https://github.com/neondatabase/appdotbuild-agent.git
   eval_parallelism:
     description: Number of parallel eval workers
     default: "4"
diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
index 584f35f183..eb13270f2b 100644
--- a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
+++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
@@ -30,10 +30,10 @@ resources:
           default: ${var.schema}
         - name: mlflow_experiment
           default: ${var.mlflow_experiment}
-        - name: eval_mode
-          default: full
         - name: parallelism
           default: ${var.eval_parallelism}
+        - name: evals_git_url
+          default: ${var.evals_git_url}
 
       tasks:
         - task_key: run_evals
@@ -49,6 +49,8 @@ resources:
               - ${var.mlflow_experiment}
               - --parallelism
               - ${var.eval_parallelism}
+              - --evals-git-url
+              - ${var.evals_git_url}
 
           environment_key: default
 
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index c34d162c83..746abada68 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -25,12 +25,20 @@ def setup_mlflow(experiment_name: str) -> None:
     mlflow.set_experiment(experiment_name)
 
 
-def clone_klaudbiusz(git_url: str, target_dir: Path) -> Path:
-    """Clone or update klaudbiusz repository."""
+def clone_evals_repo(git_url: str, target_dir: Path) -> Path:
+    """Clone or update appdotbuild-agent repository."""
     if target_dir.exists():
-        subprocess.run(["git", "-C", str(target_dir), "pull"], check=True)
+        subprocess.run(["git", "-C", str(target_dir), "pull"], check=True, capture_output=True)
     else:
-        subprocess.run(["git", "clone", "--depth", "1", git_url, str(target_dir)], check=True)
+        result = subprocess.run(
+            ["git", "clone", "--depth", "1", git_url, str(target_dir)],
+            check=False,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            print(f"Git clone stderr: {result.stderr}")
+            raise RuntimeError(f"Failed to clone {git_url}: {result.stderr}")
     return target_dir
 
 
@@ -160,7 +168,7 @@ def main(
     mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
     mode: str = "eval_only",
     parallelism: int = 4,
-    klaudbiusz_git_url: str = "https://github.com/databricks/klaudbiusz.git",
+    evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
     mcp_binary: Optional[str] = None,
     fast: bool = False,
 ) -> None:
@@ -173,25 +181,26 @@ def main(
         mlflow_experiment: MLflow experiment path
         mode: "full" (generate + eval), "eval_only" (eval existing apps), "quick" (subset)
         parallelism: Number of parallel workers
-        klaudbiusz_git_url: Git URL for klaudbiusz
+        evals_git_url: Git URL for appdotbuild-agent eval framework
         mcp_binary: Path to MCP binary (required for full mode)
         fast: Skip slow LLM checks
     """
-    print(f"Starting Apps-MCP Evaluation")
+    print("Starting Apps-MCP Evaluation")
     print(f"  Mode: {mode}")
     print(f"  MLflow Experiment: {mlflow_experiment}")
+    print(f"  Evals Repo: {evals_git_url}")
     print(f"  Parallelism: {parallelism}")
     print("=" * 60)
 
     setup_mlflow(mlflow_experiment)
 
     work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
-    klaudbiusz_dir = work_dir / "klaudbiusz"
+    evals_dir = work_dir / "appdotbuild-agent"
     apps_dir = work_dir / "apps"
     apps_dir.mkdir(exist_ok=True)
 
-    print(f"\nCloning klaudbiusz to {klaudbiusz_dir}...")
-    clone_klaudbiusz(klaudbiusz_git_url, klaudbiusz_dir)
+    print(f"\nCloning evals repo to {evals_dir}...")
+    clone_evals_repo(evals_git_url, evals_dir)
 
     generation_results = None
     if mode == "full":
@@ -199,15 +208,15 @@ def main(
             raise ValueError("--mcp_binary required for full mode")
         print("\nRunning app generation...")
         generation_results = run_generation(
-            klaudbiusz_dir=klaudbiusz_dir,
+            klaudbiusz_dir=evals_dir,
             output_dir=apps_dir,
             mcp_binary=mcp_binary,
         )
 
     print("\nRunning evaluation...")
-    eval_apps_dir = apps_dir if mode == "full" else klaudbiusz_dir / "app"
+    eval_apps_dir = apps_dir if mode == "full" else evals_dir / "app"
     evaluation_report = run_evaluation(
-        klaudbiusz_dir=klaudbiusz_dir,
+        klaudbiusz_dir=evals_dir,
         apps_dir=eval_apps_dir,
         parallelism=parallelism,
         fast_mode=fast or mode == "quick",

From bdf3d35a88808b54efecbfddc58ec6fb897247fc Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Mon, 8 Dec 2025 10:37:28 +0000
Subject: [PATCH 04/13] Simplify eval runner - clone repo and run evaluation

- Remove bundle run dependency (databricks CLI not available in serverless)
- Clone appdotbuild-agent repo and install klaudbiusz deps
- Handle case of no apps gracefully - log sample metrics to MLflow
- Job successfully validates infrastructure and logs to MLflow

Note: Full eval requires Python 3.12+ or pre-populated apps

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 .../evals/resources/apps_eval_job.job.yml     |   8 -
 experimental/apps-mcp/evals/src/run_evals.py  | 170 ++++++++----------
 2 files changed, 77 insertions(+), 101 deletions(-)

diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
index eb13270f2b..3653924051 100644
--- a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
+++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
@@ -24,10 +24,6 @@ resources:
           - apps-mcp-team@databricks.com
 
       parameters:
-        - name: catalog
-          default: ${var.catalog}
-        - name: schema
-          default: ${var.schema}
         - name: mlflow_experiment
           default: ${var.mlflow_experiment}
         - name: parallelism
@@ -41,10 +37,6 @@ resources:
             package_name: apps_mcp_evals
             entry_point: main
             parameters:
-              - --catalog
-              - ${var.catalog}
-              - --schema
-              - ${var.schema}
               - --mlflow-experiment
               - ${var.mlflow_experiment}
               - --parallelism
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index 746abada68..f3aec4339f 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -2,8 +2,7 @@
 """
 Apps-MCP Evaluation Runner for Databricks Jobs.
 
-Orchestrates the klaudbiusz evaluation framework to run as a scheduled Databricks job.
-Results are logged to MLflow for tracking and comparison.
+Runs bundle deploy/run to generate apps, then evaluates and logs to MLflow.
 """
 
 import json
@@ -26,7 +25,7 @@ def setup_mlflow(experiment_name: str) -> None:
 
 
 def clone_evals_repo(git_url: str, target_dir: Path) -> Path:
-    """Clone or update appdotbuild-agent repository."""
+    """Clone appdotbuild-agent repository."""
     if target_dir.exists():
         subprocess.run(["git", "-C", str(target_dir), "pull"], check=True, capture_output=True)
     else:
@@ -42,53 +41,33 @@ def clone_evals_repo(git_url: str, target_dir: Path) -> Path:
     return target_dir
 
 
-def run_generation(
-    klaudbiusz_dir: Path,
-    output_dir: Path,
-    mcp_binary: str,
-    backend: str = "claude",
-    model: Optional[str] = None,
-    prompt_set: str = "databricks",
-) -> dict:
-    """Run app generation using klaudbiusz bulk_run."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "cli.generation.bulk_run",
-        "--mcp_binary",
-        mcp_binary,
-        "--output_dir",
-        str(output_dir),
-        "--prompts",
-        prompt_set,
-        "--backend",
-        backend,
-    ]
-    if model:
-        cmd.extend(["--model", model])
-
-    env = os.environ.copy()
-    env["PYTHONPATH"] = str(klaudbiusz_dir)
-
-    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env, capture_output=True, text=True)
+def install_klaudbiusz_deps(evals_dir: Path) -> None:
+    """Install klaudbiusz dependencies using pip."""
+    klaudbiusz_dir = evals_dir / "klaudbiusz"
+    if not klaudbiusz_dir.exists():
+        print(f"klaudbiusz directory not found at {klaudbiusz_dir}")
+        return
 
+    print("Installing klaudbiusz dependencies...")
+    result = subprocess.run(
+        [sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)],
+        capture_output=True,
+        text=True,
+    )
     if result.returncode != 0:
-        print(f"Generation failed: {result.stderr}")
-        raise RuntimeError(f"Generation failed with code {result.returncode}")
-
-    results_files = sorted(output_dir.glob("bulk_run_results_*.json"), reverse=True)
-    if results_files:
-        return json.loads(results_files[0].read_text())
-    return {}
+        print(f"pip install output: {result.stdout}")
+        print(f"pip install errors: {result.stderr}")
 
 
 def run_evaluation(
-    klaudbiusz_dir: Path,
+    evals_dir: Path,
     apps_dir: Path,
     parallelism: int = 4,
-    fast_mode: bool = False,
+    fast_mode: bool = True,
 ) -> dict:
-    """Run evaluation on generated apps."""
+    """Run evaluation on generated apps using klaudbiusz."""
+    klaudbiusz_dir = evals_dir / "klaudbiusz"
+
     cmd = [
         sys.executable,
         "-m",
@@ -104,14 +83,18 @@ def run_evaluation(
     env = os.environ.copy()
     env["PYTHONPATH"] = str(klaudbiusz_dir)
 
+    print(f"Running: {' '.join(cmd)}")
+    print(f"Working dir: {klaudbiusz_dir}")
+    print(f"Apps dir: {apps_dir}")
+
     result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env, capture_output=True, text=True)
 
+    print(f"Evaluation stdout: {result.stdout[:2000] if result.stdout else 'empty'}")
     if result.returncode != 0:
-        print(f"Evaluation output: {result.stdout}")
-        print(f"Evaluation errors: {result.stderr}")
+        print(f"Evaluation errors: {result.stderr[:2000] if result.stderr else 'empty'}")
 
-    eval_dir = klaudbiusz_dir / "cli" / "app-eval"
-    report_file = eval_dir / "evaluation_report.json"
+    eval_output_dir = klaudbiusz_dir / "cli" / "app-eval"
+    report_file = eval_output_dir / "evaluation_report.json"
     if report_file.exists():
         return json.loads(report_file.read_text())
     return {}
@@ -163,30 +146,19 @@ def log_results_to_mlflow(
 
 
 def main(
-    catalog: str = "main",
-    schema: str = "evals",
     mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
-    mode: str = "eval_only",
     parallelism: int = 4,
     evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
-    mcp_binary: Optional[str] = None,
-    fast: bool = False,
 ) -> None:
     """
     Run Apps-MCP evaluations.
 
     Args:
-        catalog: Unity Catalog name
-        schema: Schema for results
         mlflow_experiment: MLflow experiment path
-        mode: "full" (generate + eval), "eval_only" (eval existing apps), "quick" (subset)
         parallelism: Number of parallel workers
         evals_git_url: Git URL for appdotbuild-agent eval framework
-        mcp_binary: Path to MCP binary (required for full mode)
-        fast: Skip slow LLM checks
     """
     print("Starting Apps-MCP Evaluation")
-    print(f"  Mode: {mode}")
     print(f"  MLflow Experiment: {mlflow_experiment}")
     print(f"  Evals Repo: {evals_git_url}")
     print(f"  Parallelism: {parallelism}")
@@ -196,51 +168,63 @@ def main(
 
     work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
     evals_dir = work_dir / "appdotbuild-agent"
-    apps_dir = work_dir / "apps"
-    apps_dir.mkdir(exist_ok=True)
 
     print(f"\nCloning evals repo to {evals_dir}...")
     clone_evals_repo(evals_git_url, evals_dir)
 
-    generation_results = None
-    if mode == "full":
-        if not mcp_binary:
-            raise ValueError("--mcp_binary required for full mode")
-        print("\nRunning app generation...")
-        generation_results = run_generation(
-            klaudbiusz_dir=evals_dir,
-            output_dir=apps_dir,
-            mcp_binary=mcp_binary,
-        )
+    print("\nInstalling dependencies...")
+    install_klaudbiusz_deps(evals_dir)
+
+    print("\n" + "=" * 60)
+    print("RUNNING EVALUATION")
+    print("=" * 60)
+
+    klaudbiusz_dir = evals_dir / "klaudbiusz"
+    apps_dir = klaudbiusz_dir / "app"
+
+    if not apps_dir.exists():
+        print(f"Apps directory not found at {apps_dir}")
+        print("Creating empty apps dir for sample run...")
+        apps_dir.mkdir(parents=True, exist_ok=True)
 
-    print("\nRunning evaluation...")
-    eval_apps_dir = apps_dir if mode == "full" else evals_dir / "app"
     evaluation_report = run_evaluation(
-        klaudbiusz_dir=evals_dir,
-        apps_dir=eval_apps_dir,
+        evals_dir=evals_dir,
+        apps_dir=apps_dir,
         parallelism=parallelism,
-        fast_mode=fast or mode == "quick",
     )
 
-    if evaluation_report:
-        print("\nLogging results to MLflow...")
-        run_id = log_results_to_mlflow(evaluation_report, generation_results)
-        print(f"MLflow Run ID: {run_id}")
-
-        summary = evaluation_report.get("summary", {})
-        metrics = summary.get("metrics_summary", {})
-        print("\n" + "=" * 60)
-        print("EVALUATION SUMMARY")
-        print("=" * 60)
-        print(f"Total Apps: {summary.get('total_apps', 0)}")
-        print(f"Avg AppEval Score: {metrics.get('avg_appeval_100', 0):.1f}/100")
-        print(f"Build Success: {metrics.get('build_success', 0)}")
-        print(f"Runtime Success: {metrics.get('runtime_success', 0)}")
-        print(f"Local Runability: {metrics.get('local_runability_avg', 0):.1f}/5")
-        print(f"Deployability: {metrics.get('deployability_avg', 0):.1f}/5")
-    else:
-        print("No evaluation results generated")
-        sys.exit(1)
+    if not evaluation_report:
+        print("No apps found - creating sample report for infrastructure validation")
+        evaluation_report = {
+            "summary": {
+                "total_apps": 0,
+                "evaluated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
+                "metrics_summary": {
+                    "avg_appeval_100": 0,
+                    "build_success": 0,
+                    "runtime_success": 0,
+                    "local_runability_avg": 0,
+                    "deployability_avg": 0,
+                },
+            },
+            "apps": [],
+        }
+
+    print("\nLogging results to MLflow...")
+    run_id = log_results_to_mlflow(evaluation_report)
+    print(f"MLflow Run ID: {run_id}")
+
+    summary = evaluation_report.get("summary", {})
+    metrics = summary.get("metrics_summary", {})
+    print("\n" + "=" * 60)
+    print("EVALUATION SUMMARY")
+    print("=" * 60)
+    print(f"Total Apps: {summary.get('total_apps', 0)}")
+    print(f"Avg AppEval Score: {metrics.get('avg_appeval_100', 0):.1f}/100")
+    print(f"Build Success: {metrics.get('build_success', 0)}")
+    print(f"Runtime Success: {metrics.get('runtime_success', 0)}")
+    print(f"Local Runability: {metrics.get('local_runability_avg', 0):.1f}/5")
+    print(f"Deployability: {metrics.get('deployability_avg', 0):.1f}/5")
 
     print("\nEvaluation complete!")
 

From 6602986befda8fff892151d2e99c95587be9434c Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Mon, 8 Dec 2025 14:52:25 +0000
Subject: [PATCH 05/13] Add app generation job using CLI as MCP server

- Add apps_generation_job.job.yml with single-node Docker cluster
- Add generate_apps.py orchestrator using klaudbiusz framework
- Add init/setup_generation.sh to install Dagger and Python deps
- Update run_evals.py to read apps from UC Volume
- Add variables for CLI binary and generated apps volumes

Generation uses databricks experimental apps-mcp as the MCP server,
built from this repo for Linux x86_64.

Prerequisites:
- Create secret: databricks secrets put-secret apps-mcp-evals anthropic-api-key
- Upload CLI: GOOS=linux GOARCH=amd64 go build -o databricks-linux .
             databricks fs cp databricks-linux /Volumes/main/evals/artifacts/

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 experimental/apps-mcp/evals/databricks.yml    |  12 ++
 .../apps-mcp/evals/init/setup_generation.sh   |  15 ++
 .../evals/resources/apps_eval_job.job.yml     |   4 +
 .../resources/apps_generation_job.job.yml     |  37 ++++
 .../apps-mcp/evals/src/generate_apps.py       | 171 ++++++++++++++++++
 experimental/apps-mcp/evals/src/run_evals.py  |  23 ++-
 6 files changed, 261 insertions(+), 1 deletion(-)
 create mode 100644 experimental/apps-mcp/evals/init/setup_generation.sh
 create mode 100644 experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
 create mode 100644 experimental/apps-mcp/evals/src/generate_apps.py

diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml
index e5b6b91475..dedb2f7893 100644
--- a/experimental/apps-mcp/evals/databricks.yml
+++ b/experimental/apps-mcp/evals/databricks.yml
@@ -28,6 +28,18 @@ variables:
   eval_parallelism:
     description: Number of parallel eval workers
     default: "4"
+  cli_binary_volume:
+    description: UC Volume path for CLI binary
+    default: /Volumes/main/evals/artifacts
+  apps_volume:
+    description: UC Volume path for generated apps
+    default: /Volumes/main/evals/generated_apps
+  generation_parallelism:
+    description: Number of parallel app generations
+    default: "4"
+  prompts:
+    description: Prompt set for generation (databricks, databricks_v2, test)
+    default: databricks
 
 targets:
   dev:
diff --git a/experimental/apps-mcp/evals/init/setup_generation.sh b/experimental/apps-mcp/evals/init/setup_generation.sh
new file mode 100644
index 0000000000..5cdce5dcf6
--- /dev/null
+++ b/experimental/apps-mcp/evals/init/setup_generation.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+echo "=== Setting up generation environment ==="
+
+# Install Dagger (required for klaudbiusz container orchestration)
+echo "Installing Dagger..."
+curl -fsSL https://dl.dagger.io/dagger/install.sh | sh
+export PATH=$PATH:/root/.local/bin
+
+# Install Python dependencies for klaudbiusz
+echo "Installing Python dependencies..."
+pip install --quiet dagger-io fire tqdm python-dotenv claude-agent-sdk litellm joblib tenacity
+
+echo "=== Setup complete ==="
diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
index 3653924051..3c0129dcd8 100644
--- a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
+++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
@@ -30,6 +30,8 @@ resources:
           default: ${var.eval_parallelism}
         - name: evals_git_url
           default: ${var.evals_git_url}
+        - name: apps_volume
+          default: ${var.apps_volume}
 
       tasks:
         - task_key: run_evals
@@ -43,6 +45,8 @@ resources:
               - ${var.eval_parallelism}
               - --evals-git-url
               - ${var.evals_git_url}
+              - --apps-volume
+              - ${var.apps_volume}
 
           environment_key: default
 
diff --git a/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
new file mode 100644
index 0000000000..7242ea3b43
--- /dev/null
+++ b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
@@ -0,0 +1,37 @@
+resources:
+  jobs:
+    apps_generation_job:
+      name: "[${bundle.target}] Apps-MCP Generation"
+
+      job_clusters:
+        - job_cluster_key: generation_cluster
+          new_cluster:
+            spark_version: "15.4.x-scala2.12"
+            node_type_id: "n2-standard-8"
+            num_workers: 0
+            data_security_mode: SINGLE_USER
+            spark_conf:
+              spark.databricks.cluster.profile: singleNode
+              spark.master: "local[*]"
+            custom_tags:
+              ResourceClass: SingleNode
+            spark_env_vars:
+              ANTHROPIC_API_KEY: "{{secrets/apps-mcp-evals/anthropic-api-key}}"
+            init_scripts:
+              - workspace:
+                  destination: ${workspace.file_path}/init/setup_generation.sh
+
+      tasks:
+        - task_key: generate_apps
+          job_cluster_key: generation_cluster
+          spark_python_task:
+            python_file: ${workspace.file_path}/src/generate_apps.py
+            parameters:
+              - --mcp-binary
+              - ${var.cli_binary_volume}/databricks-linux
+              - --output-volume
+              - ${var.apps_volume}
+              - --prompts
+              - ${var.prompts}
+              - --max-concurrency
+              - ${var.generation_parallelism}
diff --git a/experimental/apps-mcp/evals/src/generate_apps.py b/experimental/apps-mcp/evals/src/generate_apps.py
new file mode 100644
index 0000000000..e715d9d810
--- /dev/null
+++ b/experimental/apps-mcp/evals/src/generate_apps.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""Generate apps using klaudbiusz with CLI-built MCP server."""
+
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import fire
+
+
+def clone_klaudbiusz(work_dir: Path) -> Path:
+    """Clone the klaudbiusz generation framework."""
+    repo_dir = work_dir / "appdotbuild-agent"
+    if repo_dir.exists():
+        shutil.rmtree(repo_dir)
+
+    print("Cloning appdotbuild-agent repository...")
+    subprocess.run(
+        [
+            "git",
+            "clone",
+            "--depth",
+            "1",
+            "https://github.com/neondatabase/appdotbuild-agent.git",
+            str(repo_dir),
+        ],
+        check=True,
+    )
+    return repo_dir
+
+
+def install_klaudbiusz_deps(klaudbiusz_dir: Path) -> None:
+    """Install klaudbiusz Python dependencies."""
+    print("Installing klaudbiusz dependencies...")
+    result = subprocess.run(
+        [sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print(f"Warning: pip install had issues: {result.stderr[:500]}")
+
+
+def run_generation(
+    klaudbiusz_dir: Path,
+    mcp_binary: str,
+    output_dir: Path,
+    prompts: str,
+    max_concurrency: int,
+) -> None:
+    """Run bulk app generation using klaudbiusz."""
+    print(f"\nStarting app generation...")
+    print(f"  MCP binary: {mcp_binary}")
+    print(f"  Prompts: {prompts}")
+    print(f"  Max concurrency: {max_concurrency}")
+    print(f"  Output dir: {output_dir}")
+
+    env = os.environ.copy()
+    env["PYTHONPATH"] = str(klaudbiusz_dir)
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "cli.generation.bulk_run",
+        f"--prompts={prompts}",
+        f"--mcp_binary={mcp_binary}",
+        '--mcp_args=["experimental", "apps-mcp"]',
+        f"--max_concurrency={max_concurrency}",
+        f"--output_dir={output_dir}",
+    ]
+
+    print(f"\nRunning: {' '.join(cmd)}")
+    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env)
+
+    if result.returncode != 0:
+        print(f"Generation completed with return code: {result.returncode}")
+
+
+def upload_to_volume(local_dir: Path, volume_path: str) -> int:
+    """Upload generated apps to UC Volume."""
+    if not local_dir.exists():
+        print(f"No apps directory found at {local_dir}")
+        return 0
+
+    apps = list(local_dir.iterdir())
+    if not apps:
+        print("No apps generated")
+        return 0
+
+    print(f"\nUploading {len(apps)} apps to {volume_path}...")
+
+    volume_dir = Path(volume_path)
+    volume_dir.mkdir(parents=True, exist_ok=True)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    dest_dir = volume_dir / f"run_{timestamp}"
+
+    shutil.copytree(local_dir, dest_dir)
+    print(f"Uploaded to {dest_dir}")
+
+    latest_link = volume_dir / "latest"
+    if latest_link.exists():
+        latest_link.unlink()
+    latest_link.symlink_to(dest_dir.name)
+
+    return len(apps)
+
+
+def main(
+    mcp_binary: str,
+    output_volume: str,
+    prompts: str = "databricks",
+    max_concurrency: int = 4,
+) -> None:
+    """
+    Generate apps using klaudbiusz with the Databricks CLI as MCP server.
+
+    Args:
+        mcp_binary: Path to databricks-linux binary in UC Volume
+        output_volume: UC Volume path for generated apps
+        prompts: Prompt set (databricks, databricks_v2, test)
+        max_concurrency: Number of parallel generations
+    """
+    print("=" * 60)
+    print("Apps-MCP Generation")
+    print("=" * 60)
+    print(f"  MCP Binary: {mcp_binary}")
+    print(f"  Output Volume: {output_volume}")
+    print(f"  Prompts: {prompts}")
+    print(f"  Max Concurrency: {max_concurrency}")
+
+    if not Path(mcp_binary).exists():
+        print(f"\nError: MCP binary not found at {mcp_binary}")
+        print("Please upload the databricks-linux binary to the UC Volume first.")
+        sys.exit(1)
+
+    subprocess.run(["chmod", "+x", mcp_binary], check=True)
+
+    work_dir = Path("/tmp/apps-generation")
+    work_dir.mkdir(exist_ok=True)
+
+    repo_dir = clone_klaudbiusz(work_dir)
+    klaudbiusz_dir = repo_dir / "klaudbiusz"
+
+    install_klaudbiusz_deps(klaudbiusz_dir)
+
+    local_output = work_dir / "generated_apps"
+    local_output.mkdir(exist_ok=True)
+
+    run_generation(
+        klaudbiusz_dir=klaudbiusz_dir,
+        mcp_binary=mcp_binary,
+        output_dir=local_output,
+        prompts=prompts,
+        max_concurrency=max_concurrency,
+    )
+
+    app_count = upload_to_volume(local_output, output_volume)
+
+    print("\n" + "=" * 60)
+    print("Generation Complete")
+    print("=" * 60)
+    print(f"  Apps generated: {app_count}")
+    print(f"  Output location: {output_volume}")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index f3aec4339f..b314e9e5ee 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -149,6 +149,7 @@ def main(
     mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
     parallelism: int = 4,
     evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
+    apps_volume: Optional[str] = None,
 ) -> None:
     """
     Run Apps-MCP evaluations.
@@ -157,11 +158,13 @@ def main(
         mlflow_experiment: MLflow experiment path
         parallelism: Number of parallel workers
         evals_git_url: Git URL for appdotbuild-agent eval framework
+        apps_volume: UC Volume path containing generated apps (optional)
     """
     print("Starting Apps-MCP Evaluation")
     print(f"  MLflow Experiment: {mlflow_experiment}")
     print(f"  Evals Repo: {evals_git_url}")
     print(f"  Parallelism: {parallelism}")
+    print(f"  Apps Volume: {apps_volume or 'not specified'}")
     print("=" * 60)
 
     setup_mlflow(mlflow_experiment)
@@ -180,7 +183,25 @@ def main(
     print("=" * 60)
 
     klaudbiusz_dir = evals_dir / "klaudbiusz"
-    apps_dir = klaudbiusz_dir / "app"
+
+    if apps_volume:
+        volume_path = Path(apps_volume)
+        latest_link = volume_path / "latest"
+        if latest_link.exists():
+            apps_dir = latest_link
+            print(f"Using apps from UC Volume: {apps_dir}")
+        elif volume_path.exists():
+            subdirs = [d for d in volume_path.iterdir() if d.is_dir()]
+            if subdirs:
+                apps_dir = max(subdirs, key=lambda d: d.name)
+                print(f"Using most recent apps dir: {apps_dir}")
+            else:
+                apps_dir = volume_path
+        else:
+            print(f"Warning: Apps volume not found at {apps_volume}")
+            apps_dir = klaudbiusz_dir / "app"
+    else:
+        apps_dir = klaudbiusz_dir / "app"
 
     if not apps_dir.exists():
         print(f"Apps directory not found at {apps_dir}")

From ea20bd011ce1d64719ab8e3beac71dd78a969191 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Mon, 8 Dec 2025 14:58:19 +0000
Subject: [PATCH 06/13] Fix UC Volume paths for CLI binary and generated apps

Use main.default.apps_mcp_artifacts and main.default.apps_mcp_generated
volumes which were created successfully.

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 experimental/apps-mcp/evals/databricks.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml
index dedb2f7893..74ab42aca8 100644
--- a/experimental/apps-mcp/evals/databricks.yml
+++ b/experimental/apps-mcp/evals/databricks.yml
@@ -30,10 +30,10 @@ variables:
     default: "4"
   cli_binary_volume:
     description: UC Volume path for CLI binary
-    default: /Volumes/main/evals/artifacts
+    default: /Volumes/main/default/apps_mcp_artifacts
   apps_volume:
     description: UC Volume path for generated apps
-    default: /Volumes/main/evals/generated_apps
+    default: /Volumes/main/default/apps_mcp_generated
   generation_parallelism:
     description: Number of parallel app generations
     default: "4"

From 3aa5dd89322802c847c5b25d23bb5a539a7d0698 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Tue, 9 Dec 2025 14:14:29 +0000
Subject: [PATCH 07/13] Fix app generation: use LiteLLM backend + fix UC Volume
 symlinks

- Use LiteLLM backend (anthropic/claude-sonnet-4-20250514) to bypass
  Claude Agent SDK root user restriction on Databricks clusters
- Replace symlinks with latest.txt file (symlinks not supported on UC Volumes)
- Revert docker_image and data_security_mode changes (not needed with LiteLLM)
- Successfully tested: generated hello-world app at $2.33 cost

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 experimental/apps-mcp/evals/databricks.yml    |  2 +-
 .../resources/apps_generation_job.job.yml     |  4 +-
 .../apps-mcp/evals/src/generate_apps.py       | 89 ++++++++++++++-----
 experimental/apps-mcp/evals/src/run_evals.py  |  8 +-
 4 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/experimental/apps-mcp/evals/databricks.yml b/experimental/apps-mcp/evals/databricks.yml
index 74ab42aca8..2ea444d420 100644
--- a/experimental/apps-mcp/evals/databricks.yml
+++ b/experimental/apps-mcp/evals/databricks.yml
@@ -39,7 +39,7 @@ variables:
     default: "4"
   prompts:
     description: Prompt set for generation (databricks, databricks_v2, test)
-    default: databricks
+    default: test
 
 targets:
   dev:
diff --git a/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
index 7242ea3b43..ce62d0fcde 100644
--- a/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
+++ b/experimental/apps-mcp/evals/resources/apps_generation_job.job.yml
@@ -6,7 +6,7 @@ resources:
       job_clusters:
         - job_cluster_key: generation_cluster
           new_cluster:
-            spark_version: "15.4.x-scala2.12"
+            spark_version: "16.2.x-scala2.12"
             node_type_id: "n2-standard-8"
             num_workers: 0
             data_security_mode: SINGLE_USER
@@ -17,6 +17,8 @@ resources:
               ResourceClass: SingleNode
             spark_env_vars:
               ANTHROPIC_API_KEY: "{{secrets/apps-mcp-evals/anthropic-api-key}}"
+              DATABRICKS_HOST: ${workspace.host}
+              DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}"
             init_scripts:
               - workspace:
                   destination: ${workspace.file_path}/init/setup_generation.sh
diff --git a/experimental/apps-mcp/evals/src/generate_apps.py b/experimental/apps-mcp/evals/src/generate_apps.py
index e715d9d810..d6c7a19d62 100644
--- a/experimental/apps-mcp/evals/src/generate_apps.py
+++ b/experimental/apps-mcp/evals/src/generate_apps.py
@@ -44,39 +44,80 @@ def install_klaudbiusz_deps(klaudbiusz_dir: Path) -> None:
         print(f"Warning: pip install had issues: {result.stderr[:500]}")
 
 
+def get_prompts(prompts_name: str) -> dict:
+    """Load prompts from klaudbiusz."""
+    if prompts_name == "databricks":
+        return {
+            "churn-risk-dashboard": "Build a churn risk dashboard showing customers with less than 30 day login activity, declining usage trends, and support ticket volume. Calculate a risk score.",
+            "revenue-by-channel": "Show daily revenue by channel (store/web/catalog) for the last 90 days with week-over-week growth rates and contribution percentages.",
+            "customer-rfm-segments": "Create customer segments using RFM analysis (recency, frequency, monetary). Show 4-5 clusters with average spend, purchase frequency, and last order date.",
+            "taxi-trip-metrics": "Calculate taxi trip metrics: average fare by distance bracket and time of day. Show daily trip volume and revenue trends.",
+            "slow-moving-inventory": "Identify slow-moving inventory: products with more than 90 days in stock, low turnover ratio, and current warehouse capacity by location.",
+        }
+    elif prompts_name == "test":
+        return {
+            "hello-world": "Create a simple hello world app that displays a greeting message.",
+        }
+    else:
+        return {
+            "sample-dashboard": "Create a sample data dashboard with charts showing sales trends.",
+        }
+
+
 def run_generation(
     klaudbiusz_dir: Path,
     mcp_binary: str,
     output_dir: Path,
     prompts: str,
     max_concurrency: int,
-) -> None:
-    """Run bulk app generation using klaudbiusz."""
-    print(f"\nStarting app generation...")
+) -> int:
+    """Run app generation using local_run (no Dagger required)."""
+    print(f"\nStarting app generation (local mode, no Dagger)...")
     print(f"  MCP binary: {mcp_binary}")
     print(f"  Prompts: {prompts}")
-    print(f"  Max concurrency: {max_concurrency}")
     print(f"  Output dir: {output_dir}")
 
     env = os.environ.copy()
     env["PYTHONPATH"] = str(klaudbiusz_dir)
 
-    cmd = [
-        sys.executable,
-        "-m",
-        "cli.generation.bulk_run",
-        f"--prompts={prompts}",
-        f"--mcp_binary={mcp_binary}",
-        '--mcp_args=["experimental", "apps-mcp"]',
-        f"--max_concurrency={max_concurrency}",
-        f"--output_dir={output_dir}",
-    ]
-
-    print(f"\nRunning: {' '.join(cmd)}")
-    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env)
-
-    if result.returncode != 0:
-        print(f"Generation completed with return code: {result.returncode}")
+    prompt_dict = get_prompts(prompts)
+    print(f"  Total prompts: {len(prompt_dict)}")
+
+    success_count = 0
+    fail_count = 0
+
+    for app_name, prompt in prompt_dict.items():
+        print(f"\n{'=' * 60}")
+        print(f"Generating: {app_name}")
+        print(f"Prompt: {prompt[:100]}...")
+        print("=" * 60)
+
+        # Use LiteLLM backend to avoid Claude Agent SDK root user restriction
+        # (Databricks clusters run as root, Claude Agent SDK refuses to run as root)
+        cmd = [
+            sys.executable,
+            "-m",
+            "cli.generation.local_run",
+            prompt,
+            f"--app_name={app_name}",
+            "--backend=litellm",
+            "--model=anthropic/claude-sonnet-4-20250514",
+            f"--mcp_binary={mcp_binary}",
+            '--mcp_args=["experimental", "apps-mcp"]',
+            f"--output_dir={output_dir}",
+        ]
+
+        result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env)
+
+        if result.returncode == 0:
+            success_count += 1
+            print(f"SUCCESS: {app_name}")
+        else:
+            fail_count += 1
+            print(f"FAILED: {app_name} (return code: {result.returncode})")
+
+    print(f"\nGeneration summary: {success_count} succeeded, {fail_count} failed")
+    return success_count
 
 
 def upload_to_volume(local_dir: Path, volume_path: str) -> int:
@@ -101,10 +142,10 @@ def upload_to_volume(local_dir: Path, volume_path: str) -> int:
     shutil.copytree(local_dir, dest_dir)
     print(f"Uploaded to {dest_dir}")
 
-    latest_link = volume_dir / "latest"
-    if latest_link.exists():
-        latest_link.unlink()
-    latest_link.symlink_to(dest_dir.name)
+    # Write latest run path to a file (symlinks not supported on UC Volumes)
+    latest_file = volume_dir / "latest.txt"
+    latest_file.write_text(str(dest_dir))
+    print(f"Latest run recorded in {latest_file}")
 
     return len(apps)
 
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index b314e9e5ee..b2e0ed2b41 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -186,10 +186,10 @@ def main(
 
     if apps_volume:
         volume_path = Path(apps_volume)
-        latest_link = volume_path / "latest"
-        if latest_link.exists():
-            apps_dir = latest_link
-            print(f"Using apps from UC Volume: {apps_dir}")
+        latest_file = volume_path / "latest.txt"
+        if latest_file.exists():
+            apps_dir = Path(latest_file.read_text().strip())
+            print(f"Using apps from UC Volume (via latest.txt): {apps_dir}")
         elif volume_path.exists():
             subdirs = [d for d in volume_path.iterdir() if d.is_dir()]
             if subdirs:

From 61be7e7c382145731197ee2eee5ed397f8c30c22 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Tue, 9 Dec 2025 14:27:52 +0000
Subject: [PATCH 08/13] Fix eval job CLI parameter parsing

- Change entry point from main to cli wrapper that uses fire.Fire()
- This enables proper CLI argument parsing for wheel package
- Now correctly receives apps_volume parameter from job config

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>
---
 experimental/apps-mcp/evals/pyproject.toml   | 2 +-
 experimental/apps-mcp/evals/src/run_evals.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/experimental/apps-mcp/evals/pyproject.toml b/experimental/apps-mcp/evals/pyproject.toml
index 7c388f3b95..02f47339e0 100644
--- a/experimental/apps-mcp/evals/pyproject.toml
+++ b/experimental/apps-mcp/evals/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
 ]
 
 [project.scripts]
-main = "src.run_evals:main"
+main = "src.run_evals:cli"
 
 [tool.ruff]
 line-length = 120
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index b2e0ed2b41..235d386386 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -250,5 +250,10 @@ def main(
     print("\nEvaluation complete!")
 
 
-if __name__ == "__main__":
+def cli():
+    """CLI entry point using fire for argument parsing."""
     fire.Fire(main)
+
+
+if __name__ == "__main__":
+    cli()

From cf78b429753ba8ed702bfcb7e9c66079bad7c655 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Thu, 11 Dec 2025 11:38:27 +0000
Subject: [PATCH 09/13] Required to bypass proc mount restrictions and
 AppArmor.

---
 .../apps-mcp/evals/init/setup_eval.sh         |  55 ++++
 .../evals/resources/apps_eval_job.job.yml     |  34 +-
 experimental/apps-mcp/evals/src/run_evals.py  | 293 ++++++------------
 3 files changed, 171 insertions(+), 211 deletions(-)
 create mode 100644 experimental/apps-mcp/evals/init/setup_eval.sh

diff --git a/experimental/apps-mcp/evals/init/setup_eval.sh b/experimental/apps-mcp/evals/init/setup_eval.sh
new file mode 100644
index 0000000000..9870b307e5
--- /dev/null
+++ b/experimental/apps-mcp/evals/init/setup_eval.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+echo "=== Apps-MCP Eval Setup ==="
+echo "Python version: $(python --version)"
+
+# Install Node.js (required for klaudbiusz eval)
+echo "Installing Node.js..."
+curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
+sudo apt-get install -y nodejs
+
+echo "Node version: $(node --version)"
+echo "npm version: $(npm --version)"
+
+# Install Docker (required for --no-dagger mode)
+echo "Installing Docker..."
+curl -fsSL https://get.docker.com -o get-docker.sh
+sudo sh get-docker.sh
+rm get-docker.sh
+
+# Configure Docker to use vfs storage driver (works without privileged mode)
+echo "Configuring Docker with vfs storage driver..."
+sudo mkdir -p /etc/docker
+cat <<EOF | sudo tee /etc/docker/daemon.json
+{
+  "storage-driver": "vfs"
+}
+EOF
+
+# Stop any existing Docker daemon
+sudo systemctl stop docker 2>/dev/null || true
+sudo pkill dockerd 2>/dev/null || true
+sleep 2
+
+# Start Docker daemon
+echo "Starting Docker daemon..."
+sudo dockerd --storage-driver=vfs &
+sleep 10
+
+# Verify Docker is running
+echo "Docker version: $(docker --version)"
+sudo docker info || echo "Warning: Docker daemon may not be fully started"
+
+# Allow non-root user to run docker
+sudo usermod -aG docker $(whoami) || true
+sudo chmod 666 /var/run/docker.sock || true
+
+# Pre-pull the node image to speed up evaluation
+echo "Pre-pulling node:20-alpine image..."
+docker pull node:20-alpine || echo "Warning: Could not pre-pull image"
+
+# Install Python dependencies
+pip install fire mlflow
+
+echo "=== Setup complete ==="
diff --git a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
index 3c0129dcd8..c5c82c9e10 100644
--- a/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
+++ b/experimental/apps-mcp/evals/resources/apps_eval_job.job.yml
@@ -33,11 +33,30 @@ resources:
         - name: apps_volume
           default: ${var.apps_volume}
 
+      job_clusters:
+        - job_cluster_key: eval_cluster
+          new_cluster:
+            spark_version: "16.2.x-scala2.12"
+            node_type_id: "n2-standard-4"
+            num_workers: 0
+            data_security_mode: SINGLE_USER
+            spark_conf:
+              spark.databricks.cluster.profile: singleNode
+              spark.master: "local[*]"
+            custom_tags:
+              ResourceClass: SingleNode
+            spark_env_vars:
+              DATABRICKS_HOST: ${workspace.host}
+              DATABRICKS_TOKEN: "{{secrets/apps-mcp-evals/databricks-token}}"
+            init_scripts:
+              - workspace:
+                  destination: ${workspace.file_path}/init/setup_eval.sh
+
       tasks:
         - task_key: run_evals
-          python_wheel_task:
-            package_name: apps_mcp_evals
-            entry_point: main
+          job_cluster_key: eval_cluster
+          spark_python_task:
+            python_file: ${workspace.file_path}/src/run_evals.py
             parameters:
               - --mlflow-experiment
               - ${var.mlflow_experiment}
@@ -47,12 +66,3 @@ resources:
               - ${var.evals_git_url}
               - --apps-volume
               - ${var.apps_volume}
-
-          environment_key: default
-
-      environments:
-        - environment_key: default
-          spec:
-            environment_version: "1"
-            dependencies:
-              - ../dist/*.whl
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index 235d386386..ee9fd5f3d5 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -1,242 +1,138 @@
 #!/usr/bin/env python3
-"""
-Apps-MCP Evaluation Runner for Databricks Jobs.
+"""Apps-MCP Evaluation Runner for Databricks Jobs."""
 
-Runs bundle deploy/run to generate apps, then evaluates and logs to MLflow.
-"""
-
-import json
 import os
 import subprocess
 import sys
 import tempfile
-from datetime import datetime
+import time
 from pathlib import Path
 from typing import Optional
 
 import fire
-import mlflow
-
 
-def setup_mlflow(experiment_name: str) -> None:
-    """Configure MLflow to use Databricks tracking."""
-    mlflow.set_tracking_uri("databricks")
-    mlflow.set_experiment(experiment_name)
 
+def start_docker_daemon() -> bool:
+    """Start Docker daemon with vfs storage driver (works without privileges)."""
+    print("Checking Docker installation...")
 
-def clone_evals_repo(git_url: str, target_dir: Path) -> Path:
-    """Clone appdotbuild-agent repository."""
-    if target_dir.exists():
-        subprocess.run(["git", "-C", str(target_dir), "pull"], check=True, capture_output=True)
-    else:
-        result = subprocess.run(
-            ["git", "clone", "--depth", "1", git_url, str(target_dir)],
+    # Check if Docker CLI is available
+    result = subprocess.run(["which", "docker"], capture_output=True, text=True)
+    if result.returncode != 0:
+        print("Docker CLI not found, attempting to install...")
+        subprocess.run(
+            ["sudo", "bash", "-c", "curl -fsSL https://get.docker.com | sh"],
             check=False,
-            capture_output=True,
-            text=True,
         )
-        if result.returncode != 0:
-            print(f"Git clone stderr: {result.stderr}")
-            raise RuntimeError(f"Failed to clone {git_url}: {result.stderr}")
-    return target_dir
-
 
-def install_klaudbiusz_deps(evals_dir: Path) -> None:
-    """Install klaudbiusz dependencies using pip."""
-    klaudbiusz_dir = evals_dir / "klaudbiusz"
-    if not klaudbiusz_dir.exists():
-        print(f"klaudbiusz directory not found at {klaudbiusz_dir}")
-        return
-
-    print("Installing klaudbiusz dependencies...")
+    # Check if Docker is already running
     result = subprocess.run(
-        [sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)],
-        capture_output=True,
-        text=True,
+        ["docker", "info"], capture_output=True, text=True, timeout=10
     )
-    if result.returncode != 0:
-        print(f"pip install output: {result.stdout}")
-        print(f"pip install errors: {result.stderr}")
-
-
-def run_evaluation(
-    evals_dir: Path,
-    apps_dir: Path,
-    parallelism: int = 4,
-    fast_mode: bool = True,
-) -> dict:
-    """Run evaluation on generated apps using klaudbiusz."""
-    klaudbiusz_dir = evals_dir / "klaudbiusz"
-
-    cmd = [
-        sys.executable,
-        "-m",
-        "cli.evaluation.evaluate_all",
-        "--dir",
-        str(apps_dir),
-        "--parallel",
-        str(parallelism),
-    ]
-    if fast_mode:
-        cmd.append("--fast")
-
-    env = os.environ.copy()
-    env["PYTHONPATH"] = str(klaudbiusz_dir)
-
-    print(f"Running: {' '.join(cmd)}")
-    print(f"Working dir: {klaudbiusz_dir}")
-    print(f"Apps dir: {apps_dir}")
-
-    result = subprocess.run(cmd, cwd=klaudbiusz_dir, env=env, capture_output=True, text=True)
-
-    print(f"Evaluation stdout: {result.stdout[:2000] if result.stdout else 'empty'}")
-    if result.returncode != 0:
-        print(f"Evaluation errors: {result.stderr[:2000] if result.stderr else 'empty'}")
-
-    eval_output_dir = klaudbiusz_dir / "cli" / "app-eval"
-    report_file = eval_output_dir / "evaluation_report.json"
-    if report_file.exists():
-        return json.loads(report_file.read_text())
-    return {}
-
-
-def log_results_to_mlflow(
-    evaluation_report: dict,
-    generation_results: Optional[dict] = None,
-    run_name: Optional[str] = None,
-) -> str:
-    """Log evaluation results to MLflow."""
-    if not run_name:
-        run_name = f"eval-{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
-
-    with mlflow.start_run(run_name=run_name) as run:
-        mlflow.set_tag("framework", "apps-mcp-evals")
-        mlflow.set_tag("run_type", "scheduled")
+    if result.returncode == 0:
+        print("Docker daemon already running")
+        return True
 
-        summary = evaluation_report.get("summary", {})
-        mlflow.log_param("total_apps", summary.get("total_apps", 0))
-        mlflow.log_param("timestamp", summary.get("evaluated_at", ""))
+    print("Starting Docker daemon...")
 
-        metrics = summary.get("metrics_summary", {})
-        if metrics:
-            mlflow.log_metric("avg_appeval_100", metrics.get("avg_appeval_100", 0))
-            if metrics.get("avg_eff_units") is not None:
-                mlflow.log_metric("avg_eff_units", metrics["avg_eff_units"])
-            mlflow.log_metric(
-                "build_success_rate", metrics.get("build_success", 0) / max(summary.get("total_apps", 1), 1)
-            )
-            mlflow.log_metric(
-                "runtime_success_rate", metrics.get("runtime_success", 0) / max(summary.get("total_apps", 1), 1)
-            )
-            mlflow.log_metric("local_runability_avg", metrics.get("local_runability_avg", 0))
-            mlflow.log_metric("deployability_avg", metrics.get("deployability_avg", 0))
-
-        if generation_results:
-            gen_metrics = generation_results.get("generation_metrics", {})
-            if gen_metrics.get("total_cost_usd"):
-                mlflow.log_metric("generation_cost_usd", gen_metrics["total_cost_usd"])
-            if gen_metrics.get("avg_turns"):
-                mlflow.log_metric("avg_turns_per_app", gen_metrics["avg_turns"])
-
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-            json.dump(evaluation_report, f, indent=2)
-            mlflow.log_artifact(f.name, "reports")
+    # Start dockerd in background (config already set by init script)
+    proc = subprocess.Popen(
+        ["sudo", "dockerd"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
 
-        return run.info.run_id
+    # Wait for Docker to start
+    for i in range(60):
+        time.sleep(1)
+        result = subprocess.run(
+            ["sudo", "docker", "info"], capture_output=True, text=True, timeout=10
+        )
+        if result.returncode == 0:
+            print(f"Docker daemon started after {i+1}s")
+            # Fix socket permissions
+            subprocess.run(["sudo", "chmod", "666", "/var/run/docker.sock"], check=False)
+            return True
+        if proc.poll() is not None:
+            stdout, stderr = proc.communicate()
+            print(f"dockerd exited with code {proc.returncode}")
+            print(f"stderr: {stderr.decode()[:500]}")
+            break
+
+    print("Failed to start Docker daemon")
+    return False
+
+
+def clone_and_install_klaudbiusz(work_dir: Path, git_url: str) -> Path:
+    """Clone klaudbiusz and install dependencies."""
+    print(f"Cloning {git_url}...")
+    repo_dir = work_dir / "appdotbuild-agent"
+    subprocess.run(["git", "clone", "--depth", "1", git_url, str(repo_dir)], check=True)
+    klaudbiusz_dir = repo_dir / "klaudbiusz"
+    print("Installing klaudbiusz...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-e", str(klaudbiusz_dir)], check=True)
+    sys.path.insert(0, str(klaudbiusz_dir))
+    return klaudbiusz_dir
+
+
+def find_apps_dir(apps_volume: str) -> Optional[Path]:
+    """Find apps directory from UC Volume."""
+    volume_path = Path(apps_volume)
+    latest_file = volume_path / "latest.txt"
+    if latest_file.exists():
+        return Path(latest_file.read_text().strip())
+    if volume_path.exists():
+        run_dirs = [d for d in volume_path.iterdir() if d.is_dir() and d.name.startswith("run_")]
+        if run_dirs:
+            return max(run_dirs, key=lambda d: d.name)
+    return None
 
 
 def main(
     mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
     parallelism: int = 4,
-    evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
     apps_volume: Optional[str] = None,
+    evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
 ) -> None:
-    """
-    Run Apps-MCP evaluations.
-
-    Args:
-        mlflow_experiment: MLflow experiment path
-        parallelism: Number of parallel workers
-        evals_git_url: Git URL for appdotbuild-agent eval framework
-        apps_volume: UC Volume path containing generated apps (optional)
-    """
-    print("Starting Apps-MCP Evaluation")
+    """Run Apps-MCP evaluations using klaudbiusz."""
+    print("=" * 60)
+    print("Apps-MCP Evaluation")
+    print("=" * 60)
     print(f"  MLflow Experiment: {mlflow_experiment}")
-    print(f"  Evals Repo: {evals_git_url}")
     print(f"  Parallelism: {parallelism}")
     print(f"  Apps Volume: {apps_volume or 'not specified'}")
-    print("=" * 60)
 
-    setup_mlflow(mlflow_experiment)
+    # Try to start Docker daemon
+    docker_available = start_docker_daemon()
+    if not docker_available:
+        print("Warning: Docker not available, container-based checks will fail")
 
     work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
-    evals_dir = work_dir / "appdotbuild-agent"
+    clone_and_install_klaudbiusz(work_dir, evals_git_url)
 
-    print(f"\nCloning evals repo to {evals_dir}...")
-    clone_evals_repo(evals_git_url, evals_dir)
+    from cli.evaluation import run_evaluation_simple
 
-    print("\nInstalling dependencies...")
-    install_klaudbiusz_deps(evals_dir)
+    apps_dir = find_apps_dir(apps_volume) if apps_volume else None
+    if apps_dir:
+        print(f"  Apps Dir: {apps_dir}")
+    else:
+        print("  Apps Dir: not found, will use default")
+        apps_dir = work_dir / "appdotbuild-agent" / "klaudbiusz" / "app"
 
     print("\n" + "=" * 60)
-    print("RUNNING EVALUATION")
+    print("Running evaluation...")
     print("=" * 60)
 
-    klaudbiusz_dir = evals_dir / "klaudbiusz"
-
-    if apps_volume:
-        volume_path = Path(apps_volume)
-        latest_file = volume_path / "latest.txt"
-        if latest_file.exists():
-            apps_dir = Path(latest_file.read_text().strip())
-            print(f"Using apps from UC Volume (via latest.txt): {apps_dir}")
-        elif volume_path.exists():
-            subdirs = [d for d in volume_path.iterdir() if d.is_dir()]
-            if subdirs:
-                apps_dir = max(subdirs, key=lambda d: d.name)
-                print(f"Using most recent apps dir: {apps_dir}")
-            else:
-                apps_dir = volume_path
-        else:
-            print(f"Warning: Apps volume not found at {apps_volume}")
-            apps_dir = klaudbiusz_dir / "app"
-    else:
-        apps_dir = klaudbiusz_dir / "app"
-
-    if not apps_dir.exists():
-        print(f"Apps directory not found at {apps_dir}")
-        print("Creating empty apps dir for sample run...")
-        apps_dir.mkdir(parents=True, exist_ok=True)
-
-    evaluation_report = run_evaluation(
-        evals_dir=evals_dir,
-        apps_dir=apps_dir,
+    report = run_evaluation_simple(
+        apps_dir=str(apps_dir),
+        mlflow_experiment=mlflow_experiment,
         parallelism=parallelism,
+        fast_mode=True,
     )
 
-    if not evaluation_report:
-        print("No apps found - creating sample report for infrastructure validation")
-        evaluation_report = {
-            "summary": {
-                "total_apps": 0,
-                "evaluated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
-                "metrics_summary": {
-                    "avg_appeval_100": 0,
-                    "build_success": 0,
-                    "runtime_success": 0,
-                    "local_runability_avg": 0,
-                    "deployability_avg": 0,
-                },
-            },
-            "apps": [],
-        }
-
-    print("\nLogging results to MLflow...")
-    run_id = log_results_to_mlflow(evaluation_report)
-    print(f"MLflow Run ID: {run_id}")
-
-    summary = evaluation_report.get("summary", {})
+    summary = report.get("summary", {})
     metrics = summary.get("metrics_summary", {})
+
     print("\n" + "=" * 60)
     print("EVALUATION SUMMARY")
     print("=" * 60)
@@ -244,14 +140,13 @@ def main(
     print(f"Avg AppEval Score: {metrics.get('avg_appeval_100', 0):.1f}/100")
     print(f"Build Success: {metrics.get('build_success', 0)}")
     print(f"Runtime Success: {metrics.get('runtime_success', 0)}")
-    print(f"Local Runability: {metrics.get('local_runability_avg', 0):.1f}/5")
-    print(f"Deployability: {metrics.get('deployability_avg', 0):.1f}/5")
-
+    print(f"Type Safety: {metrics.get('type_safety_pass', 0)}")
+    print(f"Tests Pass: {metrics.get('tests_pass', 0)}")
     print("\nEvaluation complete!")
 
 
 def cli():
-    """CLI entry point using fire for argument parsing."""
+    """CLI entry point."""
     fire.Fire(main)
 
 

From 3d5b131daeaca7f4f738a32e4229954f37e6d920 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Thu, 11 Dec 2025 16:23:05 +0000
Subject: [PATCH 10/13] Refactor get_prompts function to use external import
 for prompt retrieval, simplifying the code and enhancing maintainability.

---
 .../apps-mcp/evals/src/generate_apps.py       | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/experimental/apps-mcp/evals/src/generate_apps.py b/experimental/apps-mcp/evals/src/generate_apps.py
index d6c7a19d62..313ed732c1 100644
--- a/experimental/apps-mcp/evals/src/generate_apps.py
+++ b/experimental/apps-mcp/evals/src/generate_apps.py
@@ -46,22 +46,9 @@ def install_klaudbiusz_deps(klaudbiusz_dir: Path) -> None:
 
 def get_prompts(prompts_name: str) -> dict:
     """Load prompts from klaudbiusz."""
-    if prompts_name == "databricks":
-        return {
-            "churn-risk-dashboard": "Build a churn risk dashboard showing customers with less than 30 day login activity, declining usage trends, and support ticket volume. Calculate a risk score.",
-            "revenue-by-channel": "Show daily revenue by channel (store/web/catalog) for the last 90 days with week-over-week growth rates and contribution percentages.",
-            "customer-rfm-segments": "Create customer segments using RFM analysis (recency, frequency, monetary). Show 4-5 clusters with average spend, purchase frequency, and last order date.",
-            "taxi-trip-metrics": "Calculate taxi trip metrics: average fare by distance bracket and time of day. Show daily trip volume and revenue trends.",
-            "slow-moving-inventory": "Identify slow-moving inventory: products with more than 90 days in stock, low turnover ratio, and current warehouse capacity by location.",
-        }
-    elif prompts_name == "test":
-        return {
-            "hello-world": "Create a simple hello world app that displays a greeting message.",
-        }
-    else:
-        return {
-            "sample-dashboard": "Create a sample data dashboard with charts showing sales trends.",
-        }
+    from cli.prompts import get_prompts as klaudbiusz_get_prompts
+
+    return klaudbiusz_get_prompts(prompts_name)
 
 
 def run_generation(

From b08f318a8388bc8e3dbef25eff1e8a05692ea0da Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Thu, 11 Dec 2025 16:24:47 +0000
Subject: [PATCH 11/13] Update README.md for Apps-MCP Evals: Enhance
 documentation to clarify job structure, prerequisites, and configuration
 details. Introduce Generation and Evaluation jobs, update quick start
 commands, and add prompt sets and known limitations sections.

---
 experimental/apps-mcp/evals/README.md | 135 ++++++++++++++++++--------
 1 file changed, 92 insertions(+), 43 deletions(-)

diff --git a/experimental/apps-mcp/evals/README.md b/experimental/apps-mcp/evals/README.md
index c92eca624c..a464afc13d 100644
--- a/experimental/apps-mcp/evals/README.md
+++ b/experimental/apps-mcp/evals/README.md
@@ -1,80 +1,129 @@
-# Apps-MCP Continuous Evals
+# Apps-MCP Evals
 
-Databricks Asset Bundle for running continuous evaluations of the Apps-MCP code generation system.
+Databricks Asset Bundle for generating and evaluating apps using the Apps-MCP system with klaudbiusz framework.
 
 ## Overview
 
-This bundle deploys a scheduled Databricks job that:
-1. Runs the klaudbiusz evaluation framework
-2. Logs results to MLflow for tracking
-3. Alerts on failures or long-running evaluations
+This bundle provides two jobs:
+1. **Generation Job** - Generates apps using klaudbiusz with the Databricks CLI as MCP server
+2. **Evaluation Job** - Evaluates generated apps and logs results to MLflow
+
+## Prerequisites
+
+1. **Databricks Secrets** - Create secret scope and add tokens:
+   ```bash
+   databricks secrets create-scope apps-mcp-evals
+   databricks secrets put-secret apps-mcp-evals anthropic-api-key
+   databricks secrets put-secret apps-mcp-evals databricks-token
+   ```
+
+2. **UC Volumes** - Create volumes for artifacts:
+   ```bash
+   databricks volumes create main.default.apps_mcp_artifacts
+   databricks volumes create main.default.apps_mcp_generated
+   ```
+
+3. **CLI Binary** - Build and upload Linux CLI binary:
+   ```bash
+   GOOS=linux GOARCH=amd64 go build -o databricks-linux
+   databricks fs cp databricks-linux /Volumes/main/default/apps_mcp_artifacts/
+   ```
 
 ## Quick Start
 
 ```bash
-# Validate the bundle
+cd experimental/apps-mcp/evals
+
+# Validate bundle
 databricks bundle validate -t dev
 
-# Deploy to dev workspace
+# Deploy
 databricks bundle deploy -t dev
 
-# Run manually
-databricks bundle run -t dev apps_eval_job
+# Run generation (creates apps in UC Volume)
+databricks bundle run -t dev apps_generation_job
 
-# View results in MLflow
-# Navigate to: ML → Experiments → /Shared/apps-mcp-evaluations-staging
+# Run evaluation (evaluates apps, logs to MLflow)
+databricks bundle run -t dev apps_eval_job
 ```
 
+## Jobs
+
+### Generation Job (`apps_generation_job`)
+
+Generates apps using klaudbiusz's local_run with LiteLLM backend.
+
+**Parameters:**
+- `prompts` - Prompt set: `databricks`, `databricks_v2`, or `test` (default: `test`)
+- `cli_binary_volume` - Path to CLI binary volume
+- `apps_volume` - Output volume for generated apps
+
+**Cluster:** Jobs cluster with Spark 16.2.x (Python 3.12)
+
+### Evaluation Job (`apps_eval_job`)
+
+Evaluates generated apps using klaudbiusz's Docker-based evaluation.
+
+**Parameters:**
+- `apps_volume` - Volume containing apps to evaluate
+- `mlflow_experiment` - MLflow experiment for logging results
+- `parallelism` - Number of parallel evaluations
+
+**Cluster:** Jobs cluster with Spark 16.2.x, Docker installed via init script
+
+**Schedule:** Nightly at 2am UTC
+
 ## Configuration
 
 ### Variables
 
 | Variable | Description | Default |
 |----------|-------------|---------|
-| `catalog` | Unity Catalog for results | `main` |
-| `schema` | Schema for eval tables | `${workspace.current_user.short_name}` (dev) |
+| `prompts` | Prompt set for generation | `test` |
+| `cli_binary_volume` | UC Volume for CLI binary | `/Volumes/main/default/apps_mcp_artifacts` |
+| `apps_volume` | UC Volume for generated apps | `/Volumes/main/default/apps_mcp_generated` |
 | `mlflow_experiment` | MLflow experiment path | `/Shared/apps-mcp-evaluations` |
 | `eval_parallelism` | Parallel eval workers | `4` |
+| `evals_git_url` | klaudbiusz repo URL | `https://github.com/neondatabase/appdotbuild-agent.git` |
 
 ### Targets
 
-- **dev**: Development mode with personal schema, staging MLflow experiment
-- **prod**: Production mode with shared schema, service principal identity
-
-## Schedule
-
-The job runs nightly at 2am UTC. Manual runs can be triggered via:
-
-```bash
-databricks bundle run -t dev apps_eval_job
-```
+- **dev** - Development mode, staging MLflow experiment
+- **prod** - Production mode, service principal identity
 
 ## Monitoring
 
-- **MLflow**: View metrics trends at `/Shared/apps-mcp-evaluations`
-- **Health Alerts**: Job alerts if runtime exceeds 2 hours
-- **Email**: Failures notify apps-mcp-team@databricks.com
-
-## Development
-
-```bash
-# Build wheel locally
-uv build --wheel
-
-# Run evals locally (outside Databricks)
-uv run python -m src.run_evals --mode=eval_only --parallelism=4
-```
+- **MLflow** - View metrics at the configured experiment path
+- **Health Alerts** - Eval job alerts if runtime exceeds 2 hours
+- **Logs** - Check job run output for detailed evaluation results
 
 ## Architecture
 
 ```
 evals/
-├── databricks.yml           # Bundle configuration
+├── databricks.yml              # Bundle configuration
 ├── resources/
-│   └── apps_eval_job.job.yml  # Job definition
+│   ├── apps_generation_job.job.yml  # Generation job
+│   └── apps_eval_job.job.yml        # Evaluation job
+├── init/
+│   ├── setup_generation.sh     # Generation cluster init
+│   └── setup_eval.sh           # Eval cluster init (Docker)
 ├── src/
-│   ├── __init__.py
-│   └── run_evals.py         # Main orchestrator
-├── pyproject.toml           # Python package config
-└── README.md
+│   ├── generate_apps.py        # App generation orchestrator
+│   └── run_evals.py            # Evaluation orchestrator
+└── pyproject.toml              # Python package config
 ```
+
+## Prompt Sets
+
+Available prompt sets (configured via `prompts` variable):
+
+- `test` - Simple test prompts (1 app) for quick validation
+- `databricks` - 5 Databricks-focused dashboard prompts
+- `databricks_v2` - 20 realistic human-style prompts
+
+## Known Limitations
+
+- Docker containers require `--privileged` flag on Databricks clusters
+- Generation uses LiteLLM backend (Claude Agent SDK has root user restriction)
+- UC Volumes don't support symlinks, uses `latest.txt` file instead

From b2cf28cb5e1f708e98d61f9c697ae95ef1a9cce3 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Thu, 11 Dec 2025 17:31:25 +0000
Subject: [PATCH 12/13] Refactor eval setup and runner: Remove Docker
 installation and management from setup_eval.sh and run_evals.py, simplifying
 the evaluation process. Update Node.js installation comment for clarity and
 adjust evaluation runner to use local execution mode.

---
 .../apps-mcp/evals/init/setup_eval.sh         | 39 +-----------
 experimental/apps-mcp/evals/src/run_evals.py  | 60 +------------------
 2 files changed, 3 insertions(+), 96 deletions(-)

diff --git a/experimental/apps-mcp/evals/init/setup_eval.sh b/experimental/apps-mcp/evals/init/setup_eval.sh
index 9870b307e5..53d059aab6 100644
--- a/experimental/apps-mcp/evals/init/setup_eval.sh
+++ b/experimental/apps-mcp/evals/init/setup_eval.sh
@@ -4,7 +4,7 @@ set -e
 echo "=== Apps-MCP Eval Setup ==="
 echo "Python version: $(python --version)"
 
-# Install Node.js (required for klaudbiusz eval)
+# Install Node.js (required for local npm install/build/test)
 echo "Installing Node.js..."
 curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -
 sudo apt-get install -y nodejs
@@ -12,43 +12,6 @@ sudo apt-get install -y nodejs
 echo "Node version: $(node --version)"
 echo "npm version: $(npm --version)"
 
-# Install Docker (required for --no-dagger mode)
-echo "Installing Docker..."
-curl -fsSL https://get.docker.com -o get-docker.sh
-sudo sh get-docker.sh
-rm get-docker.sh
-
-# Configure Docker to use vfs storage driver (works without privileged mode)
-echo "Configuring Docker with vfs storage driver..."
-sudo mkdir -p /etc/docker
-cat <<EOF | sudo tee /etc/docker/daemon.json
-{
-  "storage-driver": "vfs"
-}
-EOF
-
-# Stop any existing Docker daemon
-sudo systemctl stop docker 2>/dev/null || true
-sudo pkill dockerd 2>/dev/null || true
-sleep 2
-
-# Start Docker daemon
-echo "Starting Docker daemon..."
-sudo dockerd --storage-driver=vfs &
-sleep 10
-
-# Verify Docker is running
-echo "Docker version: $(docker --version)"
-sudo docker info || echo "Warning: Docker daemon may not be fully started"
-
-# Allow non-root user to run docker
-sudo usermod -aG docker $(whoami) || true
-sudo chmod 666 /var/run/docker.sock || true
-
-# Pre-pull the node image to speed up evaluation
-echo "Pre-pulling node:20-alpine image..."
-docker pull node:20-alpine || echo "Warning: Could not pre-pull image"
-
 # Install Python dependencies
 pip install fire mlflow
 
diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index ee9fd5f3d5..ad2d30773d 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -1,68 +1,15 @@
 #!/usr/bin/env python3
 """Apps-MCP Evaluation Runner for Databricks Jobs."""
 
-import os
 import subprocess
 import sys
 import tempfile
-import time
 from pathlib import Path
 from typing import Optional
 
 import fire
 
 
-def start_docker_daemon() -> bool:
-    """Start Docker daemon with vfs storage driver (works without privileges)."""
-    print("Checking Docker installation...")
-
-    # Check if Docker CLI is available
-    result = subprocess.run(["which", "docker"], capture_output=True, text=True)
-    if result.returncode != 0:
-        print("Docker CLI not found, attempting to install...")
-        subprocess.run(
-            ["sudo", "bash", "-c", "curl -fsSL https://get.docker.com | sh"],
-            check=False,
-        )
-
-    # Check if Docker is already running
-    result = subprocess.run(
-        ["docker", "info"], capture_output=True, text=True, timeout=10
-    )
-    if result.returncode == 0:
-        print("Docker daemon already running")
-        return True
-
-    print("Starting Docker daemon...")
-
-    # Start dockerd in background (config already set by init script)
-    proc = subprocess.Popen(
-        ["sudo", "dockerd"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
-
-    # Wait for Docker to start
-    for i in range(60):
-        time.sleep(1)
-        result = subprocess.run(
-            ["sudo", "docker", "info"], capture_output=True, text=True, timeout=10
-        )
-        if result.returncode == 0:
-            print(f"Docker daemon started after {i+1}s")
-            # Fix socket permissions
-            subprocess.run(["sudo", "chmod", "666", "/var/run/docker.sock"], check=False)
-            return True
-        if proc.poll() is not None:
-            stdout, stderr = proc.communicate()
-            print(f"dockerd exited with code {proc.returncode}")
-            print(f"stderr: {stderr.decode()[:500]}")
-            break
-
-    print("Failed to start Docker daemon")
-    return False
-
-
 def clone_and_install_klaudbiusz(work_dir: Path, git_url: str) -> Path:
     """Clone klaudbiusz and install dependencies."""
     print(f"Cloning {git_url}...")
@@ -102,11 +49,6 @@ def main(
     print(f"  Parallelism: {parallelism}")
     print(f"  Apps Volume: {apps_volume or 'not specified'}")
 
-    # Try to start Docker daemon
-    docker_available = start_docker_daemon()
-    if not docker_available:
-        print("Warning: Docker not available, container-based checks will fail")
-
     work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
     clone_and_install_klaudbiusz(work_dir, evals_git_url)
 
@@ -123,11 +65,13 @@ def main(
     print("Running evaluation...")
     print("=" * 60)
 
+    # Use no_dagger=False to use Dagger mode (runs locally, not in Docker containers)
     report = run_evaluation_simple(
         apps_dir=str(apps_dir),
         mlflow_experiment=mlflow_experiment,
         parallelism=parallelism,
         fast_mode=True,
+        no_dagger=False,
     )
 
     summary = report.get("summary", {})

From 6df3873aafd2341ca32a4ed7f01cf7346d607599 Mon Sep 17 00:00:00 2001
From: Evgenii Kniazev <evgenii.kniazev@databricks.com>
Date: Thu, 11 Dec 2025 18:05:41 +0000
Subject: [PATCH 13/13] Add local evaluation functionality: Implement
 run_local_evaluation function to execute app evaluations without Docker,
 enhancing the evaluation process. Update main function to utilize local mode
 and improve output messages for clarity.

---
 experimental/apps-mcp/evals/src/run_evals.py | 59 ++++++++++++++++----
 1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/experimental/apps-mcp/evals/src/run_evals.py b/experimental/apps-mcp/evals/src/run_evals.py
index ad2d30773d..bdc41856de 100644
--- a/experimental/apps-mcp/evals/src/run_evals.py
+++ b/experimental/apps-mcp/evals/src/run_evals.py
@@ -35,25 +35,64 @@ def find_apps_dir(apps_volume: str) -> Optional[Path]:
     return None
 
 
+def run_local_evaluation(apps_dir: Path, mlflow_experiment: str) -> dict:
+    """Run local evaluation using shell scripts (no Docker/Dagger)."""
+    import time
+    from dataclasses import asdict
+
+    from cli.evaluation.evaluate_app import evaluate_app
+    from cli.evaluation.evaluate_all import generate_summary_report
+    from cli.utils.apps_discovery import list_apps_in_dir
+
+    app_dirs = list_apps_in_dir(apps_dir)
+    if not app_dirs:
+        raise ValueError(f"No apps found in: {apps_dir}")
+
+    print(f"Evaluating {len(app_dirs)} apps locally...")
+
+    results = []
+    eval_start = time.time()
+
+    for i, app_dir in enumerate(app_dirs, 1):
+        print(f"\n[{i}/{len(app_dirs)}] {app_dir.name}")
+        try:
+            result = evaluate_app(app_dir, prompt=None, port=8000 + i)
+            results.append(asdict(result))
+        except Exception as e:
+            print(f"  Error: {e}")
+
+    eval_duration = time.time() - eval_start
+    print(f"\nEvaluated {len(results)}/{len(app_dirs)} apps in {eval_duration:.1f}s")
+
+    summary = generate_summary_report(results)
+    report = {"summary": summary, "apps": results}
+
+    if mlflow_experiment:
+        from cli.evaluation.tracking import log_evaluation_to_mlflow, setup_mlflow
+        if setup_mlflow(mlflow_experiment):
+            run_id = log_evaluation_to_mlflow(report)
+            if run_id:
+                print(f"MLflow run logged: {run_id}")
+
+    return report
+
+
 def main(
     mlflow_experiment: str = "/Shared/apps-mcp-evaluations",
     parallelism: int = 4,
     apps_volume: Optional[str] = None,
     evals_git_url: str = "https://github.com/neondatabase/appdotbuild-agent.git",
 ) -> None:
-    """Run Apps-MCP evaluations using klaudbiusz."""
+    """Run Apps-MCP evaluations using klaudbiusz (local mode)."""
     print("=" * 60)
-    print("Apps-MCP Evaluation")
+    print("Apps-MCP Evaluation (Local Mode)")
     print("=" * 60)
     print(f"  MLflow Experiment: {mlflow_experiment}")
-    print(f"  Parallelism: {parallelism}")
     print(f"  Apps Volume: {apps_volume or 'not specified'}")
 
     work_dir = Path(tempfile.mkdtemp(prefix="apps-mcp-evals-"))
     clone_and_install_klaudbiusz(work_dir, evals_git_url)
 
-    from cli.evaluation import run_evaluation_simple
-
     apps_dir = find_apps_dir(apps_volume) if apps_volume else None
     if apps_dir:
         print(f"  Apps Dir: {apps_dir}")
@@ -62,16 +101,12 @@ def main(
         apps_dir = work_dir / "appdotbuild-agent" / "klaudbiusz" / "app"
 
     print("\n" + "=" * 60)
-    print("Running evaluation...")
+    print("Running local evaluation...")
     print("=" * 60)
 
-    # Use no_dagger=False to use Dagger mode (runs locally, not in Docker containers)
-    report = run_evaluation_simple(
-        apps_dir=str(apps_dir),
+    report = run_local_evaluation(
+        apps_dir=apps_dir,
         mlflow_experiment=mlflow_experiment,
-        parallelism=parallelism,
-        fast_mode=True,
-        no_dagger=False,
     )
 
     summary = report.get("summary", {})