diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 0000000000..47376addc6
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,176 @@
+name: AI Evals
+
+on:
+ workflow_dispatch: # Manual trigger only
+ inputs:
+ model:
+ description: 'Model to use for evaluation'
+ required: false
+ default: 'claude-sonnet-4-5-20250929'
+ type: string
+
+jobs:
+ eval:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ pull-requests: write # For posting PR comments
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Setup Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: '20'
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install Python dependencies
+ run: |
+ python -m pip install --upgrade pip
+ # No requirements.txt needed for check_eval_scores.py (uses stdlib only)
+
+ - name: Run Evaluations
+ env:
+ LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+ LLM_AUTH_TOKEN: ${{ secrets.LLM_AUTH_TOKEN }}
+ LLM_MODEL: ${{ github.event.inputs.model || 'claude-sonnet-4-5-20250929' }}
+ run: |
+ chmod +x ./evals/scripts/run-promptfoo-eval.sh
+ ./evals/scripts/run-promptfoo-eval.sh --json
+
+ - name: Check Quality Thresholds
+ id: check_thresholds
+ run: |
+ python3 evals/scripts/check_eval_scores.py \
+ --results eval-results.json \
+ --min-score 0.70 \
+ --min-pass-rate 0.70 \
+ --verbose || echo "threshold_failed=true" >> $GITHUB_OUTPUT
+
+ - name: Generate Summary
+ if: always()
+ id: summary
+ run: |
+ if [ -f eval-results.json ]; then
+ python3 << 'EOF'
+ import json
+ import os
+
+ with open('eval-results.json', 'r') as f:
+ data = json.load(f)
+
+ results = data.get('results', {})
+ stats = results.get('stats', {})
+
+ total = stats.get('successes', 0) + stats.get('failures', 0)
+ passed = stats.get('successes', 0)
+ failed = stats.get('failures', 0)
+ pass_rate = (passed / total * 100) if total > 0 else 0
+
+ # Token usage
+ tokens = stats.get('tokenUsage', {})
+ total_tokens = tokens.get('total', 0)
+ cached_tokens = tokens.get('cached', 0)
+
+ summary = f"""## š Eval Results
+
+ **Overall:** {passed}/{total} tests passed ({pass_rate:.0f}%)
+
+ | Metric | Value |
+ |--------|-------|
+ | ā Passed | {passed} |
+ | ā Failed | {failed} |
+ | š Pass Rate | {pass_rate:.0f}% |
+ | šŖ Total Tokens | {total_tokens:,} |
+ | š¾ Cached Tokens | {cached_tokens:,} |
+
+ """
+
+ # List failed tests
+ if failed > 0:
+ summary += "\n### ā Failed Tests\n\n"
+ for result in results.get('results', []):
+ if not result.get('success', False):
+ test_name = result.get('description', 'Unknown')
+ score = result.get('score', 0)
+ summary += f"- {test_name} (score: {score:.2f})\n"
+
+ # Success message
+ if pass_rate >= 70:
+ summary += "\nā **Quality thresholds met!**"
+ else:
+ summary += "\nā ļø **Quality thresholds not met.** Please review failures."
+
+ # Write to output file for PR comment
+ with open('eval_summary.txt', 'w') as f:
+ f.write(summary)
+
+ print(summary)
+ EOF
+ else
+ echo "ā ļø No evaluation results found" > eval_summary.txt
+ fi
+
+ - name: Comment PR with Results
+ if: github.event_name == 'pull_request' && always()
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const fs = require('fs');
+
+ let summary = '## š Eval Results\n\nā ļø Evaluation failed to complete.';
+
+ if (fs.existsSync('eval_summary.txt')) {
+ summary = fs.readFileSync('eval_summary.txt', 'utf8');
+ }
+
+ // Find existing comment
+ const { data: comments } = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+
+ const botComment = comments.find(comment =>
+ comment.user.type === 'Bot' &&
+ comment.body.includes('š Eval Results')
+ );
+
+ if (botComment) {
+ // Update existing comment
+ await github.rest.issues.updateComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ comment_id: botComment.id,
+ body: summary
+ });
+ } else {
+ // Create new comment
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: summary
+ });
+ }
+
+ - name: Upload Results Artifact
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: eval-results
+ path: |
+ eval-results*.json
+ eval_summary.txt
+ retention-days: 30
+
+ - name: Fail if thresholds not met
+ if: steps.check_thresholds.outputs.threshold_failed == 'true'
+ run: |
+ echo "ā Quality thresholds not met"
+ exit 1
diff --git a/.github/workflows/scripts/create-github-release.sh b/.github/workflows/scripts/create-github-release.sh
index 1030bbef4c..64973aeff3 100644
--- a/.github/workflows/scripts/create-github-release.sh
+++ b/.github/workflows/scripts/create-github-release.sh
@@ -12,43 +12,27 @@ fi
VERSION="$1"
-# Remove 'v' prefix from version for release title
+# Tag prefix must remain in sync with get-next-version
+TAG_PREFIX="agentic-sdlc-v"
+
+# Remove prefix from version for release title
VERSION_NO_V=${VERSION#v}
-gh release create "$VERSION" \
- .genreleases/spec-kit-template-copilot-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-copilot-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-claude-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-claude-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-gemini-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-gemini-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-cursor-agent-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-cursor-agent-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-opencode-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-opencode-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-qwen-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-qwen-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-windsurf-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-windsurf-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-codex-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-codex-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-kilocode-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-kilocode-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-auggie-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-auggie-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-roo-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-roo-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-codebuddy-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-codebuddy-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-qoder-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-qoder-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-amp-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-amp-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-shai-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-shai-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-q-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-q-ps-"$VERSION".zip \
- .genreleases/spec-kit-template-bob-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-bob-ps-"$VERSION".zip \
- --title "Spec Kit Templates - $VERSION_NO_V" \
+ASSETS=()
+AGENTS=(copilot claude gemini cursor-agent opencode qwen windsurf codex kilocode auggie roo codebuddy qoder amp shai q bob)
+SCRIPTS=(sh ps)
+
+for agent in "${AGENTS[@]}"; do
+ for script in "${SCRIPTS[@]}"; do
+ asset_path=".genreleases/agentic-sdlc-spec-kit-template-${agent}-${script}-${VERSION}.zip"
+ if [[ ! -f "$asset_path" ]]; then
+ echo "Error: missing release asset $asset_path" >&2
+ exit 1
+ fi
+ ASSETS+=("$asset_path")
+ done
+done
+
+gh release create "$VERSION" "${ASSETS[@]}" \
+ --title "Agentic SDLC Spec Kit Templates - $VERSION_NO_V" \
--notes-file release_notes.md
diff --git a/.github/workflows/scripts/create-release-packages.sh b/.github/workflows/scripts/create-release-packages.sh
index 48678282e1..921eb007a9 100755
--- a/.github/workflows/scripts/create-release-packages.sh
+++ b/.github/workflows/scripts/create-release-packages.sh
@@ -18,8 +18,10 @@ if [[ $# -ne 1 ]]; then
exit 1
fi
NEW_VERSION="$1"
-if [[ ! $NEW_VERSION =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
- echo "Version must look like v0.0.0" >&2
+TAG_PREFIX="agentic-sdlc-v"
+
+if [[ ! $NEW_VERSION =~ ^${TAG_PREFIX}[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+ echo "Version must look like ${TAG_PREFIX}0.0.0" >&2
exit 1
fi
@@ -218,8 +220,8 @@ build_variant() {
mkdir -p "$base_dir/.bob/commands"
generate_commands bob md "\$ARGUMENTS" "$base_dir/.bob/commands" "$script" ;;
esac
- ( cd "$base_dir" && zip -r "../spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip" . )
- echo "Created $GENRELEASES_DIR/spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip"
+ ( cd "$base_dir" && zip -r "../agentic-sdlc-spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip" . )
+ echo "Created $GENRELEASES_DIR/agentic-sdlc-spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip"
}
# Determine agent list
@@ -269,5 +271,4 @@ for agent in "${AGENT_LIST[@]}"; do
done
echo "Archives in $GENRELEASES_DIR:"
-ls -1 "$GENRELEASES_DIR"/spec-kit-template-*-"${NEW_VERSION}".zip
-
+ls -1 "$GENRELEASES_DIR"/agentic-sdlc-spec-kit-template-*-"${NEW_VERSION}".zip
diff --git a/.github/workflows/scripts/generate-release-notes.sh b/.github/workflows/scripts/generate-release-notes.sh
index d8f5dab1fc..0886c31a8a 100644
--- a/.github/workflows/scripts/generate-release-notes.sh
+++ b/.github/workflows/scripts/generate-release-notes.sh
@@ -10,11 +10,12 @@ if [[ $# -ne 2 ]]; then
exit 1
fi
+TAG_PREFIX="agentic-sdlc-v"
NEW_VERSION="$1"
LAST_TAG="$2"
# Get commits since last tag
-if [ "$LAST_TAG" = "v0.0.0" ]; then
+if [ "$LAST_TAG" = "${TAG_PREFIX}0.0.0" ]; then
# Check how many commits we have and use that as the limit
COMMIT_COUNT=$(git rev-list --count HEAD)
if [ "$COMMIT_COUNT" -gt 10 ]; then
diff --git a/.github/workflows/scripts/get-next-version.sh b/.github/workflows/scripts/get-next-version.sh
index 9770b9fdc3..c66437cc37 100644
--- a/.github/workflows/scripts/get-next-version.sh
+++ b/.github/workflows/scripts/get-next-version.sh
@@ -5,12 +5,18 @@ set -euo pipefail
# Calculate the next version based on the latest git tag and output GitHub Actions variables
# Usage: get-next-version.sh
-# Get the latest tag, or use v0.0.0 if no tags exist
-LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+# Prefix all fork-specific tags to avoid upstream conflicts
+TAG_PREFIX="agentic-sdlc-v"
+
+# Get the latest prefixed tag, or fall back to the prefixed zero version
+LATEST_TAG=$(git tag --list "${TAG_PREFIX}*" --sort=-v:refname | head -n 1)
+if [[ -z "${LATEST_TAG}" ]]; then
+ LATEST_TAG="${TAG_PREFIX}0.0.0"
+fi
echo "latest_tag=$LATEST_TAG" >> $GITHUB_OUTPUT
# Extract version number and increment
-VERSION=$(echo $LATEST_TAG | sed 's/v//')
+VERSION=${LATEST_TAG#${TAG_PREFIX}}
IFS='.' read -ra VERSION_PARTS <<< "$VERSION"
MAJOR=${VERSION_PARTS[0]:-0}
MINOR=${VERSION_PARTS[1]:-0}
@@ -18,7 +24,7 @@ PATCH=${VERSION_PARTS[2]:-0}
# Increment patch version
PATCH=$((PATCH + 1))
-NEW_VERSION="v$MAJOR.$MINOR.$PATCH"
+NEW_VERSION="${TAG_PREFIX}$MAJOR.$MINOR.$PATCH"
echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
echo "New version will be: $NEW_VERSION"
diff --git a/.github/workflows/scripts/update-version.sh b/.github/workflows/scripts/update-version.sh
index 12bd9cd1d6..de1a247f06 100644
--- a/.github/workflows/scripts/update-version.sh
+++ b/.github/workflows/scripts/update-version.sh
@@ -12,8 +12,10 @@ fi
VERSION="$1"
-# Remove 'v' prefix for Python versioning
-PYTHON_VERSION=${VERSION#v}
+TAG_PREFIX="agentic-sdlc-v"
+
+# Remove custom prefix for Python versioning
+PYTHON_VERSION=${VERSION#${TAG_PREFIX}}
if [ -f "pyproject.toml" ]; then
sed -i "s/version = \".*\"/version = \"$PYTHON_VERSION\"/" pyproject.toml
diff --git a/.gitignore b/.gitignore
index 1ed573622d..02dfff054d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,9 @@ env/
.DS_Store
*.tmp
+# Github
+.github/workflows/.secrets
+
# Project specific
*.log
.env
@@ -43,4 +46,26 @@ env/
# Spec Kit-specific files
.genreleases/
*.zip
-sdd-*/
+sdd-*
+
+# Evaluation artifacts
+eval-results*.json
+*.backup
+.promptfoo/
+evals/.venv/
+evals/notebooks/.venv/
+evals/notebooks/.ipynb_checkpoints/
+evals/datasets/analysis-results/*.csv
+evals/datasets/analysis-results/*.txt
+evals/datasets/analysis-results/automated-analysis-*.csv
+evals/datasets/analysis-results/plan-analysis-*.csv
+evals/datasets/analysis-results/plan-eval-analysis-*.txt
+evals/datasets/analysis-results/plan-summary-*.txt
+evals/datasets/analysis-results/promptfoo-analysis-*.csv
+evals/datasets/analysis-results/summary-*.txt
+
+# Annotation tool artifacts
+evals/annotation-tool/.venv/
+evals/annotation-tool/annotations.json
+evals/annotation-tool/annotations_export_*.json
+evals/annotation-tool/.sesskey
diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc
index d6db02777c..941cd3236c 100644
--- a/.markdownlint-cli2.jsonc
+++ b/.markdownlint-cli2.jsonc
@@ -22,6 +22,7 @@
}
},
"ignores": [
- ".genreleases/"
+ ".genreleases/",
+ "evals/docs"
]
-}
\ No newline at end of file
+}
diff --git a/.markdownlintignore b/.markdownlintignore
new file mode 100644
index 0000000000..f86f0edcc5
--- /dev/null
+++ b/.markdownlintignore
@@ -0,0 +1 @@
+evals/docs
diff --git a/AGENTS.md b/AGENTS.md
index d7360487b8..bff55dbe9b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,10 +1,10 @@
# AGENTS.md
-## About Spec Kit and Specify
+## About Agentic SDLC Spec Kit and Specify
-**GitHub Spec Kit** is a comprehensive toolkit for implementing Spec-Driven Development (SDD) - a methodology that emphasizes creating clear specifications before implementation. The toolkit includes templates, scripts, and workflows that guide development teams through a structured approach to building software.
+**Agentic SDLC Spec Kit** is a comprehensive toolkit for implementing Spec-Driven Development (SDD) - a methodology that emphasizes creating clear specifications before implementation. The toolkit includes templates, scripts, and workflows that guide development teams through a structured approach to building software.
-**Specify CLI** is the command-line interface that bootstraps projects with the Spec Kit framework. It sets up the necessary directory structures, templates, and AI agent integrations to support the Spec-Driven Development workflow.
+**Specify CLI** is the command-line interface that bootstraps projects with the Agentic SDLC Spec Kit framework. It sets up the necessary directory structures, templates, and AI agent integrations to support the Spec-Driven Development workflow.
The toolkit supports multiple AI coding assistants, allowing teams to use their preferred tools while maintaining consistent project structure and development practices.
@@ -132,8 +132,8 @@ Modify `.github/workflows/scripts/create-github-release.sh` to include the new a
```bash
gh release create "$VERSION" \
# ... existing packages ...
- .genreleases/spec-kit-template-windsurf-sh-"$VERSION".zip \
- .genreleases/spec-kit-template-windsurf-ps-"$VERSION".zip \
+ .genreleases/agentic-sdlc-spec-kit-template-windsurf-sh-"$VERSION".zip \
+ .genreleases/agentic-sdlc-spec-kit-template-windsurf-ps-"$VERSION".zip \
# Add new agent packages here
```
@@ -364,6 +364,22 @@ Command content with {SCRIPT} and {{args}} placeholders.
"""
```
+## Spec-Code Synchronization
+
+The `--spec-sync` option enables automatic spec-code synchronization via git hooks. This feature is supported for both bash (Linux/macOS) and PowerShell (Windows) environments:
+
+### Bash Implementation
+
+- **Scripts**: `scripts/bash/spec-hooks-install.sh`, `spec-sync-pre-commit.sh`, `spec-sync-post-commit.sh`, `spec-sync-pre-push.sh`
+- **Hooks**: Creates bash-based git hooks that call the PowerShell scripts when on Windows
+
+### PowerShell Implementation
+
+- **Scripts**: `scripts/powershell/spec-hooks-install.ps1`, `spec-sync-pre-commit.ps1`, `spec-sync-post-commit.ps1`, `spec-sync-pre-push.ps1`
+- **Hooks**: Creates bash-based git hooks that call the PowerShell scripts using `powershell -ExecutionPolicy Bypass -File`
+
+The CLI automatically selects the appropriate implementation based on the `--script` parameter (sh/ps) or platform default.
+
## Directory Conventions
- **CLI agents**: Usually `./commands/`
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b42e8fd61..361e5e5d7b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
-# Contributing to Spec Kit
+## Contributing to Agentic SDLC Spec Kit
-Hi there! We're thrilled that you'd like to contribute to Spec Kit. Contributions to this project are [released](https://help.github.com/articles/github-terms-of-service/#6-contributions-under-repository-license) to the public under the [project's open source license](LICENSE).
+Hi there! We're thrilled that you'd like to contribute to Agentic SDLC Spec Kit. Contributions to this project are [released](https://help.github.com/articles/github-terms-of-service/#6-contributions-under-repository-license) to the public under the [project's open source license](LICENSE).
Please note that this project is released with a [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By participating in this project you agree to abide by its terms.
@@ -30,6 +30,25 @@ On [GitHub Codespaces](https://github.com/features/codespaces) it's even simpler
+## Repository setup
+
+This repository is a fork of [github/spec-kit](https://github.com/github/spec-kit). To keep your fork in sync:
+
+```bash
+# Add upstream remote (one-time setup)
+git remote add upstream https://github.com/github/spec-kit.git
+
+# Fetch upstream changes
+git fetch upstream
+
+# Merge upstream changes into your main branch
+git checkout main
+git merge upstream/main
+
+# Push the merged changes
+git push origin main
+```
+
## Submitting a pull request
> [!NOTE]
@@ -55,7 +74,7 @@ Here are a few things you can do that will increase the likelihood of your pull
## Development workflow
-When working on spec-kit:
+When working on Agentic SDLC Spec Kit:
1. Test changes with the `specify` CLI commands (`/speckit.specify`, `/speckit.plan`, `/speckit.tasks`) in your coding agent of choice
2. Verify templates are working correctly in `templates/` directory
@@ -85,16 +104,16 @@ To test your templates, commands, and other changes locally, follow these steps:
Navigate to your test project folder and open the agent to verify your implementation.
-## AI contributions in Spec Kit
+## AI contributions in Agentic SDLC Spec Kit
> [!IMPORTANT]
>
-> If you are using **any kind of AI assistance** to contribute to Spec Kit,
+> If you are using **any kind of AI assistance** to contribute to Agentic SDLC Spec Kit,
> it must be disclosed in the pull request or issue.
-We welcome and encourage the use of AI tools to help improve Spec Kit! Many valuable contributions have been enhanced with AI assistance for code generation, issue detection, and feature definition.
+We welcome and encourage the use of AI tools to help improve Agentic SDLC Spec Kit! Many valuable contributions have been enhanced with AI assistance for code generation, issue detection, and feature definition.
-That being said, if you are using any kind of AI assistance (e.g., agents, ChatGPT) while contributing to Spec Kit,
+That being said, if you are using any kind of AI assistance (e.g., agents, ChatGPT) while contributing to Agentic SDLC Spec Kit,
**this must be disclosed in the pull request or issue**, along with the extent to which AI assistance was used (e.g., documentation comments vs. code generation).
If your PR responses or comments are being generated by an AI, disclose that as well.
@@ -122,7 +141,7 @@ When submitting AI-assisted contributions, please ensure they include:
- **Clear disclosure of AI use** - You are transparent about AI use and degree to which you're using it for the contribution
- **Human understanding and testing** - You've personally tested the changes and understand what they do
-- **Clear rationale** - You can explain why the change is needed and how it fits within Spec Kit's goals
+- **Clear rationale** - You can explain why the change is needed and how it fits within Agentic SDLC Spec Kit's goals
- **Concrete evidence** - Include test cases, scenarios, or examples that demonstrate the improvement
- **Your own analysis** - Share your thoughts on the end-to-end developer experience
@@ -131,7 +150,7 @@ When submitting AI-assisted contributions, please ensure they include:
We reserve the right to close contributions that appear to be:
- Untested changes submitted without verification
-- Generic suggestions that don't address specific Spec Kit needs
+- Generic suggestions that don't address specific Agentic SDLC Spec Kit needs
- Bulk submissions that show no human review or understanding
### Guidelines for success
diff --git a/README.md b/README.md
index 76149512f6..8d0cecbaf4 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-
-
š± Spec Kit
+
+
š Agentic SDLC Spec Kit
Build high-quality software faster.
@@ -8,10 +8,34 @@
An open source toolkit that allows you to focus on product scenarios and predictable outcomes instead of vibe coding every piece from scratch.
+## šÆ Project Vision
+
+This fork combines the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology with Spec-Driven Development to create a comprehensive framework for AI-assisted software development. The 12 Factors provide the strategic foundation and operational principles, while Spec-Driven Development delivers the practical implementation workflow.
+
+### Why This Combination?
+
+**Agentic SDLC 12 Factors** establish the philosophical and strategic principles for building software with AI coding agents, covering aspects like strategic mindset, context scaffolding, dual execution loops, and team capability.
+
+**Spec-Driven Development** provides the concrete, actionable process for implementing these principles through structured specification, planning, task breakdown, and iterative implementation phases.
+
+Together, they form a complete methodology that transforms how organizations approach AI-assisted development, moving from ad-hoc AI usage to systematic, high-quality software production.
+
+### Why This Fork?
+
+The original [github/spec-kit](https://github.com/github/spec-kit) repository focused on the core Spec-Driven Development process. This fork extends that foundation by:
+
+- **Integrating the 12 Factors methodology** as the strategic layer above the tactical Spec-Driven process
+- **Adding enterprise-grade features** like team AI directives, MCP server integration, and gateway configuration
+- **Enhancing tooling** with advanced CLI options, async agent support, comprehensive issue tracker integration, and spec-code synchronization
+- **Implementing AI session context management** through the levelup command that creates reusable knowledge packets and analyzes contributions to team directives
+- **Providing team templates** and best practices for scaling AI-assisted development across teams
+
+This fork represents the evolution from a development process to a complete organizational methodology for AI-native software development, with sophisticated knowledge management and cross-project learning capabilities.
+
-
-
-
+
+
+
@@ -21,7 +45,6 @@
- [š¤ What is Spec-Driven Development?](#-what-is-spec-driven-development)
- [ā” Get Started](#-get-started)
-- [š½ļø Video Overview](#ļø-video-overview)
- [š¤ Supported AI Agents](#-supported-ai-agents)
- [š§ Specify CLI Reference](#-specify-cli-reference)
- [š Core Philosophy](#-core-philosophy)
@@ -51,7 +74,7 @@ Choose your preferred installation method:
Install once and use everywhere:
```bash
-uv tool install specify-cli --from git+https://github.com/github/spec-kit.git
+uv tool install agentic-sdlc-specify-cli --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git
```
Then use the tool directly:
@@ -72,7 +95,7 @@ specify check
To upgrade Specify, see the [Upgrade Guide](./docs/upgrade.md) for detailed instructions. Quick upgrade:
```bash
-uv tool install specify-cli --force --from git+https://github.com/github/spec-kit.git
+uv tool install agentic-sdlc-specify-cli --force --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git
```
#### Option 2: One-time Usage
@@ -80,7 +103,7 @@ uv tool install specify-cli --force --from git+https://github.com/github/spec-ki
Run directly without installing:
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init
+uvx --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git specify init
```
**Benefits of persistent installation:**
@@ -90,6 +113,207 @@ uvx --from git+https://github.com/github/spec-kit.git specify init ` | Argument | Name for your new project directory (optional if using `--here`, or use `.` for current directory) |
+| Argument/Option | Type | Description |
+|------------------------------|----------|-----------------------------------------------------------------------------|
+| `` | Argument | Name for your new project directory (optional if using `--here`, or use `.` for current directory) |
| `--ai` | Option | AI assistant to use: `claude`, `gemini`, `copilot`, `cursor-agent`, `qwen`, `opencode`, `codex`, `windsurf`, `kilocode`, `auggie`, `roo`, `codebuddy`, `amp`, `shai`, `q`, `bob`, or `qoder` |
-| `--script` | Option | Script variant to use: `sh` (bash/zsh) or `ps` (PowerShell) |
-| `--ignore-agent-tools` | Flag | Skip checks for AI agent tools like Claude Code |
-| `--no-git` | Flag | Skip git repository initialization |
-| `--here` | Flag | Initialize project in the current directory instead of creating a new one |
-| `--force` | Flag | Force merge/overwrite when initializing in current directory (skip confirmation) |
-| `--skip-tls` | Flag | Skip SSL/TLS verification (not recommended) |
-| `--debug` | Flag | Enable detailed debug output for troubleshooting |
-| `--github-token` | Option | GitHub token for API requests (or set GH_TOKEN/GITHUB_TOKEN env variable) |
+| `--script` | Option | Script type: `sh` (POSIX) or `ps` (PowerShell) |
+| `--ignore-agent-tools` | Flag | Skip checks for AI agent tools like Claude |
+| `--no-git` | Flag | Skip git repository |
+| `--here` | Flag | Initialize project in the current directory instead of creating a new one |
+| `--force` | Flag | Force merge/overwrite when initializing in current directory (skip confirmation) |
+| `--skip-tls` | Flag | Skip SSL/TLS verification (not recommended) |
+| `--debug` | Flag | Enable detailed debug output for troubleshooting |
+| `--github-token` | Option | GitHub token for API requests (or set GH_TOKEN/GITHUB_TOKEN env variable) |
+| `--team-ai-directives` | Option | Path or URL to team-ai-directives repository |
+| `--issue-tracker` | Option | Issue tracker MCP: `github`, `jira`, `linear`, `gitlab` |
+| `--async-agent` | Option | Async agent MCP: `jules`, `async-copilot`, `async-codex` |
+| `--git-platform` | Option | Git platform MCP for PR operations: `github`, `gitlab` |
+| `--gateway-url` | Option | Central LLM gateway URL |
+| `--gateway-token` | Option | Gateway authentication token |
+| `--gateway-suppress-warning` | Flag | Suppress gateway warning messages |
+| `--spec-sync` | Flag | Enable automatic spec-code synchronization (keeps specs/*.md files updated with code changes) |
+
+### `/mode` Arguments & Options
+
+| Argument/Option | Type | Description |
+|-----------------|----------|-----------------------------------------------------------------------------|
+| `` | Argument | Workflow mode: `build` (lightweight) or `spec` (comprehensive) - leave empty to show current mode |
+| `--tdd/--no-tdd` | Option | Enable/disable TDD (Test-Driven Development) |
+| `--contracts/--no-contracts` | Option | Enable/disable API contract generation |
+| `--data-models/--no-data-models` | Option | Enable/disable data model generation |
+| `--risk-tests/--no-risk-tests` | Option | Enable/disable risk-based test generation |
+| `--reset-opinions` | Option | Reset framework opinions to mode defaults |
+| `--info`, `-i` | Flag | Show detailed information about available modes |
### Examples
@@ -238,8 +485,26 @@ specify init my-project --ai claude --debug
# Use GitHub token for API requests (helpful for corporate environments)
specify init my-project --ai claude --github-token ghp_your_token_here
+# Initialize with team AI directives and issue tracker integration
+specify init my-project --ai claude --team-ai-directives https://github.com/your-org/team-ai-directives.git --issue-tracker github
+
+# Initialize with async agent support for autonomous task execution
+specify init my-project --ai claude --async-agent jules
+
+# Initialize with central LLM gateway configuration
+specify init my-project --ai claude --gateway-url https://proxy.internal --gateway-token $TOKEN
+
+# Complex example: Enterprise setup with all integrations
+specify init enterprise-app --ai claude --script sh --team-ai-directives https://github.com/company/team-ai-directives.git --issue-tracker jira --async-agent jules --gateway-url https://llm-gateway.company.com --gateway-token $GATEWAY_TOKEN --github-token $GH_TOKEN
+
# Check system requirements
specify check
+
+# Workflow mode management
+/mode # Show current mode
+/mode build # Switch to lightweight build mode
+/mode spec # Switch to comprehensive spec mode
+/mode --info # Show detailed mode information
```
### Available Slash Commands
@@ -250,13 +515,14 @@ After running `specify init`, your AI coding agent will have access to these sla
Essential commands for the Spec-Driven Development workflow:
-| Command | Description |
-| ----------------------- | ------------------------------------------------------------------------ |
-| `/speckit.constitution` | Create or update project governing principles and development guidelines |
-| `/speckit.specify` | Define what you want to build (requirements and user stories) |
-| `/speckit.plan` | Create technical implementation plans with your chosen tech stack |
-| `/speckit.tasks` | Generate actionable task lists for implementation |
-| `/speckit.implement` | Execute all tasks to build the feature according to the plan |
+| Command | Description |
+|--------------------------|-----------------------------------------------------------------------|
+| `/speckit.constitution` | Create or update project governing principles and development guidelines |
+| `/speckit.specify` | Define what you want to build (requirements and user stories) |
+| `/speckit.plan` | Create technical implementation plans with your chosen tech stack & SYNC/ASYNC triage |
+| `/speckit.tasks` | Generate actionable task lists for implementation |
+| `/speckit.implement` | Execute all tasks to build the feature according to the plan with dual execution loops (SYNC/ASYNC modes) |
+| `/speckit.levelup` | Capture learnings and contribute to team knowledge repository |
#### Optional Commands
@@ -283,6 +549,23 @@ Spec-Driven Development is a structured process that emphasizes:
- **Multi-step refinement** rather than one-shot code generation from prompts
- **Heavy reliance** on advanced AI model capabilities for specification interpretation
+### Alignment with Agentic SDLC 12 Factors
+
+This methodology aligns with the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) framework, which provides foundational principles for building software with AI coding agents. Key alignments include:
+
+- **Factor I: Strategic Mindset** - Intent-driven development with clear specifications
+- **Factor II: Context Scaffolding** - Rich organizational principles and guardrails
+- **Factor III: Mission Definition** - Structured specification creation process
+- **Factor IV: Structured Planning** - Multi-step refinement with technical planning
+- **Factor V: Dual Execution Loops** - SYNC/ASYNC execution modes for different development phases
+- **Factor VI: The Great Filter** - Quality gates and validation checkpoints
+- **Factor VII: Adaptive Quality Gates** - Flexible quality assurance based on project needs
+- **Factor VIII: AI-Augmented, Risk-Based Testing** - Intelligent testing strategies
+- **Factor IX: Traceability** - End-to-end artifact traceability
+- **Factor X: Strategic Tooling** - Purpose-built tools for AI-assisted development
+- **Factor XI: Directives as Code** - Team AI directives for consistent behavior
+- **Factor XII: Team Capability** - Knowledge sharing and continuous improvement
+
## š Development Phases
| Phase | Focus | Key Activities |
@@ -638,12 +921,18 @@ rm gcm-linux_amd64.2.6.1.deb
## š„ Maintainers
+### Original Repository
+
- Den Delimarsky ([@localden](https://github.com/localden))
- John Lam ([@jflam](https://github.com/jflam))
+### Fork Maintainers (tikalk/agentic-sdlc-spec-kit)
+
+- Lior Kanfi ([@kanfil](https://github.com/kanfil))
+
## š¬ Support
-For support, please open a [GitHub issue](https://github.com/github/spec-kit/issues/new). We welcome bug reports, feature requests, and questions about using Spec-Driven Development.
+For support, please open a [GitHub issue](https://github.com/tikalk/agentic-sdlc-spec-kit/issues/new). We welcome bug reports, feature requests, and questions about using Spec-Driven Development.
## š Acknowledgements
diff --git a/SUPPORT.md b/SUPPORT.md
index c6acf76e05..24469f99aa 100644
--- a/SUPPORT.md
+++ b/SUPPORT.md
@@ -12,7 +12,7 @@ For help or questions about using this project, please:
## Project Status
-**Spec Kit** is under active development and maintained by GitHub staff **AND THE COMMUNITY**. We will do our best to respond to support, feature requests, and community questions in a timely manner.
+**Agentic SDLC Spec Kit** is under active development and maintained by GitHub staff **AND THE COMMUNITY**. We will do our best to respond to support, feature requests, and community questions in a timely manner.
## GitHub Support Policy
diff --git a/docs/README.md b/docs/README.md
index a9edc81dc9..ab9c84bef8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,6 @@
# Documentation
-This folder contains the documentation source files for Spec Kit, built using [DocFX](https://dotnet.github.io/docfx/).
+This folder contains the documentation source files for Agentic SDLC Spec Kit, built using [DocFX](https://dotnet.github.io/docfx/). Agentic SDLC Spec Kit is built upon the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology for structured AI-assisted development.
## Building Locally
diff --git a/docs/docfx.json b/docs/docfx.json
index dca3f0f578..790f951d43 100644
--- a/docs/docfx.json
+++ b/docs/docfx.json
@@ -56,9 +56,9 @@
"cleanupCacheHistory": false,
"disableGitFeatures": false,
"globalMetadata": {
- "_appTitle": "Spec Kit Documentation",
- "_appName": "Spec Kit",
- "_appFooter": "Spec Kit - A specification-driven development toolkit",
+ "_appTitle": "Agentic SDLC Spec Kit Documentation",
+ "_appName": "Agentic SDLC Spec Kit",
+ "_appFooter": "Agentic SDLC Spec Kit - A specification-driven development toolkit",
"_enableSearch": true,
"_disableContribution": false,
"_gitContribute": {
diff --git a/docs/index.md b/docs/index.md
index a56fcc1761..3df4422667 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,4 +1,4 @@
-# Spec Kit
+# Agentic SDLC Spec Kit
*Build high-quality software faster.*
@@ -58,6 +58,25 @@ Our research and experimentation focus on:
- Provide robust iterative feature development workflows
- Extend processes to handle upgrades and modernization tasks
+## Alignment with Agentic SDLC 12 Factors
+
+Spec-Driven Development is built upon the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology, a comprehensive framework for software development with AI coding agents. The 12 factors provide the foundational principles that guide our approach:
+
+- [I. Strategic Mindset](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-mindset.html): Developer as Orchestrator, AI as Intern
+- [II. Context Scaffolding](https://tikalk.github.io/agentic-sdlc-12-factors/content/context-scaffolding.html): Treat Context as a Dependency
+- [III. Mission Definition](https://tikalk.github.io/agentic-sdlc-12-factors/content/mission-definition.html): From Intent to Specification
+- [IV. Structured Planning](https://tikalk.github.io/agentic-sdlc-12-factors/content/structured-planning.html): Decompose and Triage Tasks
+- [V. Dual Execution Loops](https://tikalk.github.io/agentic-sdlc-12-factors/content/dual-execution-loops.html): Pair Program or Delegate Toil
+- [VI. The Great Filter](https://tikalk.github.io/agentic-sdlc-12-factors/content/great-filter.html): Apply Irreplaceable Human Judgment
+- [VII. Adaptive Quality Gates](https://tikalk.github.io/agentic-sdlc-12-factors/content/adaptive-quality-gates.html): Review Appropriately for Each Workflow
+- [VIII. AI-Augmented, Risk-Based Testing](https://tikalk.github.io/agentic-sdlc-12-factors/content/ai-augmented-testing.html): AI-Augmented, Risk-Based Testing
+- [IX. Traceability](https://tikalk.github.io/agentic-sdlc-12-factors/content/traceability.html): Linking the 'Why' to the 'How'
+- [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html): Manage a Federated, Governed Stack
+- [XI. Directives as Code](https://tikalk.github.io/agentic-sdlc-12-factors/content/directives-as-code.html): Version and Share AI Behavior
+- [XII. Team Capability](https://tikalk.github.io/agentic-sdlc-12-factors/content/team-capability.html): Systematize Learning and Improvement
+
+This alignment ensures that Agentic SDLC Spec Kit provides a structured, high-velocity approach to AI-assisted development while maintaining quality and accountability.
+
## Contributing
Please see our [Contributing Guide](https://github.com/github/spec-kit/blob/main/CONTRIBUTING.md) for information on how to contribute to this project.
diff --git a/docs/installation.md b/docs/installation.md
index 6daff24315..e8bb108319 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,5 +1,7 @@
# Installation Guide
+> Agentic SDLC Spec Kit implements the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology for structured AI-assisted development.
+
## Prerequisites
- **Linux/macOS** (or Windows; PowerShell scripts now supported without WSL)
@@ -15,15 +17,15 @@
The easiest way to get started is to initialize a new project:
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init
+uvx --from git+https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git specify init
```
Or initialize in the current directory:
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init .
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init .
# or use the --here flag
-uvx --from git+https://github.com/github/spec-kit.git specify init --here
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --here
```
### Specify AI Agent
@@ -31,10 +33,10 @@ uvx --from git+https://github.com/github/spec-kit.git specify init --here
You can proactively specify your AI agent during initialization:
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init --ai claude
-uvx --from git+https://github.com/github/spec-kit.git specify init --ai gemini
-uvx --from git+https://github.com/github/spec-kit.git specify init --ai copilot
-uvx --from git+https://github.com/github/spec-kit.git specify init --ai codebuddy
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --ai claude
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --ai gemini
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --ai copilot
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --ai codebuddy
```
### Specify Script Type (Shell vs PowerShell)
@@ -50,8 +52,8 @@ Auto behavior:
Force a specific script type:
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init --script sh
-uvx --from git+https://github.com/github/spec-kit.git specify init --script ps
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --script sh
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --script ps
```
### Ignore Agent Tools Check
@@ -59,18 +61,69 @@ uvx --from git+https://github.com/github/spec-kit.git specify init --ai claude --ignore-agent-tools
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --ai claude --ignore-agent-tools
+```
+
+### Configure Team AI Directives
+
+Connect to shared team knowledge and standards:
+
+```bash
+# Use local team-ai-directives directory
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --team-ai-directives ~/workspace/team-ai-directives
+
+# Clone from remote repository
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --team-ai-directives https://github.com/your-org/team-ai-directives.git
+```
+
+### Enable Issue Tracker Integration
+
+Configure MCP servers for project management integration:
+
+```bash
+# GitHub Issues
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --issue-tracker github
+
+# Jira, Linear, or GitLab
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --issue-tracker jira
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --issue-tracker linear
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --issue-tracker gitlab
+```
+
+### Enable Async Agent Support
+
+Configure autonomous coding agents for delegated task execution:
+
+```bash
+# Jules, Async Copilot, or Async Codex
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --async-agent jules
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --async-agent async-copilot
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --async-agent async-codex
+```
+
+### Configure Central LLM Gateway
+
+Set up proxy configuration for centralized AI model access:
+
+```bash
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --gateway-url https://proxy.internal --gateway-token $TOKEN
```
## Verification
After initialization, you should see the following commands available in your AI agent:
+- `/speckit.constitution` - Establish project principles and assemble constitution
- `/speckit.specify` - Create specifications
-- `/speckit.plan` - Generate implementation plans
+- `/speckit.plan` - Generate implementation plans
- `/speckit.tasks` - Break down into actionable tasks
+- `/speckit.implement` - Execute implementation with SYNC/ASYNC dual execution loops
+- `/speckit.levelup` - Capture learnings and contribute to team knowledge
+- `/speckit.analyze` - Cross-artifact consistency and alignment reports
+- `/speckit.checklist` - Generate quality checklists for requirements validation
+- `/speckit.clarify` - Structured questions to de-risk ambiguous areas
-The `.specify/scripts` directory will contain both `.sh` and `.ps1` scripts.
+The `.specify/scripts` directory will contain both `.sh` and `.ps1` scripts for automation, and `.mcp.json` will be configured for issue tracker and async agent integration if specified.
## Troubleshooting
diff --git a/docs/local-development.md b/docs/local-development.md
index 7fac06adf4..814c455293 100644
--- a/docs/local-development.md
+++ b/docs/local-development.md
@@ -1,13 +1,13 @@
# Local Development Guide
-This guide shows how to iterate on the `specify` CLI locally without publishing a release or committing to `main` first.
+This guide shows how to iterate on the `specify` CLI locally without publishing a release or committing to `main` first. This process aligns with [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html), managing a governed stack of development tools for controlled and efficient workflows.
> Scripts now have both Bash (`.sh`) and PowerShell (`.ps1`) variants. The CLI auto-selects based on OS unless you pass `--script sh|ps`.
## 1. Clone and Switch Branches
```bash
-git clone https://github.com/github/spec-kit.git
+git clone https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git
cd spec-kit
# Work on a feature branch
git checkout -b your-feature-branch
@@ -60,7 +60,7 @@ You can also point uvx at a specific branch without merging:
```bash
# Push your working branch first
git push origin your-feature-branch
-uvx --from git+https://github.com/github/spec-kit.git@your-feature-branch specify init demo-branch-test --script ps
+uvx --from git+https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git@your-feature-branch specify init demo-branch-test --script ps
```
### 4a. Absolute Path uvx (Run From Anywhere)
@@ -68,21 +68,21 @@ uvx --from git+https://github.com/github/spec-kit.git@your-feature-branch specif
If you're in another directory, use an absolute path instead of `.`:
```bash
-uvx --from /mnt/c/GitHub/spec-kit specify --help
-uvx --from /mnt/c/GitHub/spec-kit specify init demo-anywhere --ai copilot --ignore-agent-tools --script sh
+uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify --help
+uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify init demo-anywhere --ai copilot --ignore-agent-tools --script sh
```
Set an environment variable for convenience:
```bash
-export SPEC_KIT_SRC=/mnt/c/GitHub/spec-kit
+export SPEC_KIT_SRC=/mnt/c/GitHub/agentic-sdlc-spec-kit
uvx --from "$SPEC_KIT_SRC" specify init demo-env --ai copilot --ignore-agent-tools --script ps
```
(Optional) Define a shell function:
```bash
-specify-dev() { uvx --from /mnt/c/GitHub/spec-kit specify "$@"; }
+specify-dev() { uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify "$@"; }
# Then
specify-dev --help
```
@@ -146,7 +146,7 @@ specify init demo --skip-tls --ai gemini --ignore-agent-tools --script ps
| Run CLI directly | `python -m src.specify_cli --help` |
| Editable install | `uv pip install -e .` then `specify ...` |
| Local uvx run (repo root) | `uvx --from . specify ...` |
-| Local uvx run (abs path) | `uvx --from /mnt/c/GitHub/spec-kit specify ...` |
+| Local uvx run (abs path) | `uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify ...` |
| Git branch uvx | `uvx --from git+URL@branch specify ...` |
| Build wheel | `uv build` |
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 4d3b863b35..1456b49fa3 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -1,10 +1,144 @@
# Quick Start Guide
-This guide will help you get started with Spec-Driven Development using Spec Kit.
+This guide will help you get started with Spec-Driven Development using Agentic SDLC Spec Kit.
> [!NOTE]
> All automation scripts now provide both Bash (`.sh`) and PowerShell (`.ps1`) variants. The `specify` CLI auto-selects based on OS unless you pass `--script sh|ps`.
+## Stage 0: Foundation & Setup
+
+**Goal:** Establish the foundational rules and configure the development environment so every later stage aligns with the project's architectural and security principles.
+**Note:** Run these steps in a standard terminal before opening the Intelligent IDE.
+**Alignment with 12 Factors:** This stage establishes the foundation guided by [I. Strategic Mindset](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-mindset.html) and [II. Context Scaffolding](https://tikalk.github.io/agentic-sdlc-12-factors/content/context-scaffolding.html), positioning the developer as orchestrator and assembling necessary context for AI collaboration.
+
+### Choose Your Workflow Mode
+
+Specify supports two workflow modes that control development complexity, plus configurable framework opinions:
+
+- **`spec` mode (default)**: Full structured development with comprehensive requirements, research, and validation
+- **`build` mode**: Lightweight approach focused on quick implementation and exploration
+
+**Framework Opinions** (configurable within each mode):
+
+- **TDD**: Test-Driven Development (tests before implementation)
+- **API Contracts**: Automatic contract generation during planning
+- **Data Models**: Automatic data model generation during planning
+
+```bash
+# Check current mode and opinions
+/mode
+
+# Switch to lightweight mode for prototyping
+/mode build
+
+# Switch to comprehensive mode for production features
+/mode spec
+
+# Customize framework opinions
+/mode --no-contracts # Disable API contracts
+/mode --tdd # Enable TDD
+/mode --risk-tests # Enable risk-based testing
+/mode --reset-opinions # Reset to mode defaults
+
+# Learn more about modes and opinions
+/mode --info
+```
+
+**Recommendation:** Start with `build` mode for exploration, switch to `spec` mode when features become complex or need thorough documentation.
+
+1. **Project Initialization (`/init`)**
+ **Action:** From the project root, run the Agentic SDLC Spec Kit `init` command (e.g., `specify init --team-ai-directives https://github.com/your-org/team-ai-directives.git`) to configure local settings and clone the shared `team-ai-directives` modules.
+ **Purpose:** Creates the handshake that brings the repository into the managed Agentic SDLC ecosystem, wiring credentials, endpoints, and shared knowledge needed for subsequent commands.
+2. **Establishing the Constitution (`/constitution`)**
+ **Action:** Within the IDE, execute `/constitution`, importing relevant modules from `team-ai-directives` and adding any project-specific principles.
+ **Purpose:** Generates `memory/constitution.md`, the immutable ruleset automatically injected into `/specify`, `/plan`, and other workflows so every response honors project standards.
+
+**Example Command:**
+
+```text
+/constitution "Assemble the constitution for this service. Import principles from @team/context_modules/principles/stateless_services.md and @team/context_modules/principles/zero_trust_security_model.md. Add the custom principle: 'All public APIs must be versioned.'"
+```
+
+**Outcome:** The IDE is fully integrated with the Orchestration Hub, and a committed `constitution.md` anchors all future automation.
+
+## Stage 1: Feature Specification
+
+**Goal:** Produce a committed `spec.md` that captures the feature's intent, constraints, and acceptance criteria.
+**Note:** From Stage 1 onward, all work happens inside the Intelligent IDE with the context automatically assembled by Agentic SDLC Spec Kit.
+**Alignment with 12 Factors:** This stage focuses on [III. Mission Definition](https://tikalk.github.io/agentic-sdlc-12-factors/content/mission-definition.html), translating intent into formal, version-controlled specifications.
+
+1. **Craft the Directive (`/specify`)**
+ **Action:** Author a single, comprehensive natural-language directive that blends the issue tracker mission, personas, constraints, and any clarifications.
+ **Purpose:** Front-load human judgment so the AI can draft an accurate `spec.md` aligned with the constitution.
+2. **Execute the Command**
+ **Action:** Run `/specify` in the IDE; Agentic SDLC Spec Kit loads `memory/constitution.md`, resolves `@team/...` references against the directives repo, and captures any `@issue-tracker ISSUE-###` reference in the prompt so the resulting spec links back to the originating ticket.
+ **Purpose:** Generates the structured specification artifact under `specs//spec.md` with shared principles and traceability already in context.
+3. **Review and Commit**
+ **Action:** Perform a macro-review of the generated `spec.md`, refine if needed, then commit it.
+ **Purpose:** Locks in the requirements that all later stages will honor.
+
+**Example Command:**
+
+```text
+/specify "Generate the specification for the feature in @issue-tracker ISSUE-123. The target user is the @team/personas/data_analyst.md. The operation must be asynchronous to handle large dashboards. The PDF title must include the dashboard name and an export timestamp."
+```
+
+**Outcome:** A committed `spec.md` ready to drive planning in Stage 2.
+
+## Stage 2: Planning & Task Management
+
+**Goal:** Convert the committed `spec.md` into a human-approved `plan.md` and a synced task list that routes work through the issue tracker.
+**Note:** `/plan` and `/tasks` run inside the IDE, reusing the constitution and the locally cloned `team-ai-directives` modules.
+**Alignment with 12 Factors:** This stage implements [IV. Structured Planning](https://tikalk.github.io/agentic-sdlc-12-factors/content/structured-planning.html) and [V. Dual Execution Loops](https://tikalk.github.io/agentic-sdlc-12-factors/content/dual-execution-loops.html), decomposing tasks and triaging them for synchronous or asynchronous execution.
+
+1. **Generate the Plan (`/plan`)**
+ **Action:** Execute `/plan` with a directive that covers tech stack, risk considerations, testing focus, and any implementation preferences. Agentic SDLC Spec Kit loads `memory/constitution.md`, references in `team-ai-directives`, and copies the plan template before executing automation.
+ **Purpose:** Guides the AI in generating a comprehensive and strategically-sound first draft of `plan.md`āfront-loading human judgment yields more robust outputs, and the AI produces technical steps with preliminary [SYNC]/[ASYNC] triage suggestions while emitting `plan.md`, `research.md`, `data-model.md`, `quickstart.md`, and contract stubs aligned with the constitution.
+2. **Macro-Review and Commit**
+ **Action:** Review the generated artifacts, adjust as needed, decide [SYNC]/[ASYNC] triage, then commit.
+ **Purpose:** Locks in an execution strategy that downstream stages must respect.
+3. **Sync Tasks (`/tasks`)**
+ **Action:** Run `/tasks` to transform the validated plan into numbered tasks, ensuring each contract, test, and implementation step is represented. The command requires the committed plan artifacts and will surface gaps if prerequisites are missing.
+ **Purpose:** Creates `tasks.md` and mirrors it to the issue tracker for execution visibility.
+
+**Outcome:** A constitution-compliant `plan.md`, supporting design artifacts, and an actionable task list synchronized with project management.
+
+## Stage 3: Implementation
+
+**Goal:** Execute the validated plan, honoring the `[SYNC]/[ASYNC]` execution modes and completing every task in `tasks.md`.
+**Note:** Use `/implement` within the IDE; the command enforces the TDD order, dependency rules, and execution modes captured in Stages 1-2.
+**Alignment with 12 Factors:** This stage applies [VI. The Great Filter](https://tikalk.github.io/agentic-sdlc-12-factors/content/great-filter.html), [VII. Adaptive Quality Gates](https://tikalk.github.io/agentic-sdlc-12-factors/content/adaptive-quality-gates.html), and [VIII. AI-Augmented, Risk-Based Testing](https://tikalk.github.io/agentic-sdlc-12-factors/content/ai-augmented-testing.html), ensuring human judgment filters AI output with appropriate review processes and targeted testing.
+
+1. **Execute Tasks (`/implement`)**
+ **Action:** Run `/implement` to load `plan.md`, `tasks.md`, and supporting artifacts. Follow the phase-by-phase flow, completing risk-based tests before implementation and respecting `[SYNC]/[ASYNC]` modes and `[P]` parallel markers for efficient execution.
+ **Purpose:** Produces production-ready code with targeted testing based on identified risks, marks tasks as `[X]`, and preserves the execution trace for Stage 4.
+2. **Review & Validate**
+ **Action:** Ensure all `[SYNC]` tasks received micro-reviews, all `[ASYNC]` work underwent macro-review, and the test suite passes before moving on.
+ **Purpose:** Guarantees the feature matches the spec and plan with traceable quality gates.
+
+**Outcome:** A completed feature branch with passing tests and an updated `tasks.md` documenting execution status and modes.
+
+## Stage 4: Leveling Up
+
+**Goal:** Capture best practices from the completed feature, draft a reusable knowledge asset in `team-ai-directives`, and generate traceability notes for the original issue.
+**Note:** `/levelup` runs inside the IDE and relies on the locally cloned directives repository from Stage 0.
+**Alignment with 12 Factors:** This stage encompasses [IX. Traceability](https://tikalk.github.io/agentic-sdlc-12-factors/content/traceability.html), [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html), [XI. Directives as Code](https://tikalk.github.io/agentic-sdlc-12-factors/content/directives-as-code.html), and [XII. Team Capability](https://tikalk.github.io/agentic-sdlc-12-factors/content/team-capability.html), linking artifacts, managing tools, versioning AI behavior, and systematizing learning.
+
+1. **Run Level-Up Workflow (`/levelup`)**
+ **Action:** Invoke `/levelup` with a strategic directive (e.g., highlight what should become reusable). Agentic SDLC Spec Kit gathers spec/plan/tasks metadata, validates the directives repo, and prompts you to synthesize a knowledge asset plus PR/issue summaries.
+ **Purpose:** Produces a draft markdown asset under `.specify/memory/team-ai-directives/drafts/`, along with a pull-request description and trace comment for review.
+2. **Review & Publish**
+ **Action:** Inspect the generated asset and summaries. When satisfied, confirm inside `/levelup` to let it create a `levelup/{slug}` branch, commit the asset, push (when remotes are configured), open a PR via `gh pr create` (or emit the command), and post the trace comment (or provide the text if automation is unavailable).
+ **Purpose:** Ensures lessons learned become part of the team's shared brain and closes the loop with traceability artifacts without manual branching overhead.
+
+**Example Command:**
+
+```text
+/levelup "Capture the FastAPI error-handling patterns we refined while closing ISSUE-123. Summarize why the retry strategy works, when to apply it, and provide links to the final implementation."
+```
+
+**Outcome:** A knowledge asset ready for PR, a drafted trace comment for the issue tracker, and clear next steps for team review.
+
## The 6-Step Process
> [!TIP]
@@ -14,19 +148,20 @@ This guide will help you get started with Spec-Driven Development using Spec Kit
**In your terminal**, run the `specify` CLI command to initialize your project:
+> **Note:** All slash commands adapt their behavior based on your current workflow mode. Use `/mode` to check or change modes.
+
```bash
# Create a new project directory
-uvx --from git+https://github.com/github/spec-kit.git specify init
-
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init
# OR initialize in the current directory
-uvx --from git+https://github.com/github/spec-kit.git specify init .
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init .
```
Pick script type explicitly (optional):
```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init --script ps # Force PowerShell
-uvx --from git+https://github.com/github/spec-kit.git specify init --script sh # Force POSIX shell
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --script ps # Force PowerShell
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --script sh # Force POSIX shell
```
### Step 2: Define Your Constitution
@@ -156,6 +291,37 @@ Finally, implement the solution:
- **Iterate and refine** your specifications before implementation
- **Validate** the plan before coding begins
- **Let the AI agent handle** the implementation details
+- **Choose your complexity level** with workflow modes (build for speed, spec for thoroughness)
+
+## Mode Transitions
+
+Your development needs may change as features evolve:
+
+### When to Switch from Build to Spec Mode
+
+```bash
+/mode spec
+```
+
+**Indicators:**
+
+- Feature scope is growing beyond initial expectations
+- Multiple stakeholders need detailed documentation
+- Production deployment requires comprehensive testing
+- Integration with existing systems becomes complex
+
+### When to Switch from Spec to Build Mode
+
+```bash
+/mode build
+```
+
+**Indicators:**
+
+- Shifting to exploratory prototyping
+- Need to quickly validate a technical approach
+- Working on throwaway proof-of-concepts
+- Time pressure requires simplified process
## Next Steps
diff --git a/docs/triage-framework.md b/docs/triage-framework.md
new file mode 100644
index 0000000000..0d930c6fc5
--- /dev/null
+++ b/docs/triage-framework.md
@@ -0,0 +1,307 @@
+# Triage Framework: [SYNC] vs [ASYNC] Task Classification
+
+## Overview
+
+The Triage Framework provides structured guidance for classifying implementation tasks as [SYNC] (human-reviewed execution) or [ASYNC] (autonomous agent delegation). This classification optimizes the dual execution loop by routing tasks to the most appropriate execution method.
+
+## Core Principles
+
+1. **Quality First**: Complex tasks requiring human judgment are classified as [SYNC]
+2. **Efficiency Optimization**: Routine tasks are delegated to agents as [ASYNC]
+3. **Risk Mitigation**: Critical paths maintain human oversight
+4. **Continuous Learning**: Triage decisions improve over time through metrics
+
+## Decision Tree: Task Classification
+
+```text
+START: New Implementation Task
+ ā
+ āāā Is this task security-critical?
+ ā āāā YES ā [SYNC] (Security classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Does this task involve external integrations?
+ ā āāā YES ā [SYNC] (Integration classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Is the requirement ambiguous or unclear?
+ ā āāā YES ā [SYNC] (Clarity classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Does this task require architectural decisions?
+ ā āāā YES ā [SYNC] (Architecture classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Is this complex business logic?
+ ā āāā YES ā [SYNC] (Complexity classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Is this well-defined CRUD/standard pattern?
+ ā āāā YES ā [ASYNC] (Standard pattern classification)
+ ā āāā NO ā Continue
+ ā
+ āāā Does this have comprehensive test coverage?
+ ā āāā YES ā [ASYNC] (Test coverage classification)
+ ā āāā NO ā [SYNC] (Insufficient testing safeguards)
+ ā
+ āāā [ASYNC] (Default classification)
+```
+
+## Detailed Classification Criteria
+
+### [SYNC] Classifications (Human Execution Required)
+
+#### 1. Security Classification
+
+**When to Apply**: Tasks involving authentication, authorization, encryption, data protection, or compliance requirements.
+
+**Examples**:
+
+- User authentication flows
+- API key management
+- Data encryption/decryption
+- Access control logic
+- GDPR/privacy compliance
+
+**Rationale**: Security-critical code requires human expertise and cannot be delegated due to liability and compliance risks.
+
+#### 2. Integration Classification
+
+**When to Apply**: Tasks involving external APIs, legacy systems, third-party services, or complex data transformations.
+
+**Examples**:
+
+- Payment gateway integration
+- External API consumption
+- Legacy system migration
+- Data import/export pipelines
+- Webhook implementations
+
+**Rationale**: Integration complexity often requires domain expertise and error handling that agents cannot reliably implement.
+
+#### 3. Clarity Classification
+
+**When to Apply**: Tasks with ambiguous requirements, unclear acceptance criteria, or multiple interpretation possibilities.
+
+**Examples**:
+
+- Vague user story requirements
+- Missing edge case specifications
+- Conflicting stakeholder expectations
+- Novel feature implementations
+- Research-dependent tasks
+
+**Rationale**: Ambiguous requirements need human clarification and interpretation before implementation.
+
+#### 4. Architecture Classification
+
+**When to Apply**: Tasks involving system design decisions, component boundaries, or architectural patterns.
+
+**Examples**:
+
+- Database schema design
+- API contract definition
+- Component architecture
+- Design pattern selection
+- Performance optimization strategies
+
+**Rationale**: Architectural decisions have long-term impact and require experienced design judgment.
+
+#### 5. Complexity Classification
+
+**When to Apply**: Tasks involving non-trivial algorithms, state machines, or complex business logic.
+
+**Examples**:
+
+- Custom algorithms
+- State machine implementations
+- Complex validation logic
+- Mathematical computations
+- Multi-step business processes
+
+**Rationale**: Complex logic requires deep understanding and careful implementation that benefits from human review.
+
+### [ASYNC] Classifications (Agent Delegation Suitable)
+
+#### 1. Standard Pattern Classification
+
+**When to Apply**: Tasks following well-established patterns, frameworks, or standard implementations.
+
+**Examples**:
+
+- RESTful API endpoints
+- Standard CRUD operations
+- Form validation
+- Basic error handling
+- Standard authentication flows
+
+**Rationale**: Standard patterns have predictable implementations that agents can reliably generate.
+
+#### 2. Test Coverage Classification
+
+**When to Apply**: Tasks with comprehensive automated test coverage providing execution safeguards.
+
+**Examples**:
+
+- Components with >80% test coverage
+- TDD-developed features
+- Well-tested utility functions
+- Components with integration tests
+
+**Rationale**: Comprehensive testing provides confidence that agent-generated code meets requirements.
+
+#### 3. Independent Component Classification
+
+**When to Apply**: Tasks implementing self-contained components with minimal external dependencies.
+
+**Examples**:
+
+- Utility libraries
+- Standalone services
+- Independent UI components
+- Pure functions
+- Data transformation helpers
+
+**Rationale**: Independent components have limited blast radius and can be safely delegated.
+
+## Triage Process Workflow
+
+### Phase 1: Task Identification
+
+1. Break down feature into discrete, implementable tasks
+2. Estimate effort and dependencies for each task
+3. Identify task boundaries and interfaces
+
+### Phase 2: Classification Assessment
+
+1. Apply decision tree to each task
+2. Document primary classification criteria
+3. Assess risk level of misclassification
+4. Record rationale for each decision
+
+### Phase 3: Review and Validation
+
+1. Peer review of triage decisions
+2. Validate classification consistency
+3. Confirm risk assessments
+4. Document any overrides or exceptions
+
+### Phase 4: Execution Planning
+
+1. Group tasks by classification
+2. Plan [SYNC] review checkpoints
+3. Configure [ASYNC] agent delegation
+4. Establish monitoring and rollback procedures
+
+## Triage Metrics and Improvement
+
+### Effectiveness Metrics
+
+#### Classification Accuracy
+
+- **Measurement**: Percentage of tasks correctly classified (validated post-implementation)
+- **Target**: >90% accuracy
+- **Tracking**: Monthly review of misclassifications
+
+#### Review Efficiency
+
+- **Measurement**: Time spent on [SYNC] reviews vs time saved by [ASYNC] delegation
+- **Target**: Net positive efficiency gain
+- **Tracking**: Per-feature analysis
+
+#### Quality Impact
+
+- **Measurement**: Defect rates by classification type
+- **Target**: [SYNC] defects <5%, [ASYNC] defects <15%
+- **Tracking**: Post-implementation defect analysis
+
+### Continuous Improvement
+
+#### Learning Opportunities
+
+- **Common Patterns**: Identify frequently misclassified task types
+- **Training Updates**: Update decision trees based on lessons learned
+- **Tool Improvements**: Enhance triage guidance based on metrics
+
+#### Feedback Integration
+
+- **Developer Feedback**: Collect classification experience reports
+- **Review Feedback**: Analyze review findings for triage improvements
+- **Quality Metrics**: Use defect data to refine classification criteria
+
+## Training Module: Triage Decision Making
+
+### Module 1: Understanding Classifications
+
+**Objective**: Understand the difference between [SYNC] and [ASYNC] tasks
+**Content**:
+
+- Classification criteria with examples
+- Risk assessment frameworks
+- Common misclassification patterns
+
+### Module 2: Decision Tree Application
+
+**Objective**: Practice applying the triage decision tree
+**Content**:
+
+- Interactive decision tree walkthrough
+- Real-world task classification exercises
+- Peer review of classification decisions
+
+### Module 3: Risk Assessment
+
+**Objective**: Learn to assess misclassification risks
+**Content**:
+
+- Risk level determination
+- Impact analysis techniques
+- Mitigation strategy development
+
+### Module 4: Metrics and Improvement
+
+**Objective**: Understand triage effectiveness measurement
+**Content**:
+
+- Metrics definition and calculation
+- Improvement opportunity identification
+- Feedback integration processes
+
+## Implementation Checklist
+
+### For Each Feature
+
+- [ ] All tasks classified as [SYNC] or [ASYNC]
+- [ ] Classification rationale documented
+- [ ] Risk assessment completed
+- [ ] Peer review conducted
+- [ ] Triage decisions approved
+
+### For Each Sprint/Iteration
+
+- [ ] Classification accuracy measured
+- [ ] Review efficiency analyzed
+- [ ] Quality metrics collected
+- [ ] Improvement opportunities identified
+- [ ] Training modules updated
+
+## Appendix: Common Misclassification Patterns
+
+### False Positives ([SYNC] when should be [ASYNC])
+
+- Over-classifying standard CRUD operations
+- Treating well-tested components as high-risk
+- Misinterpreting "complex" as requiring human review
+
+### False Negatives ([ASYNC] when should be [SYNC])
+
+- Underestimating integration complexity
+- Ignoring security implications
+- Delegating ambiguous requirements
+
+### Mitigation Strategies
+
+- Regular calibration sessions
+- Peer review of classifications
+- Metrics-driven refinement
+- Clear escalation procedures
diff --git a/evals/.markdownlintignore b/evals/.markdownlintignore
new file mode 100644
index 0000000000..77f12ae2e5
--- /dev/null
+++ b/evals/.markdownlintignore
@@ -0,0 +1 @@
+docs/
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000000..375eabf29a
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,83 @@
+# Spec-Kit Evaluation Framework
+
+[](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml)
+
+Comprehensive evaluation infrastructure for testing spec-kit template quality using PromptFoo with Claude.
+
+## š Current Evaluation Results (Updated: 2026-01-14)
+
+**Overall Pass Rate: 100% (10/10 tests passing)** ā
+
+| Test Suite | Pass Rate | Status |
+|------------|-----------|--------|
+| **Spec Template** | 8/8 (100%) | ā |
+| **Plan Template** | 2/2 (100%) | ā |
+| **Total** | **10/10 (100%)** | ā |
+
+## Quick Start
+
+> **š” New to the eval framework?** Check out [docs/QUICK_REFERENCE.md](./docs/QUICK_REFERENCE.md) for a one-page overview of all commands, files, and workflows!
+
+### 1. Prerequisites
+
+```bash
+# Install Node.js (if not already installed)
+# macOS:
+brew install node
+
+# Verify installation
+node --version # Should be v18+
+npx --version # Comes with Node.js
+```
+
+### 2. Configure Environment Variables
+
+```bash
+# Add to ~/.bashrc or ~/.zshrc
+export LLM_BASE_URL="your-llm-base-url"
+export LLM_AUTH_TOKEN="your-api-key"
+export LLM_MODEL="claude-sonnet-4-5-20250929" # Optional, defaults to Sonnet 4.5
+
+# Reload shell
+source ~/.zshrc # or source ~/.bashrc
+```
+
+### 3. Run Evaluations
+
+```bash
+# From repo root - run all PromptFoo tests
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run and open web UI
+./evals/scripts/run-promptfoo-eval.sh --view
+
+# Use a specific model (overrides LLM_MODEL env var)
+./evals/scripts/run-promptfoo-eval.sh --model claude-opus-4-5-20251101
+```
+
+## Test Suite
+
+The evaluation includes **10 automated tests** covering:
+- **Spec Template (8 tests)**: Structure, clarity, security, completeness
+- **Plan Template (2 tests)**: Simplicity, constitution compliance
+
+For more details on the test suite and individual tests, see the `tests` array in the `promptfooconfig.js` files.
+
+## Advanced Workflows
+
+For more advanced use cases, see our detailed workflow guides:
+
+- **[Error Analysis & Annotation](docs/WORKFLOWS.md)**: Deep dive into manual and automated error analysis, and how to use the annotation tool.
+- **[CI/CD & Local Testing](docs/GITHUB_ACTIONS_SETUP.md)**: Set up and run evaluations in GitHub Actions or test them locally with `act`.
+
+## Support
+
+For evaluation framework issues:
+- PromptFoo Discord:
+- PromptFoo GitHub:
+
+For spec-kit specific questions:
+- Open issue:
diff --git a/evals/annotation-tool/README.md b/evals/annotation-tool/README.md
new file mode 100644
index 0000000000..d74c84f695
--- /dev/null
+++ b/evals/annotation-tool/README.md
@@ -0,0 +1,127 @@
+# Spec Annotation Tool
+
+A fast, keyboard-driven interface for reviewing generated specs. Built with FastHTML for 10x faster annotation.
+
+## Features
+
+- **Keyboard-driven navigation**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: See how many specs reviewed, passed, failed
+- **Notes**: Add notes for each spec
+- **Auto-save**: Annotations automatically saved to JSON
+- **Export**: Export all annotations with statistics
+- **Beautiful rendering**: Markdown specs rendered with syntax highlighting
+
+## Quick Start
+
+```bash
+# Run using the provided script (from project root)
+./evals/scripts/run-annotation-tool.sh
+```
+
+Or manually:
+
+```bash
+# Install dependencies with uv
+cd evals/annotation-tool
+uv venv
+source .venv/bin/activate # On Windows: .venv\Scripts\activate
+uv pip install python-fasthtml markdown
+
+# Run the app
+python app.py
+```
+
+Then open your browser to `http://localhost:5001` (or the port shown in the terminal).
+
+## Usage
+
+### Keyboard Shortcuts
+
+- **N** - Next spec
+- **P** - Previous spec
+- **1** - Mark as Pass (and advance to next)
+- **2** - Mark as Fail (and advance to next)
+
+### Workflow
+
+1. Review the spec content displayed
+2. Add notes in the text area (optional)
+3. Mark as Pass (1) or Fail (2)
+4. The tool automatically advances to the next spec
+5. Use Export button to save all annotations with timestamp
+
+### Output
+
+Annotations are saved to:
+
+- `annotations.json` - Current annotations (auto-saved)
+- `annotations_export_YYYYMMDD_HHMMSS.json` - Exported snapshots
+
+## Data Structure
+
+```json
+{
+ "exported_at": "2026-01-08T14:30:00",
+ "statistics": {
+ "total": 17,
+ "passed": 12,
+ "failed": 3,
+ "pending": 2,
+ "progress": 88.2
+ },
+ "annotations": {
+ "spec-001.md": {
+ "status": "pass",
+ "notes": "Good structure, all sections present",
+ "timestamp": "2026-01-08T14:25:00"
+ },
+ "spec-002.md": {
+ "status": "fail",
+ "notes": "Missing acceptance criteria",
+ "timestamp": "2026-01-08T14:26:00"
+ }
+ }
+}
+```
+
+## Customization
+
+### Change Specs Directory
+
+Edit `SPECS_DIR` in `app.py`:
+
+```python
+SPECS_DIR = Path("path/to/your/specs")
+```
+
+### Add Filtering
+
+The MVP doesn't include filtering by status yet. To add:
+
+1. Add filter buttons in the UI
+2. Modify `get_current_spec()` to filter specs list
+3. Reset index when filter changes
+
+### Add Semantic Search
+
+For advanced features, you can extend with:
+
+- Vector embeddings for similar spec clustering
+- AI-powered categorization suggestions
+- Bulk operations
+
+## Architecture
+
+- **FastHTML**: Lightweight web framework with HTMX
+- **Pico CSS**: Minimal, beautiful styling
+- **JSON storage**: Simple file-based persistence
+- **Markdown**: Renders spec content with code highlighting
+
+## Next Steps
+
+After using this tool for initial reviews:
+
+1. Export annotations for error analysis in Jupyter
+2. Use findings to extend PromptFoo tests
+3. Build LLM-as-Judge evaluators based on failure patterns
+4. Add discovered failure modes to CI/CD pipeline
diff --git a/evals/annotation-tool/app.py b/evals/annotation-tool/app.py
new file mode 100644
index 0000000000..438fbefc07
--- /dev/null
+++ b/evals/annotation-tool/app.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""
+FastHTML Annotation Tool for Spec Review
+Provides a fast, keyboard-driven interface for reviewing generated specs.
+"""
+
+from fasthtml.common import *
+from pathlib import Path
+import json
+from datetime import datetime
+import markdown
+
+# Initialize FastHTML app with Pico CSS
+app, rt = fast_app(pico=True)
+
+# Configuration
+SPECS_DIR = Path("../datasets/real-specs")
+ANNOTATIONS_FILE = Path("annotations.json")
+
+# Global state (in production, use proper state management)
+class AnnotationState:
+ def __init__(self):
+ self.load_specs()
+ self.load_annotations()
+ self.current_index = 0
+
+ def load_specs(self):
+ """Load all spec files"""
+ self.specs = sorted(list(SPECS_DIR.glob("spec-*.md")))
+ if not self.specs:
+ self.specs = []
+
+ def load_annotations(self):
+ """Load existing annotations"""
+ if ANNOTATIONS_FILE.exists():
+ with open(ANNOTATIONS_FILE, 'r') as f:
+ self.annotations = json.load(f)
+ else:
+ self.annotations = {}
+
+ def save_annotations(self):
+ """Save annotations to JSON"""
+ with open(ANNOTATIONS_FILE, 'w') as f:
+ json.dump(self.annotations, f, indent=2)
+
+ def get_current_spec(self):
+ """Get current spec content"""
+ if not self.specs or self.current_index >= len(self.specs):
+ return None, None
+
+ spec_path = self.specs[self.current_index]
+ with open(spec_path, 'r') as f:
+ content = f.read()
+
+ return spec_path.name, content
+
+ def get_annotation(self, spec_name):
+ """Get annotation for a spec"""
+ return self.annotations.get(spec_name, {
+ "status": "pending",
+ "notes": "",
+ "timestamp": None
+ })
+
+ def set_annotation(self, spec_name, status, notes=""):
+ """Set annotation for a spec"""
+ self.annotations[spec_name] = {
+ "status": status,
+ "notes": notes,
+ "timestamp": datetime.now().isoformat()
+ }
+ self.save_annotations()
+
+ def get_stats(self):
+ """Get annotation statistics"""
+ total = len(self.specs)
+ passed = sum(1 for a in self.annotations.values() if a["status"] == "pass")
+ failed = sum(1 for a in self.annotations.values() if a["status"] == "fail")
+ pending = total - passed - failed
+
+ return {
+ "total": total,
+ "passed": passed,
+ "failed": failed,
+ "pending": pending,
+ "progress": (passed + failed) / total * 100 if total > 0 else 0
+ }
+
+# Initialize state
+state = AnnotationState()
+
+@rt("/")
+def get():
+ """Main page"""
+ spec_name, content = state.get_current_spec()
+
+ if spec_name is None:
+ return Html(
+ Head(Title("Spec Annotation Tool")),
+ Body(
+ Main(
+ H1("No specs found"),
+ P(f"Please add spec files to {SPECS_DIR}"),
+ cls="container"
+ )
+ )
+ )
+
+ annotation = state.get_annotation(spec_name)
+ stats = state.get_stats()
+
+ # Convert markdown to HTML
+ html_content = markdown.markdown(content, extensions=['fenced_code', 'tables'])
+
+ # Status badge color
+ status_colors = {
+ "pass": "green",
+ "fail": "red",
+ "pending": "gray"
+ }
+
+ return Html(
+ Head(
+ Title("Spec Annotation Tool"),
+ Script(src="https://unpkg.com/htmx.org@1.9.10"),
+ Link(rel="stylesheet", href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"),
+ Style("""
+ .spec-content {
+ padding: 1rem;
+ background: var(--pico-background-color);
+ border-radius: 0.5rem;
+ margin: 1rem 0;
+ max-height: 60vh;
+ overflow-y: auto;
+ }
+ .controls {
+ display: flex;
+ gap: 1rem;
+ margin: 1rem 0;
+ flex-wrap: wrap;
+ }
+ .stats {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+ gap: 1rem;
+ margin: 1rem 0;
+ }
+ .stat-card {
+ padding: 1rem;
+ background: var(--pico-card-background-color);
+ border-radius: 0.5rem;
+ text-align: center;
+ }
+ .stat-value {
+ font-size: 2rem;
+ font-weight: bold;
+ }
+ .stat-label {
+ font-size: 0.9rem;
+ opacity: 0.8;
+ }
+ .progress-bar {
+ width: 100%;
+ height: 8px;
+ background: var(--pico-muted-color);
+ border-radius: 4px;
+ overflow: hidden;
+ margin: 1rem 0;
+ }
+ .progress-fill {
+ height: 100%;
+ background: var(--pico-primary);
+ transition: width 0.3s ease;
+ }
+ .keyboard-help {
+ font-size: 0.9rem;
+ opacity: 0.7;
+ padding: 1rem;
+ background: var(--pico-card-background-color);
+ border-radius: 0.5rem;
+ margin: 1rem 0;
+ }
+ .status-badge {
+ display: inline-block;
+ padding: 0.25rem 0.75rem;
+ border-radius: 1rem;
+ font-size: 0.9rem;
+ font-weight: bold;
+ }
+ .status-pass { background: #2ecc71; color: white; }
+ .status-fail { background: #e74c3c; color: white; }
+ .status-pending { background: #95a5a6; color: white; }
+ """)
+ ),
+ Body(
+ Main(
+ H1("Spec Annotation Tool"),
+
+ # Stats
+ Div(
+ Div(
+ Div(str(stats['total']), cls="stat-value"),
+ Div("Total", cls="stat-label"),
+ cls="stat-card"
+ ),
+ Div(
+ Div(str(stats['passed']), cls="stat-value"),
+ Div("Passed", cls="stat-label"),
+ cls="stat-card"
+ ),
+ Div(
+ Div(str(stats['failed']), cls="stat-value"),
+ Div("Failed", cls="stat-label"),
+ cls="stat-card"
+ ),
+ Div(
+ Div(str(stats['pending']), cls="stat-value"),
+ Div("Pending", cls="stat-label"),
+ cls="stat-card"
+ ),
+ cls="stats"
+ ),
+
+ # Progress bar
+ Div(
+ Div(style=f"width: {stats['progress']}%", cls="progress-fill"),
+ cls="progress-bar"
+ ),
+
+ # Current spec info
+ H2(f"Spec {state.current_index + 1} of {len(state.specs)}: {spec_name}"),
+ Span(annotation['status'].upper(), cls=f"status-badge status-{annotation['status']}"),
+
+ # Spec content
+ Article(
+ NotStr(html_content),
+ cls="spec-content"
+ ),
+
+ # Notes
+ Form(
+ Textarea(
+ annotation.get('notes', ''),
+ name="notes",
+ placeholder="Add notes about this spec...",
+ rows="3"
+ ),
+ Input(type="hidden", name="spec_name", value=spec_name),
+ Button("Save Notes", type="submit"),
+ hx_post="/save-notes",
+ hx_target="#message",
+ hx_swap="innerHTML"
+ ),
+ Div(id="message"),
+
+ # Controls
+ Div(
+ Button("ā Previous (P)",
+ hx_post="/prev",
+ hx_target="body",
+ hx_swap="outerHTML",
+ accesskey="p"),
+ Button("Next (N) ā",
+ hx_post="/next",
+ hx_target="body",
+ hx_swap="outerHTML",
+ accesskey="n"),
+ Button("ā Pass (1)",
+ hx_post=f"/annotate/pass/{spec_name}",
+ hx_target="body",
+ hx_swap="outerHTML",
+ accesskey="1",
+ cls="contrast"),
+ Button("ā Fail (2)",
+ hx_post=f"/annotate/fail/{spec_name}",
+ hx_target="body",
+ hx_swap="outerHTML",
+ accesskey="2"),
+ Button("Export JSON",
+ hx_get="/export",
+ hx_target="#message"),
+ cls="controls"
+ ),
+
+ # Keyboard help
+ Details(
+ Summary("Keyboard Shortcuts"),
+ Div(
+ P(Strong("N"), " - Next spec"),
+ P(Strong("P"), " - Previous spec"),
+ P(Strong("1"), " - Mark as Pass"),
+ P(Strong("2"), " - Mark as Fail"),
+ cls="keyboard-help"
+ )
+ ),
+
+ cls="container"
+ ),
+ # JavaScript for additional keyboard shortcuts
+ Script("""
+ document.addEventListener('keydown', function(e) {
+ // Ignore if user is typing in textarea
+ if (e.target.tagName === 'TEXTAREA' || e.target.tagName === 'INPUT') {
+ return;
+ }
+
+ switch(e.key.toLowerCase()) {
+ case 'n':
+ document.querySelector('button[accesskey="n"]').click();
+ break;
+ case 'p':
+ document.querySelector('button[accesskey="p"]').click();
+ break;
+ case '1':
+ document.querySelector('button[accesskey="1"]').click();
+ break;
+ case '2':
+ document.querySelector('button[accesskey="2"]').click();
+ break;
+ }
+ });
+ """)
+ )
+ )
+
+@rt("/next")
+def post():
+ """Navigate to next spec"""
+ if state.current_index < len(state.specs) - 1:
+ state.current_index += 1
+ return get()
+
+@rt("/prev")
+def post():
+ """Navigate to previous spec"""
+ if state.current_index > 0:
+ state.current_index -= 1
+ return get()
+
+@rt("/annotate/{status}/{spec_name}")
+def post(status: str, spec_name: str):
+ """Annotate spec with pass/fail"""
+ annotation = state.get_annotation(spec_name)
+ state.set_annotation(spec_name, status, annotation.get('notes', ''))
+
+ # Auto-advance to next spec
+ if state.current_index < len(state.specs) - 1:
+ state.current_index += 1
+
+ # Return redirect header for HTMX
+ return Response(status_code=200, headers={'HX-Redirect': '/'})
+
+@rt("/save-notes")
+def post(spec_name: str, notes: str):
+ """Save notes for current spec"""
+ annotation = state.get_annotation(spec_name)
+ state.set_annotation(spec_name, annotation.get('status', 'pending'), notes)
+ return Div(
+ P("Notes saved!", style="color: green; margin-top: 0.5rem;"),
+ id="message"
+ )
+
+@rt("/export")
+def get():
+ """Export annotations to JSON"""
+ stats = state.get_stats()
+ export_data = {
+ "exported_at": datetime.now().isoformat(),
+ "statistics": stats,
+ "annotations": state.annotations
+ }
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ export_path = Path(f"annotations_export_{timestamp}.json")
+
+ with open(export_path, 'w') as f:
+ json.dump(export_data, f, indent=2)
+
+ return Div(
+ P(f"Exported to {export_path}", style="color: green; margin-top: 0.5rem;"),
+ id="message"
+ )
+
+if __name__ == "__main__":
+ serve()
diff --git a/evals/configs/promptfooconfig-plan.js b/evals/configs/promptfooconfig-plan.js
new file mode 100644
index 0000000000..3c34d1875e
--- /dev/null
+++ b/evals/configs/promptfooconfig-plan.js
@@ -0,0 +1,63 @@
+// PromptFoo configuration for Plan Template tests only
+module.exports = {
+ description: 'Plan Template Quality Evaluation',
+
+ // Plan prompt only
+ prompts: ['file://../prompts/plan-prompt.txt'],
+
+ // Configure LLM provider using OpenAI-compatible endpoint
+ providers: [
+ {
+ id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+ config: {
+ apiBaseUrl: process.env.LLM_BASE_URL,
+ apiKey: process.env.LLM_AUTH_TOKEN,
+ temperature: 0.7,
+ max_tokens: 4000,
+ },
+ env: {
+ OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+ OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+ },
+ },
+ ],
+
+ defaultTest: {
+ options: {
+ provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ },
+ },
+
+ tests: [
+ // Test 7: Plan Template - Simplicity Gate
+ {
+ description: 'Plan Template: Simple app should have ā¤3 projects',
+ vars: {
+ user_input: 'Plan a simple todo app with Node.js and SQLite',
+ },
+ assert: [
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_simplicity_gate' },
+ // Note: Removed not-icontains checks - our custom grader is context-aware
+ // and handles "no microservices" vs "use microservices" correctly
+ ],
+ },
+
+ // Test 8: Plan Template - Constitution Compliance
+ {
+ description: 'Plan Template: No over-engineering for basic CRUD API',
+ vars: {
+ user_input: 'Plan a basic REST API for CRUD operations on a todo list',
+ },
+ assert: [
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_constitution_compliance' },
+ {
+ type: 'llm-rubric',
+ value:
+ 'Is the architecture appropriately simple for a basic CRUD API?\nCheck for:\n- No unnecessary complexity (service mesh, event sourcing, CQRS)\n- No over-engineered infrastructure (Kubernetes for simple app)\n- Direct framework usage (no unnecessary wrappers)\nReturn 1.0 if appropriately simple, 0.0 if over-engineered.',
+ threshold: 0.8,
+ },
+ ],
+ },
+ ],
+};
diff --git a/evals/configs/promptfooconfig-spec.js b/evals/configs/promptfooconfig-spec.js
new file mode 100644
index 0000000000..4f3d1002f0
--- /dev/null
+++ b/evals/configs/promptfooconfig-spec.js
@@ -0,0 +1,162 @@
+// PromptFoo configuration for Spec Template tests only
+module.exports = {
+ description: 'Spec Template Quality Evaluation',
+
+ // Spec prompt only
+ prompts: ['file://../prompts/spec-prompt.txt'],
+
+ // Configure LLM provider using OpenAI-compatible endpoint
+ providers: [
+ {
+ id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+ config: {
+ apiBaseUrl: process.env.LLM_BASE_URL,
+ apiKey: process.env.LLM_AUTH_TOKEN,
+ temperature: 0.7,
+ max_tokens: 4000,
+ },
+ env: {
+ OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+ OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+ },
+ },
+ ],
+
+ defaultTest: {
+ options: {
+ provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ },
+ },
+
+ tests: [
+ // Test 1: Basic Spec Structure
+ {
+ description: 'Spec Template: Basic CRUD app - Structure validation',
+ vars: {
+ user_input:
+ 'Build a task management app where users can create, edit, delete, and view tasks. Each task has a title, description, due date, and priority (low/medium/high).',
+ },
+ assert: [
+ { type: 'icontains', value: 'overview' },
+ { type: 'icontains', value: 'functional requirements' },
+ { type: 'icontains', value: 'user stor' },
+ { type: 'icontains', value: 'non-functional' },
+ { type: 'icontains', value: 'edge case' },
+ ],
+ },
+
+ // Test 2: No Premature Tech Stack
+ {
+ description: 'Spec Template: Should not include tech stack details',
+ vars: {
+ user_input: 'Build a REST API for managing user profiles with CRUD operations',
+ },
+ assert: [
+ { type: 'not-icontains', value: 'React' },
+ { type: 'not-icontains', value: 'Node.js' },
+ { type: 'not-icontains', value: 'Express' },
+ { type: 'not-icontains', value: 'MongoDB' },
+ {
+ type: 'llm-rubric',
+ value:
+ 'Check if this specification avoids technical implementation details.\nIt should focus on WHAT needs to be built, not HOW to build it.\nReturn 1.0 if no tech stack is mentioned, 0.5 if some mentioned, 0.0 if heavy tech details.',
+ threshold: 0.8,
+ },
+ ],
+ },
+
+ // Test 3: Quality - User Stories
+ {
+ description: 'Spec Template: Has clear user stories with acceptance criteria',
+ vars: {
+ user_input: 'Create an authentication system with email/password login and social OAuth',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Grade the specification on user story quality (0-1):\n1. Are there 5+ user stories?\n2. Do stories follow "As a [role], I want [feature], so that [benefit]" format?\n3. Does each story have clear acceptance criteria?\n4. Are the criteria measurable and testable?\nReturn average score 0-1.',
+ threshold: 0.75,
+ },
+ ],
+ },
+
+ // Test 4: Clarity - No Vague Terms
+ {
+ description: 'Spec Template: Flags vague requirements',
+ vars: {
+ user_input: 'Build a fast, scalable, user-friendly dashboard with good performance',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Check if vague terms like "fast", "scalable", "user-friendly", "good performance"\nare either:\n1. Quantified with specific metrics (e.g., "response time < 200ms")\n2. Marked with [NEEDS CLARIFICATION] or similar flags\n\nReturn 1.0 if all vague terms are handled properly, 0.0 if none are.',
+ threshold: 0.7,
+ },
+ ],
+ },
+
+ // Test 5: Security Requirements
+ {
+ description: 'Spec Template: Security-critical features include security requirements',
+ vars: {
+ user_input: 'Create a payment processing system with credit card handling and transaction history',
+ },
+ assert: [
+ { type: 'icontains', value: 'security' },
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_security_completeness' },
+ ],
+ },
+
+ // Test 6: Edge Cases Coverage
+ {
+ description: 'Spec Template: Includes edge cases and error scenarios',
+ vars: {
+ user_input: 'Build a file upload system supporting multiple file types up to 100MB',
+ },
+ assert: [
+ { type: 'icontains', value: 'edge case' },
+ // Using Python grader instead of LLM rubric for more reliable results
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_edge_cases_coverage' },
+ ],
+ },
+
+ // Test 9: Completeness Score
+ {
+ description: 'Spec Template: E-commerce checkout has comprehensive requirements',
+ vars: {
+ user_input: 'Build an e-commerce checkout flow with cart, payment, and order confirmation',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Grade completeness (0-1):\n1. Are functional requirements complete? (cart operations, payment, confirmation)\n2. Are user stories covering main flows?\n3. Are non-functional requirements specified? (performance, security)\n4. Are edge cases identified? (payment failures, session timeout)\nReturn average score 0-1.',
+ threshold: 0.75,
+ },
+ ],
+ },
+
+ // Test 10: Regression Test - Basic Structure
+ {
+ description: 'Regression: Spec template maintains required sections',
+ vars: {
+ user_input: 'Simple feature: Add a search bar to existing app',
+ },
+ assert: [
+ { type: 'icontains', value: 'functional requirements' },
+ { type: 'icontains', value: 'user stor' },
+ {
+ type: 'javascript',
+ value: `
+ // Count major sections (## or #)
+ const sections = output.split(/^#{1,2} /gm).length - 1;
+ return sections >= 4;
+ `,
+ },
+ ],
+ },
+ ],
+};
diff --git a/evals/configs/promptfooconfig.js b/evals/configs/promptfooconfig.js
new file mode 100644
index 0000000000..5ed518dc64
--- /dev/null
+++ b/evals/configs/promptfooconfig.js
@@ -0,0 +1,224 @@
+// PromptFoo configuration using JavaScript for environment variable support
+module.exports = {
+ description: 'Spec-Kit Quality Evaluation',
+
+ // Configure LLM provider using OpenAI-compatible endpoint
+ providers: [
+ {
+ id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+ config: {
+ // AI API Gateway exposes an OpenAI-compatible endpoint at /chat/completions
+ apiBaseUrl: process.env.LLM_BASE_URL,
+ apiKey: process.env.LLM_AUTH_TOKEN,
+ temperature: 0.7,
+ max_tokens: 4000,
+ },
+ // Also set the env vars that PromptFoo looks for
+ env: {
+ OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+ OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+ },
+ },
+ ],
+
+ // Default test configuration
+ defaultTest: {
+ options: {
+ provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+ },
+ },
+
+ // Test suite
+ tests: [
+ // ================================
+ // Test 1: Basic Spec Structure
+ // ================================
+ {
+ description: 'Spec Template: Basic CRUD app - Structure validation',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input:
+ 'Build a task management app where users can create, edit, delete, and view tasks. Each task has a title, description, due date, and priority (low/medium/high).',
+ },
+ assert: [
+ { type: 'icontains', value: 'overview' },
+ { type: 'icontains', value: 'functional requirements' },
+ { type: 'icontains', value: 'user stor' }, // Matches "User Stories" or "User Story"
+ { type: 'icontains', value: 'non-functional' },
+ { type: 'icontains', value: 'edge case' },
+ ],
+ },
+
+ // ================================
+ // Test 2: No Premature Tech Stack
+ // ================================
+ {
+ description: 'Spec Template: Should not include tech stack details',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Build a REST API for managing user profiles with CRUD operations',
+ },
+ assert: [
+ { type: 'not-icontains', value: 'React' },
+ { type: 'not-icontains', value: 'Node.js' },
+ { type: 'not-icontains', value: 'Express' },
+ { type: 'not-icontains', value: 'MongoDB' },
+ {
+ type: 'llm-rubric',
+ value:
+ 'Check if this specification avoids technical implementation details.\nIt should focus on WHAT needs to be built, not HOW to build it.\nReturn 1.0 if no tech stack is mentioned, 0.5 if some mentioned, 0.0 if heavy tech details.',
+ threshold: 0.8,
+ },
+ ],
+ },
+
+ // ================================
+ // Test 3: Quality - User Stories
+ // ================================
+ {
+ description: 'Spec Template: Has clear user stories with acceptance criteria',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Create an authentication system with email/password login and social OAuth',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Grade the specification on user story quality (0-1):\n1. Are there 5+ user stories?\n2. Do stories follow "As a [role], I want [feature], so that [benefit]" format?\n3. Does each story have clear acceptance criteria?\n4. Are the criteria measurable and testable?\nReturn average score 0-1.',
+ threshold: 0.75,
+ },
+ ],
+ },
+
+ // ================================
+ // Test 4: Clarity - No Vague Terms
+ // ================================
+ {
+ description: 'Spec Template: Flags vague requirements',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Build a fast, scalable, user-friendly dashboard with good performance',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Check if vague terms like "fast", "scalable", "user-friendly", "good performance"\nare either:\n1. Quantified with specific metrics (e.g., "response time < 200ms")\n2. Marked with [NEEDS CLARIFICATION] or similar flags\n\nReturn 1.0 if all vague terms are handled properly, 0.0 if none are.',
+ threshold: 0.7,
+ },
+ ],
+ },
+
+ // ================================
+ // Test 5: Security Requirements
+ // ================================
+ {
+ description: 'Spec Template: Security-critical features include security requirements',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Create a payment processing system with credit card handling and transaction history',
+ },
+ assert: [
+ { type: 'icontains', value: 'security' },
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_security_completeness' },
+ ],
+ },
+
+ // ================================
+ // Test 6: Edge Cases Coverage
+ // ================================
+ {
+ description: 'Spec Template: Includes edge cases and error scenarios',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Build a file upload system supporting multiple file types up to 100MB',
+ },
+ assert: [
+ { type: 'icontains', value: 'edge case' },
+ // Using Python grader instead of LLM rubric for more reliable results
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_edge_cases_coverage' },
+ ],
+ },
+
+ // ================================
+ // Test 7: Plan Template - Simplicity Gate
+ // ================================
+ {
+ description: 'Plan Template: Simple app should have ā¤3 projects',
+ prompt: 'file://../prompts/plan-prompt.txt',
+ vars: {
+ user_input: 'Plan a simple todo app with Node.js and SQLite',
+ },
+ assert: [
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_simplicity_gate' },
+ // Note: Removed not-icontains checks - our custom grader is context-aware
+ // and handles "no microservices" vs "use microservices" correctly
+ ],
+ },
+
+ // ================================
+ // Test 8: Plan Template - Constitution Compliance
+ // ================================
+ {
+ description: 'Plan Template: No over-engineering for basic CRUD API',
+ prompt: 'file://../prompts/plan-prompt.txt',
+ vars: {
+ user_input: 'Plan a basic REST API for CRUD operations on a todo list',
+ },
+ assert: [
+ { type: 'python', value: 'file://../graders/custom_graders.py:check_constitution_compliance' },
+ {
+ type: 'llm-rubric',
+ value:
+ 'Is the architecture appropriately simple for a basic CRUD API?\nCheck for:\n- No unnecessary complexity (service mesh, event sourcing, CQRS)\n- No over-engineered infrastructure (Kubernetes for simple app)\n- Direct framework usage (no unnecessary wrappers)\nReturn 1.0 if appropriately simple, 0.0 if over-engineered.',
+ threshold: 0.8,
+ },
+ ],
+ },
+
+ // ================================
+ // Test 9: Completeness Score
+ // ================================
+ {
+ description: 'Spec Template: E-commerce checkout has comprehensive requirements',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Build an e-commerce checkout flow with cart, payment, and order confirmation',
+ },
+ assert: [
+ {
+ type: 'llm-rubric',
+ value:
+ 'Grade completeness (0-1):\n1. Are functional requirements complete? (cart operations, payment, confirmation)\n2. Are user stories covering main flows?\n3. Are non-functional requirements specified? (performance, security)\n4. Are edge cases identified? (payment failures, session timeout)\nReturn average score 0-1.',
+ threshold: 0.75,
+ },
+ ],
+ },
+
+ // ================================
+ // Test 10: Regression Test - Basic Structure
+ // ================================
+ {
+ description: 'Regression: Spec template maintains required sections',
+ prompt: 'file://../prompts/spec-prompt.txt',
+ vars: {
+ user_input: 'Simple feature: Add a search bar to existing app',
+ },
+ assert: [
+ { type: 'icontains', value: 'functional requirements' },
+ { type: 'icontains', value: 'user stor' },
+ // Even simple features should have structure
+ {
+ type: 'javascript',
+ value: `
+ // Count major sections (## or #)
+ const sections = output.split(/^#{1,2} /gm).length - 1;
+ return sections >= 4; // Lowered from 5 to be more flexible
+ `,
+ },
+ ],
+ },
+ ],
+};
diff --git a/evals/datasets/real-plans/plan-001.md b/evals/datasets/real-plans/plan-001.md
new file mode 100644
index 0000000000..a4eda957bd
--- /dev/null
+++ b/evals/datasets/real-plans/plan-001.md
@@ -0,0 +1,313 @@
+# Test Plan 1
+
+## Prompt
+
+Plan a simple todo app with Node.js and SQLite
+
+## Generated Plan
+
+## Implementation Plan: Simple Todo App with Node.js and SQLite
+
+### 1. Overview
+
+A straightforward todo application built with Node.js and SQLite that allows users to create, read, update, and delete todo items. The application will feature a web-based interface with a RESTful API backend, all integrated into a single cohesive project for simplicity.
+
+**Technology Stack:**
+
+- Backend: Node.js with Express.js
+- Database: SQLite3
+- Frontend: HTML, CSS, vanilla JavaScript (served by Express)
+- No authentication (can be added later if needed)
+
+## 2. Project Structure
+
+### Single Project: `todo-app`
+
+**Rationale:** This is a simple CRUD application that doesn't require separation of concerns. Combining frontend and backend into one project reduces complexity, simplifies deployment, and maintains all code in one repository.
+
+**Project Count: 1** ā (within limit)
+
+---
+
+## 3. Project Details
+
+### Project: `todo-app`
+
+**Scope:**
+
+- RESTful API endpoints for todo operations
+- SQLite database for data persistence
+- Static frontend interface served by Express
+- Basic error handling and validation
+
+**Boundaries:**
+
+- Single-user application (no multi-tenancy)
+- No authentication/authorization (Phase 1)
+- No external integrations
+- Local deployment only (initially)
+
+#### Key Tasks and Milestones
+
+#### Milestone 1: Project Setup (Day 1)
+
+- [ ] Initialize Node.js project with npm
+- [ ] Install dependencies: express, sqlite3, body-parser
+- [ ] Create folder structure:
+
+```json
+ todo-app/
+ āāā server.js # Express server entry point
+ āāā database.js # SQLite connection and queries
+ āāā routes/
+ ā āāā todos.js # Todo API routes
+ āāā public/
+ ā āāā index.html # Frontend interface
+ ā āāā styles.css # Styling
+ ā āāā app.js # Frontend JavaScript
+ āāā package.json
+ āāā .gitignore
+```
+
+- [ ] Set up basic Express server
+- [ ] Configure SQLite database connection
+
+#### Milestone 2: Database Layer (Day 1-2)
+
+- [ ] Create todos table schema:
+
+```sql
+ CREATE TABLE todos (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ title TEXT NOT NULL,
+ description TEXT,
+ completed BOOLEAN DEFAULT 0,
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+ updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+ );
+```
+
+- [ ] Implement database initialization function
+- [ ] Create CRUD query functions:
+ - `getAllTodos()`
+ - `getTodoById(id)`
+ - `createTodo(title, description)`
+ - `updateTodo(id, updates)`
+ - `deleteTodo(id)`
+
+#### Milestone 3: API Endpoints (Day 2-3)
+
+- [ ] `GET /api/todos` - Retrieve all todos
+- [ ] `GET /api/todos/:id` - Retrieve single todo
+- [ ] `POST /api/todos` - Create new todo
+- [ ] `PUT /api/todos/:id` - Update existing todo
+- [ ] `DELETE /api/todos/:id` - Delete todo
+- [ ] Add input validation middleware
+- [ ] Implement error handling middleware
+
+#### Milestone 4: Frontend Interface (Day 3-4)
+
+- [ ] Create HTML structure with:
+ - Input form for new todos
+ - List display for existing todos
+ - Edit/Delete buttons per todo
+ - Checkbox for completion status
+- [ ] Style with CSS (clean, responsive design)
+- [ ] Implement JavaScript functionality:
+ - Fetch and display todos on load
+ - Add new todo via form submission
+ - Toggle completion status
+ - Edit todo inline
+ - Delete todo with confirmation
+ - Handle API errors gracefully
+
+#### Milestone 5: Testing & Polish (Day 4-5)
+
+- [ ] Manual testing of all CRUD operations
+- [ ] Test edge cases (empty inputs, special characters)
+- [ ] Add loading states in UI
+- [ ] Improve error messages
+- [ ] Add README with setup instructions
+- [ ] Code cleanup and comments
+
+#### Dependencies
+
+- None (standalone project)
+
+#### Testing Strategy
+
+**Manual Testing:**
+
+1. **Database Operations:**
+ - Verify table creation on first run
+ - Test each CRUD function directly
+ - Check data persistence across server restarts
+
+2. **API Testing:**
+ - Use Postman or curl to test each endpoint
+ - Verify correct status codes (200, 201, 404, 500)
+ - Test with valid and invalid inputs
+ - Check JSON response formats
+
+3. **Frontend Testing:**
+ - Test in Chrome, Firefox, Safari
+ - Verify responsive design on mobile
+ - Test all user interactions
+ - Check console for JavaScript errors
+
+**Optional: Automated Testing (if time permits):**
+
+- Unit tests for database functions using Jest
+- API endpoint tests using Supertest
+- Simple integration tests
+
+---
+
+## 4. Development Approach
+
+### Phase 1: Core Functionality (Days 1-5)
+
+- Complete all milestones listed above
+- Focus on working features over perfection
+- Keep code simple and readable
+
+### Phase 2: Enhancements (Optional, Future)
+
+- Add user authentication
+- Implement todo categories/tags
+- Add due dates and priorities
+- Search and filter functionality
+- Data export/import
+
+### Deployment Strategy
+
+**Local Development:**
+
+```bash
+npm install
+npm start
+# Access at http://localhost:3000
+```
+
+**Simple Production Deployment:**
+
+- Option 1: VPS with Node.js (DigitalOcean, Linode)
+- Option 2: Platform-as-a-Service (Heroku, Render.com)
+- Use environment variables for configuration
+- SQLite file persisted in mounted volume
+
+**Deployment Steps:**
+
+1. Set `NODE_ENV=production`
+2. Configure port via environment variable
+3. Ensure SQLite database directory is writable
+4. Use process manager (PM2) for production
+5. Set up basic logging
+
+---
+
+## 5. Verification Steps
+
+### Project Verification Checklist
+
+**ā Setup Verification:**
+
+- [ ] `npm install` runs without errors
+- [ ] Server starts on configured port
+- [ ] Database file is created automatically
+- [ ] Frontend loads at root URL
+
+**ā Database Verification:**
+
+- [ ] Todos table exists with correct schema
+- [ ] Can insert data directly via SQL
+- [ ] Data persists after server restart
+- [ ] No SQL injection vulnerabilities
+
+**ā API Verification:**
+
+- [ ] All endpoints return correct status codes
+- [ ] POST creates todo and returns new ID
+- [ ] GET retrieves all todos correctly
+- [ ] PUT updates only specified fields
+- [ ] DELETE removes todo from database
+- [ ] Invalid IDs return 404
+- [ ] Malformed requests return 400
+
+**ā Frontend Verification:**
+
+- [ ] Page loads without console errors
+- [ ] Can add new todo via form
+- [ ] Todos display immediately after creation
+- [ ] Can toggle completion status
+- [ ] Can edit todo title/description
+- [ ] Can delete todo with confirmation
+- [ ] UI updates reflect database state
+- [ ] Error messages display appropriately
+
+**ā Integration Verification:**
+
+- [ ] Frontend and backend communicate correctly
+- [ ] CORS issues resolved (if applicable)
+- [ ] Data consistency between UI and database
+- [ ] No race conditions in concurrent operations
+
+**ā Code Quality Verification:**
+
+- [ ] Code is properly commented
+- [ ] No hardcoded configuration values
+- [ ] Error handling covers edge cases
+- [ ] README includes setup instructions
+- [ ] Git repository has meaningful commits
+
+---
+
+## 6. Success Criteria
+
+The implementation is complete when:
+
+1. ā User can perform all CRUD operations via the web interface
+2. ā Data persists across application restarts
+3. ā Application runs without errors in console/logs
+4. ā Code is clean, commented, and maintainable
+5. ā README provides clear setup instructions
+6. ā All verification steps pass
+
+---
+
+## 7. Estimated Timeline
+
+- **Total Duration:** 4-5 days
+- **Lines of Code (estimated):** 500-700
+- **Complexity:** Low to Medium
+
+**Daily Breakdown:**
+
+- Day 1: Setup + Database (4-6 hours)
+- Day 2: API Development (4-6 hours)
+- Day 3: Frontend Development (4-6 hours)
+- Day 4: Integration + Testing (3-4 hours)
+- Day 5: Polish + Documentation (2-3 hours)
+
+---
+
+## 8. Risk Mitigation
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| SQLite file corruption | High | Regular backups, proper connection handling |
+| Concurrent write conflicts | Medium | SQLite handles via locking, keep operations simple |
+| Frontend-backend sync issues | Medium | Use proper HTTP status codes, implement retry logic |
+| Scope creep | Medium | Stick to core features, document future enhancements |
+
+---
+
+## Conclusion
+
+This plan delivers a fully functional todo application using a **single-project architecture** that maintains simplicity while providing all essential features. The monolithic approach is appropriate for this use case and allows for rapid development and easy maintenance. Future enhancements can be added incrementally without requiring architectural changes.
+
+## Review Notes
+
+- [ ] Pass/Fail: Pass (from automated eval)
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-plans/plan-002.md b/evals/datasets/real-plans/plan-002.md
new file mode 100644
index 0000000000..4808f0caaf
--- /dev/null
+++ b/evals/datasets/real-plans/plan-002.md
@@ -0,0 +1,433 @@
+# Test Plan 2
+
+## Prompt
+
+Plan a basic REST API for CRUD operations on a todo list
+
+## Generated Plan
+
+## Implementation Plan: Todo List REST API
+
+### 1. Overview
+
+A simple REST API service for managing a todo list with full CRUD (Create, Read, Update, Delete) operations. The API will allow users to create todos, retrieve them (individually or as a list), update their status and content, and delete them. This will be implemented as a single, straightforward backend service with persistent storage.
+
+**Technology Stack:**
+
+- **Runtime:** Node.js with Express.js (simple, well-documented, ideal for REST APIs)
+- **Database:** SQLite (file-based, zero configuration, perfect for todo apps)
+- **Validation:** Express-validator (built-in request validation)
+- **Testing:** Jest + Supertest (API endpoint testing)
+
+## 2. Project Structure
+
+### Project 1: Todo API Service
+
+**Single monolithic backend service** - All API logic, data access, and business rules in one deployable unit.
+
+**Rationale:** A todo list CRUD API is a straightforward application with no complex domain separation. Splitting into multiple services would introduce unnecessary complexity, deployment overhead, and inter-service communication without any real benefit.
+
+---
+
+## 3. Project Details
+
+### Project 1: Todo API Service
+
+#### Scope and Boundaries
+
+- RESTful API endpoints for todo management
+- Data persistence layer (SQLite database)
+- Input validation and error handling
+- Basic logging for debugging
+- API documentation (inline comments + README)
+
+#### File Structure
+
+```text
+todo-api/
+āāā src/
+ā āāā index.js # Entry point, server setup
+ā āāā routes/
+ā ā āāā todos.js # Todo route handlers
+ā āāā models/
+ā ā āāā todo.js # Data access layer
+ā āāā middleware/
+ā ā āāā validation.js # Request validators
+ā ā āāā errorHandler.js # Global error handling
+ā āāā db/
+ā āāā database.js # Database initialization
+āāā tests/
+ā āāā todos.test.js # API endpoint tests
+āāā package.json
+āāā .env.example
+āāā README.md
+```
+
+#### API Endpoints
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/api/todos` | Get all todos (with optional filters: ?completed=true) |
+| GET | `/api/todos/:id` | Get a specific todo by ID |
+| POST | `/api/todos` | Create a new todo |
+| PUT | `/api/todos/:id` | Update an existing todo |
+| DELETE | `/api/todos/:id` | Delete a todo |
+
+#### Data Model
+
+```javascript
+Todo {
+ id: INTEGER (auto-increment, primary key)
+ title: TEXT (required, max 200 chars)
+ description: TEXT (optional, max 1000 chars)
+ completed: BOOLEAN (default: false)
+ createdAt: DATETIME (auto-generated)
+ updatedAt: DATETIME (auto-updated)
+}
+```
+
+#### Key Tasks and Milestones
+
+##### Phase 1: Foundation (Day 1)
+
+- [ ] Initialize Node.js project with Express
+- [ ] Set up SQLite database connection
+- [ ] Create todos table schema with migration
+- [ ] Implement basic server with health check endpoint (`/health`)
+- [ ] Configure environment variables (port, database path)
+
+##### Phase 2: Core CRUD Operations (Day 2)
+
+- [ ] Implement POST `/api/todos` - Create todo
+- [ ] Implement GET `/api/todos` - List all todos
+- [ ] Implement GET `/api/todos/:id` - Get single todo
+- [ ] Implement PUT `/api/todos/:id` - Update todo
+- [ ] Implement DELETE `/api/todos/:id` - Delete todo
+- [ ] Add data access layer for database operations
+
+##### Phase 3: Validation & Error Handling (Day 3)
+
+- [ ] Add input validation middleware (title required, length limits)
+- [ ] Implement global error handler
+- [ ] Add 404 handling for non-existent resources
+- [ ] Validate ID parameters (numeric, exists in DB)
+- [ ] Return appropriate HTTP status codes (200, 201, 400, 404, 500)
+
+##### Phase 4: Testing & Documentation (Day 4)
+
+- [ ] Write unit tests for data access layer
+- [ ] Write integration tests for all API endpoints
+- [ ] Test error scenarios (invalid input, missing resources)
+- [ ] Document API endpoints in README
+- [ ] Add example requests/responses
+
+#### Dependencies
+
+- **External:** None (standalone service)
+- **NPM Packages:**
+ - `express` - Web framework
+ - `better-sqlite3` - Synchronous SQLite client
+ - `express-validator` - Request validation
+ - `dotenv` - Environment configuration
+ - `jest` & `supertest` - Testing (dev dependencies)
+
+#### Testing Strategy
+
+**Unit Tests:**
+
+- Database operations (CRUD functions in isolation)
+- Validation logic
+- Error handling utilities
+
+**Integration Tests:**
+
+- Full request/response cycle for each endpoint
+- Test successful operations (200, 201 responses)
+- Test error cases (400, 404 responses)
+- Test edge cases (empty list, updating non-existent todo)
+- Test query parameters (filtering by completed status)
+
+**Test Coverage Goals:**
+
+- Minimum 80% code coverage
+- 100% endpoint coverage
+- All error paths tested
+
+**Testing Tools:**
+
+```bash
+# Run all tests
+npm test
+
+# Run with coverage report
+npm run test:coverage
+
+# Run in watch mode during development
+npm run test:watch
+```
+
+---
+
+## 4. Development Approach
+
+### Phasing Strategy
+
+#### Phase 1: MVP (Minimum Viable Product) - Days 1-2
+
+- Basic CRUD operations working
+- Data persists in SQLite
+- Manual testing via Postman/curl
+- **Deliverable:** Functional API with all endpoints
+
+#### Phase 2: Hardening - Days 3-4
+
+- Add validation and error handling
+- Write comprehensive tests
+- Document API
+- **Deliverable:** Production-ready service with tests
+
+#### Phase 3: Enhancements (Optional/Future)
+
+- Add pagination for GET `/api/todos`
+- Add search/filter capabilities
+- Add due dates to todos
+- Add priority levels
+
+### Development Workflow
+
+1. Create feature branch from `main`
+2. Implement functionality
+3. Write tests (aim for test-first when possible)
+4. Run tests locally (`npm test`)
+5. Manual testing with REST client
+6. Merge to `main` when tests pass
+
+### Deployment Strategy
+
+**Local Development:**
+
+```bash
+npm install
+npm run dev # Uses nodemon for auto-reload
+```
+
+**Production Deployment:**
+
+```bash
+npm install --production
+npm start
+```
+
+**Environment Configuration:**
+
+- `PORT` - Server port (default: 3000)
+- `DATABASE_PATH` - SQLite file location (default: `./todos.db`)
+- `NODE_ENV` - Environment (development/production)
+
+**Deployment Options:**
+
+- **Simple:** Run directly on VPS with PM2 process manager
+- **Containerized:** Docker container (single Dockerfile)
+- **Platform:** Deploy to Heroku, Railway, or Render (all support Node.js natively)
+
+**Docker Setup (Optional):**
+
+```dockerfile
+FROM node:18-alpine
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci --production
+COPY src/ ./src/
+EXPOSE 3000
+CMD ["node", "src/index.js"]
+```
+
+---
+
+## 5. Verification Steps
+
+### Project 1: Todo API Service
+
+#### Functional Verification
+
+##### Step 1: Health Check
+
+```bash
+curl http://localhost:3000/health
+# Expected: {"status": "ok"}
+```
+
+##### Step 2: Create Todo
+
+```bash
+curl -X POST http://localhost:3000/api/todos \
+ -H "Content-Type: application/json" \
+ -d '{"title": "Test todo", "description": "Test description"}'
+# Expected: 201 status, returns created todo with ID
+```
+
+##### Step 3: List Todos
+
+```bash
+curl http://localhost:3000/api/todos
+# Expected: 200 status, array containing created todo
+```
+
+##### Step 4: Get Single Todo
+
+```bash
+curl http://localhost:3000/api/todos/1
+# Expected: 200 status, returns todo with ID 1
+```
+
+##### Step 5: Update Todo
+
+```bash
+curl -X PUT http://localhost:3000/api/todos/1 \
+ -H "Content-Type: application/json" \
+ -d '{"completed": true}'
+# Expected: 200 status, returns updated todo
+```
+
+##### Step 6: Delete Todo
+
+```bash
+curl -X DELETE http://localhost:3000/api/todos/1
+# Expected: 204 status, no content
+```
+
+##### Step 7: Verify Deletion
+
+```bash
+curl http://localhost:3000/api/todos/1
+# Expected: 404 status, error message
+```
+
+#### Error Handling Verification
+
+**Test Invalid Input:**
+
+```bash
+curl -X POST http://localhost:3000/api/todos \
+ -H "Content-Type: application/json" \
+ -d '{}'
+# Expected: 400 status, validation error for missing title
+```
+
+**Test Non-existent Resource:**
+
+```bash
+curl http://localhost:3000/api/todos/99999
+# Expected: 404 status, "Todo not found" message
+```
+
+**Test Invalid ID:**
+
+```bash
+curl http://localhost:3000/api/todos/invalid
+# Expected: 400 status, "Invalid ID" message
+```
+
+#### Automated Testing Verification
+
+```bash
+# Run test suite
+npm test
+
+# Expected output:
+# ā GET /api/todos returns empty array initially
+# ā POST /api/todos creates new todo
+# ā POST /api/todos validates required fields
+# ā GET /api/todos/:id returns todo
+# ā GET /api/todos/:id returns 404 for non-existent
+# ā PUT /api/todos/:id updates todo
+# ā DELETE /api/todos/:id removes todo
+# ... (all tests passing)
+```
+
+#### Performance Verification (Basic)
+
+```bash
+# Test response time for list endpoint
+time curl http://localhost:3000/api/todos
+# Expected: < 100ms for small datasets
+
+# Create 100 todos and test performance
+for i in {1..100}; do
+ curl -X POST http://localhost:3000/api/todos \
+ -H "Content-Type: application/json" \
+ -d "{\"title\": \"Todo $i\"}" &
+done
+wait
+
+time curl http://localhost:3000/api/todos
+# Expected: < 500ms for 100 items
+```
+
+#### Data Persistence Verification
+
+```bash
+# Create a todo
+curl -X POST http://localhost:3000/api/todos \
+ -H "Content-Type: application/json" \
+ -d '{"title": "Persistence test"}'
+
+# Stop server
+# Restart server
+
+# Verify todo still exists
+curl http://localhost:3000/api/todos
+# Expected: Previously created todo is still in the list
+```
+
+---
+
+## 6. Success Criteria
+
+The implementation is considered complete when:
+
+- ā All 5 CRUD endpoints are functional
+- ā Data persists across server restarts
+- ā Input validation prevents invalid data
+- ā Appropriate HTTP status codes are returned
+- ā All automated tests pass (minimum 80% coverage)
+- ā API documentation is complete in README
+- ā Error handling covers common failure scenarios
+- ā Manual verification steps all pass
+- ā Service can be deployed and run in production environment
+
+---
+
+## 7. Timeline Estimate
+
+**Total Duration:** 4 days (assuming 1 developer working full-time)
+
+- **Day 1:** Project setup, database, basic server (Phase 1 Foundation)
+- **Day 2:** Implement all CRUD endpoints (Phase 2 Core Operations)
+- **Day 3:** Add validation, error handling (Phase 3 Hardening)
+- **Day 4:** Testing, documentation, verification (Phase 4 Testing)
+
+**Note:** This is a conservative estimate. An experienced developer could complete this in 2-3 days.
+
+---
+
+## 8. Future Enhancements (Out of Scope)
+
+These are explicitly NOT part of the initial implementation but could be added later:
+
+- User authentication and authorization
+- Multiple todo lists per user
+- Tags/categories for todos
+- Due dates and reminders
+- File attachments
+- Real-time updates (WebSocket)
+- Frontend web application
+- Mobile app
+
+**Principle:** Ship the simple version first, validate usage, then enhance based on actual needs.
+
+## Review Notes
+
+- [ ] Pass/Fail: Pass (from automated eval)
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-001.md b/evals/datasets/real-specs/spec-001.md
new file mode 100644
index 0000000000..c1289f882c
--- /dev/null
+++ b/evals/datasets/real-specs/spec-001.md
@@ -0,0 +1,231 @@
+# Test Spec 1
+
+## Prompt
+
+Build a simple todo list web application with task creation, editing, and deletion
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements provided]
+
+**Feature Name:** [Awaiting Input]
+
+**Description:**
+This section will contain a brief description of the feature once user requirements are provided. The overview should explain:
+
+- What the feature does
+- Who will use it
+- Why it's needed
+- High-level scope and boundaries
+
+**Business Value:**
+[To be defined based on user input]
+
+**Stakeholders:**
+[To be identified based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action occurs]
+ **Then** [expected outcome]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-002: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action occurs]
+ **Then** [expected outcome]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-003: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action occurs]
+ **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [Awaiting specification]
+
+**FR-002:** [Awaiting specification]
+
+**FR-003:** [Awaiting specification]
+
+**FR-004:** [Awaiting specification]
+
+**FR-005:** [Awaiting specification]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-003)
+
+**NFR-001:** [Performance requirement - response time, throughput, etc.]
+
+**NFR-002:** [Capacity requirement - concurrent users, data volume, etc.]
+
+**NFR-003:** [Resource utilization requirement]
+
+---
+
+### Security (NFR-004 to NFR-006)
+
+**NFR-004:** [Authentication requirement]
+
+**NFR-005:** [Authorization requirement]
+
+**NFR-006:** [Data protection requirement]
+
+---
+
+### Scalability (NFR-007 to NFR-008)
+
+**NFR-007:** [Horizontal/vertical scaling requirement]
+
+**NFR-008:** [Growth accommodation requirement]
+
+---
+
+### Usability (NFR-009 to NFR-010)
+
+**NFR-009:** [User experience requirement]
+
+**NFR-010:** [Accessibility requirement]
+
+---
+
+### Reliability (NFR-011 to NFR-012)
+
+**NFR-011:** [Uptime/availability requirement]
+
+**NFR-012:** [Error handling requirement]
+
+---
+
+## 5. Edge Cases
+
+### Boundary Conditions
+
+**EC-001:** [Edge case description and expected behavior]
+
+**EC-002:** [Edge case description and expected behavior]
+
+**EC-003:** [Edge case description and expected behavior]
+
+---
+
+### Error Scenarios
+
+**ES-001:** [Error scenario and expected system response]
+
+**ES-002:** [Error scenario and expected system response]
+
+**ES-003:** [Error scenario and expected system response]
+
+---
+
+### Data Validation
+
+**DV-001:** [Invalid input scenario and expected handling]
+
+**DV-002:** [Invalid input scenario and expected handling]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [Measurable success metric with target value]
+
+**SC-002:** [Measurable success metric with target value]
+
+**SC-003:** [Measurable success metric with target value]
+
+**SC-004:** [Measurable success metric with target value]
+
+**SC-005:** [Measurable success metric with target value]
+
+---
+
+## 7. Out of Scope
+
+[Items explicitly excluded from this feature]
+
+---
+
+## 8. Dependencies
+
+[Prerequisites, integrations, or external dependencies]
+
+---
+
+## 9. Assumptions
+
+[Assumptions made during specification]
+
+---
+
+## 10. Open Questions
+
+[NEEDS CLARIFICATION: User requirements needed]
+
+1. What is the feature being requested?
+2. Who are the target users?
+3. What problem does this solve?
+4. Are there any existing systems this integrates with?
+5. What are the key constraints or limitations?
+
+---
+
+**Document Status:** Draft - Awaiting User Requirements
+
+**Last Updated:** [Current Date]
+
+**Version:** 0.1
+
+---
+
+**Note:** Please provide your user requirements so I can create a comprehensive feature specification document tailored to your specific needs.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-002.md b/evals/datasets/real-specs/spec-002.md
new file mode 100644
index 0000000000..d5a441ea38
--- /dev/null
+++ b/evals/datasets/real-specs/spec-002.md
@@ -0,0 +1,378 @@
+# Test Spec 2
+
+## Prompt
+
+Create a personal blog platform with posts, comments, and basic authentication
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. Please provide the feature requirements you would like documented.]
+
+**Purpose:**
+[NEEDS CLARIFICATION - Business objective and target users need to be specified]
+
+**Scope:**
+
+- In Scope: [NEEDS CLARIFICATION]
+- Out of Scope: [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+#### US-001: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION - initial state]
+ **When** [NEEDS CLARIFICATION - action taken]
+ **Then** [NEEDS CLARIFICATION - expected outcome]
+
+**Priority:** P1
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+#### US-002: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+ **When** [NEEDS CLARIFICATION]
+ **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P1
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+#### US-003: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+ **When** [NEEDS CLARIFICATION]
+ **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P2
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+#### US-004: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+ **When** [NEEDS CLARIFICATION]
+ **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P2
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+#### US-005: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+ **When** [NEEDS CLARIFICATION]
+ **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P3
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Core functional requirement not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**FR-002:** [NEEDS CLARIFICATION - Core functional requirement not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Data Requirements
+
+**FR-003:** [NEEDS CLARIFICATION - Data handling requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**FR-004:** [NEEDS CLARIFICATION - Data validation requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### User Interface Requirements
+
+**FR-005:** [NEEDS CLARIFICATION - UI/UX requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Integration Requirements
+
+**FR-006:** [NEEDS CLARIFICATION - Integration points not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [NEEDS CLARIFICATION - Response time requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., Response time < X seconds for Y% of requests]
+
+**NFR-P-002:** [NEEDS CLARIFICATION - Throughput requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., System must handle X concurrent users]
+
+**NFR-P-003:** [NEEDS CLARIFICATION - Load time requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., Initial load time < X seconds]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [NEEDS CLARIFICATION - Authentication requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-002:** [NEEDS CLARIFICATION - Authorization requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-003:** [NEEDS CLARIFICATION - Data encryption requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-004:** [NEEDS CLARIFICATION - Audit logging requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [NEEDS CLARIFICATION - User growth requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., System must scale to X users within Y timeframe]
+
+**NFR-SC-002:** [NEEDS CLARIFICATION - Data volume requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Availability & Reliability (NFR-A)
+
+**NFR-A-001:** [NEEDS CLARIFICATION - Uptime requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., 99.9% uptime]
+
+**NFR-A-002:** [NEEDS CLARIFICATION - Recovery time objective not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., RTO < X hours]
+
+**NFR-A-003:** [NEEDS CLARIFICATION - Backup requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [NEEDS CLARIFICATION - Accessibility requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., WCAG 2.1 Level AA compliance]
+
+**NFR-U-002:** [NEEDS CLARIFICATION - Browser/device support not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-U-003:** [NEEDS CLARIFICATION - Internationalization requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** [NEEDS CLARIFICATION - Documentation requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-M-002:** [NEEDS CLARIFICATION - Monitoring requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Invalid input handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-002:** [NEEDS CLARIFICATION - Missing required data handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-003:** [NEEDS CLARIFICATION - Data format validation not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### Boundary Conditions
+
+**EC-004:** [NEEDS CLARIFICATION - Maximum capacity scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-005:** [NEEDS CLARIFICATION - Minimum threshold scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-006:** [NEEDS CLARIFICATION - Empty state handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### System Failures
+
+**EC-007:** [NEEDS CLARIFICATION - Network failure scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-008:** [NEEDS CLARIFICATION - Service unavailability handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-009:** [NEEDS CLARIFICATION - Timeout scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### Concurrency Issues
+
+**EC-010:** [NEEDS CLARIFICATION - Simultaneous user actions not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-011:** [NEEDS CLARIFICATION - Data conflict resolution not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - User adoption target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., X% of target users actively using feature within Y days]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-002:** [NEEDS CLARIFICATION - User engagement target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Performance Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - Performance benchmark not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-004:** [NEEDS CLARIFICATION - Error rate target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., < X% error rate]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Business Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - Business impact target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-006:** [NEEDS CLARIFICATION - ROI or cost savings target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Quality Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - User satisfaction target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., X+ satisfaction score]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-008:** [NEEDS CLARIFICATION - Support ticket reduction target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+---
+
+## 7. Dependencies & Assumptions
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - External dependencies not specified]
+- [NEEDS CLARIFICATION - Internal dependencies not specified]
+- [NEEDS CLARIFICATION - Third-party service dependencies not specified]
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - Assumptions about user behavior not specified]
+- [NEEDS CLARIFICATION - Assumptions about system environment not specified]
+- [NEEDS CLARIFICATION - Assumptions about data availability not specified]
+
+---
+
+## 8. Risks & Mitigation
+
+| Risk ID | Risk Description | Impact | Likelihood | Mitigation Strategy |
+|---------|-----------------|--------|------------|-------------------|
+| R-001 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+| R-002 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+| R-003 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being requested?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the business objectives?]
+5. [NEEDS CLARIFICATION - What are the timeline and budget constraints?]
+6. [NEEDS CLARIFICATION - Are there any regulatory or compliance requirements?]
+7. [NEEDS CLARIFICATION - What are the integration points with existing systems?]
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Signature | Date |
+|------|------|-----------|------|
+| Product Owner | [NEEDS CLARIFICATION] | | |
+| Engineering Lead | [NEEDS CLARIFICATION] | | |
+| Design Lead | [NEEDS CLARIFICATION] | | |
+| QA Lead | [NEEDS CLARIFICATION] | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Current Date]
+**Status:** DRAFT - AWAITING USER REQUIREMENTS
+
+---
+
+## Next Steps
+
+To complete this feature specification, please provide:
+
+1. Detailed description of the feature you want to build
+2. Target users and use cases
+3. Business objectives and success criteria
+4. Any specific requirements or constraints
+5. Integration points or dependencies
+
+Once this information is provided, I can create a comprehensive, actionable feature specification document.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-003.md b/evals/datasets/real-specs/spec-003.md
new file mode 100644
index 0000000000..8747e2f2c5
--- /dev/null
+++ b/evals/datasets/real-specs/spec-003.md
@@ -0,0 +1,531 @@
+# Test Spec 3
+
+## Prompt
+
+Develop a simple calculator web app with basic arithmetic operations
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature described in user requirements]
+
+**Description:**
+No user requirements were provided in the input. This template demonstrates the structure for a comprehensive feature specification document.
+
+**Purpose:**
+[NEEDS CLARIFICATION - Awaiting user requirements to define feature purpose and value proposition]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User personas and roles to be defined based on requirements]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and ROI to be defined]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [NEEDS CLARIFICATION - Core user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Core user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [NEEDS CLARIFICATION - Secondary user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Secondary user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION - Enhancement user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-006: [NEEDS CLARIFICATION - Enhancement user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-002:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-003:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### Data Requirements
+
+**FR-004:** [NEEDS CLARIFICATION - Define data handling requirements]
+
+- **Description:** [Data that must be captured, stored, or processed]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-005:** [NEEDS CLARIFICATION - Define data validation requirements]
+
+- **Description:** [Data validation rules and constraints]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### User Interface Requirements
+
+**FR-006:** [NEEDS CLARIFICATION - Define UI/UX requirements]
+
+- **Description:** [User interface elements and interactions needed]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-007:** [NEEDS CLARIFICATION - Define accessibility requirements]
+
+- **Description:** [Accessibility standards and requirements]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### Integration Requirements
+
+**FR-008:** [NEEDS CLARIFICATION - Define system integration needs]
+
+- **Description:** [External systems or services that must integrate]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** Response Time
+
+- **Requirement:** [NEEDS CLARIFICATION - Define acceptable response times]
+- **Measurement:** [How to measure: e.g., "95th percentile response time under X seconds"]
+
+**NFR-002:** Throughput
+
+- **Requirement:** [NEEDS CLARIFICATION - Define transaction volume requirements]
+- **Measurement:** [How to measure: e.g., "System must handle X transactions per second"]
+
+**NFR-003:** Concurrent Users
+
+- **Requirement:** [NEEDS CLARIFICATION - Define concurrent user capacity]
+- **Measurement:** [How to measure: e.g., "Support X concurrent users without degradation"]
+
+---
+
+### Security Requirements
+
+**NFR-004:** Authentication
+
+- **Requirement:** [NEEDS CLARIFICATION - Define authentication requirements]
+- **Measurement:** [How to verify security controls are implemented]
+
+**NFR-005:** Authorization
+
+- **Requirement:** [NEEDS CLARIFICATION - Define access control requirements]
+- **Measurement:** [How to verify authorization mechanisms]
+
+**NFR-006:** Data Protection
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data encryption and protection needs]
+- **Measurement:** [How to verify data protection compliance]
+
+**NFR-007:** Audit Logging
+
+- **Requirement:** [NEEDS CLARIFICATION - Define audit trail requirements]
+- **Measurement:** [How to verify logging completeness]
+
+---
+
+### Scalability Requirements
+
+**NFR-008:** Horizontal Scalability
+
+- **Requirement:** [NEEDS CLARIFICATION - Define scaling requirements]
+- **Measurement:** [How to measure scalability capacity]
+
+**NFR-009:** Data Growth
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data volume projections]
+- **Measurement:** [How to verify system handles projected growth]
+
+---
+
+### Reliability Requirements
+
+**NFR-010:** Availability
+
+- **Requirement:** [NEEDS CLARIFICATION - Define uptime requirements (e.g., 99.9% availability)]
+- **Measurement:** [How to measure: "System uptime percentage over 30-day period"]
+
+**NFR-011:** Fault Tolerance
+
+- **Requirement:** [NEEDS CLARIFICATION - Define error recovery requirements]
+- **Measurement:** [How to verify system recovers from failures]
+
+**NFR-012:** Data Integrity
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data consistency requirements]
+- **Measurement:** [How to verify data remains accurate and consistent]
+
+---
+
+### Usability Requirements
+
+**NFR-013:** Learning Curve
+
+- **Requirement:** [NEEDS CLARIFICATION - Define ease-of-use expectations]
+- **Measurement:** [How to measure: "New users can complete core tasks within X minutes"]
+
+**NFR-014:** Accessibility Standards
+
+- **Requirement:** [NEEDS CLARIFICATION - Define accessibility compliance needs (e.g., WCAG 2.1 Level AA)]
+- **Measurement:** [How to verify accessibility compliance]
+
+---
+
+### Maintainability Requirements
+
+**NFR-015:** Documentation
+
+- **Requirement:** [NEEDS CLARIFICATION - Define documentation requirements]
+- **Measurement:** [How to verify documentation completeness]
+
+**NFR-016:** Monitoring
+
+- **Requirement:** [NEEDS CLARIFICATION - Define system monitoring needs]
+- **Measurement:** [How to verify monitoring coverage]
+
+---
+
+### Compliance Requirements
+
+**NFR-017:** Regulatory Compliance
+
+- **Requirement:** [NEEDS CLARIFICATION - Define applicable regulations (e.g., GDPR, HIPAA, SOC2)]
+- **Measurement:** [How to verify compliance]
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Define minimum input boundaries]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+**EC-002:** [NEEDS CLARIFICATION - Define maximum input boundaries]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+**EC-003:** [NEEDS CLARIFICATION - Define empty/null state handling]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+---
+
+### Error Scenarios
+
+**EC-004:** [NEEDS CLARIFICATION - Define invalid input handling]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can correct the error]
+
+**EC-005:** [NEEDS CLARIFICATION - Define network failure handling]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can retry or recover]
+
+**EC-006:** [NEEDS CLARIFICATION - Define timeout scenarios]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can proceed]
+
+**EC-007:** [NEEDS CLARIFICATION - Define concurrent access conflicts]
+
+- **Scenario:** [Describe conflict condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Resolution:** [How conflict is resolved]
+
+---
+
+### Data Integrity Edge Cases
+
+**EC-008:** [NEEDS CLARIFICATION - Define duplicate data handling]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Prevention:** [How system prevents data issues]
+
+**EC-009:** [NEEDS CLARIFICATION - Define data migration scenarios]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Rollback Plan:** [How to revert if issues occur]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** Feature Completeness
+
+- **Metric:** [NEEDS CLARIFICATION - Define completion criteria]
+- **Target:** [e.g., "100% of P1 user stories completed and accepted"]
+- **Measurement Method:** [How to verify]
+
+**SC-002:** Acceptance Testing
+
+- **Metric:** [NEEDS CLARIFICATION - Define acceptance test pass rate]
+- **Target:** [e.g., "100% of acceptance criteria passing"]
+- **Measurement Method:** [How to verify]
+
+---
+
+### Performance Success Metrics
+
+**SC-003:** Response Time Achievement
+
+- **Metric:** [NEEDS CLARIFICATION - Define performance target]
+- **Target:** [e.g., "95th percentile response time < X seconds"]
+- **Measurement Method:** [Performance testing methodology]
+
+**SC-004:** Load Capacity
+
+- **Metric:** [NEEDS CLARIFICATION - Define load handling target]
+- **Target:** [e.g., "Successfully handle X concurrent users"]
+- **Measurement Method:** [Load testing methodology]
+
+---
+
+### Quality Success Metrics
+
+**SC-005:** Defect Rate
+
+- **Metric:** [NEEDS CLARIFICATION - Define acceptable defect rate]
+- **Target:** [e.g., "Less than X critical defects in production after 30 days"]
+- **Measurement Method:** [Defect tracking methodology]
+
+**SC-006:** Test Coverage
+
+- **Metric:** [NEEDS CLARIFICATION - Define test coverage requirements]
+- **Target:** [e.g., "Automated test coverage of X% for critical paths"]
+- **Measurement Method:** [Coverage analysis tools]
+
+---
+
+### User Adoption Success Metrics
+
+**SC-007:** User Engagement
+
+- **Metric:** [NEEDS CLARIFICATION - Define engagement metrics]
+- **Target:** [e.g., "X% of target users actively using feature within 30 days"]
+- **Measurement Method:** [Analytics tracking]
+
+**SC-008:** User Satisfaction
+
+- **Metric:** [NEEDS CLARIFICATION - Define satisfaction measurement]
+- **Target:** [e.g., "Average satisfaction score of X/10 or higher"]
+- **Measurement Method:** [User feedback surveys]
+
+**SC-009:** Task Completion Rate
+
+- **Metric:** [NEEDS CLARIFICATION - Define task success rate]
+- **Target:** [e.g., "X% of users successfully complete primary workflow"]
+- **Measurement Method:** [User analytics and session tracking]
+
+---
+
+### Business Success Metrics
+
+**SC-010:** Business Impact
+
+- **Metric:** [NEEDS CLARIFICATION - Define business value metrics]
+- **Target:** [e.g., "Achieve X% increase in [specific business metric]"]
+- **Measurement Method:** [Business analytics]
+
+**SC-011:** ROI Achievement
+
+- **Metric:** [NEEDS CLARIFICATION - Define ROI expectations]
+- **Target:** [e.g., "Achieve positive ROI within X months"]
+- **Measurement Method:** [Financial analysis]
+
+---
+
+## 7. Dependencies and Assumptions
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies that must be available]
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions made during specification]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints affecting the feature]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [NEEDS CLARIFICATION - Define what is NOT included]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - List unresolved questions requiring stakeholder input]
+
+---
+
+## 10. Approval and Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | [TBD] | [TBD] | [TBD] |
+| Business Stakeholder | [TBD] | [TBD] | [TBD] |
+| Engineering Lead | [TBD] | [TBD] | [TBD] |
+| QA Lead | [TBD] | [TBD] | [TBD] |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Next Review Date:** [Date]
+
+---
+
+## Instructions for Use
+
+**To complete this specification, please provide:**
+
+1. **Clear feature description** - What problem are you solving?
+2. **Target users** - Who will use this feature?
+3. **User workflows** - What tasks should users be able to complete?
+4. **Business objectives** - What business goals does this support?
+5. **Performance expectations** - What are acceptable response times, capacity, etc.?
+6. **Security and compliance needs** - What standards must be met?
+7. **Success metrics** - How will you measure success?
+
+Once requirements are provided, all sections marked with [NEEDS CLARIFICATION] will be completed with specific, measurable, and testable requirements.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-004.md b/evals/datasets/real-specs/spec-004.md
new file mode 100644
index 0000000000..0736c7d5d7
--- /dev/null
+++ b/evals/datasets/real-specs/spec-004.md
@@ -0,0 +1,301 @@
+# Test Spec 4
+
+## Prompt
+
+Build an e-commerce platform with product catalog, shopping cart, and checkout flow
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+#### Feature Name
+
+[NEEDS CLARIFICATION: No user requirements provided]
+
+#### Description
+
+This feature specification template is ready to be populated once user requirements are provided. The document will detail the complete functional and non-functional requirements, user stories, acceptance criteria, edge cases, and success metrics for the requested feature.
+
+#### Business Justification
+
+[NEEDS CLARIFICATION: Business value and justification pending user input]
+
+#### Scope
+
+**In Scope:**
+
+- [To be defined based on user requirements]
+
+**Out of Scope:**
+
+- [To be defined based on user requirements]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (Critical - Must Have)
+
+#### US-001: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+### Priority 2 (Important - Should Have)
+
+#### US-002: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+### Priority 3 (Nice to Have - Could Have)
+
+#### US-003: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+**Priority:** P3
+**Story Points:** [TBD]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION: Requirement description pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-002:** [NEEDS CLARIFICATION: Requirement description pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### Data Requirements
+
+**FR-003:** [NEEDS CLARIFICATION: Data handling requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### User Interface Requirements
+
+**FR-004:** [NEEDS CLARIFICATION: UI/UX requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### Integration Requirements
+
+**FR-005:** [NEEDS CLARIFICATION: System integration requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION: Performance benchmarks pending]
+
+- **Measurement Method:** [How performance will be measured]
+
+**NFR-002:** [NEEDS CLARIFICATION: Response time requirements pending]
+
+- **Measurement Method:** [How response times will be measured]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION: Security and authentication requirements pending]
+
+- **Verification Method:** [How security will be verified]
+
+**NFR-004:** [NEEDS CLARIFICATION: Data protection requirements pending]
+
+- **Verification Method:** [How data protection will be verified]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION: Scalability requirements pending]
+
+- **Measurement Method:** [How scalability will be measured]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION: Usability standards pending]
+
+- **Measurement Method:** [How usability will be measured]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION: Uptime and reliability requirements pending]
+
+- **Measurement Method:** [How reliability will be measured]
+
+### Accessibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION: Accessibility standards pending]
+
+- **Verification Method:** [How accessibility compliance will be verified]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION: Minimum/maximum value handling]
+
+- **Expected Behavior:** [How system should respond]
+
+**EC-002:** [NEEDS CLARIFICATION: Empty or null input handling]
+
+- **Expected Behavior:** [How system should respond]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION: Invalid input handling]
+
+- **Expected Behavior:** [Error message and recovery process]
+
+**ES-002:** [NEEDS CLARIFICATION: System unavailability]
+
+- **Expected Behavior:** [Fallback mechanism]
+
+**ES-003:** [NEEDS CLARIFICATION: Concurrent access conflicts]
+
+- **Expected Behavior:** [Conflict resolution approach]
+
+### Network & Connectivity
+
+**ES-004:** [NEEDS CLARIFICATION: Network failure handling]
+
+- **Expected Behavior:** [Offline capability or error handling]
+
+**ES-005:** [NEEDS CLARIFICATION: Timeout scenarios]
+
+- **Expected Behavior:** [Timeout handling and user notification]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** [NEEDS CLARIFICATION: Feature completion criteria]
+
+- **Target:** [Measurable target value]
+- **Measurement Method:** [How success will be measured]
+
+**SC-002:** [NEEDS CLARIFICATION: User adoption metrics]
+
+- **Target:** [Measurable target value]
+- **Measurement Method:** [How success will be measured]
+
+### Performance Success Metrics
+
+**SC-003:** [NEEDS CLARIFICATION: Performance benchmarks]
+
+- **Target:** [Specific numeric target]
+- **Measurement Method:** [How performance will be tracked]
+
+### Business Success Metrics
+
+**SC-004:** [NEEDS CLARIFICATION: Business impact metrics]
+
+- **Target:** [Measurable business outcome]
+- **Measurement Method:** [How business impact will be measured]
+
+**SC-005:** [NEEDS CLARIFICATION: User satisfaction metrics]
+
+- **Target:** [Satisfaction score or rating]
+- **Measurement Method:** [Survey or feedback mechanism]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION: List assumptions about user environment, behavior, or system state]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION: External systems, data sources, or third-party services required]
+
+### Constraints
+
+- [NEEDS CLARIFICATION: Known limitations or restrictions]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION: What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION: Who are the target users/personas?]
+3. [NEEDS CLARIFICATION: What problem does this feature solve?]
+4. [NEEDS CLARIFICATION: What are the expected usage volumes?]
+5. [NEEDS CLARIFICATION: Are there any regulatory or compliance requirements?]
+
+---
+
+## 9. Appendix
+
+### Glossary
+
+- **Term:** Definition
+
+### Related Documents
+
+- [Link to related specifications or requirements documents]
+
+---
+
+## Document Control
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | [Date] | [Author] | Initial template created |
+
+---
+
+**Note:** This specification document requires user input to be completed. Please provide detailed requirements including the feature description, target users, business goals, and any specific functionality needed. Once provided, all sections marked with [NEEDS CLARIFICATION] will be populated with specific, measurable, and testable requirements.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-005.md b/evals/datasets/real-specs/spec-005.md
new file mode 100644
index 0000000000..d2b88214b5
--- /dev/null
+++ b/evals/datasets/real-specs/spec-005.md
@@ -0,0 +1,315 @@
+# Test Spec 5
+
+## Prompt
+
+Create a customer relationship management (CRM) system with contact management and sales pipeline
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements were provided in the input]
+
+**Feature Name:** [Pending User Input]
+
+**Description:**
+This section will contain a brief description of the feature once user requirements are provided. The overview should explain the purpose of the feature, the problem it solves, and the value it delivers to users and the business.
+
+**Target Users:** [To be defined based on user input]
+
+**Business Value:** [To be defined based on user input]
+
+---
+
+## 2. User Stories
+
+### High Priority (P1)
+
+**US-001: [Story Title - Pending]**
+
+- **Priority:** P1
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [Story Title - Pending]**
+
+- **Priority:** P1
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Medium Priority (P2)
+
+**US-003: [Story Title - Pending]**
+
+- **Priority:** P2
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [Story Title - Pending]**
+
+- **Priority:** P2
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Low Priority (P3)
+
+**US-005: [Story Title - Pending]**
+
+- **Priority:** P3
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-002:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-003:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-004:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-005:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-003)
+
+**NFR-001:** [Performance requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+**NFR-002:** [Response time requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+**NFR-003:** [Throughput requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Security (NFR-004 to NFR-006)
+
+**NFR-004:** [Security requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+**NFR-005:** [Data protection requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+**NFR-006:** [Access control requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Scalability (NFR-007 to NFR-008)
+
+**NFR-007:** [Scalability requirement - Pending user input]
+
+- **Measurement:** [Specific capacity metrics]
+- **Priority:** [P1/P2/P3]
+
+**NFR-008:** [Growth handling requirement - Pending user input]
+
+- **Measurement:** [Specific capacity metrics]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Usability (NFR-009 to NFR-010)
+
+**NFR-009:** [Usability requirement - Pending user input]
+
+- **Measurement:** [Specific usability metrics]
+- **Priority:** [P1/P2/P3]
+
+**NFR-010:** [Accessibility requirement - Pending user input]
+
+- **Measurement:** [Specific accessibility standards]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Reliability (NFR-011 to NFR-012)
+
+**NFR-011:** [Availability requirement - Pending user input]
+
+- **Measurement:** [Uptime percentage and measurement period]
+- **Priority:** [P1/P2/P3]
+
+**NFR-012:** [Error handling requirement - Pending user input]
+
+- **Measurement:** [Error rate thresholds]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 5. Edge Cases
+
+**EC-001: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of boundary condition]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-002: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of boundary condition]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-003: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of error scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-004: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of error scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-005: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of concurrent operation scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-002:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-003:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-004:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-005:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+---
+
+## Appendix
+
+### Assumptions
+
+[NEEDS CLARIFICATION: Assumptions will be documented once user requirements are provided]
+
+### Dependencies
+
+[NEEDS CLARIFICATION: Dependencies will be documented once user requirements are provided]
+
+### Out of Scope
+
+[NEEDS CLARIFICATION: Out of scope items will be documented once user requirements are provided]
+
+### Open Questions
+
+1. [NEEDS CLARIFICATION: User requirements input is required to complete this specification]
+2. [Additional questions will be added based on user input]
+
+---
+
+**Note:** Please provide the user requirements to generate a complete and detailed feature specification document. The template above demonstrates the structure and level of detail that will be included once requirements are specified.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-006.md b/evals/datasets/real-specs/spec-006.md
new file mode 100644
index 0000000000..aec54f5cad
--- /dev/null
+++ b/evals/datasets/real-specs/spec-006.md
@@ -0,0 +1,418 @@
+# Test Spec 6
+
+## Prompt
+
+Develop a project management dashboard with task tracking, team collaboration, and reporting
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+#### Feature Name
+
+[NEEDS CLARIFICATION - Feature name not provided]
+
+#### Description
+
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.]
+
+#### Business Value
+
+[NEEDS CLARIFICATION - Business objectives and value proposition need to be defined]
+
+#### Target Users
+
+[NEEDS CLARIFICATION - User personas and target audience need to be identified]
+
+#### Scope
+
+**In Scope:**
+
+- [NEEDS CLARIFICATION]
+
+**Out of Scope:**
+
+- [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### User Story Template (Ready for Population)
+
+#### **US-001: [Story Title] [Priority: P1/P2/P3]**
+
+**As a** [user role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action performed]
+ **Then** [expected outcome]
+
+- **Given** [initial context]
+ **When** [action performed]
+ **Then** [expected outcome]
+
+**Definition of Done:**
+
+- [ ] All acceptance criteria met
+- [ ] Edge cases handled
+- [ ] Validation rules implemented
+- [ ] User feedback mechanism in place
+- [ ] Accessible to users with disabilities
+- [ ] Testable independently
+
+---
+
+[NEEDS CLARIFICATION - Please provide user requirements to generate specific user stories. Template will include 5+ prioritized stories covering:]
+
+- Primary user workflows (P1)
+- Secondary features (P2)
+- Nice-to-have enhancements (P3)
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION]
+**Description:** System must [specific, measurable requirement]
+**Priority:** [P1/P2/P3]
+**Dependencies:** None
+
+**FR-002:** [NEEDS CLARIFICATION]
+**Description:** System must [specific, measurable requirement]
+**Priority:** [P1/P2/P3]
+**Dependencies:** FR-001
+
+### Data Management
+
+**FR-003:** [NEEDS CLARIFICATION]
+**Description:** System must support [data requirements]
+**Priority:** [P1/P2/P3]
+
+### User Interface
+
+**FR-004:** [NEEDS CLARIFICATION]
+**Description:** Interface must provide [UI requirement]
+**Priority:** [P1/P2/P3]
+
+### Integration
+
+**FR-005:** [NEEDS CLARIFICATION]
+**Description:** System must integrate with [integration requirement]
+**Priority:** [P1/P2/P3]
+
+### Validation & Error Handling
+
+**FR-006:** [NEEDS CLARIFICATION]
+**Description:** System must validate [validation requirement]
+**Priority:** [P1/P2/P3]
+
+**FR-007:** [NEEDS CLARIFICATION]
+**Description:** System must display error messages when [condition]
+**Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-005)
+
+**NFR-001:** Response Time
+**Requirement:** System must respond to user actions within [X] seconds under normal load conditions
+**Measurement:** 95th percentile response time < [X] seconds
+
+**NFR-002:** Throughput
+**Requirement:** System must support [X] concurrent users without performance degradation
+**Measurement:** Load testing with [X] concurrent users maintaining response times per NFR-001
+
+**NFR-003:** Data Processing
+**Requirement:** System must process [X] records/transactions per [time unit]
+**Measurement:** Batch processing completion time < [X] minutes for [Y] records
+
+**NFR-004:** Availability
+**Requirement:** System must maintain [X]% uptime during business hours
+**Measurement:** Monthly uptime monitoring excluding planned maintenance
+
+**NFR-005:** Capacity
+**Requirement:** System must handle [X] volume of data/transactions
+**Measurement:** Storage and processing capacity tests
+
+### Security (NFR-006 to NFR-010)
+
+**NFR-006:** Authentication
+**Requirement:** System must authenticate all users before granting access
+**Measurement:** 100% of access attempts require valid authentication
+
+**NFR-007:** Authorization
+**Requirement:** System must enforce role-based access controls
+**Measurement:** Users can only access features permitted by their role
+
+**NFR-008:** Data Protection
+**Requirement:** System must protect sensitive data at rest and in transit
+**Measurement:** Security audit confirms encryption standards are met
+
+**NFR-009:** Audit Logging
+**Requirement:** System must log all user actions and system events
+**Measurement:** 100% of critical actions are logged with timestamp, user, and action details
+
+**NFR-010:** Session Management
+**Requirement:** System must terminate inactive sessions after [X] minutes
+**Measurement:** Automated logout occurs at specified time interval
+
+### Usability (NFR-011 to NFR-014)
+
+**NFR-011:** Accessibility
+**Requirement:** System must comply with WCAG 2.1 Level AA standards
+**Measurement:** Accessibility audit confirms compliance
+
+**NFR-012:** User Interface Consistency
+**Requirement:** System must maintain consistent UI patterns across all screens
+**Measurement:** UI review confirms adherence to design standards
+
+**NFR-013:** Error Messages
+**Requirement:** System must provide clear, actionable error messages
+**Measurement:** User testing confirms 90% of users understand error messages
+
+**NFR-014:** Learning Curve
+**Requirement:** New users must be able to complete core tasks within [X] minutes of first use
+**Measurement:** User testing with [Y] participants completing [Z] core tasks
+
+### Compatibility (NFR-015 to NFR-017)
+
+**NFR-015:** Browser Support
+**Requirement:** System must function correctly on supported browsers
+**Measurement:** Testing confirms functionality on specified browser versions
+
+**NFR-016:** Device Compatibility
+**Requirement:** System must be accessible from [desktop/mobile/tablet] devices
+**Measurement:** Testing confirms responsive behavior across device types
+
+**NFR-017:** Data Format Support
+**Requirement:** System must support [specified file formats/data types]
+**Measurement:** Import/export testing with all supported formats
+
+### Scalability (NFR-018 to NFR-019)
+
+**NFR-018:** User Growth
+**Requirement:** System architecture must support [X]% growth in user base without redesign
+**Measurement:** Capacity planning confirms headroom for growth
+
+**NFR-019:** Data Growth
+**Requirement:** System must handle [X]% annual data growth
+**Measurement:** Storage and query performance maintained with increased data volume
+
+### Reliability (NFR-020 to NFR-021)
+
+**NFR-020:** Data Integrity
+**Requirement:** System must maintain data accuracy and consistency
+**Measurement:** Data validation tests confirm 100% accuracy
+
+**NFR-021:** Recovery
+**Requirement:** System must recover from failures within [X] minutes
+**Measurement:** Disaster recovery testing confirms recovery time objective
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION]
+**Scenario:** User enters [boundary condition]
+**Expected Behavior:** System should [expected response]
+
+**EC-002:** [NEEDS CLARIFICATION]
+**Scenario:** User submits empty/null values
+**Expected Behavior:** System should [expected response]
+
+**EC-003:** [NEEDS CLARIFICATION]
+**Scenario:** User enters special characters/invalid format
+**Expected Behavior:** System should [expected response]
+
+**EC-004:** [NEEDS CLARIFICATION]
+**Scenario:** User exceeds maximum input length/size
+**Expected Behavior:** System should [expected response]
+
+### Concurrency Edge Cases
+
+**EC-005:** [NEEDS CLARIFICATION]
+**Scenario:** Multiple users attempt to modify the same record simultaneously
+**Expected Behavior:** System should [expected response]
+
+**EC-006:** [NEEDS CLARIFICATION]
+**Scenario:** User performs rapid repeated actions
+**Expected Behavior:** System should [expected response]
+
+### System State Edge Cases
+
+**EC-007:** [NEEDS CLARIFICATION]
+**Scenario:** System reaches maximum capacity
+**Expected Behavior:** System should [expected response]
+
+**EC-008:** [NEEDS CLARIFICATION]
+**Scenario:** External dependency is unavailable
+**Expected Behavior:** System should [expected response]
+
+**EC-009:** [NEEDS CLARIFICATION]
+**Scenario:** User session expires during operation
+**Expected Behavior:** System should [expected response]
+
+### Data Edge Cases
+
+**EC-010:** [NEEDS CLARIFICATION]
+**Scenario:** Data set is empty/contains zero records
+**Expected Behavior:** System should [expected response]
+
+**EC-011:** [NEEDS CLARIFICATION]
+**Scenario:** Data contains extreme values (min/max boundaries)
+**Expected Behavior:** System should [expected response]
+
+**EC-012:** [NEEDS CLARIFICATION]
+**Scenario:** Data contains duplicates
+**Expected Behavior:** System should [expected response]
+
+### Network & Connectivity Edge Cases
+
+**EC-013:** [NEEDS CLARIFICATION]
+**Scenario:** Network connection is lost during operation
+**Expected Behavior:** System should [expected response]
+
+**EC-014:** [NEEDS CLARIFICATION]
+**Scenario:** Request times out
+**Expected Behavior:** System should [expected response]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** User Engagement
+**Criteria:** [X]% of target users actively using the feature within [Y] days of launch
+**Measurement Method:** Analytics tracking active users
+
+**SC-002:** Task Completion Rate
+**Criteria:** [X]% of users successfully complete primary workflow without assistance
+**Measurement Method:** User session analysis and funnel tracking
+
+**SC-003:** User Satisfaction
+**Criteria:** Average user satisfaction score of [X] out of [Y] in post-launch survey
+**Measurement Method:** User satisfaction survey (NPS, CSAT, or similar)
+
+### Performance Metrics
+
+**SC-004:** Response Time
+**Criteria:** 95th percentile response time remains below [X] seconds
+**Measurement Method:** Performance monitoring tools
+
+**SC-005:** Error Rate
+**Criteria:** Error rate remains below [X]% of all transactions
+**Measurement Method:** Error logging and monitoring
+
+**SC-006:** System Uptime
+**Criteria:** System maintains [X]% uptime in first [Y] months post-launch
+**Measurement Method:** Uptime monitoring service
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - Business KPI]
+**Criteria:** Achieve [specific business outcome] within [timeframe]
+**Measurement Method:** [Measurement approach]
+
+**SC-008:** Cost Efficiency [if applicable]
+**Criteria:** Feature reduces [process/cost] by [X]%
+**Measurement Method:** Before/after comparison analysis
+
+### Quality Metrics
+
+**SC-009:** Defect Rate
+**Criteria:** Post-launch critical defects < [X] per [time period]
+**Measurement Method:** Defect tracking system
+
+**SC-010:** Accessibility Compliance
+**Criteria:** Zero critical accessibility violations in audit
+**Measurement Method:** Accessibility testing tools and manual audit
+
+### Adoption Milestones
+
+**SC-011:** Initial Adoption
+**Criteria:** [X] users/transactions in first week
+**Measurement Method:** Usage analytics
+
+**SC-012:** Sustained Adoption
+**Criteria:** [X]% month-over-month growth for first [Y] months
+**Measurement Method:** Monthly usage reports
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, system environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other features, systems, or external factors]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List any known constraints (budget, time, resources)]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION - What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the key workflows users need to complete?]
+5. [NEEDS CLARIFICATION - Are there existing systems this needs to work with?]
+6. [NEEDS CLARIFICATION - What are the success metrics from a business perspective?]
+7. [NEEDS CLARIFICATION - What is the expected launch timeline?]
+8. [NEEDS CLARIFICATION - What are the priority levels for different aspects?]
+
+---
+
+## 9. Approval & Sign-off
+
+**Document Version:** 0.1 (Template - Awaiting Requirements)
+**Last Updated:** [Current Date]
+**Status:** Draft - Awaiting User Input
+
+**Stakeholder Approval:**
+
+- [ ] Product Owner
+- [ ] Business Stakeholder
+- [ ] User Experience Lead
+- [ ] Quality Assurance Lead
+
+---
+
+## Instructions for Completing This Specification
+
+To complete this feature specification document, please provide:
+
+1. **Feature description** - What is being built and why?
+2. **User roles** - Who will use this feature?
+3. **Core workflows** - What tasks do users need to accomplish?
+4. **Business requirements** - What business problems does this solve?
+5. **Integration needs** - What other systems are involved?
+6. **Success metrics** - How will success be measured?
+7. **Constraints** - Any known limitations or requirements?
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications following the structure above.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-007.md b/evals/datasets/real-specs/spec-007.md
new file mode 100644
index 0000000000..78b4c99446
--- /dev/null
+++ b/evals/datasets/real-specs/spec-007.md
@@ -0,0 +1,272 @@
+# Test Spec 7
+
+## Prompt
+
+Build a distributed microservices architecture for a real-time analytics platform
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Objective:** [NEEDS CLARIFICATION]
+
+**Scope:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-002:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-003:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-004:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-005:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirements]
+
+- Example: System shall respond to user actions within X seconds under normal load
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirements]
+
+- Example: System shall support X concurrent users
+
+**NFR-003:** [NEEDS CLARIFICATION - Load capacity]
+
+- Example: System shall handle X transactions per second
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirements]
+
+**NFR-005:** [NEEDS CLARIFICATION - Authorization requirements]
+
+**NFR-006:** [NEEDS CLARIFICATION - Data protection requirements]
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Growth capacity]
+
+**NFR-008:** [NEEDS CLARIFICATION - Resource scaling]
+
+### Reliability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Uptime requirements]
+
+- Example: System shall maintain 99.X% uptime
+
+**NFR-010:** [NEEDS CLARIFICATION - Data integrity requirements]
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - User experience standards]
+
+**NFR-012:** [NEEDS CLARIFICATION - Accessibility requirements]
+
+### Maintainability Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION - Support and maintenance needs]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Invalid input handling]
+
+**EC-002:** [NEEDS CLARIFICATION - Missing required data]
+
+**EC-003:** [NEEDS CLARIFICATION - Data format errors]
+
+### System Boundaries
+
+**EC-004:** [NEEDS CLARIFICATION - Maximum capacity scenarios]
+
+**EC-005:** [NEEDS CLARIFICATION - Minimum threshold scenarios]
+
+**EC-006:** [NEEDS CLARIFICATION - Timeout scenarios]
+
+### Concurrent Operations
+
+**EC-007:** [NEEDS CLARIFICATION - Simultaneous user actions]
+
+**EC-008:** [NEEDS CLARIFICATION - Race conditions]
+
+### Integration Points
+
+**EC-009:** [NEEDS CLARIFICATION - External system failures]
+
+**EC-010:** [NEEDS CLARIFICATION - Network connectivity issues]
+
+### Data Scenarios
+
+**EC-011:** [NEEDS CLARIFICATION - Empty state handling]
+
+**EC-012:** [NEEDS CLARIFICATION - Duplicate data handling]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Measurable adoption metric]
+
+- Example: X% of target users complete primary workflow within first week
+
+**SC-002:** [NEEDS CLARIFICATION - Measurable performance metric]
+
+- Example: X% reduction in time to complete task Y
+
+**SC-003:** [NEEDS CLARIFICATION - Measurable quality metric]
+
+- Example: Error rate below X%
+
+**SC-004:** [NEEDS CLARIFICATION - Measurable satisfaction metric]
+
+- Example: User satisfaction score of X or higher
+
+**SC-005:** [NEEDS CLARIFICATION - Measurable business metric]
+
+- Example: X% increase in conversion/completion rate
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, or context]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other systems, teams, or features]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known limitations or constraints]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the feature being specified?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What problem does this solve?]
+4. [NEEDS CLARIFICATION - What are the key workflows?]
+5. [NEEDS CLARIFICATION - What are the priority requirements?]
+
+---
+
+## Next Steps
+
+**To complete this specification, please provide:**
+
+1. Detailed description of the feature or capability to be built
+2. Target user roles and personas
+3. Key user workflows and scenarios
+4. Business objectives and success metrics
+5. Any known constraints or requirements
+
+Once user requirements are provided, this document will be fully populated with specific, measurable, and testable requirements.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-008.md b/evals/datasets/real-specs/spec-008.md
new file mode 100644
index 0000000000..e5c5fc400b
--- /dev/null
+++ b/evals/datasets/real-specs/spec-008.md
@@ -0,0 +1,221 @@
+# Test Spec 8
+
+## Prompt
+
+Create an enterprise inventory management system with multi-warehouse support and compliance tracking
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+No user requirements were provided in the input. This specification document template is ready to be populated once feature requirements are submitted.
+
+**Purpose:**
+[NEEDS CLARIFICATION - Awaiting user input to define feature purpose and business objectives]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User roles and personas to be defined]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and value proposition to be defined]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality to be defined]
+
+**FR-002:** [NEEDS CLARIFICATION - User interaction requirements to be defined]
+
+**FR-003:** [NEEDS CLARIFICATION - Data handling requirements to be defined]
+
+**FR-004:** [NEEDS CLARIFICATION - Validation rules to be defined]
+
+**FR-005:** [NEEDS CLARIFICATION - Business logic requirements to be defined]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [NEEDS CLARIFICATION - Response time requirements to be defined]
+
+**NFR-P-002:** [NEEDS CLARIFICATION - Throughput requirements to be defined]
+
+**NFR-P-003:** [NEEDS CLARIFICATION - Resource utilization limits to be defined]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [NEEDS CLARIFICATION - Authentication requirements to be defined]
+
+**NFR-S-002:** [NEEDS CLARIFICATION - Authorization requirements to be defined]
+
+**NFR-S-003:** [NEEDS CLARIFICATION - Data protection requirements to be defined]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [NEEDS CLARIFICATION - Concurrent user capacity to be defined]
+
+**NFR-SC-002:** [NEEDS CLARIFICATION - Data volume handling to be defined]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [NEEDS CLARIFICATION - Accessibility standards to be defined]
+
+**NFR-U-002:** [NEEDS CLARIFICATION - User experience requirements to be defined]
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** [NEEDS CLARIFICATION - Availability requirements to be defined]
+
+**NFR-R-002:** [NEEDS CLARIFICATION - Error handling requirements to be defined]
+
+---
+
+## 5. Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION - Boundary conditions to be defined]
+
+**EC-002:** [NEEDS CLARIFICATION - Invalid input scenarios to be defined]
+
+**EC-003:** [NEEDS CLARIFICATION - Concurrent operation conflicts to be defined]
+
+**EC-004:** [NEEDS CLARIFICATION - System unavailability scenarios to be defined]
+
+**EC-005:** [NEEDS CLARIFICATION - Data inconsistency scenarios to be defined]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - User adoption metrics to be defined]
+
+**SC-002:** [NEEDS CLARIFICATION - Performance benchmarks to be defined]
+
+**SC-003:** [NEEDS CLARIFICATION - Quality metrics to be defined]
+
+**SC-004:** [NEEDS CLARIFICATION - Business outcome measurements to be defined]
+
+**SC-005:** [NEEDS CLARIFICATION - User satisfaction targets to be defined]
+
+---
+
+## 7. Assumptions
+
+[NEEDS CLARIFICATION - List assumptions about user behavior, system context, and constraints]
+
+---
+
+## 8. Dependencies
+
+[NEEDS CLARIFICATION - Identify dependencies on other systems, features, or external services]
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Explicitly define what is NOT included in this feature]
+
+---
+
+## Next Steps
+
+To complete this specification, please provide:
+
+1. **Feature description**: What problem does this feature solve?
+2. **Target users**: Who will use this feature?
+3. **Key capabilities**: What should users be able to do?
+4. **Constraints**: Any limitations, compliance requirements, or specific needs?
+5. **Success metrics**: How will we measure if this feature is successful?
+
+Once these details are provided, this specification can be fully populated with specific, measurable, and testable requirements.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-009.md b/evals/datasets/real-specs/spec-009.md
new file mode 100644
index 0000000000..134a218b4a
--- /dev/null
+++ b/evals/datasets/real-specs/spec-009.md
@@ -0,0 +1,355 @@
+# Test Spec 9
+
+## Prompt
+
+Develop a multi-tenant SaaS application with role-based access control and API integrations
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+[NEEDS CLARIFICATION - Please provide the user requirements/feature description to create a comprehensive specification document]
+
+**Purpose:**
+[To be defined based on user input]
+
+**Target Users:**
+[To be defined based on user input]
+
+**Business Value:**
+[To be defined based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Critical
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+### Priority 2 (P2) - Important
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+### Priority 3 (P3) - Nice to Have
+
+**US-006: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-002:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-003:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-004:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-005:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Performance metric]
+
+- **Description:** [Specific measurable performance requirement]
+- **Target:** [Numeric threshold]
+- **Measurement Method:** [How to measure]
+
+**NFR-002:** [NEEDS CLARIFICATION - Response time requirement]
+
+- **Description:** [Specific measurable requirement]
+- **Target:** [Numeric threshold]
+- **Measurement Method:** [How to measure]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION - Security requirement]
+
+- **Description:** [Specific security control or measure]
+- **Compliance:** [Relevant standards if applicable]
+
+**NFR-004:** [NEEDS CLARIFICATION - Data protection requirement]
+
+- **Description:** [Specific data security requirement]
+- **Compliance:** [Relevant standards if applicable]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION - Scalability requirement]
+
+- **Description:** [Specific scalability metric]
+- **Target:** [Numeric threshold or growth capacity]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION - Usability requirement]
+
+- **Description:** [Specific usability standard]
+- **Target:** [Measurable usability metric]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Availability requirement]
+
+- **Description:** [Uptime or availability requirement]
+- **Target:** [Percentage or time-based metric]
+
+### Accessibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION - Accessibility standard]
+
+- **Description:** [Specific accessibility compliance requirement]
+- **Standard:** [e.g., WCAG 2.1 Level AA]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Boundary condition]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum capacity scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-003:** [NEEDS CLARIFICATION - Minimum value scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+### Error Scenarios
+
+**EC-004:** [NEEDS CLARIFICATION - Invalid input scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+**EC-005:** [NEEDS CLARIFICATION - System unavailability scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+**EC-006:** [NEEDS CLARIFICATION - Timeout scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+### Data Integrity Scenarios
+
+**EC-007:** [NEEDS CLARIFICATION - Concurrent access scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-008:** [NEEDS CLARIFICATION - Data corruption scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Adoption metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-002:** [NEEDS CLARIFICATION - User satisfaction metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+### Business Impact Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - Business outcome metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-004:** [NEEDS CLARIFICATION - Efficiency metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+### Quality Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - Error rate metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-006:** [NEEDS CLARIFICATION - Performance metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List key assumptions]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [NEEDS CLARIFICATION - Items not included in this release]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+2. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+3. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+
+---
+
+## Document Control
+
+**Version:** 0.1 (Draft - Awaiting User Requirements)
+**Last Updated:** [Current Date]
+**Status:** Incomplete - Requires user input
+**Next Review Date:** [To be scheduled after requirements provided]
+
+---
+
+**Note:** This document is a template awaiting your specific feature requirements. Please provide the details of the feature you want specified, and I will create a complete, detailed specification document tailored to your needs.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-010.md b/evals/datasets/real-specs/spec-010.md
new file mode 100644
index 0000000000..b27cdd7520
--- /dev/null
+++ b/evals/datasets/real-specs/spec-010.md
@@ -0,0 +1,199 @@
+# Test Spec 10
+
+## Prompt
+
+Design a RESTful API for a social media platform with user profiles, posts, and messaging
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature specified]
+
+**Description:**
+No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.
+
+**Business Value:**
+[NEEDS CLARIFICATION - Awaiting feature description]
+
+**Target Users:**
+[NEEDS CLARIFICATION - Awaiting user persona details]
+
+**Scope:**
+[NEEDS CLARIFICATION - Awaiting feature boundaries]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (Must Have)
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Priority 2 (Should Have)
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Priority 3 (Nice to Have)
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+**FR-002:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+**FR-003:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance
+
+**NFR-001:** [NEEDS CLARIFICATION - Awaiting performance expectations]
+
+**NFR-002:** [NEEDS CLARIFICATION - Awaiting response time requirements]
+
+### Security
+
+**NFR-003:** [NEEDS CLARIFICATION - Awaiting security requirements]
+
+**NFR-004:** [NEEDS CLARIFICATION - Awaiting authentication/authorization requirements]
+
+### Scalability
+
+**NFR-005:** [NEEDS CLARIFICATION - Awaiting scalability requirements]
+
+### Usability
+
+**NFR-006:** [NEEDS CLARIFICATION - Awaiting usability requirements]
+
+### Accessibility
+
+**NFR-007:** [NEEDS CLARIFICATION - Awaiting accessibility standards]
+
+### Reliability
+
+**NFR-008:** [NEEDS CLARIFICATION - Awaiting uptime/availability requirements]
+
+---
+
+## 5. Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION - Awaiting boundary conditions]
+
+**EC-002:** [NEEDS CLARIFICATION - Awaiting error scenarios]
+
+**EC-003:** [NEEDS CLARIFICATION - Awaiting invalid input handling]
+
+**EC-004:** [NEEDS CLARIFICATION - Awaiting concurrent usage scenarios]
+
+**EC-005:** [NEEDS CLARIFICATION - Awaiting data limit scenarios]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Awaiting measurable success metrics]
+
+**SC-002:** [NEEDS CLARIFICATION - Awaiting adoption targets]
+
+**SC-003:** [NEEDS CLARIFICATION - Awaiting performance benchmarks]
+
+**SC-004:** [NEEDS CLARIFICATION - Awaiting user satisfaction metrics]
+
+---
+
+## 7. Assumptions
+
+[NEEDS CLARIFICATION - Awaiting feature context to identify assumptions]
+
+---
+
+## 8. Dependencies
+
+[NEEDS CLARIFICATION - Awaiting feature details to identify dependencies]
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Awaiting feature boundaries]
+
+---
+
+## 10. Open Questions
+
+1. What is the primary feature being requested?
+2. Who are the target users and what problems are they experiencing?
+3. What are the business goals for this feature?
+4. Are there any existing systems or processes this feature should integrate with?
+5. What are the expected usage patterns and volume?
+6. Are there any compliance or regulatory requirements?
+7. What is the timeline or release deadline?
+
+---
+
+## Next Steps
+
+- Detailed description of the desired feature
+- Target user personas and their pain points
+- Business objectives and expected outcomes
+- Any existing workflows or systems to consider
+- Constraints (budget, timeline, resources)
+
+---
+
+**Document Status:** DRAFT - Awaiting Requirements
+**Last Updated:** [Current Date]
+**Author:** Product Specification Team
+
+---
+
+*Please provide your feature requirements, and I will generate a comprehensive specification document tailored to your needs.*
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-011.md b/evals/datasets/real-specs/spec-011.md
new file mode 100644
index 0000000000..719da907c2
--- /dev/null
+++ b/evals/datasets/real-specs/spec-011.md
@@ -0,0 +1,549 @@
+# Test Spec 11
+
+## Prompt
+
+Build a GraphQL API for a content management system with flexible querying
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No user input provided]
+
+**Description:**
+This feature specification document is a template awaiting user requirements. Once requirements are provided, this section will contain:
+
+- Brief summary of the feature's purpose
+- Primary problem it solves
+- Target user base
+- High-level scope boundaries
+
+**Document Version:** 1.0
+**Last Updated:** [Current Date]
+**Status:** Draft - Awaiting Requirements
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+
+Given [alternative context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+**US-002: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+**US-004: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-002:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-003:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+### Data Management
+
+**FR-004:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+### User Interface
+
+**FR-005:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+### Integration Points
+
+**FR-006:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** Response Time
+
+- **Description:** [NEEDS CLARIFICATION] The system shall respond to user actions within [X] seconds under normal load conditions
+- **Measurement:** Average response time measured across [specify measurement period]
+- **Priority:** P1
+
+**NFR-002:** Throughput
+
+- **Description:** [NEEDS CLARIFICATION] The system shall support [X] concurrent users performing [specific actions]
+- **Measurement:** System maintains acceptable performance with specified user load
+- **Priority:** P1
+
+**NFR-003:** Data Processing
+
+- **Description:** [NEEDS CLARIFICATION] The system shall process [X] transactions/records per [time unit]
+- **Measurement:** Processing rate during peak operations
+- **Priority:** P2
+
+### Scalability Requirements
+
+**NFR-004:** User Growth
+
+- **Description:** [NEEDS CLARIFICATION] The system shall support growth to [X] total users within [timeframe]
+- **Measurement:** System performance metrics remain within acceptable thresholds at specified scale
+- **Priority:** P1
+
+**NFR-005:** Data Volume
+
+- **Description:** [NEEDS CLARIFICATION] The system shall handle up to [X] amount of data without performance degradation
+- **Measurement:** Response times and throughput at maximum data capacity
+- **Priority:** P2
+
+### Security Requirements
+
+**NFR-006:** Authentication
+
+- **Description:** [NEEDS CLARIFICATION] The system shall require user authentication before accessing protected features
+- **Measurement:** All protected endpoints reject unauthenticated requests
+- **Priority:** P1
+
+**NFR-007:** Authorization
+
+- **Description:** [NEEDS CLARIFICATION] The system shall enforce role-based access controls for all user actions
+- **Measurement:** Users can only perform actions authorized for their role
+- **Priority:** P1
+
+**NFR-008:** Data Protection
+
+- **Description:** [NEEDS CLARIFICATION] The system shall protect sensitive data at rest and in transit
+- **Measurement:** Security audit confirms data protection standards are met
+- **Priority:** P1
+
+**NFR-009:** Audit Logging
+
+- **Description:** [NEEDS CLARIFICATION] The system shall log all security-relevant events
+- **Measurement:** Audit logs capture specified events with required information
+- **Priority:** P2
+
+### Reliability Requirements
+
+**NFR-010:** Availability
+
+- **Description:** [NEEDS CLARIFICATION] The system shall maintain [X]% uptime during business hours
+- **Measurement:** Actual uptime percentage over specified period
+- **Priority:** P1
+
+**NFR-011:** Data Integrity
+
+- **Description:** [NEEDS CLARIFICATION] The system shall ensure zero data loss during normal operations
+- **Measurement:** Data validation checks confirm no data corruption or loss
+- **Priority:** P1
+
+**NFR-012:** Error Recovery
+
+- **Description:** [NEEDS CLARIFICATION] The system shall recover from errors within [X] minutes
+- **Measurement:** Time from error detection to full service restoration
+- **Priority:** P2
+
+### Usability Requirements
+
+**NFR-013:** Accessibility
+
+- **Description:** [NEEDS CLARIFICATION] The system shall comply with [accessibility standard, e.g., WCAG 2.1 Level AA]
+- **Measurement:** Accessibility audit confirms compliance
+- **Priority:** P1
+
+**NFR-014:** User Training
+
+- **Description:** [NEEDS CLARIFICATION] New users shall be able to complete core tasks within [X] minutes of initial use
+- **Measurement:** User testing demonstrates task completion within timeframe
+- **Priority:** P2
+
+**NFR-015:** Error Messaging
+
+- **Description:** The system shall provide clear, actionable error messages to users
+- **Measurement:** Error messages include explanation and next steps
+- **Priority:** P2
+
+### Maintainability Requirements
+
+**NFR-016:** Documentation
+
+- **Description:** [NEEDS CLARIFICATION] The system shall include user documentation for all features
+- **Measurement:** Documentation exists and covers specified feature set
+- **Priority:** P2
+
+**NFR-017:** Monitoring
+
+- **Description:** [NEEDS CLARIFICATION] The system shall provide operational monitoring capabilities
+- **Measurement:** Key system metrics are observable and measurable
+- **Priority:** P1
+
+### Compliance Requirements
+
+**NFR-018:** Regulatory Compliance
+
+- **Description:** [NEEDS CLARIFICATION] The system shall comply with [relevant regulations, e.g., GDPR, HIPAA, SOC 2]
+- **Measurement:** Compliance audit confirms adherence to specified regulations
+- **Priority:** P1
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides invalid input format
+- **Expected Behavior:** System displays clear error message and does not process invalid data
+- **Recovery:** User can correct input and resubmit
+
+**EC-002:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides input exceeding maximum allowed size/length
+- **Expected Behavior:** System rejects input with descriptive error
+- **Recovery:** User is informed of limits and can adjust input
+
+**EC-003:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides empty/null required fields
+- **Expected Behavior:** System identifies missing required fields
+- **Recovery:** User is prompted to provide required information
+
+### Boundary Conditions
+
+**EC-004:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** System reaches maximum capacity (users, data, transactions)
+- **Expected Behavior:** System gracefully handles limit
+- **Recovery:** Clear messaging about limitations and alternatives
+
+**EC-005:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Operation times out due to duration
+- **Expected Behavior:** System cancels operation and notifies user
+- **Recovery:** User can retry or choose alternative action
+
+**EC-006:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Minimum boundary values are used
+- **Expected Behavior:** System processes minimum valid values correctly
+- **Recovery:** N/A - normal operation
+
+### Concurrent Operations
+
+**EC-007:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Multiple users attempt to modify the same resource simultaneously
+- **Expected Behavior:** System handles conflicts appropriately
+- **Recovery:** Users are notified of conflicts with options to resolve
+
+**EC-008:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User initiates same operation multiple times rapidly
+- **Expected Behavior:** System prevents duplicate processing
+- **Recovery:** User receives confirmation of single operation
+
+### System Resource Constraints
+
+**EC-009:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** System experiences temporary resource constraints
+- **Expected Behavior:** System queues or delays non-critical operations
+- **Recovery:** Operations complete when resources available
+
+**EC-010:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** External dependency is unavailable
+- **Expected Behavior:** System continues operating in degraded mode or provides clear error
+- **Recovery:** System resumes full functionality when dependency restored
+
+### Data Consistency
+
+**EC-011:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Data synchronization fails between components
+- **Expected Behavior:** System detects inconsistency and alerts operators
+- **Recovery:** System provides reconciliation mechanism
+
+**EC-012:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User session expires during operation
+- **Expected Behavior:** System safely terminates operation or saves state
+- **Recovery:** User can re-authenticate and resume or restart
+
+### Network & Connectivity
+
+**EC-013:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Network connection is lost during operation
+- **Expected Behavior:** System detects connection loss and handles gracefully
+- **Recovery:** System retries operation or notifies user to retry
+
+**EC-014:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Partial data transmission occurs
+- **Expected Behavior:** System validates data completeness
+- **Recovery:** System requests retransmission of incomplete data
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** User Engagement
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of target users actively use the feature within [timeframe]
+- **Measurement Method:** Usage analytics tracking active users
+- **Target:** [Specific percentage and timeframe]
+
+**SC-002:** Task Completion Rate
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of users successfully complete primary user flows
+- **Measurement Method:** Analytics tracking successful task completions
+- **Target:** [Specific percentage]
+
+**SC-003:** User Satisfaction
+
+- **Metric:** [NEEDS CLARIFICATION] Average user satisfaction score of [X] out of [Y]
+- **Measurement Method:** User surveys and feedback collection
+- **Target:** [Specific score]
+
+### Performance Metrics
+
+**SC-004:** System Performance
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of requests complete within target response time
+- **Measurement Method:** Performance monitoring tools
+- **Target:** [Specific percentage and response time]
+
+**SC-005:** System Reliability
+
+- **Metric:** [NEEDS CLARIFICATION] System maintains [X]% uptime
+- **Measurement Method:** Uptime monitoring over specified period
+- **Target:** [Specific uptime percentage]
+
+**SC-006:** Error Rate
+
+- **Metric:** [NEEDS CLARIFICATION] Error rate below [X]% of total operations
+- **Measurement Method:** Error logging and monitoring
+- **Target:** [Specific error rate threshold]
+
+### Business Metrics
+
+**SC-007:** Business Value
+
+- **Metric:** [NEEDS CLARIFICATION] Feature generates/saves [specific business outcome]
+- **Measurement Method:** [Business metric tracking method]
+- **Target:** [Specific measurable outcome]
+
+**SC-008:** Efficiency Gain
+
+- **Metric:** [NEEDS CLARIFICATION] Reduces time for [specific task] by [X]%
+- **Measurement Method:** Time tracking before and after implementation
+- **Target:** [Specific percentage improvement]
+
+### Quality Metrics
+
+**SC-009:** Defect Rate
+
+- **Metric:** [NEEDS CLARIFICATION] Fewer than [X] critical defects in first [timeframe] post-launch
+- **Measurement Method:** Defect tracking system
+- **Target:** [Specific number of defects]
+
+**SC-010:** Accessibility Compliance
+
+- **Metric:** [NEEDS CLARIFICATION] Achieve [X]% accessibility compliance score
+- **Measurement Method:** Automated and manual accessibility testing
+- **Target:** [Specific compliance percentage]
+
+### Operational Metrics
+
+**SC-011:** Support Tickets
+
+- **Metric:** [NEEDS CLARIFICATION] Support tickets related to feature remain below [X] per [time period]
+- **Measurement Method:** Support ticket tracking system
+- **Target:** [Specific number of tickets]
+
+**SC-012:** Training Effectiveness
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of users require no additional support after initial training
+- **Measurement Method:** Support request tracking correlated with user training
+- **Target:** [Specific percentage]
+
+---
+
+## 7. Dependencies & Assumptions
+
+### Dependencies
+
+[NEEDS CLARIFICATION - Awaiting requirements to identify:]
+
+- External systems or services required
+- Data sources needed
+- Third-party integrations
+- Organizational resources required
+
+### Assumptions
+
+[NEEDS CLARIFICATION - Awaiting requirements to document:]
+
+- Assumed user behaviors
+- Assumed technical capabilities available
+- Assumed business constraints
+- Assumed timeline and resource availability
+
+---
+
+## 8. Out of Scope
+
+[NEEDS CLARIFICATION - To be defined based on user requirements]
+
+The following items are explicitly excluded from this feature:
+
+- [Item 1]
+- [Item 2]
+- [Item 3]
+
+---
+
+## 9. Open Questions
+
+**OQ-001:** [NEEDS CLARIFICATION] What are the specific user requirements for this feature?
+
+**OQ-002:** [NEEDS CLARIFICATION] Who are the target users and what are their roles?
+
+**OQ-003:** [NEEDS CLARIFICATION] What problem is this feature intended to solve?
+
+**OQ-004:** [NEEDS CLARIFICATION] What are the primary user workflows?
+
+**OQ-005:** [NEEDS CLARIFICATION] What are the regulatory or compliance requirements?
+
+**OQ-006:** [NEEDS CLARIFICATION] What are the performance expectations?
+
+**OQ-007:** [NEEDS CLARIFICATION] What are the integration requirements?
+
+**OQ-008:** [NEEDS CLARIFICATION] What is the expected timeline for delivery?
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Signature | Date |
+|------|------|-----------|------|
+| Product Owner | [Name] | | |
+| Technical Lead | [Name] | | |
+| QA Lead | [Name] | | |
+| UX Lead | [Name] | | |
+| Stakeholder | [Name] | | |
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-012.md b/evals/datasets/real-specs/spec-012.md
new file mode 100644
index 0000000000..784322b313
--- /dev/null
+++ b/evals/datasets/real-specs/spec-012.md
@@ -0,0 +1,360 @@
+# Test Spec 12
+
+## Prompt
+
+Create a .NET Core web application for employee onboarding with document management
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature described in user input]
+
+**Feature Description:**
+No user requirements were provided in the input. This template demonstrates the structure and format for a comprehensive feature specification document.
+
+**Business Value:**
+[NEEDS CLARIFICATION - Business objectives and value proposition undefined]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User personas and roles undefined]
+
+**Scope:**
+[NEEDS CLARIFICATION - Feature boundaries and limitations undefined]
+
+---
+
+## 2. User Stories
+
+### P1 (Must Have) Stories
+
+**US-001: [NEEDS CLARIFICATION - Core user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+### P2 (Should Have) Stories
+
+**US-002: [NEEDS CLARIFICATION - Secondary user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+### P3 (Nice to Have) Stories
+
+**US-003: [NEEDS CLARIFICATION - Enhancement user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality undefined]
+
+- **Priority:** P1
+- **Testability:** [Test criteria undefined]
+
+**FR-002:** [NEEDS CLARIFICATION - Data handling requirements undefined]
+
+- **Priority:** P1
+- **Testability:** [Test criteria undefined]
+
+**FR-003:** [NEEDS CLARIFICATION - User interaction requirements undefined]
+
+- **Priority:** P2
+- **Testability:** [Test criteria undefined]
+
+**FR-004:** [NEEDS CLARIFICATION - Integration requirements undefined]
+
+- **Priority:** P2
+- **Testability:** [Test criteria undefined]
+
+**FR-005:** [NEEDS CLARIFICATION - Notification/feedback requirements undefined]
+
+- **Priority:** P3
+- **Testability:** [Test criteria undefined]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirements undefined]
+
+- **Metric:** [Undefined - e.g., "95% of requests complete within X seconds"]
+- **Priority:** P1
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System handles X concurrent users"]
+- **Priority:** P1
+
+**NFR-003:** [NEEDS CLARIFICATION - Load time requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Initial load completes within X seconds"]
+- **Priority:** P2
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirements undefined]
+
+- **Metric:** [Undefined - e.g., "All sensitive operations require authentication"]
+- **Priority:** P1
+
+**NFR-005:** [NEEDS CLARIFICATION - Data protection requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Sensitive data encrypted at rest and in transit"]
+- **Priority:** P1
+
+**NFR-006:** [NEEDS CLARIFICATION - Authorization requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Role-based access control implemented"]
+- **Priority:** P1
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - User growth requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System scales to X users within Y months"]
+- **Priority:** P2
+
+**NFR-008:** [NEEDS CLARIFICATION - Data volume requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System handles X records without degradation"]
+- **Priority:** P2
+
+### Usability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Accessibility requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Meets WCAG 2.1 Level AA standards"]
+- **Priority:** P2
+
+**NFR-010:** [NEEDS CLARIFICATION - Device compatibility requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Functions on devices with screen sizes X to Y"]
+- **Priority:** P1
+
+### Reliability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - Availability requirements undefined]
+
+- **Metric:** [Undefined - e.g., "99.9% uptime during business hours"]
+- **Priority:** P1
+
+**NFR-012:** [NEEDS CLARIFICATION - Error handling requirements undefined]
+
+- **Metric:** [Undefined - e.g., "All errors logged with user-friendly messages"]
+- **Priority:** P1
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Minimum input scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User enters minimum allowed value"]
+- **Expected Behavior:** [Undefined]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum input scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User enters maximum allowed value"]
+- **Expected Behavior:** [Undefined]
+
+**EC-003:** [NEEDS CLARIFICATION - Empty state scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User accesses feature with no data"]
+- **Expected Behavior:** [Undefined]
+
+**EC-004:** [NEEDS CLARIFICATION - Concurrent operation scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "Multiple users modify same resource"]
+- **Expected Behavior:** [Undefined]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION - Invalid input handling undefined]
+
+- **Trigger:** [Undefined - e.g., "User submits invalid data format"]
+- **Expected Response:** [Undefined - e.g., "Clear error message with guidance"]
+
+**ES-002:** [NEEDS CLARIFICATION - Network failure handling undefined]
+
+- **Trigger:** [Undefined - e.g., "Connection lost during operation"]
+- **Expected Response:** [Undefined - e.g., "Graceful degradation with retry option"]
+
+**ES-003:** [NEEDS CLARIFICATION - Permission denial handling undefined]
+
+- **Trigger:** [Undefined - e.g., "User attempts unauthorized action"]
+- **Expected Response:** [Undefined - e.g., "Access denied message"]
+
+**ES-004:** [NEEDS CLARIFICATION - Timeout handling undefined]
+
+- **Trigger:** [Undefined - e.g., "Operation exceeds time limit"]
+- **Expected Response:** [Undefined - e.g., "Timeout notification with options"]
+
+**ES-005:** [NEEDS CLARIFICATION - Resource exhaustion handling undefined]
+
+- **Trigger:** [Undefined - e.g., "System reaches capacity limits"]
+- **Expected Response:** [Undefined - e.g., "Capacity message with alternatives"]
+
+---
+
+## 6. Success Criteria
+
+### Quantitative Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Adoption metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% of target users adopt feature within Y days"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-002:** [NEEDS CLARIFICATION - Usage metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Average of X sessions per user per week"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-003:** [NEEDS CLARIFICATION - Performance metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Average task completion time under X minutes"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-004:** [NEEDS CLARIFICATION - Error rate metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Error rate below X%"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+### Qualitative Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - User satisfaction metrics undefined]
+
+- **Metric:** [Undefined - e.g., "User satisfaction score of X/10 or higher"]
+- **Measurement Method:** [Undefined - e.g., "Post-feature survey"]
+- **Target:** [Undefined]
+
+**SC-006:** [NEEDS CLARIFICATION - Task completion metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% task completion rate without assistance"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - Business impact metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% reduction in support tickets"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-008:** [NEEDS CLARIFICATION - ROI metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Positive ROI within X months"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - Project assumptions undefined]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - External dependencies undefined]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - Project constraints undefined]
+
+---
+
+## 8. Out of Scope
+
+- [NEEDS CLARIFICATION - Explicitly excluded features undefined]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being requested?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the key user workflows?]
+5. [NEEDS CLARIFICATION - What are the integration requirements?]
+6. [NEEDS CLARIFICATION - What are the business objectives?]
+7. [NEEDS CLARIFICATION - What is the expected timeline?]
+8. [NEEDS CLARIFICATION - What are the resource constraints?]
+
+---
+
+## 10. Approval and Sign-off
+
+**Document Version:** 1.0 (Template)
+
+**Prepared By:** [Name/Role]
+
+**Date:** [Date]
+
+**Stakeholder Approvals:**
+
+- Product Owner: _________________ Date: _______
+- Engineering Lead: _________________ Date: _______
+- Design Lead: _________________ Date: _______
+- QA Lead: _________________ Date: _______
+
+---
+
+## Notes
+
+**This is a template document.** Please provide specific user requirements to generate a complete and actionable feature specification. The actual specification should include:
+
+- Detailed descriptions of desired functionality
+- User roles and personas
+- Specific workflows and interactions
+- Business objectives and constraints
+- Integration requirements
+- Any existing system context
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-013.md b/evals/datasets/real-specs/spec-013.md
new file mode 100644
index 0000000000..79918e8b3d
--- /dev/null
+++ b/evals/datasets/real-specs/spec-013.md
@@ -0,0 +1,443 @@
+# Test Spec 13
+
+## Prompt
+
+Build a Python Flask API for data processing and machine learning model serving
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+This section requires user input to define the feature being specified. The feature specification document framework is ready to document:
+
+- Core functionality and purpose
+- Target users and use cases
+- Business value and objectives
+- High-level scope and boundaries
+
+[NEEDS CLARIFICATION - Please provide details about the feature you want specified, including:
+
+- What problem does this feature solve?
+- Who are the primary users?
+- What are the main capabilities needed?
+- What business goals does this support?]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Critical
+
+**US-001: [NEEDS CLARIFICATION - Core user story]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+And [additional outcome]
+```
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Essential functionality]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+### Priority 2 (P2) - Important
+
+**US-003: [NEEDS CLARIFICATION - Important enhancement]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Supporting functionality]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION - Additional feature]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+**US-006: [NEEDS CLARIFICATION - Optional enhancement]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P1
+- **Related User Stories:** US-001
+
+**FR-002:** [NEEDS CLARIFICATION - Data handling requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P1
+- **Related User Stories:** US-001, US-002
+
+**FR-003:** [NEEDS CLARIFICATION - User interaction requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P2
+- **Related User Stories:** US-003
+
+**FR-004:** [NEEDS CLARIFICATION - Validation requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P2
+- **Related User Stories:** US-004
+
+**FR-005:** [NEEDS CLARIFICATION - Reporting/feedback requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P3
+- **Related User Stories:** US-005
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** Response Time
+
+- The system shall respond to user actions within [X] seconds under normal load conditions
+- **Priority:** P1
+- **Measurement:** 95th percentile response time
+
+**NFR-P-002:** Throughput
+
+- The system shall support [X] concurrent users without performance degradation
+- **Priority:** P1
+- **Measurement:** Load testing results
+
+**NFR-P-003:** Data Processing
+
+- The system shall process [X] transactions per second
+- **Priority:** P2
+- **Measurement:** Transaction processing metrics
+
+---
+
+### Security (NFR-S)
+
+**NFR-S-001:** Authentication
+
+- The system shall require user authentication for all protected operations
+- **Priority:** P1
+- **Measurement:** Security audit compliance
+
+**NFR-S-002:** Authorization
+
+- The system shall enforce role-based access controls for all sensitive operations
+- **Priority:** P1
+- **Measurement:** Access control testing results
+
+**NFR-S-003:** Data Protection
+
+- The system shall protect sensitive data at rest and in transit
+- **Priority:** P1
+- **Measurement:** Security scanning results
+
+**NFR-S-004:** Audit Logging
+
+- The system shall log all security-relevant events with timestamp, user, and action details
+- **Priority:** P2
+- **Measurement:** Audit log completeness
+
+---
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** Horizontal Scaling
+
+- The system shall support scaling to handle [X]% increase in load without architectural changes
+- **Priority:** P2
+- **Measurement:** Load testing at scale
+
+**NFR-SC-002:** Data Volume
+
+- The system shall handle up to [X] records without performance degradation
+- **Priority:** P2
+- **Measurement:** Benchmark testing results
+
+---
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** Availability
+
+- The system shall maintain [X]% uptime during business hours
+- **Priority:** P1
+- **Measurement:** Uptime monitoring
+
+**NFR-R-002:** Error Handling
+
+- The system shall handle error conditions gracefully without data loss
+- **Priority:** P1
+- **Measurement:** Error recovery testing
+
+**NFR-R-003:** Data Integrity
+
+- The system shall maintain data consistency across all operations
+- **Priority:** P1
+- **Measurement:** Data validation testing
+
+---
+
+### Usability (NFR-U)
+
+**NFR-U-001:** Accessibility
+
+- The system shall comply with WCAG 2.1 Level AA accessibility standards
+- **Priority:** P1
+- **Measurement:** Accessibility audit results
+
+**NFR-U-002:** Learnability
+
+- New users shall be able to complete core tasks within [X] minutes of first use
+- **Priority:** P2
+- **Measurement:** User testing observations
+
+**NFR-U-003:** Error Messages
+
+- The system shall provide clear, actionable error messages for all user-facing errors
+- **Priority:** P2
+- **Measurement:** User comprehension testing
+
+---
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** Logging
+
+- The system shall log sufficient information to diagnose issues in production
+- **Priority:** P1
+- **Measurement:** Incident resolution time
+
+**NFR-M-002:** Monitoring
+
+- The system shall expose health check endpoints and operational metrics
+- **Priority:** P1
+- **Measurement:** Monitoring coverage
+
+---
+
+## 5. Edge Cases
+
+### EC-001: [NEEDS CLARIFICATION - Boundary condition]
+
+**Scenario:** [Description of edge case]
+**Expected Behavior:** The system shall [specific handling]
+**Priority:** P1
+
+### EC-002: [NEEDS CLARIFICATION - Invalid input]
+
+**Scenario:** User provides invalid or malformed input
+**Expected Behavior:** The system shall reject the input and provide clear feedback
+**Priority:** P1
+
+### EC-003: [NEEDS CLARIFICATION - Resource limits]
+
+**Scenario:** System reaches maximum capacity/limits
+**Expected Behavior:** The system shall [specific handling]
+**Priority:** P2
+
+### EC-004: [NEEDS CLARIFICATION - Concurrent operations]
+
+**Scenario:** Multiple users attempt to modify the same resource simultaneously
+**Expected Behavior:** The system shall [specific handling] to prevent conflicts
+**Priority:** P2
+
+### EC-005: [NEEDS CLARIFICATION - Empty/null states]
+
+**Scenario:** No data available or null values encountered
+**Expected Behavior:** The system shall display appropriate empty state messaging
+**Priority:** P2
+
+### EC-006: [NEEDS CLARIFICATION - Network/service failure]
+
+**Scenario:** External dependency becomes unavailable
+**Expected Behavior:** The system shall degrade gracefully and notify users appropriately
+**Priority:** P1
+
+### EC-007: [NEEDS CLARIFICATION - Session timeout]
+
+**Scenario:** User session expires during active use
+**Expected Behavior:** The system shall preserve user work and prompt for re-authentication
+**Priority:** P2
+
+---
+
+## 6. Success Criteria
+
+**SC-001: User Adoption** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of target users actively using the feature within [timeframe]
+- **Measurement Method:** User analytics
+- **Target:** [Specific number/percentage]
+
+**SC-002: Task Completion Rate** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of users successfully complete primary workflows
+- **Measurement Method:** User behavior tracking
+- **Target:** [Specific percentage]
+
+**SC-003: Performance Benchmarks** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of operations complete within target time
+- **Measurement Method:** Performance monitoring
+- **Target:** [Specific percentage]
+
+**SC-004: Error Rate** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** System error rate below [X]%
+- **Measurement Method:** Error logging and monitoring
+- **Target:** [Specific percentage]
+
+**SC-005: User Satisfaction** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** User satisfaction score of [X]/10 or higher
+- **Measurement Method:** User surveys/feedback
+- **Target:** [Specific score]
+
+**SC-006: Business Impact** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [Specific business metric improvement]
+- **Measurement Method:** Business analytics
+- **Target:** [Specific improvement percentage]
+
+---
+
+## 7. Dependencies and Constraints
+
+### Dependencies [NEEDS CLARIFICATION]
+
+- External system integration requirements
+- Data availability requirements
+- User authentication/authorization systems
+- Third-party service dependencies
+
+### Constraints [NEEDS CLARIFICATION]
+
+- Regulatory compliance requirements
+- Budget limitations
+- Timeline constraints
+- Resource availability
+
+---
+
+## 8. Assumptions [NEEDS CLARIFICATION]
+
+- User technical proficiency level
+- Available infrastructure
+- Data quality and availability
+- User access patterns
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Define what is explicitly not included in this feature]
+
+---
+
+## 10. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being built?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What are the specific business objectives?]
+4. [NEEDS CLARIFICATION - What are the success metrics?]
+5. [NEEDS CLARIFICATION - Are there any regulatory or compliance requirements?]
+6. [NEEDS CLARIFICATION - What is the expected timeline?]
+
+---
+
+## Document Control
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | [Date] | [Author] | Initial template created - awaiting requirements |
+
+---
+
+**Note:** This document requires substantial input to complete. Please provide detailed information about the feature you want specified, and I will populate all sections with specific, measurable, and testable requirements.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-014.md b/evals/datasets/real-specs/spec-014.md
new file mode 100644
index 0000000000..a7b018b4e7
--- /dev/null
+++ b/evals/datasets/real-specs/spec-014.md
@@ -0,0 +1,348 @@
+# Test Spec 14
+
+## Prompt
+
+Develop a Node.js/Express backend with real-time WebSocket communication
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification template is ready to be populated once user requirements are submitted.]
+
+**Purpose:**
+[NEEDS CLARIFICATION - Purpose and objectives need to be defined based on user requirements]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User segments and personas need to be identified]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and value proposition need to be defined]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+2. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+ **When** [action]
+ **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-002:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-003:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-004:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-005:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Performance requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+**NFR-002:** [NEEDS CLARIFICATION - Response time requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION - Security requirement]
+
+- **Compliance Standard:** [Applicable standards]
+- **Verification Method:** [How compliance will be tested]
+
+**NFR-004:** [NEEDS CLARIFICATION - Data protection requirement]
+
+- **Compliance Standard:** [Applicable standards]
+- **Verification Method:** [How compliance will be tested]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION - Scalability requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION - Usability requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Availability requirement]
+
+- **Metric:** [Uptime percentage or MTBF]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Compatibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION - Platform/device compatibility]
+
+- **Supported Platforms:** [List of platforms]
+- **Verification Method:** [How compatibility will be tested]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Edge case description]
+
+- **Scenario:** [Describe the boundary condition]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback is provided]
+
+**EC-002:** [NEEDS CLARIFICATION - Edge case description]
+
+- **Scenario:** [Describe the boundary condition]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback is provided]
+
+### Error Scenarios
+
+**EC-003:** [NEEDS CLARIFICATION - Error scenario]
+
+- **Trigger Condition:** [What causes this error]
+- **Expected Behavior:** [How the system should respond]
+- **Recovery Path:** [How the user can recover]
+- **User Feedback:** [Error message or guidance provided]
+
+**EC-004:** [NEEDS CLARIFICATION - Error scenario]
+
+- **Trigger Condition:** [What causes this error]
+- **Expected Behavior:** [How the system should respond]
+- **Recovery Path:** [How the user can recover]
+- **User Feedback:** [Error message or guidance provided]
+
+### Data Validation Edge Cases
+
+**EC-005:** [NEEDS CLARIFICATION - Validation edge case]
+
+- **Invalid Input Type:** [Description]
+- **System Response:** [Expected handling]
+- **User Guidance:** [Feedback provided]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-002:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-003:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-004:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-005:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies, prerequisites, or required integrations]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List any known constraints or limitations]
+
+---
+
+## 8. Out of Scope
+
+[NEEDS CLARIFICATION - Explicitly define what is NOT included in this feature]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+2. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+3. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | | | |
+| Business Stakeholder | | | |
+| QA Lead | | | |
+| Development Lead | | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Created By:** [Author]
+**Status:** DRAFT - Awaiting User Requirements
+
+---
+
+## Instructions for Completion
+
+**To complete this specification, please provide:**
+
+1. **Feature description**: What problem are you solving? What capability should users have?
+2. **Target users**: Who will use this feature?
+3. **User goals**: What should users be able to accomplish?
+4. **Business objectives**: What business outcomes are expected?
+5. **Constraints**: Any budget, timeline, or regulatory constraints?
+6. **Integration needs**: Does this feature need to work with existing systems?
+7. **Success metrics**: How will you measure if this feature is successful?
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-015.md b/evals/datasets/real-specs/spec-015.md
new file mode 100644
index 0000000000..00bc981aca
--- /dev/null
+++ b/evals/datasets/real-specs/spec-015.md
@@ -0,0 +1,322 @@
+# Test Spec 15
+
+## Prompt
+
+Build a HIPAA-compliant healthcare appointment scheduling system
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No user requirements provided]
+
+**Description:**
+No user input was provided for this feature specification. This document serves as a template structure that should be populated once user requirements are submitted.
+
+**Problem Statement:** [NEEDS CLARIFICATION]
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Value:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-002:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-003:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-004:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-005:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION] - The system shall respond to user actions within [X seconds/milliseconds] under normal load conditions
+
+**NFR-002:** [NEEDS CLARIFICATION] - The system shall support [X] concurrent users without performance degradation
+
+**NFR-003:** [NEEDS CLARIFICATION] - The system shall process [X] transactions per second
+
+---
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION] - The system shall authenticate all users before granting access
+
+**NFR-005:** [NEEDS CLARIFICATION] - The system shall encrypt sensitive data at rest and in transit
+
+**NFR-006:** [NEEDS CLARIFICATION] - The system shall maintain audit logs of all user actions for [X] duration
+
+---
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION] - The system shall scale to accommodate [X%] growth in user base over [time period]
+
+**NFR-008:** [NEEDS CLARIFICATION] - The system shall handle peak loads of [X times] normal traffic
+
+---
+
+### Availability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION] - The system shall maintain [X%] uptime during business hours
+
+**NFR-010:** [NEEDS CLARIFICATION] - The system shall recover from failures within [X] minutes
+
+---
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION] - The system shall be accessible to users with disabilities per [specific standard]
+
+**NFR-012:** [NEEDS CLARIFICATION] - New users shall be able to complete [core task] within [X] minutes without training
+
+---
+
+### Compatibility Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION] - The system shall function on [specified browsers/devices/platforms]
+
+**NFR-014:** [NEEDS CLARIFICATION] - The system shall integrate with [external systems] via standard protocols
+
+---
+
+## 5. Edge Cases
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION] - System behavior when [minimum input value] is provided
+
+**EC-002:** [NEEDS CLARIFICATION] - System behavior when [maximum input value] is provided
+
+**EC-003:** [NEEDS CLARIFICATION] - System behavior when [empty/null input] is provided
+
+**EC-004:** [NEEDS CLARIFICATION] - System behavior when [special characters/invalid format] is provided
+
+---
+
+### Error Scenarios
+
+**EC-005:** [NEEDS CLARIFICATION] - System behavior when [external service] is unavailable
+
+**EC-006:** [NEEDS CLARIFICATION] - System behavior when user session expires during operation
+
+**EC-007:** [NEEDS CLARIFICATION] - System behavior when [resource limit] is exceeded
+
+**EC-008:** [NEEDS CLARIFICATION] - System behavior during network interruption
+
+**EC-009:** [NEEDS CLARIFICATION] - System behavior with concurrent conflicting operations
+
+**EC-010:** [NEEDS CLARIFICATION] - System behavior when duplicate entries are attempted
+
+---
+
+### Data Integrity Scenarios
+
+**EC-011:** [NEEDS CLARIFICATION] - System behavior when data format changes mid-process
+
+**EC-012:** [NEEDS CLARIFICATION] - System behavior with missing required dependencies
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION] - Achieve [X%] user adoption rate within [timeframe]
+
+**SC-002:** [NEEDS CLARIFICATION] - Achieve [X%] user satisfaction score (measured via survey/feedback)
+
+**SC-003:** [NEEDS CLARIFICATION] - Reduce user support tickets related to [area] by [X%]
+
+---
+
+### Performance Metrics
+
+**SC-004:** [NEEDS CLARIFICATION] - Reduce time to complete [task] from [current time] to [target time]
+
+**SC-005:** [NEEDS CLARIFICATION] - Process [X] transactions per time period successfully
+
+**SC-006:** [NEEDS CLARIFICATION] - Maintain page load time under [X] seconds for [Y%] of requests
+
+---
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION] - Increase [business metric] by [X%] within [timeframe]
+
+**SC-008:** [NEEDS CLARIFICATION] - Reduce operational costs by [X amount/percentage]
+
+**SC-009:** [NEEDS CLARIFICATION] - Achieve ROI of [X%] within [timeframe]
+
+---
+
+### Quality Metrics
+
+**SC-010:** [NEEDS CLARIFICATION] - Maintain defect rate below [X] per [time period/release]
+
+**SC-011:** [NEEDS CLARIFICATION] - Achieve [X%] test coverage for critical paths
+
+**SC-012:** [NEEDS CLARIFICATION] - Zero critical security vulnerabilities at launch
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION] - Assumption about user environment
+- [NEEDS CLARIFICATION] - Assumption about data availability
+- [NEEDS CLARIFICATION] - Assumption about user capabilities
+
+### Dependencies
+
+- [NEEDS CLARIFICATION] - Dependency on external system/service
+- [NEEDS CLARIFICATION] - Dependency on internal team/resource
+- [NEEDS CLARIFICATION] - Dependency on third-party provider
+
+---
+
+## 8. Out of Scope
+
+- [NEEDS CLARIFICATION] - Feature/capability explicitly not included in this release
+- [NEEDS CLARIFICATION] - Future enhancement consideration
+- [NEEDS CLARIFICATION] - Related but separate initiative
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION] - Question about user requirements
+2. [NEEDS CLARIFICATION] - Question about business rules
+3. [NEEDS CLARIFICATION] - Question about integration points
+4. [NEEDS CLARIFICATION] - Question about data handling
+5. [NEEDS CLARIFICATION] - Question about user workflows
+
+---
+
+## 10. Approval and Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | [NEEDS CLARIFICATION] | | |
+| Business Stakeholder | [NEEDS CLARIFICATION] | | |
+| Technical Lead | [NEEDS CLARIFICATION] | | |
+| QA Lead | [NEEDS CLARIFICATION] | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Next Review Date:** [NEEDS CLARIFICATION]
+
+---
+
+## Instructions for Completion
+
+To complete this feature specification, please provide detailed user requirements including:
+
+1. **Feature description and purpose**
+2. **Target user personas and their needs**
+3. **Core functionality and capabilities required**
+4. **Business goals and expected outcomes**
+5. **Known constraints or limitations**
+6. **Integration requirements with existing systems**
+7. **Regulatory or compliance requirements**
+8. **Timeline and priority considerations**
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-016.md b/evals/datasets/real-specs/spec-016.md
new file mode 100644
index 0000000000..38b98cbe66
--- /dev/null
+++ b/evals/datasets/real-specs/spec-016.md
@@ -0,0 +1,325 @@
+# Test Spec 16
+
+## Prompt
+
+Create a financial transaction processing system with audit trails and compliance reporting
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements have been provided in the input]
+
+**Feature Name:** [Pending User Input]
+
+**Description:** This specification document is ready to be populated with detailed requirements once the user provides their feature requirements. The template below demonstrates the structure and level of detail that will be included.
+
+**Target Users:** [To be determined based on user input]
+
+**Business Value:** [To be determined based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+
+Given [alternative context]
+When [action is performed]
+Then [expected outcome]
+```
+
+**US-002: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+### Priority 3 (P3) - Nice to Have
+
+**US-004: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+**FR-002:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+### Data Management
+
+**FR-003:** [Requirement description]
+
+- **Priority:** P2
+- **Testability:** [How to verify this requirement]
+
+### User Interface
+
+**FR-004:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+### Integration Points
+
+**FR-005:** [Requirement description]
+
+- **Priority:** P2
+- **Testability:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [Performance requirement with specific metrics]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+**NFR-P-002:** [Performance requirement with specific metrics]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [Security requirement]
+
+- **Compliance:** [Relevant standards]
+- **Testing Method:** [How to verify]
+
+**NFR-S-002:** [Security requirement]
+
+- **Compliance:** [Relevant standards]
+- **Testing Method:** [How to verify]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [Scalability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [Usability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** [Reliability requirement]
+
+- **Metric:** [Measurable target (e.g., 99.9% uptime)]
+- **Testing Method:** [How to measure]
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** [Maintainability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001: [Edge case description]**
+
+- **Scenario:** [What happens at the boundary]
+- **Expected Behavior:** [How the system should respond]
+- **Test Case:** [How to reproduce]
+
+**EC-002: [Edge case description]**
+
+- **Scenario:** [What happens at the boundary]
+- **Expected Behavior:** [How the system should respond]
+- **Test Case:** [How to reproduce]
+
+### Error Scenarios
+
+**ES-001: [Error scenario]**
+
+- **Trigger:** [What causes this error]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback the user receives]
+
+**ES-002: [Error scenario]**
+
+- **Trigger:** [What causes this error]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback the user receives]
+
+### Data Validation
+
+**DV-001: [Validation scenario]**
+
+- **Invalid Input:** [Description of invalid data]
+- **Expected Behavior:** [How the system validates and responds]
+
+---
+
+## 6. Success Criteria
+
+### Quantitative Metrics
+
+**SC-001:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+**SC-002:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+**SC-003:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+### Qualitative Metrics
+
+**SC-004:** [User satisfaction or experience metric]
+
+- **Measurement Method:** [Survey, feedback, etc.]
+- **Target:** [Desired outcome]
+
+**SC-005:** [Business value metric]
+
+- **Measurement Method:** [How to evaluate]
+- **Target:** [Desired outcome]
+
+### Acceptance Criteria for Release
+
+**SC-006:** All P1 user stories must be completed and verified
+**SC-007:** All functional requirements marked as P1 must pass testing
+**SC-008:** All non-functional requirements must meet specified thresholds
+**SC-009:** No critical or high-priority defects remain unresolved
+**SC-010:** [Feature-specific acceptance criterion]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [Assumption 1]
+- [Assumption 2]
+- [Assumption 3]
+
+### Dependencies
+
+- [Dependency 1]
+- [Dependency 2]
+- [Dependency 3]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [Out of scope item 1]
+- [Out of scope item 2]
+- [Out of scope item 3]
+
+---
+
+## 9. Open Questions
+
+**[NEEDS CLARIFICATION]**
+
+1. [Question requiring stakeholder input]
+2. [Question requiring stakeholder input]
+3. [Question requiring stakeholder input]
+
+---
+
+## Document Information
+
+| Field | Value |
+|-------|-------|
+| **Document Version** | 1.0 (Draft) |
+| **Last Updated** | [Date] |
+| **Author** | [Name] |
+| **Stakeholders** | [List] |
+| **Status** | Awaiting User Requirements |
+
+---
+
+**Note:** This is a template structure. Please provide your specific user requirements to generate a complete, detailed feature specification document tailored to your needs.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-017.md b/evals/datasets/real-specs/spec-017.md
new file mode 100644
index 0000000000..41aa5a3a07
--- /dev/null
+++ b/evals/datasets/real-specs/spec-017.md
@@ -0,0 +1,370 @@
+# Test Spec 17
+
+## Prompt
+
+Develop a legacy system integration middleware using REST and SOAP protocols
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification document template is ready to be populated once specific feature requirements are shared.]
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Value:** [NEEDS CLARIFICATION]
+
+**Scope:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 (Must Have) Stories
+
+#### US-001: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+#### US-002: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+### P2 (Should Have) Stories
+
+#### US-003: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+#### US-004: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+### P3 (Nice to Have) Stories
+
+#### US-005: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+ **When** [action taken]
+ **Then** [expected outcome]
+
+**Priority:** P3
+**Story Points:** [TBD]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Core feature requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-002:** [NEEDS CLARIFICATION - Core feature requirement]
+**Testability:** [Define how this will be verified]
+
+### User Interface Requirements
+
+**FR-003:** [NEEDS CLARIFICATION - UI requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-004:** [NEEDS CLARIFICATION - UI requirement]
+**Testability:** [Define how this will be verified]
+
+### Data Requirements
+
+**FR-005:** [NEEDS CLARIFICATION - Data requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-006:** [NEEDS CLARIFICATION - Data requirement]
+**Testability:** [Define how this will be verified]
+
+### Integration Requirements
+
+**FR-007:** [NEEDS CLARIFICATION - Integration requirement]
+**Testability:** [Define how this will be verified]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-003:** [NEEDS CLARIFICATION - Load capacity requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-005:** [NEEDS CLARIFICATION - Authorization requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-006:** [NEEDS CLARIFICATION - Data protection requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Concurrent users requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-008:** [NEEDS CLARIFICATION - Data volume requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Reliability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Uptime requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-010:** [NEEDS CLARIFICATION - Error rate requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - Accessibility requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-012:** [NEEDS CLARIFICATION - User learning curve requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Maintainability Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION - Support requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Minimum input boundary]
+**Expected Behavior:** [Define system response]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum input boundary]
+**Expected Behavior:** [Define system response]
+
+**EC-003:** [NEEDS CLARIFICATION - Empty/null state]
+**Expected Behavior:** [Define system response]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION - Invalid input error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-002:** [NEEDS CLARIFICATION - System unavailability error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-003:** [NEEDS CLARIFICATION - Timeout error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-004:** [NEEDS CLARIFICATION - Permission denied error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+### Concurrent Operations
+
+**CO-001:** [NEEDS CLARIFICATION - Simultaneous access scenario]
+**Expected Behavior:** [Define system response]
+
+**CO-002:** [NEEDS CLARIFICATION - Race condition scenario]
+**Expected Behavior:** [Define system response]
+
+### Data Integrity
+
+**DI-001:** [NEEDS CLARIFICATION - Data validation failure]
+**Expected Behavior:** [Define system response]
+
+**DI-002:** [NEEDS CLARIFICATION - Data corruption scenario]
+**Expected Behavior:** [Define system response]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Feature adoption metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-002:** [NEEDS CLARIFICATION - Feature completion metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+### User Experience Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - User satisfaction metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-004:** [NEEDS CLARIFICATION - Task completion rate metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-005:** [NEEDS CLARIFICATION - Time to complete task metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+### Performance Metrics
+
+**SC-006:** [NEEDS CLARIFICATION - Response time metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+**SC-007:** [NEEDS CLARIFICATION - Error rate metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+### Business Metrics
+
+**SC-008:** [NEEDS CLARIFICATION - ROI or business value metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+**SC-009:** [NEEDS CLARIFICATION - Cost efficiency metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other systems, teams, or features]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints (budget, time, resources, etc.)]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly **NOT** included in this feature:
+
+- [NEEDS CLARIFICATION - List items that are out of scope]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem is this feature solving?]
+4. [NEEDS CLARIFICATION - What are the business goals and KPIs?]
+5. [NEEDS CLARIFICATION - Are there any existing systems this needs to integrate with?]
+6. [NEEDS CLARIFICATION - What are the compliance or regulatory requirements?]
+7. [NEEDS CLARIFICATION - What is the expected launch timeline?]
+
+---
+
+## Document Control
+
+**Version:** 0.1 (Draft - Awaiting Requirements)
+**Last Updated:** [Current Date]
+**Author:** [TBD]
+**Stakeholders:** [TBD]
+**Review Status:** Awaiting User Requirements
+
+---
+
+## Next Steps
+
+1. **Gather Requirements:** Conduct stakeholder interviews to define feature requirements
+2. **Populate Specification:** Fill in all [NEEDS CLARIFICATION] sections with specific details
+3. **Review & Validate:** Share with stakeholders for feedback and approval
+4. **Refine & Finalize:** Incorporate feedback and create final specification
+5. **Handoff:** Share approved specification with development team
+
+---
+
+**Note:** This is a template specification document. Please provide specific user requirements to generate a complete, actionable feature specification.
+
+## Review Notes
+
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/docs/AI-EVALS-WORKPLAN.md b/evals/docs/AI-EVALS-WORKPLAN.md
new file mode 100644
index 0000000000..a2593a0b35
--- /dev/null
+++ b/evals/docs/AI-EVALS-WORKPLAN.md
@@ -0,0 +1,147 @@
+# AI Evals Work Plan for Agentic SDLC Spec Kit
+
+**Purpose:** Apply proven AI evaluation practices to our Agentic SDLC Spec Kit project.
+
+---
+
+## š Implementation Progress (MVP Approach)
+
+**Strategy:** Implement minimum viable features for each step, expand later as time permits.
+
+### š Current Status Summary
+
+**Overall Progress:** 5/5 core weeks completed (100%) ā
+
+| Phase | Status | Pass Rate |
+|-------|--------|-----------|
+| Week 1: Error Analysis Foundation | ā Complete | **Plan Analysis: 100% (2/2)** |
+| Week 2-3: Custom Annotation Tool | ā Complete | - |
+| Week 4: Extend PromptFoo | ā Complete | **100% (10/10 tests)** |
+| Week 5: GitHub Actions CI/CD | ā Complete | - |
+| Week 5-6: Production Monitoring | š Optional | - |
+
+**Latest Evaluation Results:**
+- **Spec Template Tests:** 8/8 passed (100%)
+- **Plan Template Tests:** 2/2 passed (100%)
+- **Overall:** 10/10 tests passing (100%)
+
+### Week 1: Error Analysis Foundation ā **COMPLETED**
+- **Directory structure** created: `evals/notebooks/`, `evals/datasets/`
+- **Test data generation script** created: `evals/scripts/generate-test-data.sh`
+- **Error analysis notebook** created: `evals/notebooks/error-analysis.ipynb`
+- **Environment setup** using uv: `evals/scripts/run-error-analysis.sh`
+- **Automated error analysis for specs** created: `evals/scripts/run-auto-error-analysis.sh`
+- **Automated error analysis for plans** created: `evals/scripts/run-auto-plan-analysis.sh`
+- **Real test specs generated** - 17 specs in `evals/datasets/real-specs/`
+- **Real test plans generated** - 2 plans in `evals/datasets/real-plans/` (expandable)
+- **First error analysis session** - Completed through PromptFoo evaluation runs
+- **Plan error analysis results** - 100% pass rate on initial test cases
+- **Document findings** - Results documented in this file and README
+
+### Week 2-3: Custom Annotation Tool š ā **COMPLETED**
+- Basic FastHTML annotation app (MVP)
+- Keyboard shortcuts (N, P, 1, 2)
+- Export to JSON
+- Progress tracking with statistics
+- Auto-save functionality
+- Beautiful markdown rendering
+- Notes support for each spec
+- Launch script in evals/scripts/
+
+### Week 4: Extend PromptFoo ā **COMPLETED**
+- **Ran comprehensive evaluation** - All 10 tests executed
+- **Identified failures** - 3 initial failures documented
+- **Fixed prompts iteratively** - 2 rounds of refinements
+- **Achieved 100% pass rate** - 10/10 tests passing
+
+### Week 5-6: Production Monitoring š **TODO (NOT MANDATORY)**
+- [ ] Async evaluation script (vanilla Python)
+- [ ] Simple alerting mechanism
+
+### Week 5: GitHub Actions Integration š ā **COMPLETED**
+- [x] Add GitHub Action for running evals on PR
+- [x] Add threshold checks to fail PR if quality drops
+- [x] Add automated reporting
+- [x] Create comprehensive setup documentation
+- [x] Add status badge to README
+- [x] Configure PR commenting for results
+
+---
+
+## Table of Contents
+
+1. [Tool Strategy & Current State](#1-tool-strategy--current-state) ā **START HERE**
+2. [Foundation & Strategy](#2-foundation--strategy)
+3. [Evaluation Infrastructure](#3-evaluation-infrastructure)
+4. [Data & Datasets](#4-data--datasets)
+5. [Error Analysis & Debugging](#5-error-analysis--debugging)
+6. [Evaluators & Metrics](#6-evaluators--metrics)
+7. [RAG & Retrieval Evaluation](#7-rag--retrieval-evaluation)
+8. [Agentic Workflows](#8-agentic-workflows)
+9. [CI/CD & Production Monitoring](#9-cicd--production-monitoring) ā **GitHub Actions + Optional Production**
+10. [Team & Process](#10-team--process)
+11. [Tools & Vendors](#11-tools--vendors)
+
+---
+
+## 1. Tool Strategy & Current State
+
+### Hybrid Tool Strategy šÆ
+
+Use **different tools for different purposes**:
+
+| Purpose | Tool | Why | Priority |
+|---------|------|-----|----------|
+| **Error Analysis** | Jupyter Notebooks | Flexible, exploratory, visualization | š„ **CRITICAL** |
+| **Fast Annotation** | Custom FastHTML App | 10x faster than manual review | š„ **HIGH** |
+| **CI/CD Template Testing** | PromptFoo (keep existing) | Fast, deterministic regression tests | ā **Keep** |
+| **Production Monitoring** | Vanilla Python + Vendor | Async, streaming, alerting | š„ **HIGH** |
+| **LLM-as-Judge Building** | Notebooks ā PromptFoo | Prototype ā Productionize | **MEDIUM** |
+
+### What NOT to Do ā
+
+Per the FAQ, **avoid these common mistakes**:
+
+1. ā **Don't rely only on PromptFoo**
+2. ā **Don't use generic eval metrics from PromptFoo**
+3. ā **Don't skip error analysis**
+4. ā **Don't build annotation UI in PromptFoo**
+
+### Quick Start: What to Do Now
+
+**Week 1 (START HERE):**
+```bash
+# 1. Generate REAL test data (not synthetic prompts)
+mkdir -p evals/datasets/real-specs
+for i in {1..100}; do
+ # Use diverse prompts
+ specify init "test-$i" --ai claude
+ cp test-$i/.specify/specs/*/spec.md evals/datasets/real-specs/spec-$i.md
+done
+
+# 2. Create error analysis notebook
+jupyter lab evals/notebooks/error-analysis.ipynb
+# - Load 100 real specs
+# - Domain expert notes failures (open coding)
+# - Categorize into taxonomy (axial coding)
+# - Count failure modes
+
+# 3. Keep running PromptFoo for CI
+npx promptfoo eval # This tests TEMPLATES, not real outputs
+```
+
+**Week 2-3:**
+```bash
+# Build custom annotation tool
+cd evals/annotation-tool
+uv venv
+uv pip install fasthtml
+python app.py # Start annotation server
+```
+
+**Week 4:**
+```bash
+# Extend PromptFoo based on discovered failures
+# Add new tests to evals/configs/promptfooconfig-spec.js or promptfooconfig-plan.js
+# Add new graders to evals/graders/custom_graders.py
+```
diff --git a/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf b/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf
new file mode 100644
index 0000000000..18a00b44fa
Binary files /dev/null and b/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf differ
diff --git a/evals/docs/EVAL.md b/evals/docs/EVAL.md
new file mode 100644
index 0000000000..39a4054f88
--- /dev/null
+++ b/evals/docs/EVAL.md
@@ -0,0 +1,265 @@
+# Annotation Evaluation System
+
+This document explains the annotation evaluation framework for testing spec-kit template quality.
+
+## What are the Annotation Evals?
+
+The annotation evals are a **multi-layered evaluation framework** for testing the quality of spec-kit templates (specifications and implementation plans) that are generated by AI prompts. The system has three main components:
+
+## 1. Automated Testing (PromptFoo)
+
+**10 automated regression tests** that run against the spec and plan generation templates.
+
+**Current Status:** 90% pass rate (9/10 tests passing)
+
+### Test Categories
+
+#### Spec Template Tests (8 tests)
+- **Basic Structure**: Validates required sections (Overview, Requirements, User Stories, etc.)
+- **No Premature Tech Stack**: Ensures spec focuses on WHAT, not HOW
+- **Quality User Stories**: Checks for proper format and acceptance criteria
+- **Clarity & Vague Terms**: Flags unmeasurable requirements needing quantification
+- **Security Requirements**: Security-critical features include security considerations
+- **Edge Cases Coverage**: Validates error scenarios and boundary conditions
+- **Completeness**: Comprehensive requirements for complex features
+- **Regression**: Even simple features maintain proper structure
+
+#### Plan Template Tests (2 tests)
+- **Simplicity Gate**: Simple apps should have ā¤3 projects (Constitution Article VII)
+- **Constitution Compliance**: No over-engineering or unnecessary abstractions
+
+### Running Automated Tests
+
+```bash
+# Run all tests
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run and open web UI
+./evals/scripts/run-promptfoo-eval.sh --view
+
+# Filter specific tests
+./evals/scripts/run-promptfoo-eval.sh --filter "Spec Template"
+```
+
+## 2. Custom Annotation Tool
+
+Located in `evals/annotation-tool/`, this is a **FastHTML-based web interface** for manual spec review that's "10x faster than manual review".
+
+### Features
+- **Keyboard-driven interface**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: Visual statistics on review progress
+- **Note-taking**: Add observations for each spec
+- **Auto-save**: Annotations automatically saved to JSON
+- **Export**: Timestamped exports for analysis
+
+### Purpose
+- Human reviewers manually evaluate generated specs for quality
+- Binary pass/fail assessment (no Likert scales)
+- Identify failure patterns that automated tests miss
+- Build understanding of quality dimensions
+
+### Running the Annotation Tool
+
+```bash
+# Launch the web interface
+./evals/scripts/run-annotation-tool.sh
+
+# Open browser to http://localhost:5001
+```
+
+### Output
+Annotations are saved to:
+- `evals/annotation-tool/annotations.json` - Current state (auto-saved)
+- `evals/annotation-tool/annotations_export_YYYYMMDD_HHMMSS.json` - Timestamped exports
+
+## 3. Error Analysis Workflows
+
+Two approaches for deeper investigation of failures:
+
+### a) Automated Analysis (using Claude API)
+
+Uses Claude API to automatically evaluate specs/plans and categorize failures.
+
+```bash
+# For specs
+./evals/scripts/run-auto-error-analysis.sh
+
+# For plans
+./evals/scripts/run-auto-plan-analysis.sh
+```
+
+**Features:**
+- Evaluates all specs automatically using Claude
+- Binary pass/fail with reasoning
+- Categorizes failures automatically
+- Generates comprehensive CSV reports and summaries
+- Saves time on initial review
+
+**Output:**
+- `evals/datasets/analysis-results/automated-analysis-.csv`
+- `evals/datasets/analysis-results/summary-.txt`
+- `evals/datasets/analysis-results/plan-analysis-.csv`
+- `evals/datasets/analysis-results/plan-summary-.txt`
+
+### b) Manual Analysis (using Jupyter)
+
+Deep investigation following qualitative coding methodology.
+
+```bash
+# Launch Jupyter Lab
+./evals/scripts/run-error-analysis.sh
+```
+
+**Process:**
+1. **Open Coding** - Domain expert reviews 10-20 real specs/plans, notes issues without categorization
+2. **Axial Coding** - Group similar failures into categories, count frequency
+3. **Fix & Iterate** - Address high-frequency failures, add automated checks
+
+**Location:** `evals/notebooks/error-analysis.ipynb`
+
+## What Do They Test?
+
+The evaluations test the quality of **AI-generated specifications and implementation plans**.
+
+### Test Data
+
+- **17 diverse spec templates**: `evals/datasets/real-specs/spec-001.md` through `spec-017.md`
+- **2 plan templates**: `evals/datasets/real-plans/plan-001.md` and `plan-002.md` (expandable)
+
+### Quality Criteria
+
+#### For Specifications
+- Required sections present (Overview, Requirements, User Stories, etc.)
+- No premature technical decisions (focuses on WHAT, not HOW)
+- Proper user story format with acceptance criteria
+- Measurable requirements (flags vague terms)
+- Security considerations for security-critical features
+- Edge cases and error scenarios
+- Completeness for complex features
+- Proper structure even for simple features
+
+#### For Implementation Plans
+- **Simplicity gate**: ā¤3 projects for simple apps (CRITICAL - Constitution Article VII)
+- No over-engineering or microservices for simple apps
+- Clear project boundaries and tasks
+- Testing strategy included
+- Appropriate architecture
+- No premature optimization
+- Testability and verification steps
+
+### Common Failure Categories
+
+**For Specs:**
+- Incomplete requirements
+- Ambiguous specifications
+- Missing acceptance criteria
+- Security considerations missing
+- Over-engineering indicators
+- Vague or unmeasurable terms
+- Missing edge cases
+
+**For Plans:**
+- Too many projects (>3)
+- Over-engineering
+- Unclear project boundaries
+- Missing verification steps
+- Microservices for simple app
+- Premature optimization
+- Missing testing strategy
+- Tech stack mismatch
+
+## The Complete Workflow
+
+```
+1. Generate Specs/Plans (using prompt templates)
+ ā
+2. PromptFoo Tests (automated regression checks)
+ ā
+3. Manual Annotation Tool (human review, pass/fail)
+ ā
+4. Error Analysis (categorize failures, find patterns)
+ ā
+5. Fix Templates (address high-frequency failures)
+ ā
+6. Add Automated Checks (extend PromptFoo)
+ ā
+7. Repeat (continuous improvement)
+```
+
+## The 80/20 Rule
+
+According to AI evaluation best practices:
+
+> "We spent 60-80% of our development time on error analysis and evaluation. Expect most of your effort to go toward understanding failures (i.e. looking at data) rather than building automated checks."
+
+**80% of value comes from:**
+1. Manual annotation tool for human review
+2. Error analysis (Jupyter notebooks + automated analysis)
+3. PromptFoo for CI/CD regression testing
+
+**20% of value comes from:**
+- Production monitoring (planned)
+- Advanced features (clustering, AI assistance)
+
+## Key Insight
+
+**Manual review via the annotation tool is the most valuable activity**, with automated tests serving as regression checks to prevent known failures from recurring. The workflow emphasizes:
+
+1. Human judgment first (annotation tool)
+2. Pattern discovery (error analysis)
+3. Automation second (PromptFoo tests)
+
+This mirrors industry best practices where teams spend 60-80% of evaluation time on manual review and understanding failures rather than building automated checks.
+
+## Directory Structure
+
+```
+evals/
+āāā EVAL.md # This file - evaluation system overview
+āāā README.md # Setup and usage instructions
+āāā configs/ # PromptFoo configuration files
+ā āāā promptfooconfig.js # Main config (all 10 tests)
+ā āāā promptfooconfig-spec.js # Spec template tests only
+ā āāā promptfooconfig-plan.js # Plan template tests only
+āāā prompts/ # Templates under test
+ā āāā spec-prompt.txt # Specification generation template
+ā āāā plan-prompt.txt # Implementation plan template
+āāā graders/ # Custom evaluation logic
+ā āāā custom_graders.py # Python-based quality checks
+āāā scripts/ # Test execution utilities
+ā āāā run-promptfoo-eval.sh # PromptFoo test runner
+ā āāā run-error-analysis.sh # Jupyter error analysis launcher
+ā āāā run-auto-error-analysis.sh # Automated spec analysis
+ā āāā run-auto-plan-analysis.sh # Automated plan analysis
+ā āāā run-annotation-tool.sh # Annotation tool launcher
+āāā annotation-tool/ # FastHTML annotation interface
+ā āāā app.py # Web application
+ā āāā README.md # Tool documentation
+ā āāā annotations.json # Saved annotations
+āāā notebooks/ # Jupyter notebooks
+ā āāā error-analysis.ipynb # Manual review workflow
+āāā datasets/ # Test data and results
+ āāā real-specs/ # Generated specs (17 templates)
+ āāā real-plans/ # Generated plans (2 templates)
+ āāā analysis-results/ # Analysis outputs (CSV, summaries)
+```
+
+## Quick Reference
+
+| Task | Command |
+|------|---------|
+| Run all automated tests | `./evals/scripts/run-promptfoo-eval.sh` |
+| Launch annotation tool | `./evals/scripts/run-annotation-tool.sh` |
+| Run automated spec analysis | `./evals/scripts/run-auto-error-analysis.sh` |
+| Run automated plan analysis | `./evals/scripts/run-auto-plan-analysis.sh` |
+| Manual error analysis | `./evals/scripts/run-error-analysis.sh` |
+| View test results in browser | `./evals/scripts/run-promptfoo-eval.sh --view` |
+
+## See Also
+
+- [README.md](../README.md) - Complete setup and configuration guide
+- [annotation-tool/README.md](../annotation-tool/README.md) - Annotation tool detailed documentation
+- [AI-EVALS-WORKPLAN.md](AI-EVALS-WORKPLAN.md) - Implementation roadmap
\ No newline at end of file
diff --git a/evals/docs/GITHUB_ACTIONS_SETUP.md b/evals/docs/GITHUB_ACTIONS_SETUP.md
new file mode 100644
index 0000000000..c644c1e8c6
--- /dev/null
+++ b/evals/docs/GITHUB_ACTIONS_SETUP.md
@@ -0,0 +1,57 @@
+# GitHub Actions Setup for AI Evaluations
+
+This guide explains how to set up GitHub Actions for manual evaluation runs.
+
+## Overview
+
+The GitHub Actions workflow (`.github/workflows/eval.yml`) provides:
+- **Manual execution** via GitHub Actions interface
+- Quality threshold checks (minimum 70% pass rate)
+- Detailed evaluation reports with pass/fail status
+- Result artifacts stored for 30 days
+- On-demand quality validation
+
+## Required Secrets
+
+The workflow requires two secrets to be configured in your GitHub repository:
+
+### 1. LLM_BASE_URL
+
+Your LiteLLM proxy URL or other LLM provider API base URL.
+
+### 2. LLM_AUTH_TOKEN
+
+Your authentication token for the API.
+
+## Setting Up Secrets
+
+1. Go to your repository on GitHub
+2. Click on **Settings** (top menu)
+3. In the left sidebar, click **Secrets and variables** ā **Actions**
+4. Click **New repository secret**
+5. Enter the secret name (e.g., `LLM_BASE_URL`)
+6. Paste the secret value
+7. Click **Add secret**
+
+Repeat for both `LLM_BASE_URL` and `LLM_AUTH_TOKEN`.
+
+## Running the Workflow
+
+The workflow is configured for **manual execution only**.
+
+1. Go to **Actions** tab in your repository
+2. Select **AI Evals** workflow from the left sidebar
+3. Click **Run workflow** button (top right)
+4. Select the branch to run against (usually `main`)
+5. (Optional) Enter the model name in the **Model** input field.
+6. Click the green **Run workflow** button
+
+## Viewing Results
+
+After the workflow completes:
+
+1. Click on the completed workflow run
+2. Click on the **eval** job to see detailed logs
+3. Scroll to **Artifacts** section at the bottom
+4. Download `eval-results` to get detailed JSON reports
+5. View the summary in the workflow logs
diff --git a/evals/docs/LOCAL_TESTING.md b/evals/docs/LOCAL_TESTING.md
new file mode 100644
index 0000000000..94325b4bfa
--- /dev/null
+++ b/evals/docs/LOCAL_TESTING.md
@@ -0,0 +1,463 @@
+# Testing GitHub Actions Locally
+
+Guide for testing the AI Evals workflow locally before pushing to GitHub.
+
+## Quick Start (Easiest Method)
+
+We provide a helper script that handles everything for you:
+
+```bash
+# 1. Install act (if not already installed)
+brew install act # macOS
+# or: curl https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash
+
+# 2. Run the helper script
+./evals/scripts/test-workflow-locally.sh
+
+# The script will:
+# - Check prerequisites (act, Docker)
+# - Create secrets template if needed
+# - Run the workflow locally
+# - Display results
+
+# Options:
+./evals/scripts/test-workflow-locally.sh --list # Dry run (list steps)
+./evals/scripts/test-workflow-locally.sh --verbose # Show details
+./evals/scripts/test-workflow-locally.sh --reuse # Faster iterations
+./evals/scripts/test-workflow-locally.sh --skip-pr-comment # Skip PR comment step
+./evals/scripts/test-workflow-locally.sh --help # Show all options
+```
+
+That's it! The script handles secrets setup, Docker checks, and runs the workflow.
+
+---
+
+## Manual Setup (Advanced)
+
+If you prefer manual control or need to customize the setup:
+
+## Prerequisites
+
+1. **Docker** - Required for `act` to run workflows in containers
+ ```bash
+ # Verify Docker is installed and running
+ docker --version
+ docker ps
+ ```
+
+2. **act** - Tool for running GitHub Actions locally
+ ```bash
+ # macOS
+ brew install act
+
+ # Linux
+ curl https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash
+
+ # Verify installation
+ act --version
+ ```
+
+## Setup
+
+### 1. Create Secrets File
+
+Create `.github/workflows/.secrets` (gitignored):
+
+```bash
+# Create the secrets file
+cat > .github/workflows/.secrets << 'EOF'
+LLM_BASE_URL=your-llm-base-url
+LLM_AUTH_TOKEN=your-api-key
+EOF
+
+# Secure the file
+chmod 600 .github/workflows/.secrets
+```
+
+**Important:** Never commit this file! It's already in `.gitignore`.
+
+### 2. Add to .gitignore
+
+Ensure `.github/workflows/.secrets` is in your `.gitignore`:
+
+```bash
+# Add to .gitignore if not already there
+echo ".github/workflows/.secrets" >> .gitignore
+```
+
+## Running Tests
+
+### Basic Commands
+
+```bash
+# List all jobs and steps (dry run)
+act pull_request --list
+
+# Run the full workflow
+act pull_request --secret-file .github/workflows/.secrets
+
+# Run with verbose output
+act pull_request --secret-file .github/workflows/.secrets -v
+
+# Run specific job
+act pull_request -j eval --secret-file .github/workflows/.secrets
+```
+
+### Simulating Different Events
+
+```bash
+# Simulate pull_request event (default)
+act pull_request --secret-file .github/workflows/.secrets
+
+# Simulate push to main
+act push --secret-file .github/workflows/.secrets
+
+# Simulate schedule (cron)
+act schedule --secret-file .github/workflows/.secrets
+
+# Simulate manual workflow_dispatch
+act workflow_dispatch --secret-file .github/workflows/.secrets
+```
+
+### Advanced Options
+
+```bash
+# Use smaller runner image (faster)
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ -P ubuntu-latest=catthehacker/ubuntu:act-latest
+
+# Run with specific event payload
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ --eventpath .github/workflows/test-event.json
+
+# Skip steps that can't run locally (e.g., PR comments)
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ --job eval \
+ --skip-steps "Comment PR with Results"
+
+# Reuse containers (faster for repeated runs)
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ --reuse
+```
+
+### Using Environment Variables Instead
+
+```bash
+# Export secrets as environment variables
+export LLM_BASE_URL="your-url"
+export LLM_AUTH_TOKEN="your-token"
+
+# Run with -s flag for each secret
+act pull_request \
+ -s LLM_BASE_URL \
+ -s LLM_AUTH_TOKEN
+```
+
+## Testing Specific Scenarios
+
+### Test Only Evaluation Steps
+
+```bash
+# Skip setup, just run evals
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ --matrix ubuntu-latest:catthehacker/ubuntu:act-latest \
+ --job eval
+```
+
+### Test Threshold Failures
+
+Temporarily modify `check_eval_scores.py` to fail:
+
+```bash
+# Edit the threshold to impossible value
+# In .github/workflows/eval.yml, change:
+# --min-pass-rate 0.70
+# To:
+# --min-pass-rate 0.99
+
+# Then test
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+### Test PR Comment Logic
+
+The PR comment step won't work locally (requires GitHub API), but you can verify the summary generation:
+
+```bash
+# Run up to summary generation
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ --skip-steps "Comment PR with Results"
+```
+
+## Limitations of Local Testing
+
+### What Works ā
+
+- Job execution
+- Step execution
+- Environment variables and secrets
+- Docker container actions
+- Artifacts (stored in local directory)
+- Most shell commands
+- Python/Node.js setup
+
+### What Doesn't Work ā
+
+- **GitHub API interactions**: PR comments, issue updates
+- **GitHub context**: Some `github.*` variables may be missing
+- **Exact GitHub runner environment**: Uses Docker images that approximate GitHub runners
+- **Caching between runs**: Less effective than GitHub's cache
+- **Concurrent job execution**: Runs sequentially locally
+
+### Workarounds
+
+1. **PR Comments**: Test summary generation, skip actual commenting
+ ```bash
+ act pull_request --secret-file .github/workflows/.secrets --skip-steps "Comment PR"
+ ```
+
+2. **GitHub Context**: Create mock event file
+ ```json
+ {
+ "pull_request": {
+ "number": 123,
+ "head": {
+ "ref": "test-branch"
+ }
+ }
+ }
+ ```
+
+3. **Artifacts**: Check local `.artifacts/` directory
+
+## Debugging
+
+### View Detailed Logs
+
+```bash
+# Maximum verbosity
+act pull_request --secret-file .github/workflows/.secrets -v -v
+
+# Show Docker commands
+act pull_request --secret-file .github/workflows/.secrets --verbose
+```
+
+### Interactive Debugging
+
+```bash
+# Start interactive shell in workflow container
+act pull_request --secret-file .github/workflows/.secrets --shell
+
+# Then manually run commands to debug
+cd /github/workspace
+./evals/scripts/run-promptfoo-eval.sh --json
+```
+
+### Check Container Logs
+
+```bash
+# Keep containers after run
+act pull_request --secret-file .github/workflows/.secrets --reuse
+
+# List containers
+docker ps -a | grep act-
+
+# View container logs
+docker logs
+
+# Exec into container
+docker exec -it /bin/bash
+```
+
+## Best Practices
+
+### 1. Use Smaller Test Suite Locally
+
+Create a minimal config for local testing:
+
+```bash
+# Create evals/configs/promptfooconfig-test.js
+# With just 1-2 tests for quick iteration
+
+# Modify workflow to use test config locally
+# Or pass --filter flag to run-promptfoo-eval.sh
+```
+
+### 2. Cache Dependencies
+
+```bash
+# Use --reuse flag for faster iterations
+act pull_request --secret-file .github/workflows/.secrets --reuse
+
+# This keeps containers running between tests
+```
+
+### 3. Test Incrementally
+
+```bash
+# Test just the setup steps first
+act pull_request --secret-file .github/workflows/.secrets --list
+
+# Then test evaluation steps
+act pull_request --secret-file .github/workflows/.secrets -j eval
+
+# Finally test full workflow
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+### 4. Use Smaller Runner Image
+
+The default `ubuntu-latest` image is large (~18GB). Use a smaller one:
+
+```bash
+# Create .actrc in repo root
+echo "-P ubuntu-latest=catthehacker/ubuntu:act-latest" > .actrc
+
+# Now act will use smaller image by default
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+## Common Issues
+
+### Issue: Docker Not Running
+
+```
+Error: Cannot connect to Docker daemon
+```
+
+**Solution:**
+```bash
+# Start Docker Desktop (macOS)
+open -a Docker
+
+# Or start Docker service (Linux)
+sudo systemctl start docker
+
+# Verify
+docker ps
+```
+
+### Issue: Secrets Not Found
+
+```
+Error: LLM_BASE_URL not set
+```
+
+**Solution:**
+```bash
+# Verify secrets file exists and has correct format
+cat .github/workflows/.secrets
+
+# Use absolute path
+act pull_request --secret-file $(pwd)/.github/workflows/.secrets
+```
+
+### Issue: Node.js/Python Not Found
+
+```
+Error: node: command not found
+```
+
+**Solution:**
+```bash
+# Use official runner images (larger but more compatible)
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ -P ubuntu-latest=ghcr.io/catthehacker/ubuntu:runner-latest
+```
+
+### Issue: Workflow Takes Too Long
+
+```
+# Runs forever or very slow
+```
+
+**Solution:**
+```bash
+# Use smaller image
+act pull_request --secret-file .github/workflows/.secrets \
+ -P ubuntu-latest=catthehacker/ubuntu:act-latest
+
+# Skip non-essential steps
+act pull_request --secret-file .github/workflows/.secrets \
+ --skip-steps "Upload Results Artifact"
+
+# Run specific job only
+act pull_request -j eval --secret-file .github/workflows/.secrets
+```
+
+## Quick Reference
+
+```bash
+# Complete local test workflow
+# 1. Ensure Docker is running
+docker ps
+
+# 2. Verify secrets file
+cat .github/workflows/.secrets
+
+# 3. List jobs (dry run)
+act pull_request --list
+
+# 4. Run workflow
+act pull_request --secret-file .github/workflows/.secrets
+
+# 5. Check results
+ls -la eval-results*.json
+
+# 6. View summary
+cat eval_summary.txt
+```
+
+## Integration with Development Workflow
+
+### Pre-commit Hook (Optional)
+
+Create `.git/hooks/pre-push`:
+
+```bash
+#!/bin/bash
+echo "Running local GitHub Actions tests..."
+
+act pull_request \
+ --secret-file .github/workflows/.secrets \
+ -P ubuntu-latest=catthehacker/ubuntu:act-latest \
+ --quiet
+
+if [ $? -ne 0 ]; then
+ echo "ā Workflow tests failed. Fix issues before pushing."
+ exit 1
+fi
+
+echo "ā Workflow tests passed."
+```
+
+Make it executable:
+```bash
+chmod +x .git/hooks/pre-push
+```
+
+## Resources
+
+- **act Documentation**: https://github.com/nektos/act
+- **act Runner Images**: https://github.com/catthehacker/docker_images
+- **GitHub Actions Docs**: https://docs.github.com/en/actions
+- **Docker Installation**: https://docs.docker.com/get-docker/
+
+## Next Steps
+
+After testing locally:
+
+1. ā Fix any issues found during local testing
+2. ā Push changes to GitHub
+3. ā Verify workflow runs successfully on GitHub
+4. ā Set up GitHub secrets (if not already done)
+5. ā Test with real PR
+
+See [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) for production deployment.
diff --git a/evals/docs/QUICK_REFERENCE.md b/evals/docs/QUICK_REFERENCE.md
new file mode 100644
index 0000000000..e363dbccc0
--- /dev/null
+++ b/evals/docs/QUICK_REFERENCE.md
@@ -0,0 +1,137 @@
+# GitHub Actions Eval Workflow - Quick Reference
+
+## š Quick Commands
+
+### Local Testing
+
+```bash
+# Install act (first time only)
+brew install act # macOS
+
+# Test workflow locally (easiest)
+./evals/scripts/test-workflow-locally.sh
+
+# With options
+./evals/scripts/test-workflow-locally.sh --list # Dry run
+./evals/scripts/test-workflow-locally.sh --verbose # Detailed output
+./evals/scripts/test-workflow-locally.sh --reuse # Faster iterations
+```
+
+### Manual Testing
+
+```bash
+# Run all evaluations
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run specific tests
+./evals/scripts/run-promptfoo-eval.sh --filter "Spec"
+
+# Check scores
+python3 evals/scripts/check_eval_scores.py \
+ --results eval-results.json \
+ --min-pass-rate 0.70
+```
+
+## š Important Files
+
+| File | Purpose |
+|------|---------|
+| `.github/workflows/eval.yml` | Main GitHub Actions workflow |
+| `.github/workflows/.secrets` | Local testing secrets (gitignored) |
+| `evals/configs/promptfooconfig.js` | All evaluation tests |
+| `evals/scripts/check_eval_scores.py` | Threshold validation |
+| `evals/scripts/run-promptfoo-eval.sh` | Evaluation runner |
+| `evals/scripts/test-workflow-locally.sh` | Local testing helper |
+
+## š§ Setup Checklist
+
+### For GitHub Actions (Production)
+
+- [ ] Add `LLM_BASE_URL` secret to GitHub
+- [ ] Add `LLM_AUTH_TOKEN` secret to GitHub
+- [ ] Run workflow manually from Actions tab
+- [ ] (Optional) Set `LLM_MODEL` in the workflow dispatch UI.
+- [ ] Verify results in workflow logs
+- [ ] Download and review artifacts
+
+**Guide:** See [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md)
+
+### For Local Testing
+
+- [ ] Install `act`: `brew install act`
+- [ ] Ensure Docker is running
+- [ ] Run: `./evals/scripts/test-workflow-locally.sh`
+- [ ] Edit `.github/workflows/.secrets` with your values
+- [ ] Run again to test
+
+**Guide:** See [LOCAL_TESTING.md](LOCAL_TESTING.md)
+
+## šÆ Workflow Triggers
+
+| Trigger | When | Purpose |
+|---------|------|---------|
+| **workflow_dispatch** | Manual trigger only | On-demand quality validation |
+
+**Note:** The workflow does NOT run automatically on PRs, pushes, or schedules.
+Run manually from GitHub Actions tab when you need quality validation.
+
+## š Quality Thresholds
+
+| Metric | Threshold | Action if Failed |
+|--------|-----------|------------------|
+| Average Score | ā„ 0.70 | ā Workflow fails |
+| Pass Rate | ā„ 70% | ā Workflow fails |
+| Overall | 10/10 tests pass | ā Currently passing |
+
+## š Troubleshooting
+
+### Workflow Fails Locally
+
+```bash
+# Check Docker is running
+docker ps
+
+# Verify secrets
+cat .github/workflows/.secrets
+
+# Run with verbose output
+./evals/scripts/test-workflow-locally.sh --verbose
+
+# Check logs
+act pull_request --secret-file .github/workflows/.secrets -v
+```
+
+### Workflow Fails on GitHub
+
+- Check **Actions** tab for error logs
+- Verify secrets are set in repository settings
+- Test locally first with `act`
+- Ensure API credentials are valid
+
+## š Learning Resources
+
+| Topic | Link |
+|-------|------|
+| **Full Setup Guide** | [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) |
+| **Local Testing** | [LOCAL_TESTING.md](LOCAL_TESTING.md) |
+| **Eval Framework** | [README.md](../README.md) |
+| **act Documentation** | |
+| **GitHub Actions** | |
+
+## š Getting Help
+
+1. Check [LOCAL_TESTING.md](LOCAL_TESTING.md) troubleshooting section
+2. Check [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) common issues
+3. Review workflow logs in GitHub Actions tab
+4. Open issue:
+
+## ā Current Status
+
+[](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml)
+
+- **Pass Rate:** 100% (10/10 tests)
+- **Status:** ā Production ready
+- **Last Updated:** 2026-01-14
diff --git a/evals/docs/WORKFLOWS.md b/evals/docs/WORKFLOWS.md
new file mode 100644
index 0000000000..8eda0d1f80
--- /dev/null
+++ b/evals/docs/WORKFLOWS.md
@@ -0,0 +1,258 @@
+# Advanced Workflows
+
+This document provides detailed information on the error analysis and annotation workflows used in the Spec-Kit Evaluation Framework.
+
+## Error Analysis Workflow
+
+PromptFoo provides automated regression testing, but **error analysis on real outputs** is the most important evaluation activity. According to AI evaluation best practices:
+
+> "We spent 60-80% of our development time on error analysis and evaluation. Expect most of your effort to go toward understanding failures (i.e. looking at data) rather than building automated checks."
+
+### Quick Start
+
+```bash
+# 1. Generate test data
+cd evals/scripts
+./generate-test-data.sh # Creates 17 diverse test case templates
+
+# 2. Run error analysis workflow (sets up environment + launches Jupyter)
+cd ../scripts
+./run-error-analysis.sh
+
+# 4. Run error analysis session (30-60 minutes)
+# - Load specs
+# - Review and annotate (pass/fail, issues, categories)
+# - Categorize failures
+# - Prioritize fixes
+```
+
+### Directory Structure
+
+```text
+evals/
+āāā notebooks/ # Error analysis
+ā āāā error-analysis.ipynb # Main analysis notebook (manual)
+ā āāā .venv/ # Virtual environment
+āāā scripts/ # Automation scripts
+ā āāā run-promptfoo-eval.sh # PromptFoo test runner
+ā āāā run-error-analysis.sh # Manual error analysis (Jupyter)
+ā āāā run-auto-error-analysis.sh # Automated error analysis for specs (Claude API)
+ā āāā run-automated-error-analysis.py # Python script for spec automation
+ā āāā run-auto-plan-analysis.sh # Automated error analysis for plans
+ā āāā run-automated-plan-analysis.py # Python script for plan automation
+ā āāā generate-real-plans.py # Generate plan test data
+ā āāā check_eval_scores.py # Score validation
+āāā datasets/ # Test data
+ āāā real-specs/ # Generated specs for review (17 templates)
+ āāā real-plans/ # Generated plans for review (2 templates, expandable)
+ āāā analysis-results/ # Analysis output (CSV, summaries)
+ āāā automated-analysis-*.csv # Automated spec eval results
+ āāā plan-analysis-*.csv # Automated plan eval results
+ āāā plan-eval-analysis-*.txt # Plan evaluation analysis reports
+ āāā summary-*.txt # Summary reports
+ āāā error-analysis-results.csv # Manual review results
+```
+
+### The 80/20 Rule
+
+**80% of value** comes from:
+
+1. ā Jupyter notebooks for error analysis (manual review)
+2. ā Custom annotation tools (planned - Week 2)
+3. ā PromptFoo for CI/CD (already working)
+
+**20% of value** comes from:
+
+- Production monitoring (planned - Week 5-6)
+- Advanced features (clustering, AI assistance)
+
+### Error Analysis Process
+
+You can run error analysis in two ways:
+
+#### Option 1: Automated Analysis (Using Claude API)
+
+Uses Claude API to automatically evaluate specs and categorize failures:
+
+```bash
+# Run automated error analysis (uses default model)
+./evals/scripts/run-auto-error-analysis.sh
+
+# Use a specific model
+./evals/scripts/run-auto-error-analysis.sh --model claude-opus-4-5-20251101
+
+# Requirements:
+# - ANTHROPIC_API_KEY environment variable set
+# - Generated specs in evals/datasets/real-specs/
+
+# Output:
+# - Detailed CSV: evals/datasets/analysis-results/automated-analysis-.csv
+# - Summary report: evals/datasets/analysis-results/summary-.txt
+```
+
+**Features:**
+
+- Evaluates all specs automatically using Claude
+- Binary pass/fail with reasoning
+- Categorizes failures (incomplete requirements, ambiguous specs, etc.)
+- Generates comprehensive reports
+- Saves time on initial review
+
+#### Option 2: Manual Analysis (Using Jupyter Notebook)
+
+Traditional error analysis workflow for deep investigation:
+
+```bash
+# Launch Jupyter Lab
+./evals/scripts/run-error-analysis.sh
+
+# In Jupyter:
+# 1. Load specs from datasets/real-specs/
+# 2. Review and annotate manually
+# 3. Categorize failures
+# 4. Export results
+```
+
+**Process:**
+
+1. **Open Coding** (Week 1)
+ - Domain expert reviews 10-20 real specs/plans
+ - Notes issues without categorization yet
+ - Binary pass/fail (no Likert scales)
+
+2. **Axial Coding** (Week 1-2)
+ - Group similar failures into categories
+ - Count frequency of each failure mode
+ - Prioritize by impact
+
+3. **Fix & Iterate** (Ongoing)
+ - Fix high-frequency failure modes
+ - Add automated checks to PromptFoo
+ - Re-run error analysis monthly
+
+### Plan Error Analysis (NEW)
+
+In addition to spec evaluation, we now support error analysis for **implementation plans**.
+
+#### Quick Start - Plan Analysis
+
+```bash
+# 1. Generate plan test data
+cd evals/scripts
+LLM_BASE_URL="your-url" \
+LLM_AUTH_TOKEN="your-key" \
+./evals/.venv/bin/python generate-real-plans.py
+
+# 2. Run automated plan analysis (uses default model)
+export ANTHROPIC_API_KEY="your-anthropic-key"
+./run-auto-plan-analysis.sh
+
+# Or use a specific model
+./run-auto-plan-analysis.sh --model claude-opus-4-5-20251101
+
+# Output:
+# - evals/datasets/analysis-results/plan-analysis-.csv
+# - evals/datasets/analysis-results/plan-summary-.txt
+```
+
+#### Plan Analysis Features
+
+**Evaluation Criteria:**
+
+- **Simplicity Gate**: ā¤3 projects (CRITICAL - Constitution compliance)
+- **Completeness**: All necessary components and phases defined
+- **Clarity**: Project boundaries, tasks, and milestones clear
+- **Appropriateness**: Simple architecture, no over-engineering
+- **Constitution Compliance**: No microservices for simple apps, no premature optimization
+- **Testability**: Testing strategy and verification steps included
+
+**Common Failure Categories for Plans:**
+
+- Too many projects (>3)
+- Over-engineering
+- Missing verification steps
+- Unclear project boundaries
+- Microservices for simple app
+- Premature optimization
+- Missing testing strategy
+- Tech stack mismatch
+- Incomplete milestones
+
+**Current Status:**
+
+- Plan evaluation integrated with PromptFoo (100% pass rate on 2 test cases)
+- Error analysis infrastructure ready for expansion
+- Support for both automated (Claude API) and manual review workflows
+
+## Custom Annotation Tool
+
+A fast, keyboard-driven web interface for reviewing generated specs - **10x faster than manual review**.
+
+### Quick Start
+
+```bash
+# Run the annotation tool
+./evals/scripts/run-annotation-tool.sh
+```
+
+Open your browser to `http://localhost:5001` and start reviewing specs.
+
+### Features
+
+- **Keyboard shortcuts**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: Visual progress bar with statistics
+- **Notes**: Add observations for each spec
+- **Auto-save**: Annotations saved automatically to JSON
+- **Export**: Export all annotations with timestamps
+
+### Workflow
+
+1. Review the spec displayed on the page
+2. Evaluate quality (structure, completeness, clarity)
+3. Add notes about any issues (optional)
+4. Press **1** for Pass or **2** for Fail
+5. Tool automatically advances to next spec
+6. Click "Export JSON" when done
+
+### Output
+
+Annotations are saved to:
+
+- `evals/annotation-tool/annotations.json` - Auto-saved current state
+- `evals/annotation-tool/annotations_export_YYYYMMDD_HHMMSS.json` - Timestamped exports
+
+Example output structure:
+
+```json
+{
+ "exported_at": "2026-01-08T14:30:00",
+ "statistics": {
+ "total": 17,
+ "passed": 12,
+ "failed": 3,
+ "pending": 2,
+ "progress": 88.2
+ },
+ "annotations": {
+ "spec-001.md": {
+ "status": "pass",
+ "notes": "Good structure, all sections present",
+ "timestamp": "2026-01-08T14:25:00"
+ }
+ }
+}
+```
+
+### What to Look For
+
+Common failure patterns to note during review:
+
+- Missing required sections
+- Vague or unmeasurable requirements
+- Premature technical decisions
+- Missing acceptance criteria
+- Incomplete user stories
+- Security considerations missing
+- Over-engineering indicators
+
+See [annotation-tool/README.md](annotation-tool/README.md) for detailed documentation.
diff --git a/evals/graders/custom_graders.py b/evals/graders/custom_graders.py
new file mode 100644
index 0000000000..31e90628b1
--- /dev/null
+++ b/evals/graders/custom_graders.py
@@ -0,0 +1,383 @@
+"""
+Custom graders for Spec-Kit evaluation.
+
+These graders check specific quality criteria for specifications and plans
+generated by the spec-kit templates.
+"""
+
+
+def check_security_completeness(output: str, context: dict) -> dict:
+ """
+ Check if security-critical features include proper security requirements.
+
+ Args:
+ output: The generated specification or plan text
+ context: Additional context (unused but required by PromptFoo)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ required_security_topics = [
+ 'authentication',
+ 'authorization',
+ 'encryption',
+ 'session management',
+ 'data protection',
+ 'input validation'
+ ]
+
+ output_lower = output.lower()
+ found_topics = [
+ topic for topic in required_security_topics
+ if topic in output_lower
+ ]
+
+ score = len(found_topics) / len(required_security_topics)
+
+ return {
+ 'pass': score >= 0.5, # At least 50% of security topics
+ 'score': score,
+ 'reason': f'Found {len(found_topics)}/{len(required_security_topics)} security topics: {", ".join(found_topics) if found_topics else "none"}'
+ }
+
+
+def check_simplicity_gate(output: str, context: dict) -> dict:
+ """
+ Check if plan follows simplicity gate (Article VII: ā¤3 projects).
+
+ Args:
+ output: The generated plan text
+ context: Additional context (unused but required by PromptFoo)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ # Count unique projects in the plan
+ # Look for patterns like "Project 1:", "## Project", etc.
+ import re
+
+ # Extract project numbers to avoid counting duplicates
+ # Pattern: "Project" followed by a number
+ project_number_pattern = re.compile(r'Project\s+(\d+)', re.IGNORECASE)
+ project_numbers = project_number_pattern.findall(output)
+
+ # Get unique project numbers
+ unique_projects = set(project_numbers)
+ project_count = len(unique_projects)
+
+ # If no numbered projects found, look for "Project Structure" section
+ # and try to extract count from table or list
+ if project_count == 0:
+ # Look for table format: "| Project 1" or "| **Project 1"
+ table_project_pattern = re.compile(r'\|\s*\*?\*?Project\s+(\d+)', re.IGNORECASE)
+ table_numbers = table_project_pattern.findall(output)
+ if table_numbers:
+ unique_projects = set(table_numbers)
+ project_count = len(unique_projects)
+
+ # If still no projects found, look for explicit project count in text
+ if project_count == 0:
+ count_pattern = re.compile(r'(\d+)\s+projects?', re.IGNORECASE)
+ count_matches = count_pattern.findall(output)
+ if count_matches:
+ # Take the first explicit count mentioned
+ project_count = int(count_matches[0])
+ else:
+ # Assume single project if nothing found
+ project_count = 1
+
+ passed = project_count <= 3
+ score = 1.0 if passed else max(0, 1 - (project_count - 3) * 0.2)
+
+ return {
+ 'pass': passed,
+ 'score': score,
+ 'reason': f'Found {project_count} projects (expected ā¤3 for simplicity)'
+ }
+
+
+def check_constitution_compliance(output: str, context: dict) -> dict:
+ """
+ Check if plan violates constitution principles.
+
+ Checks:
+ - Article VII: Simplicity (ā¤3 projects)
+ - Article VIII: Anti-Abstraction (no unnecessary wrappers)
+ - Over-engineering detection
+
+ Args:
+ output: The generated plan text
+ context: Additional context (unused but required by PromptFoo)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ violations = []
+ output_lower = output.lower()
+
+ # Check simplicity gate
+ simplicity_result = check_simplicity_gate(output, context)
+ if not simplicity_result['pass']:
+ violations.append(simplicity_result['reason'])
+
+ # Check for over-engineering patterns (context-aware)
+ # Only flag if NOT in a negative context (e.g., "no microservices", "avoiding kubernetes")
+ import re
+
+ over_engineering_terms = [
+ 'microservices',
+ 'kubernetes',
+ 'k8s',
+ 'service mesh',
+ 'event sourcing',
+ 'cqrs',
+ 'saga pattern',
+ 'message queue' if 'simple' in context.get('vars', {}).get('user_input', '').lower() else None
+ ]
+ over_engineering_terms = [t for t in over_engineering_terms if t] # Remove None
+
+ found_overengineering = []
+ for term in over_engineering_terms:
+ if term not in output_lower:
+ continue
+
+ # Check if term is in a negative context
+ # Look for patterns like "no X", "avoid X", "not X", "without X"
+ negative_patterns = [
+ rf'\b(no|avoid|avoiding|not|without)\s+\w*\s*{re.escape(term)}',
+ rf'{re.escape(term)}\s*\w*\s*(avoided|rejected|unnecessary)'
+ ]
+
+ is_negative = False
+ for pattern in negative_patterns:
+ if re.search(pattern, output_lower, re.IGNORECASE):
+ is_negative = True
+ break
+
+ # Only flag if NOT in negative context
+ if not is_negative:
+ found_overengineering.append(term)
+
+ if found_overengineering:
+ violations.append(f"Over-engineering detected: {', '.join(found_overengineering)}")
+
+ # Check for unnecessary abstractions/wrappers
+ abstraction_terms = [
+ 'wrapper',
+ 'facade',
+ 'adapter layer',
+ 'abstraction layer'
+ ]
+
+ found_abstractions = [
+ term for term in abstraction_terms
+ if term in output_lower
+ ]
+
+ if found_abstractions:
+ violations.append(f"Unnecessary abstractions: {', '.join(found_abstractions)}")
+
+ # Calculate score
+ if not violations:
+ score = 1.0
+ else:
+ # Deduct 0.3 per violation, minimum 0
+ score = max(0, 1.0 - len(violations) * 0.3)
+
+ return {
+ 'pass': score >= 0.7,
+ 'score': score,
+ 'reason': '; '.join(violations) if violations else 'Constitution compliant'
+ }
+
+
+def check_vague_terms(output: str, context: dict) -> dict:
+ """
+ Check for vague, unmeasurable terms that need clarification.
+
+ Args:
+ output: The generated specification text
+ context: Additional context (unused but required by PromptFoo)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ vague_terms = [
+ 'fast',
+ 'quick',
+ 'scalable',
+ 'secure',
+ 'intuitive',
+ 'robust',
+ 'performant',
+ 'user-friendly',
+ 'easy',
+ 'simple',
+ 'good performance',
+ 'high availability'
+ ]
+
+ output_lower = output.lower()
+ vague_found = [term for term in vague_terms if term in output_lower]
+
+ if not vague_found:
+ return {
+ 'pass': True,
+ 'score': 1.0,
+ 'reason': 'No vague terms found'
+ }
+
+ # Check if vague terms are quantified or flagged
+ quantified_count = 0
+ for term in vague_found:
+ # Look for the term followed by quantification or clarification markers
+ term_index = output_lower.find(term)
+ if term_index == -1:
+ continue
+
+ # Check 200 chars after the term
+ context_window = output_lower[term_index:term_index + 200]
+
+ # Check for quantification patterns
+ quantification_patterns = [
+ r'\d+\s*(ms|milliseconds|seconds|minutes)', # time
+ r'\d+\s*(mb|gb|requests|users)', # size/count
+ r'<\s*\d+', # less than X
+ r'>\s*\d+', # greater than X
+ r'\[needs clarification\]',
+ r'\[tbd\]',
+ r'\[todo\]'
+ ]
+
+ import re
+ if any(re.search(pattern, context_window) for pattern in quantification_patterns):
+ quantified_count += 1
+
+ quantified_ratio = quantified_count / len(vague_found) if vague_found else 1.0
+
+ return {
+ 'pass': quantified_ratio >= 0.7,
+ 'score': quantified_ratio,
+ 'reason': f'Found {len(vague_found)} vague terms, {quantified_count} properly quantified/flagged'
+ }
+
+
+def check_edge_cases_coverage(output: str, context: dict) -> dict:
+ """
+ Check if edge cases section has comprehensive coverage.
+
+ Args:
+ output: The generated specification text
+ context: Additional context with vars (user_input)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ import re
+
+ output_lower = output.lower()
+
+ # Define categories of edge cases to check
+ edge_case_categories = {
+ 'boundary_values': [
+ 'empty', 'min', 'max', 'limit', 'boundary', 'zero', 'negative',
+ 'very large', 'exceed'
+ ],
+ 'invalid_inputs': [
+ 'invalid', 'malformed', 'incorrect', 'wrong', 'unsupported',
+ 'malicious', 'corrupt'
+ ],
+ 'network_failures': [
+ 'network', 'timeout', 'connection', 'disconnect', 'offline',
+ 'latency', 'fail'
+ ],
+ 'concurrent_actions': [
+ 'concurrent', 'simultaneous', 'parallel', 'race condition',
+ 'multiple users'
+ ],
+ 'state_issues': [
+ 'session', 'expire', 'recovery', 'rollback', 'partial',
+ 'inconsistent', 'state'
+ ]
+ }
+
+ # Count how many categories are covered
+ covered_categories = 0
+ found_terms = []
+
+ for category, terms in edge_case_categories.items():
+ for term in terms:
+ if term in output_lower:
+ covered_categories += 1
+ found_terms.append(f"{category}: {term}")
+ break # Count each category only once
+
+ # Calculate score based on coverage
+ total_categories = len(edge_case_categories)
+ score = covered_categories / total_categories
+
+ # Pass if at least 3 out of 5 categories covered
+ passed = covered_categories >= 3
+
+ return {
+ 'pass': passed,
+ 'score': score,
+ 'reason': f'Covered {covered_categories}/{total_categories} edge case categories ({", ".join(found_terms[:3])}{"..." if len(found_terms) > 3 else ""})'
+ }
+
+
+def check_testability(output: str, context: dict) -> dict:
+ """
+ Check if requirements are testable with clear acceptance criteria.
+
+ Args:
+ output: The generated specification text
+ context: Additional context (unused but required by PromptFoo)
+
+ Returns:
+ dict with 'pass', 'score', and 'reason' keys
+ """
+ # Look for acceptance criteria patterns
+ import re
+
+ # Find user stories (should have acceptance criteria)
+ user_story_pattern = re.compile(r'\*\*As a .+?\*\*', re.IGNORECASE)
+ user_stories = user_story_pattern.findall(output)
+
+ if not user_stories:
+ return {
+ 'pass': False,
+ 'score': 0.0,
+ 'reason': 'No user stories found'
+ }
+
+ # Look for acceptance criteria after each user story
+ acceptance_patterns = [
+ r'acceptance criteria',
+ r'given .+? when .+? then', # BDD format
+ r'should .+?',
+ r'must .+?',
+ r'verify that',
+ r'confirm that'
+ ]
+
+ stories_with_criteria = 0
+ for story in user_stories:
+ # Find the story position
+ story_index = output.lower().find(story.lower())
+ if story_index == -1:
+ continue
+
+ # Check next 500 chars for acceptance criteria
+ context_window = output.lower()[story_index:story_index + 500]
+
+ if any(re.search(pattern, context_window, re.IGNORECASE) for pattern in acceptance_patterns):
+ stories_with_criteria += 1
+
+ testability_ratio = stories_with_criteria / len(user_stories)
+
+ return {
+ 'pass': testability_ratio >= 0.7,
+ 'score': testability_ratio,
+ 'reason': f'{stories_with_criteria}/{len(user_stories)} user stories have testable acceptance criteria'
+ }
diff --git a/evals/notebooks/error-analysis.ipynb b/evals/notebooks/error-analysis.ipynb
new file mode 100644
index 0000000000..2bbb2d6e14
--- /dev/null
+++ b/evals/notebooks/error-analysis.ipynb
@@ -0,0 +1,998 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Error Analysis Notebook\n",
+ "\n",
+ "**Purpose:** Manual review and error analysis of generated specs/plans\n",
+ "\n",
+ "**Process:**\n",
+ "1. **Open Coding:** Review outputs and note issues\n",
+ "2. **Axial Coding:** Group similar failures into categories\n",
+ "3. **Prioritization:** Count frequency and prioritize fixes\n",
+ "\n",
+ "**Time:** 30-60 minutes per session\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ā Setup complete\n",
+ "š Dataset directory: ../datasets/real-specs\n",
+ "š Results directory: ../datasets/analysis-results\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Setup\n",
+ "import os\n",
+ "import glob\n",
+ "import pandas as pd\n",
+ "from pathlib import Path\n",
+ "import re\n",
+ "\n",
+ "# Configuration\n",
+ "DATASET_DIR = Path('../datasets/real-specs')\n",
+ "RESULTS_DIR = Path('../datasets/analysis-results')\n",
+ "RESULTS_DIR.mkdir(exist_ok=True)\n",
+ "\n",
+ "print(f\"ā Setup complete\")\n",
+ "print(f\"š Dataset directory: {DATASET_DIR}\")\n",
+ "print(f\"š Results directory: {RESULTS_DIR}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Load Test Specs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "š Found 17 spec files\n",
+ "\n",
+ "ā Loaded 17 specs into dataframe\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "